├── .Rbuildignore ├── .gitignore ├── DESCRIPTION ├── NAMESPACE ├── R ├── Classes.R ├── DBanalysis.R ├── DBresults.R ├── GenericFunctions.R ├── countReads.R ├── data.R ├── peakreference.R ├── plots.R ├── timeclust.R └── timecourseTable.R ├── README.md ├── TCseq.Rproj ├── data ├── countsTable.rda ├── experiment.rda ├── experiment_BAMfile.rda ├── genomicIntervals.rda └── tca_ATAC.rda ├── man ├── DBanalysis.Rd ├── DBresult.Rd ├── TCA.Rd ├── TCA.accessors.Rd ├── clust-class.Rd ├── clust.accessors.Rd ├── countReads.Rd ├── counts.Rd ├── countsTable.Rd ├── experiment.Rd ├── experiment_BAMfile.Rd ├── genomicIntervals.Rd ├── peakreference.Rd ├── tca_ATAC.Rd ├── timeclust.Rd ├── timeclustplot.Rd └── timecourseTable.Rd ├── tests ├── testthat.R └── testthat │ └── test_TCseq.R └── vignettes ├── TCseq.Rnw ├── clusterRes.png └── subcluster.png /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: TCseq 2 | Type: Package 3 | Title: Time course sequencing data analysis 4 | Version: 1.23.2 5 | Author: Mengjun Wu , Lei Gu 6 | Maintainer: Mengjun Wu 7 | Description: Quantitative and differential analysis of epigenomic and 8 | transcriptomic time course sequencing data, clustering analysis 9 | and visualization of temporal patterns of time course data. 10 | Depends: R (>= 3.4) 11 | License: GPL (>= 2) 12 | LazyData: TRUE 13 | Imports: edgeR, BiocGenerics, reshape2, GenomicRanges, IRanges, 14 | SummarizedExperiment, GenomicAlignments, Rsamtools, e1071, 15 | cluster, ggplot2, grid, grDevices, stats, utils, methods, locfit 16 | Suggests: testthat 17 | biocViews: Epigenetics, TimeCourse, Sequencing, ChIPSeq, RNASeq, 18 | DifferentialExpression, Clustering, Visualization 19 | RoxygenNote: 7.2.3 20 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(DBanalysis) 4 | export(DBresult) 5 | export(DBresult.cluster) 6 | export(TCA) 7 | export(TCAFromSummarizedExperiment) 8 | export(clustCenters) 9 | export(clustCluster) 10 | export(clustData) 11 | export(clustMembership) 12 | export(clustResults) 13 | export(countReads) 14 | export(genomicFeature) 15 | export(peakreference) 16 | export(tcTable) 17 | export(timeclust) 18 | export(timeclustplot) 19 | export(timecourseTable) 20 | exportClasses(TCA) 21 | exportClasses(clust) 22 | exportMethods('counts<-') 23 | exportMethods(clustCenters) 24 | exportMethods(clustCluster) 25 | exportMethods(clustData) 26 | exportMethods(clustMembership) 27 | exportMethods(clustResults) 28 | exportMethods(counts) 29 | exportMethods(design) 30 | exportMethods(tcTable) 31 | import(GenomicRanges) 32 | import(SummarizedExperiment) 33 | import(cluster) 34 | import(e1071) 35 | import(edgeR) 36 | import(ggplot2) 37 | import(grid) 38 | import(locfit) 39 | import(reshape2) 40 | importFrom(BiocGenerics,"counts<-") 41 | importFrom(BiocGenerics,counts) 42 | importFrom(BiocGenerics,design) 43 | importFrom(GenomicAlignments,summarizeOverlaps) 44 | importFrom(IRanges,IRanges) 45 | importFrom(Rsamtools,BamFile) 46 | importFrom(Rsamtools,BamFileList) 47 | importFrom(grDevices,rainbow) 48 | importFrom(methods,new) 49 | importFrom(methods,validObject) 50 | importFrom(stats,as.dist) 51 | importFrom(stats,complete.cases) 52 | importFrom(stats,cor) 53 | importFrom(stats,cutree) 54 | importFrom(stats,hclust) 55 | importFrom(stats,kmeans) 56 | importFrom(stats,model.matrix) 57 | importFrom(stats,sd) 58 | importFrom(stats,time) 59 | importFrom(utils,capture.output) 60 | importFrom(utils,read.table) 61 | -------------------------------------------------------------------------------- /R/Classes.R: -------------------------------------------------------------------------------- 1 | #' clust class 2 | #' 3 | #'\code{clust} is a S4 class for storing results of the clustering 4 | #'analysis of time course data. 5 | #' 6 | #'@section Slots: 7 | #'Object of \code{clust} class contains the following slots: 8 | #'\describe{ 9 | #' \item{\code{method}}{clustering method used} 10 | #' \item{\code{dist}}{distance metric used} 11 | #' \item{\code{data}}{a matrix of original or standardized data used 12 | #' in the analysis} 13 | #' \item{\code{centers}}{a matrix of cluster centers} 14 | #' \item{\code{cluster}}{an integer vector of length \eqn{n} (the 15 | #' integers are the indices of clusters the data points belong to. 16 | #' For the fuzzy cmeans clustering method, a data point is assigned 17 | #' to the closest cluster to which the data point has highest 18 | #' membership value.} 19 | #' \item{\code{membership}}{a matrix of membership values of the 20 | #' data points to each clusters} 21 | #'} 22 | #'@details 23 | #'The clust objects are returned from \code{\link{timeclust}} and have 24 | #'a show method printing a compact summary of their contents 25 | #' 26 | #'@author Mengjun Wu 27 | #' 28 | #'@seealso \code{\link{timeclust}}, \code{\link{@}} 29 | #'@exportClass clust 30 | 31 | clust <- setClass("clust", slots = c(method = "character", 32 | dist = "character", 33 | data = "matrix", 34 | centers = "matrix", 35 | cluster = "integer", 36 | membership = "matrix")) 37 | 38 | #'@rdname TCA 39 | #'@export 40 | setClass("TCA", slots = c(design = "data.frame", counts = "matrix", 41 | genomicFeature = "data.frame", 42 | DBfit = "DGEGLM",contrasts = "matrix", 43 | tcTable = "matrix", clusterRes = "clust"), 44 | prototype = list(counts = matrix(0L, 0L, 0L), 45 | design = data.frame())) 46 | 47 | setValidity("TCA", function(object) { 48 | counts <- object@counts 49 | design <- object@design 50 | genomicFeature <- object@genomicFeature 51 | if (!is.numeric(counts)) { 52 | stop("All counts must be numeric.") 53 | } 54 | if (any(is.na(counts))) { 55 | stop("NA values are not allowed in counts. ") 56 | } 57 | if (any(counts < 0)) { 58 | stop("counts contain negative number(s), all counts must be positive") 59 | } 60 | if (!is.integer(counts)) { 61 | if (any(round(counts) != counts)) { 62 | stop("All counts must be intergers.") 63 | } else { 64 | mode(counts) <- "integer" 65 | warning("All counts are coerced to integers.") 66 | } 67 | } 68 | if (!identical(matrix(0L, 0L, 0L), counts)) { 69 | if (ncol(counts) != nrow(design)) { 70 | stop("Number of columns in 'counts' must equal to number of rows in 'design'.") 71 | } 72 | if (nrow(counts) != nrow(genomicFeature)) { 73 | stop("Number of rows in 'counts' must equal to number of rows in 'genomicFeature'") 74 | } 75 | } 76 | if (!sum(c("sampleid", "timepoint", "group") %in% 77 | tolower(colnames(design))) == 3) { 78 | err <- paste0("One or more following required fields in 'design' are missing: 'sampleid', 'timepoint', 'group', check if the columns are correctly named or if the corresponding information is provided.") 79 | stop(err) 80 | } 81 | if (!sum(c("id", "chr", "start", "end") %in% 82 | tolower(colnames(genomicFeature))) == 4) { 83 | err <- paste0("One or more following required fields in 'genomicFeature' are missing: 'id', 'chr', 'start','end', check if the columns are correctly named or if the corresponding information is provided.") 84 | stop(err) 85 | } 86 | TRUE 87 | }) 88 | 89 | #'TCA class and constructor 90 | #' 91 | #'\code{TCA} is a S4 class for storing input data, results of 92 | #'differential analysis and clustering analysis. A \code{TCA} object 93 | #'can be created by the constructor function taking a table of sample 94 | #'information, a table of the genomic coordinates of features, and read 95 | #'count table (optional). 96 | #' 97 | #'@param design a data frame containing information of 98 | #'samples/libraries. For time course analysis, design table should 99 | #'contain at least three columns (case insensitive): \code{sampleid}, 100 | #'\code{timepoint} and \code{group} providing time point and group 101 | #'information of each sample/library. If \code{counts} is not provided 102 | #'when creating \code{TCA} object, an optional column \code{bamfile} can 103 | #'be used to provide BAM filename of each sample/library and generate 104 | #'count table using \code{\link{countReads}} function later. 105 | #' 106 | #'@param counts an integer matrix containing read counts. Rows 107 | #'correspond to genomic features and columns to samples/libraries. 108 | #'The name of column s should be the same as the time points 109 | #'in \code{design}. 110 | #' 111 | #'@param genomicFeature a data frame or a GRanges object containing 112 | #'genomic coordinates of features of interest (e.g. genes in RNA-seq, 113 | #'binding regions in ChIP-seq). If genomicFeature is a data frame, 114 | #'four columns are required in \code{genomicFeature}: \code{id}, 115 | #'\code{chr}, \code{start}, \code{end}; if genomicFeature is a Granges 116 | #'object, the metadata column "\code{id}" is required. For 117 | #'\code{TCAFromSummarizedExperiment}, genomicFeature must be 118 | #'provided if \code{se} is a SummarizedExperiment object. 119 | #' 120 | #' 121 | #'@param se A SummarizedExperiment or a RangedSummarizedExperiment 122 | #'object. The object might contain multiple assays in the assay list, 123 | #'only the first one will be taken to construct TCA object. 124 | #'For SummarizedExperiment object, \code{genomicFeature} 125 | #'must be provided while for RangedSummarizedExperiment object, 126 | #'the genomic features will be extracted directly from the object. 127 | #' 128 | #'@param zero.based Logical. If TRUE, the start positions of the 129 | #'genomic ranges in the returned \code{TCA} object are \emph{0-based}, 130 | #'if FALSE, the start positions will be \emph{1-based}. 131 | #' 132 | #'@return A TCA object 133 | #' 134 | #'@details A TCA object can be created without providing read counts, 135 | #'read counts can be provided by \code{\link{counts}} or generated by 136 | #'\code{\link{countReads}}. For the read counts, the number of rows 137 | #'should equal to that in '\code{genomicFeature} and the number of columns 138 | #'should equal to number of rows in \code{design}; in addition, the name 139 | #'of column names should be the same as the time points in \code{design}. 140 | #'Input data and analysis results in a TCA object can be accessed by using 141 | #'corresponding accessors and functions. 142 | #'The TCA objects also have a show method printing a compact summary of 143 | #'their contents see \code{\link{counts}}, \code{\link{TCA.accessors}}, 144 | #'\code{\link{DBresult}}, \code{\link{tcTable}}, \code{\link{timeclust}}. 145 | #'\code{clust} 146 | #'@author Mengjun Wu 147 | #'@seealso \code{\link{counts}}, \code{\link{TCA.accessors}}, 148 | #'\code{\link{DBresult}}, \code{\link{timeclust}}, \code{\link{clust}} 149 | #' 150 | #'@author Mengjun Wu 151 | #' 152 | #'@examples 153 | #'#create data frame of experiment design: 4 time points and 2 replicates for each time point. 154 | #'d <- data.frame(sampleID = 1:8, group = rep(c(1, 2, 3, 4), 2), 155 | #' timepoint = rep(c('0h', '24h', '48h', '72h'), 2)) 156 | #' 157 | #' 158 | #'#create data frame of genomic intervals of interest 159 | #'gf <- data.frame(chr = c(rep('chr1', 3), rep('chr2', 2), rep('chr4', 2)), 160 | #' start = seq(100, 2000, by = 300), 161 | #' end = seq(100, 2000, by = 300) + 150, 162 | #' id = paste0('peak', 1:7)) 163 | #'tca <- TCA(design = d, genomicFeature = gf) 164 | #'genomicFeature(tca) 165 | #' 166 | #'#if count table is available 167 | #'c <- matrix(sample(1000, 56), nrow = 7, dimnames = list(paste0('peak', 1:7), 1:8)) 168 | #'tca <- TCA(design = d, counts = c, genomicFeature = gf) 169 | #'# replace the count table of a \code{TCA} object 170 | #'c2 <- matrix(sample(500, 56), nrow = 7, dimnames = list(paste0('peak', 1:7), 1:8)) 171 | #'counts(tca) <- c2 172 | #' 173 | #' 174 | #'@export 175 | TCA <- function(design, counts = matrix(0L, 0L, 0L), genomicFeature, 176 | zero.based = TRUE) { 177 | 178 | if (!is.numeric(counts)) { 179 | stop("All counts must be numeric.") 180 | } 181 | if (any(is.na(counts))) { 182 | stop("NA values are not allowed in counts.") 183 | } 184 | if (any(counts < 0)) { 185 | stop("counts contain negative number(s), all counts must be positive") 186 | } 187 | if (!is.integer(counts)) { 188 | if (any(round(counts) != counts)) { 189 | stop("All counts must be intergers.") 190 | } else { 191 | mode(counts) <- "integer" 192 | warning("All counts are coerced to integers.") 193 | } 194 | } 195 | if (!is.data.frame(design)) { 196 | stop("design must be 'data.frame'.") 197 | } 198 | if (!is.data.frame(genomicFeature) && 199 | !is(genomicFeature, "GRanges")) { 200 | stop("genomicFeature must be a data frame or a GRanges object.") 201 | } 202 | if (is.data.frame(genomicFeature)) { 203 | if (sum(c("id", "chr", "start", "end") %in% 204 | tolower(colnames(genomicFeature))) != 4) { 205 | err <- paste0("One or more following required fields in genomicFeature are missing: 'id', 'chr', 'start','end', check if the columns are correctly named or if the corresponding information is provided. ") 206 | stop(err) 207 | } 208 | if (sum(c("id", "chr", "start", "end") %in% 209 | colnames(genomicFeature)) != 4) { 210 | colnames(genomicFeature) <- tolower(colnames(genomicFeature)) 211 | warning("colnames of genomicFeature are all forced to lowercase.") 212 | } 213 | if (!zero.based) { 214 | genomicFeature$start <- genomicFeature$start + 1 215 | } 216 | } 217 | if (is(genomicFeature, "GRanges")) { 218 | if(!"id" %in% tolower(colnames(elementMetadata(genomicFeature)))) { 219 | stop("Required metadata of genomicFeature is mising: 'id'.") 220 | } 221 | if(!"id" %in% colnames(elementMetadata(genomicFeature))) { 222 | colnames(elementMetadata(genomicFeature)) <- tolower(colnames(elementMetadata(genomicFeature))) 223 | warning("Names of metadata of genomicFeature are all forced to lowercase.") 224 | } 225 | genomicFeature <- as(genomicFeature, "data.frame") 226 | if (zero.based) { 227 | enomicFeature$start <- genomicFeature$start - 1 228 | } 229 | } 230 | if (!identical(matrix(0L, 0L, 0L), counts)) { 231 | if (ncol(counts) != nrow(design)) { 232 | stop("number of columns in 'counts' must equal to number of rows in 'design'.") 233 | } 234 | if (nrow(counts) != nrow(genomicFeature)) { 235 | stop("Number of rows in 'counts' must equal to number of rows in 'genomicFeature'") 236 | } 237 | } 238 | 239 | if (!sum(c("sampleid", "timepoint", "group") %in% 240 | tolower(colnames(design))) == 3) { 241 | err <- paste0("One or more following required fields in 'design' are missing: 'sampleid', 242 | 'timepoint', 'group', check if the columns are correctly named or if the corresponding information is provided.") 243 | stop(err) 244 | } 245 | colnames(design) <- tolower(colnames(design)) 246 | if(class(design$timepoint) != "character"){ 247 | design$timepoint <- as.character(design$timepoint) 248 | warning("time points in 'design' are not characters, converted to characters") 249 | } 250 | object <- new("TCA", design = design, counts = counts, 251 | genomicFeature = genomicFeature) 252 | object 253 | } 254 | 255 | #'@rdname TCA 256 | #'@export 257 | TCAFromSummarizedExperiment <-function(se, genomicFeature=NULL){ 258 | if (!is(se, "SummarizedExperiment") && 259 | !is(se, "RangedSummarizedExperiment")) { 260 | stop("se must be a SummarizedExperiment or a RangedSummarizedExperiment object.") 261 | } 262 | if (is(se, "SummarizedExperiment")) { 263 | if (is.null(genomicFeature)) { 264 | stop("genomicFeature must be provided") 265 | } 266 | design <- as(colData(se), "data.frame") 267 | counts <- assay(se,1) 268 | } 269 | if (is(se, "RangedSummarizedExperiment")) { 270 | design <- as(colData(se), "data.frame") 271 | counts <- assay(se,1) 272 | genomicFeature <- rowRanges(se) 273 | } 274 | object <- TCA(design = design, counts = counts, 275 | genomicFeature = genomicFeature) 276 | object 277 | } 278 | 279 | #Set inheritance 280 | #The Class LargeDataObject from limma has \code{show} method for objects of the class. 281 | 282 | setIs("clust", "LargeDataObject") 283 | setIs("TCA", "LargeDataObject") 284 | -------------------------------------------------------------------------------- /R/DBanalysis.R: -------------------------------------------------------------------------------- 1 | #' Perform differential expression analysis 2 | #' 3 | #' This function is a wrapper for the \code{\link{glmFit}} in edgeR package. 4 | #' 5 | #' @param object a \code{TCA} object. 6 | #' 7 | #' @param categories character string giving which column in \code{design} 8 | #' will be used for differential analysis. For time course analysis, the default 9 | #' column is "\code{timepoint}". 10 | #' 11 | #' @param norm.lib logical indicating whether or not use effective 12 | #' library size when perform normalization. See \code{\link{counts}} for more 13 | #' details. 14 | #' 15 | #' @param filter.type character string indicating which type of count 16 | #' (raw or normalized) is used when performing filtering. Options are 17 | #' "\code{raw}", "\code{cpm}", "\code{rpkm}", "\code{NULL}". No filtering will 18 | #' be performed when using "\code{NULL}'. 19 | #' 20 | #' @param filter.value a numberic value; minimum values of selected 21 | #' \code{filter.type} ("\code{raw}", "\code{cpm}", "\code{rpkm}"). It is used in 22 | #' combination with \code{samplePassfilter}. 23 | #' 24 | #' @param samplePassfilter a numberic value indicating the minimum number 25 | #' of samples/libraries in which a genomic feature has counts value 26 | #' (raw or normalized) more than \code{filter.value}. Smaller than this number, 27 | #' the genomic feature will be filtered out. 28 | #' 29 | #' @param ... additional arguments passed to \code{\link{glmFit}} from 30 | #' \code{edgeR} package. 31 | #' 32 | #' @details The differetial event is detected by using the generalized 33 | #' linear model (GLM) methods (McCarthy et al, 2012). This function 34 | #' fits the read counts of each genes to a negative binomial glm by 35 | #' using \code{\link{glmFit}} function from edgeR. To further test the 36 | #' significance of changes, see \code{DBresult}, \code{TopDBresult} 37 | #' 38 | 39 | #' @return 40 | #' A \code{TCA} object 41 | #' 42 | #' @author 43 | #' Mengjun Wu, Lei Gu 44 | #' 45 | #' @references McCarthy,D.J.,Chen, Y., & Smyth, G. K.(2012). Differential 46 | #' expression analysis of multifactor RNA-Seq experiments with respect to 47 | #' biological variation. Nucleic acids research 40, 4288-4297. 48 | #' 49 | #' @seealso \code{DBresult}, \code{TopDBresult} 50 | #' 51 | #' @examples 52 | #' data(tca_ATAC) 53 | #' tca_ATAC <- DBanalysis(tca_ATAC) 54 | #' 55 | #' @export 56 | DBanalysis <- function(object, categories = "timepoint", norm.lib = TRUE, 57 | filter.type = NULL, filter.value = NULL, 58 | samplePassfilter = 2, ...) { 59 | if (!categories %in% colnames(object@design)) { 60 | err <- paste0("Can not find ", categories, " in design, please check if the correspoinding field is missing or a different name is used.") 61 | stop(err) 62 | } 63 | 64 | object@contrasts <- contrastMatrix(object, categories) 65 | 66 | # require(edgeR) 67 | group <- object@design[[categories]] 68 | y <- DGEList(counts = object@counts, group = group) 69 | if (norm.lib) { 70 | y <- calcNormFactors(y) 71 | } 72 | if (!is.null(filter.type)) { 73 | if (is.null(filter.value)) { 74 | err <- paste0("\"filter.value\" is required to be specified for the chosen filter.type ", 75 | filter.type, ".") 76 | stop("\"filter.value\" is required to be specified for chosen .") 77 | } else { 78 | y <- switch(filter.type, raw = { 79 | ind <- rowSums(y$counts > filter.value) >= samplePassfilter 80 | y <- y[ind, , keep.lib.sizes = FALSE] 81 | y 82 | }, cpm = { 83 | ind <- rowSums(cpm(y, ...) > filter.value) >= samplePassfilter 84 | y <- y[ind, , keep.lib.sizes = FALSE] 85 | y 86 | 87 | }, rpkm = { 88 | giwidth <- object@genomicFeature$end - object@genomicFeature$start 89 | ind <- rowSums(rpkm(y, gene.length = giwidth, ...) > filter.value) >= samplePassfilter 90 | y <- y[ind, , keep.lib.sizes = FALSE] 91 | y 92 | }) 93 | } 94 | } 95 | 96 | design <- model.matrix(~0 + group, data = y$samples) 97 | colnames(design) <- levels(y$samples$group) 98 | design <- design[, unique(group)] 99 | y <- estimateDisp(y, design) 100 | fit <- glmFit(y, design, ...) 101 | object@DBfit <- fit 102 | object 103 | } 104 | 105 | # initialize a contrast table with all possible comibinations of group in defined categories 106 | contrastMatrix <- function(object, categories) { 107 | ca <- unique(object@design[[categories]]) 108 | a <- length(ca) 109 | b <- 2 * choose(a, 2) 110 | contrastM <- matrix(0, a, b) 111 | name <- vector(mode = "character", length = b) 112 | count <- 1 113 | count.col <- -1 114 | count.col2 <- 0 115 | for (i in seq_len((a - 1))) { 116 | count = count + 1 117 | for (j in count:a) { 118 | count.col <- count.col + 2 119 | count.col2 <- count.col2 + 2 120 | n <- paste0(ca[j], "vs", ca[i]) 121 | n1 <- paste0(ca[i], "vs", ca[j]) 122 | name[count.col] <- n 123 | name[count.col2] <- n1 124 | contrastM[i, count.col] = -1 125 | contrastM[j, count.col] = 1 126 | contrastM[j, count.col2] = -1 127 | contrastM[i, count.col2] = 1 128 | } 129 | } 130 | dimnames(contrastM) <- list(ca, name) 131 | contrastM 132 | } 133 | 134 | -------------------------------------------------------------------------------- /R/DBresults.R: -------------------------------------------------------------------------------- 1 | #' This function tests for differential expression 2 | #' 3 | #' This function is a wrapper for \code{\link{glmLRT}} in edgeR package. 4 | #' It performs likelihood ratio tests for given coefficinets contrasts 5 | #' after fitting read counts to a negative binomial glm by 6 | #' \code{\link{DBanalysis}}. \code{DBresult} also extracts the 7 | #' diffential analysis results of given contrasts at a chosen significance level. 8 | #' \code{DBresult.cluster} returns similar results but only 9 | #' contain genomic features belong to a given cluster. 10 | #' 11 | #' @name DBresult 12 | #' 13 | #' @param object a \code{TCA} object, for \code{DBresult}, 14 | #' \code{DBanalysis} should already be called on the object; 15 | #' for \code{DBresult.cluster}, both \code{DBanalysis} and 16 | #' \code{timeclust} should be already called. 17 | #' 18 | #' @param group1 character string giving the group to be compared with, 19 | #' i.e., the denominator in the fold changes. group1 can be set NULL and 20 | #' will be ignored if the comparisons are passed to \code{contrasts} 21 | #' 22 | #' @param group2 a character vetor giving the other groups to 23 | #' compare with \code{group1}, i.e., the numerator in the fold changes. 24 | #' group2 can be set NULL and will be ignored if the comparisons are 25 | #' passed to \code{contrasts} 26 | #' 27 | #' @param contrasts a character vector, each string in 28 | #' the vector gives a contrast of two groups with the format 29 | #' "group2vsgroup1", group1 is the denominator level in the fold 30 | #' changes and group2 is the numerator 31 | #' level in the fold changes. 32 | #' 33 | #' @param p.adjust character string specifying a correction method 34 | #' for p-values. Options are "\code{holm}", "\code{hochberg}", 35 | #' "\code{hommel}", "\code{bonferroni}", "\code{BH}", "\code{BY}", 36 | #' "\code{fdr}", and "\code{none}". 37 | #' 38 | #' @param top.sig logical if TRUE, only genomic regions with 39 | #' given log2-fold changes and significance levels (p-value) 40 | #' will be returned. Log2-fold changes are defined by \code{abs.fold} 41 | #' and \code{direction}; significance levels are defined by \code{pvalue} 42 | #' and \code{pvalue.threshold} 43 | #' 44 | #' @param pvalue character string specify the type of p-values 45 | #' used for defining the significance level(\code{PValue} 46 | #' or adjusted p-value \code{paj}) 47 | #' 48 | #' @param pvalue.threshold a numeric value giving threshold of 49 | #' selected p-value, Significant changes have lower 50 | #' (adjusted) p-values than the threshold. 51 | #' 52 | #' @param abs.fold a numeric value, the minimum absolute log2-fold 53 | #' changes. The returned genomic regions have changes 54 | #' with absolute log2-fold changes exceeding \code{abs.fold}. 55 | #' 56 | #' @param direction character string specify the direction of fold 57 | #' changes. "\code{up}": positive fold changes; "\code{down}": 58 | #' negative fold changes; "\code{both}": both positive and 59 | #' negative fold changes. 60 | #' 61 | #' @param cluster an integer giving the number of cluster from which 62 | #' genomic features are extracted. 63 | #' 64 | #' @param cmthreshold a numeric value, this argument is applicable 65 | #' only if \code{cmeans}' clustering method is selected when calling 66 | #' \code{\link{timeclust}} function. if not NULL, the result table of 67 | #' genomic features that belong to the defined \code{cluster} and 68 | #' the membership values to this cluster exceed \code{cmthreshold} 69 | #' are extracted. 70 | #' 71 | #' @param result.type character string giving the data type of return 72 | #' value. Options are "GRangesList" and "list". 73 | #' 74 | #' @details This function uses \code{\link{glmLRT}} from edgeR which 75 | #' perform likelihood ratio tests for the significance of changes. 76 | #' For more deatils, 77 | #' see \code{\link{glmLRT}} 78 | #' 79 | #' @note If not NULL \code{group1}, \code{group2} and \code{contrasts}, 80 | #' result tables are extracted from comparisons in \code{constrasts}. 81 | #' 82 | #' @return 83 | #' A list or a GRangesList. 84 | #' If \code{result.type} is "GRangesList", a GRangesList is returned containing 85 | #' the differential analysis results for all provided contrasts. Each GRanges 86 | #' object of the list is one contrast, the analysis results are contained in 4 87 | #' metadata columns: 88 | #' 89 | #' @return \code{logFC} log2-fold changes between two groups. 90 | #' 91 | #' @return \code{PValue} p-values. 92 | #' 93 | #' @return \code{paj} adjusted p-values 94 | #' 95 | #' @return \code{id} name of genomic features 96 | #' 97 | #' If \code{result.type} is "list", a list of data frames is returned. 98 | #' Each data frame contains one contrast with the following columns: 99 | #' 100 | #' @return \code{logFC} log2-fold changes between two groups. 101 | #' 102 | #' @return \code{PValue} p-values. 103 | #' 104 | #' @return \code{paj} adjusted p-values 105 | #' 106 | #' @return \code{chr} name of chromosomes 107 | #' 108 | #' @return \code{start} starting positions of features in the 109 | #' chromosomes 110 | #' 111 | #' @return \code{end} ending postitions of features in the chromosomes 112 | #' 113 | #' @return \code{id} name of genomic features 114 | #' 115 | #' @author 116 | #' Mengjun Wu, Lei Gu 117 | #' 118 | #' @seealso 119 | #' 120 | #' \code{\link{glmLRT}} 121 | #' 122 | #' @examples 123 | #' data(tca_ATAC) 124 | #' tca_ATAC <- DBanalysis(tca_ATAC) 125 | #' ### extract differntial analysis of 24h, 72h to 0h 126 | #' # set the contrasts using the 'group1' and 'group2' paramters 127 | #' res1 <- DBresult(tca_ATAC, group1 = '0h', group2 = c('24h', '72h')) 128 | #' # one can get the same result by setting the contrasts using hte 'contrasts' parameter 129 | #' res2 <- DBresult(tca_ATAC, contrasts = c('24hvs0h', '72hvs0h')) 130 | #' # extract significant diffential events 131 | #' res.sig <- DBresult(tca_ATAC, contrasts = c('24hvs0h', '72hvs0h'), 132 | #' top.sig = TRUE) 133 | #' 134 | #' # extract differntial analysis of 24h, 72h to 0h of a given cluster 135 | #' tca_ATAC <- timecourseTable(tca_ATAC, filter = TRUE) 136 | #' tca_ATAC <- timeclust(tca_ATAC, algo = 'cm', k = 6) 137 | #' res_cluster1 <- DBresult.cluster(tca_ATAC, group1 = '0h', 138 | #' group2 = c('24h', '72h'), 139 | #' cluster = 1) 140 | #' 141 | #' 142 | #' 143 | #' @export 144 | DBresult <- function(object, group1 = NULL, group2 = NULL, 145 | contrasts = NULL, p.adjust = "fdr", 146 | top.sig = FALSE, pvalue = "paj", 147 | pvalue.threshold = 0.05, abs.fold = 2, 148 | direction = "both", result.type = "GRangesList") { 149 | if (is.null(group1) && is.null(group2) && is.null(contrasts)) { 150 | stop("Either information of groups to compare or \"contrasts\" should be provided") 151 | } 152 | if (!is.null(contrasts)){ 153 | contrasts <- contrasts 154 | }else{ 155 | if (sum(group1 %in% group2) > 0) { 156 | warning("Members in group1 are also found in group2, overlapped members are removed from group2.") 157 | group2 <- group2[-which(group2 %in% group1)] 158 | } 159 | contrasts <- contrastNames(group1, group2) 160 | } 161 | if (!p.adjust %in% c("holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr", "none")) { 162 | stop("Method for adjusting P-values should be one of following methods: 'holm', 'hochberg', 'hommel', 'bonferroni', 'BH', 'BY', 'fdr', 'none'. Character string is case sensitive.") 163 | } 164 | fit <- object@DBfit 165 | contrast.table <- object@contrasts 166 | gi <- object@genomicFeature[object@genomicFeature$id %in% 167 | row.names(fit$coefficients), ] 168 | gi <- gi[, c("chr", "start", "end", "id")] 169 | res <- list() 170 | for (i in contrasts) { 171 | tmp <- glmLRT(fit, contrast = contrast.table[, i]) 172 | restmp <- tmp$table[, c(1, 4)] 173 | adjustp <- p.adjust(restmp$PValue, method = p.adjust) 174 | restmp <- cbind(restmp, adjustp) 175 | colnames(restmp)[length(restmp[1, ])] <- "paj" 176 | restmp <- cbind(restmp, gi, stringsAsFactors = FALSE) 177 | res[[i]] <- restmp 178 | } 179 | if (top.sig) { 180 | res <- DBresult.filter(x = res, pvalue = pvalue, 181 | pvalue.threshold = pvalue.threshold, 182 | abs.fold = abs.fold, 183 | direction = direction) 184 | } 185 | if (tolower(result.type) == "grangeslist") { 186 | gr <- as(do.call(rbind, unname(res)), "GRanges") 187 | res <- suppressWarnings(split(gr, rep(names(res), lengths(res)))) 188 | } 189 | 190 | res 191 | } 192 | 193 | #' @rdname DBresult 194 | #' @export 195 | DBresult.cluster <- function(object, group1 = NULL, group2 = NULL, 196 | contrasts = NULL, p.adjust = "fdr", 197 | top.sig = FALSE, pvalue = "paj", 198 | pvalue.threshold = 0.05, abs.fold = 2, 199 | direction = "both",cluster, cmthreshold = NULL, 200 | result.type = "GRangesList") { 201 | if (length(object@clusterRes@cluster) == 0) { 202 | stop("No cluster information provided, clustering analysis must be performed first") 203 | } 204 | DBres <- DBresult(object, group1 = group1, group2 = group2, 205 | contrasts = contrasts, p.adjust = p.adjust, 206 | top.sig = top.sig, pvalue = pvalue, 207 | pvalue.threshold = pvalue.threshold, 208 | abs.fold = abs.fold, 209 | direction = direction, result.type = "list") 210 | names <- names(object@clusterRes@cluster) 211 | res <- list() 212 | contrast_name <- names(DBres) 213 | counter <- 0 214 | for (i in DBres) { 215 | restmp <- i 216 | counter <- counter + 1 217 | clusters <- object@clusterRes@cluster 218 | clusternames <- names[which(clusters == cluster)] 219 | if (!is.null(cmthreshold)) { 220 | membership <- object@clusterRes@membership[clusters == cluster, 221 | cluster] 222 | if (is.null(membership)) { 223 | stop("No membership matrix found. To get membership matrix, please choose 'cmeans' clustering method when calling timeclust") 224 | } else { 225 | clusternames <- clusternames[which(membership > cmthreshold)] 226 | } 227 | } 228 | restmp <- restmp[clusternames, ] 229 | contrast <- contrast_name[counter] 230 | res[[contrast]] <- restmp 231 | } 232 | if (tolower(result.type) == "grangeslist") { 233 | gr <- as(do.call(rbind, unname(res)), "GRanges") 234 | res <- suppressWarnings(split(gr, rep(names(res), lengths(res)))) 235 | } 236 | res 237 | } 238 | 239 | # contrast contrast by given strings, group1 is a string, group2 can be a string or a vector of strings 240 | contrastNames <- function(group1, group2) { 241 | b <- length(group2) 242 | name <- vector(mode = "character", length = b) 243 | for (i in seq_len(b)) { 244 | n <- paste0(group2[i], "vs", group1) 245 | name[i] <- n 246 | } 247 | name 248 | } 249 | 250 | DBresult.filter <- function(x, pvalue = "paj", pvalue.threshold = 0.05, 251 | abs.fold = 2, direction = "both") { 252 | if (abs.fold < 0) { 253 | err <- paste0("\"abs.fold\" should be postive number.") 254 | stop(err) 255 | } 256 | d <- x 257 | for (i in seq_len(length(d))) { 258 | dt <- d[[i]] 259 | if (direction == "up") { 260 | dt <- dt[which(dt$logFC >= abs.fold), ] 261 | } 262 | 263 | if (direction == "down") { 264 | dt <- dt[which(dt$logFC <= -abs.fold), ] 265 | } 266 | if (direction == "both") { 267 | if (abs.fold == 0) { 268 | dt <- rbind(dt[which(dt$logFC >= abs.fold), ], 269 | dt[which(dt$logFC < -abs.fold), ]) 270 | } else { 271 | dt <- rbind(dt[which(dt$logFC >= abs.fold), ], 272 | dt[which(dt$logFC <= -abs.fold), ]) 273 | } 274 | } 275 | dt <- dt[which(dt[, pvalue] < pvalue.threshold), ] 276 | d[[i]] <- dt 277 | } 278 | d 279 | } 280 | -------------------------------------------------------------------------------- /R/GenericFunctions.R: -------------------------------------------------------------------------------- 1 | #' @import ggplot2 2 | #' @import edgeR 3 | #' @import e1071 4 | #' @import cluster 5 | #' @import reshape2 6 | #' @import grid 7 | #' @import locfit 8 | #' @import GenomicRanges 9 | #' @import SummarizedExperiment 10 | #' @importFrom BiocGenerics counts counts<- design 11 | #' @importFrom IRanges IRanges 12 | #' @importFrom Rsamtools BamFile BamFileList 13 | #' @importFrom GenomicAlignments summarizeOverlaps 14 | #' @importFrom grDevices rainbow 15 | #' @importFrom stats as.dist complete.cases cor cutree hclust kmeans model.matrix sd time 16 | #' @importFrom utils capture.output read.table 17 | #' @importFrom methods new validObject 18 | NULL 19 | 20 | counts.TCA <- function(object, normalization = "none", lib.norm = TRUE, 21 | log = FALSE, ...) { 22 | if (!normalization %in% c("none", "rpkm", "cpm")) { 23 | stop("'normalization method should one of 'none', 'rpkm', 'cpm'.") 24 | } 25 | if (normalization == "none") { 26 | t <- object@counts 27 | } 28 | if (normalization != "none") { 29 | genomicFeature <- object@genomicFeature 30 | group <- object@design$group 31 | y <- DGEList(counts = object@counts, group = group) 32 | if (lib.norm) { 33 | y <- calcNormFactors(y) 34 | } 35 | c <- switch(normalization, rpkm = { 36 | giwidth <- genomicFeature$end - genomicFeature$start 37 | t <- rpkm(y, normalized.lib.sizes = lib.norm, gene.length = giwidth, 38 | log = log, ...) 39 | t 40 | }, cpm = { 41 | t <- cpm(y, normalized.lib.sizes = lib.norm, log = log, ...) 42 | t 43 | }) 44 | } 45 | t 46 | } 47 | 48 | #' Extracts counts of a TCA object. 49 | #' 50 | #' \code{counts} extract raw read counts stored in a \code{TCA} object 51 | #' or compute normalized counts from the raw counts. 52 | #' 53 | #' @name counts 54 | #' @aliases counts counts,TCA-method counts<-,TCA-method 55 | #' @param object a \code{TCA} object. 56 | #' 57 | #' @param normalization character string giving the normalization method. 58 | #' Options are "\code{none}" (original raw counts), "\code{cpm}" (counts 59 | #' per million), 60 | #' "\code{rpkm}" (reads per kilobase per million). 61 | #' 62 | #' @param lib.norm logical indicating whether or not use effective library 63 | #' size (see Details below) when \code{normalization} is "\code{cpm}" or 64 | #' "\code{rpkm}". 65 | #' 66 | #' @param log logical if \code{TRUE}, the returned value will be on a log2 67 | #' scale. 68 | #' 69 | #' @param value an integer matrix. 70 | #' 71 | #' @param ... additional arguments passed to \code{\link{cpm}} or 72 | #' \code{\link{rpkm}} in the edgeR package. 73 | #' 74 | #' @details when calculating normalized counts, library size can be rescaled 75 | #' to minimize the log-fold changes between samples for most genomic features 76 | #' (e.g. genes, binding sites) by multiplying a scale factor. The rescaled 77 | #' library size is called effective library size. In this function, the scale 78 | #' factor is calculated using the weighted trimmed mean of M-values (TMM, 79 | #' Robinson et al (2010)) 80 | #' 81 | #' If log2 values are computed, a small count would be added to avoid logarithm 82 | #' of zero. The actual added count will be scaled according to the library size, 83 | #' for details see \code{\link{addPriorCount}} in the edgeR package 84 | #' when not specified, the prior count is set to 0.25 by default. 85 | #' 86 | #' @references 87 | #' Robinson, M. D., & Oshlack, A. (2010). A scaling normalization method for 88 | #' differential expression analysis of RNA-seq data. Genome biology, 11(3), 1. 89 | #' 90 | #' @return 91 | #' An integer matrix 92 | #' 93 | #' @author 94 | #' Mengjun Wu 95 | #' 96 | #' @examples 97 | #' data(tca_ATAC) 98 | #' c <- counts(tca_ATAC) 99 | #' # normalized counts table 100 | #' c_norm <- counts(tca_ATAC, normalization='rpkm') 101 | #' @export 102 | setMethod("counts", "TCA", counts.TCA) 103 | 104 | #' @rdname counts 105 | #' @exportMethod 'counts<-' 106 | setMethod("counts<-", "TCA", function(object, value) { 107 | object@counts <- value 108 | validObject(object) 109 | object 110 | }) 111 | 112 | 113 | #' Accessors to extract slots of a TCA class. 114 | #' 115 | #' Accessors are provided to extract \code{design}, \code{genomicFeature}, 116 | #' \code{tcTable}, \code{clustResults} slots of a TCA class. The \code{design} 117 | #' slot stores experimental information of samples/libraries, the 118 | #' \code{genomicFeature} slot stores genomic coordinates of features, the 119 | #' \code{tcTable} slot stores time couse data as a matrix, where rows are 120 | #' genomic features and columns time points. The \code{clustResults} slot 121 | #' stores results of clustering analysis as a \code{clust} object. 122 | #' 123 | #' @name TCA.accessors 124 | #' @aliases design design,TCA-method genomicFeature,TCA-method 125 | #' tcTable,TCA-method clustResults,TCA-method 126 | #' 127 | #' @param object \code{TCA} object object 128 | #' @return 129 | #' \code{design} returns a data frame. \code{genomicFeature} returns a data frame. 130 | #' \code{tcTable} returns a numeric matrix. \code{clustResults} returns a 131 | #' \code{clust} object, see \code{\link{clust}} for details. 132 | #' 133 | #' @author 134 | #' Mengjun Wu 135 | #' 136 | #' @seealso 137 | #' \code{\link{clust}} 138 | #' 139 | #' @examples 140 | #' data(tca_ATAC) 141 | #' genomicFeature(tca_ATAC) 142 | #' tcTable(tca_ATAC) 143 | 144 | #' @rdname TCA.accessors 145 | #' @export 146 | setMethod("design", "TCA", function(object) { 147 | object@design 148 | }) 149 | 150 | #' @rdname TCA.accessors 151 | #' @export 152 | 153 | setGeneric("genomicFeature", function(object) standardGeneric("genomicFeature")) 154 | setMethod("genomicFeature", "TCA", function(object) { 155 | object@genomicFeature 156 | }) 157 | 158 | #' @rdname TCA.accessors 159 | #' @export 160 | 161 | setGeneric("tcTable", function(object) standardGeneric("tcTable")) 162 | 163 | #' @rdname TCA.accessors 164 | #' @export 165 | 166 | setMethod("tcTable", "TCA", function(object) { 167 | object@tcTable 168 | }) 169 | 170 | #' @rdname TCA.accessors 171 | #' @export 172 | 173 | setGeneric("clustResults", function(object) standardGeneric("clustResults")) 174 | 175 | #' @rdname TCA.accessors 176 | #' @export 177 | 178 | setMethod("clustResults", "TCA", function(object) { 179 | object@clusterRes 180 | }) 181 | 182 | #' Accessors to extract slots of a clust class. 183 | #' 184 | #' Accessors are provided to extract \code{data}, \code{centers}, \code{cluster}, 185 | #' and \code{membership} slots stored in a clust class. 186 | #' @name clust.accessors 187 | #' @aliases clustData clustData,clust-method clustCenters,clust-method 188 | #' clustCluster,clust-method clustMembership,clust-method 189 | #' 190 | #' @param object \code{clust} object object 191 | #' @return 192 | #' \code{clustData} returns a data matrix. \code{clustCenters} returns a matrix of 193 | #' centers. \code{clustCluster} returns an integer vector. \code{clustMembership} 194 | #' returns a matrix of membership, see \code{\link{clust}} for details. 195 | #' 196 | #' @author 197 | #' Mengjun Wu 198 | #' 199 | #' @seealso 200 | #' \code{\link{clust}} 201 | 202 | #' @rdname clust.accessors 203 | #' @export 204 | setGeneric("clustData", function(object) standardGeneric("clustData")) 205 | 206 | #' @rdname clust.accessors 207 | #' @export 208 | setMethod("clustData", "clust", function(object) { 209 | object@data 210 | }) 211 | 212 | #' @rdname clust.accessors 213 | #' @export 214 | setGeneric("clustCenters", function(object) standardGeneric("clustCenters")) 215 | 216 | #' @rdname clust.accessors 217 | #' @export 218 | 219 | setMethod("clustCenters", "clust", function(object) { 220 | object@centers 221 | }) 222 | 223 | #' @rdname clust.accessors 224 | #' @export 225 | setGeneric("clustCluster", function(object) standardGeneric("clustCluster")) 226 | 227 | #' @rdname clust.accessors 228 | #' @export 229 | 230 | setMethod("clustCluster", "clust", function(object) { 231 | object@cluster 232 | }) 233 | 234 | #' @rdname clust.accessors 235 | #' @export 236 | setGeneric("clustMembership", function(object) standardGeneric("clustMembership")) 237 | 238 | #' @rdname clust.accessors 239 | #' @export 240 | 241 | setMethod("clustMembership", "clust", function(object) { 242 | object@membership 243 | }) 244 | -------------------------------------------------------------------------------- /R/countReads.R: -------------------------------------------------------------------------------- 1 | #' count mapped reads overlap genomic intervals 2 | #' 3 | #' This function counts mapped reads from multiple BAM files 4 | #' overlapping genomic intervals in \code{genomicFeature} in a 5 | #' \code{TCA} object. The resulted count table is stored in 6 | #' \code{count} slot of the \code{TCA} object. 7 | #' 8 | #' @param object a \code{TCA} object. 9 | #' 10 | #' @param dir character string giving the directory of BAM files. 11 | #' 12 | #' @param method character string giving the counting method. Options 13 | #' are "\code{summarizeOverlaps}" and "\code{featureCounts}". For 14 | #' Windows system, only "\code{summarizeOverlaps}" can be used, For 15 | #' Linux system, both methods can be used. 16 | #' 17 | #' @param ... additional arguments passed to 18 | #' \code{\link{summarizeOverlaps}} in GenomicAlignments package 19 | #' or \code{\link{featureCounts}} in Rsubread package. 20 | #' 21 | #' @param zero.based Logical. If TRUE, the start positions of the 22 | #' genomic intervals are \emph{0-based}, if FALSE, the start positions 23 | #' will be \emph{1-based}. 24 | #' 25 | #' @details 26 | #' This function provides two options to count the mapped reads: 27 | #' "\code{summarizeOverlaps}" in the GenomicAlignments package and 28 | #' "\code{featureCounts}" in the Rsubread package. As Rsubread package 29 | #' is only avaible for linux systems, Windows users can only choose 30 | #' "\code{summarizeOverlaps}". The user could further customize the 31 | #' counting paramters by passing additional arguments (...), otherwise 32 | #' the default settings of the two methods will be used. For details 33 | #' of the counting parameters, see \code{\link{summarizeOverlaps}}, 34 | #' \code{\link{featureCounts}}. 35 | #' 36 | #' 37 | #' @return 38 | #' A TCA object with updated \code{count} slot. 39 | #' 40 | #' @author 41 | #' Mengjun Wu 42 | #' 43 | #' @seealso 44 | #' \code{\link{summarizeOverlaps}}, \code{\link{featureCounts}} 45 | #' 46 | #' 47 | #' @export 48 | countReads <- function(object, dir, method = "summarizeoverlaps", 49 | zero.based = TRUE,...) { 50 | name.col.tmp <- colnames(object@design) 51 | name.col.tmp <- tolower(name.col.tmp) 52 | colnames(object@design) <- name.col.tmp 53 | if (!"bamfile" %in% colnames(object@design)) { 54 | err <- paste0("Can not find information of bam files in design, please check whether the correspoinding field is missing or the column name is the same as required.") 55 | stop(err) 56 | } 57 | old <- setwd(tempdir()) 58 | on.exit(setwd(old), add = TRUE) 59 | setwd(dir) 60 | bamfiles <- as.vector(object@design$bamfile) 61 | features <- object@genomicFeature 62 | ignore.strand <- NULL 63 | if (is.null(features$strand)) { 64 | warning("No strand information is provided, strand is ignored in reads counting") 65 | ignore.strand <- TRUE 66 | } 67 | gi <- makeGRangesFromDataFrame(features, keep.extra.columns = TRUE, 68 | starts.in.df.are.0based = zero.based) 69 | method1 <- tolower(method) 70 | if (method1 == "featureCounts" && .Platform$OS.type == "windows") { 71 | stop(" 'featureCounts' is only available in Linux/Mac OS system.") 72 | } 73 | count.table <- switch(method1, summarizeoverlaps = { 74 | bamfl <- Rsamtools::BamFileList(bamfiles, yieldSize = 1e+06) 75 | c <- GenomicAlignments::summarizeOverlaps(gi, bamfl, 76 | ignore.strand = ignore.strand, ...) 77 | count.table <- SummarizedExperiment::assays(c)$counts 78 | row.names(count.table) <- features$id 79 | count.table 80 | }, featureCounts = { 81 | warning("To use the featureCounts, you need to load 'Rsubread' package first") 82 | gi_rsubread <- createAnnotationFile(gi) 83 | stra <- 0 84 | if (!ignore.strand) { 85 | stra <- 1 86 | } 87 | for (i in bamfiles) { 88 | m <- paste0("counting reads in bamfile ", i) 89 | message(m) 90 | o <- capture.output(x <- featureCounts(i, annot.ext = gi_rsubread, 91 | strandSpecific = stra, ...)) 92 | count.table <- x$counts 93 | } 94 | rm(o) 95 | count.table 96 | }) 97 | count.table <- as.matrix(count.table) 98 | counts(object) <- count.table 99 | object 100 | } 101 | -------------------------------------------------------------------------------- /R/data.R: -------------------------------------------------------------------------------- 1 | #' An example read Counts table 2 | #' 3 | #' A dataset of exemplary read counts 4 | #' 5 | #' @docType data 6 | #' @keywords datasets 7 | #' @name countsTable 8 | #' @usage data(countsTable) 9 | #' @format A data frame containing experiment design information 10 | #' for 12 samples/libraries. 11 | #' @return 12 | #' A data frame 13 | #' @examples 14 | #' data(countsTable) 15 | NULL 16 | 17 | #' An example experiment design without BAM file infomration 18 | #' 19 | #' A dataset of exemplary experiment design without BAM file 20 | #' infomration 21 | #' 22 | #' @docType data 23 | #' @keywords datasets 24 | #' @name experiment 25 | #' @usage data(experiment) 26 | #' @format A data frame containing experiment design information 27 | #' for 12 samples/libraries. 28 | #' @return 29 | #' A data frame 30 | #' @examples 31 | #' data(experiment) 32 | NULL 33 | 34 | #' An example experiment design with BAM file infomration 35 | #' 36 | #' A dataset of exemplary experiment design with BAM file 37 | #' infomration 38 | #' 39 | #' @docType data 40 | #' @keywords datasets 41 | #' @name experiment_BAMfile 42 | #' @usage data(experiment_BAMfile) 43 | #' @format A data frame containing experiment design information 44 | #' for 12 samples/libraries. 45 | #' @return 46 | #' A data frame 47 | #' @examples 48 | #' data(experiment_BAMfile) 49 | NULL 50 | 51 | #' An example reference genomic regions 52 | #' 53 | #' A dataset of exemplary genomic regions 54 | #' 55 | #' @docType data 56 | #' @keywords datasets 57 | #' @name genomicIntervals 58 | #' @usage data(genomicIntervals) 59 | #' @format A data frame containing 2751 genomic regions. 60 | #' @return 61 | #' A data frame 62 | #' @examples 63 | #' data(genomicIntervals) 64 | NULL 65 | 66 | #' An example TCA object 67 | #' 68 | #' A TCA object storing exemplary ATAC-seq time course data, 69 | #' including the experiment design, read counts, reference 70 | #' genomic regions. 71 | #' 72 | #' @docType data 73 | #' @keywords datasets 74 | #' @name tca_ATAC 75 | #' @usage data(tca_ATAC) 76 | #' @format A TCA object of exemplary ATAC-seq time course data 77 | #' @return 78 | #' A TCA object 79 | #' @examples 80 | #' data(tca_ATAC) 81 | NULL 82 | 83 | -------------------------------------------------------------------------------- /R/peakreference.R: -------------------------------------------------------------------------------- 1 | #' combine and merge multiple BED files 2 | #' 3 | #' This function merges overlapping genomic regions into a single feature. 4 | #' The merged single feature represents the widest genomic interval 5 | #' that covers all overlapping regions. 6 | #' 7 | #' @param data a data frame containg coordinates information of peaks 8 | #' to be merged. Columns of the data frame should be consistent with 9 | #' the BED format where the first column contains chromosome information, 10 | #' the second column the starting position, and the third column 11 | #' the ending position. 12 | #' 13 | #' @param dir a character string giving the directory where BED files 14 | #' are stored. If \code{data} is not given, the function will reads 15 | #' in the BED files under \code{code}. 16 | #' 17 | #' @param pattern an \code{\link{regular expression}}, only files that 18 | #' have names match the regular expression will be read in. 19 | #' 20 | #' @param merge logical indicating whether to merge overlapped regions 21 | #' or not. If False, regions are simply combined. 22 | #' 23 | #' @param overlap a numberic value giving the least number of base(s) 24 | #' two regions should overlap when merging them. 25 | #' 26 | #' @param ratio a numberic value giving the thresold of overlapping 27 | #' ratio between two regions to merge them. See '\code{Details}' below 28 | #' for the definition of the overlapping ratio. 29 | #' 30 | #' @return a data frame with four columns: \code{chr}, \code{start}, 31 | #' \code{stop}, \code{id} 32 | #' 33 | #' @details 34 | #' The overlapping ratio (OR) is defined as: 35 | #' 36 | #' \deqn{ OR = \frac{n}{\min(length(a), length(b)}} 37 | #' 38 | #' \eqn{a}, \eqn{b} are two genomic regions, \eqn{n} is the number of 39 | #' overlapping bases between region \eqn{a} and region \eqn{b}. 40 | #' 41 | #' @author 42 | #' Mengjun Wu, Lei Gu 43 | #' 44 | #' @examples 45 | #' peaks <- data.frame(chr = c(rep('chr1',4),rep('chr2', 3), rep('chr3',2)), 46 | #' start = c(100,148,230,300,330,480,1000,700,801), 47 | #' end = c(150,220,500,450,600,900,1050,760,900)) 48 | #' 49 | #' merged_peaks <- peakreference(data = peaks, merge = TRUE, overlap = 1) 50 | #' 51 | #' @export 52 | 53 | peakreference <- function(data = NULL, dir = NULL, pattern = NULL, 54 | merge = TRUE, overlap = 1, ratio = NULL) { 55 | if (is.null(data) && is.null(dir)) { 56 | stop("Either a data.frame of genomic coordinates or a directory 57 | for the BED files should be given") 58 | } 59 | if (!is.null(data)) { 60 | checkBEDformat(data) 61 | data[, 1] <- factor(data[, 1]) 62 | peakset <- data 63 | } 64 | if (is.null(data) && !is.null(dir)) { 65 | old <- setwd(tempdir()) 66 | on.exit(setwd(old), add = TRUE) 67 | setwd(dir) 68 | filenames <- list.files(pattern = pattern) 69 | if (length(filenames) == 0) { 70 | err <- paste0("Can not find file names containing '", 71 | pattern, "'.") 72 | stop(err) 73 | } 74 | datalist <- lapply(filenames, function(x) { 75 | read.table(file = x, header = FALSE) 76 | }) 77 | peakset <- do.call(rbind, datalist) 78 | checkBEDformat(peakset) 79 | } 80 | peakset <- peakset[order(peakset[, 1], peakset[, 2]), ] 81 | if (merge) { 82 | if (overlap <= 0 || round(overlap) != overlap) { 83 | stop("\"overlap\" must be integer and greater than 0.") 84 | } 85 | peakset.sub <- split(peakset, peakset[, 1], 86 | drop = TRUE) 87 | level <- names(peakset.sub) 88 | mergedpeak <- c() 89 | for (i in seq_len(length(peakset.sub))) { 90 | temp <- peakset.sub[[i]] 91 | if (is.null(ratio)) { 92 | submerge <- intervalmerge(temp[, 2], temp[, 3], 93 | overlap = overlap) 94 | } else { 95 | submerge <- intervalmerge(temp[, 2], temp[, 3], 96 | ratio = ratio) 97 | } 98 | 99 | chr <- rep(level[i], length(submerge[, 1])) 100 | submerge1 <- data.frame(chr, submerge) 101 | mergedpeak <- rbind(mergedpeak, submerge1) 102 | } 103 | name <- paste0("peak", seq_len(length(mergedpeak[, 1]))) 104 | mergedpeak <- data.frame(mergedpeak, name) 105 | colnames(mergedpeak) <- c("chr", "start", "end", "id") 106 | mergedpeak 107 | } else { 108 | peakset 109 | } 110 | 111 | } 112 | 113 | 114 | checkBEDformat <- function(data) { 115 | if (ncol(data) < 3) { 116 | stop("At least three columns should be provided. The first column contains chromosome name, 117 | the second column contains starting position, the third column contains ending position.") 118 | } 119 | if (class(as.vector(data[, 1])) != "character") { 120 | stop("The first column contains chromosome name and must be character.") 121 | } 122 | if (any(round(data[,2]) != data[,2]) && 123 | any(round(data[,3]) != data[,3])) { 124 | stop("the second and third column contain starting and ending positions, must be numeric.") 125 | } 126 | } 127 | 128 | intervalmerge <- function(a0, b0, overlap = NULL, 129 | ratio = NULL) { 130 | if (length(a0) > 1) { 131 | a1 <- c(a0[1]) 132 | b1 <- c(b0[1]) 133 | merge <- NULL 134 | for (i in seq_len(length(a0) - 1)) { 135 | if (is.null(ratio)) { 136 | if (b1[length(b1)] - a0[i + 1] < overlap) { 137 | a1 <- append(a1, a0[i + 1]) 138 | b1 <- append(b1, b0[i + 1]) 139 | } else { 140 | b1[length(b1)] <- max(b1[length(b1)], b0[i + 1]) 141 | } 142 | } 143 | if (is.null(overlap)) { 144 | len <- min((b1[length(b1)] - a1[length(b1)]), 145 | (b0[i + 1] - a0[i + 1])) 146 | rt <- (b1[length(b1)] - a0[i + 1])/len 147 | if (rt < ratio) { 148 | a1 <- append(a1, a0[i + 1]) 149 | b1 <- append(b1, b0[i + 1]) 150 | } else { 151 | b1[length(b1)] <- max(b1[length(b1)], b0[i + 1]) 152 | } 153 | } 154 | } 155 | merge <- cbind(a1, b1) 156 | } 157 | if (length(a0) <= 1) { 158 | a1 <- c(a0[1]) 159 | b1 <- c(b0[1]) 160 | merge <- cbind(a1, b1) 161 | } 162 | merge 163 | } 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | -------------------------------------------------------------------------------- /R/plots.R: -------------------------------------------------------------------------------- 1 | #' Plot clustering results for time course data. 2 | #' 3 | #' This function plots the clusters generated from 4 | #' \code{\link{timeclust}}. For fuzzy cmeans clustering, data points 5 | #' are color-coded according to membership values, the color palettes 6 | #' can be customized. 7 | #' 8 | #' @param object a \code{TCA} object or a \code{clust} object 9 | #' 10 | #' @param categories character string giving the x-axis label 11 | #' 12 | #' @param value character string giving the y-axis label 13 | #' 14 | #' @param cols integer value specifying number of columns in the final 15 | #' layout. 16 | #' 17 | #' @param cl.color character string specifying a color for hard 18 | #' clustering. 19 | #' 20 | #' @param membership.color color palettes, a character vector of 21 | #' n colors 22 | #' 23 | #' @param title.size numeric value specifying the font size of title 24 | #' of each 25 | #' plot in the layout 26 | #' 27 | #' @param axis.line.size numeric value specifying the size of both 28 | #' axis lines 29 | #' 30 | #' @param axis.title.size numeric value specifying the font size of 31 | #' titles of both axis 32 | #' 33 | #' @param axis.text.size numeric value specifying the font size of 34 | #' labels of both axis 35 | #' 36 | #' @param legend.title.size numeric value specifying the font size 37 | #' of legend title 38 | #' 39 | #' @param legend.text.size numeric value specifying the font size of 40 | #' legend text 41 | #' 42 | #' @return 43 | #' Plot all clusters in one plot and return a list of ggplot objects, 44 | #' each object is for one cluster. The ggplot object can be drawed by 45 | #' calling \code{\link{print.ggplot}} 46 | #' 47 | #' @examples 48 | #' x <- matrix(sample(500, 1600, replace = TRUE), nrow = 200, 49 | #' dimnames = list(paste0('peak', 1:200), 1:8)) 50 | #' clust_res <- timeclust(x, algo = 'cm', k = 4, standardize = TRUE) 51 | #' p <- timeclustplot(clust_res, cols =2) 52 | #' # to plot a individual cluster 53 | #' print (p[[2]]) # plot cluster 2 54 | #' print (p[[3]]) # plot cluster 3 55 | #' 56 | #' @author 57 | #' Mengjun Wu 58 | #' @export 59 | 60 | timeclustplot <- function(object = NULL, categories = "timepoint", 61 | value = "expression", cols = NULL, 62 | cl.color = "gray50", 63 | membership.color = rainbow(30, s = 3/4, v = 1, start = 1/6), 64 | title.size = 18, axis.line.size = 0.6, 65 | axis.title.size = 18, 66 | axis.text.size = 16, legend.title.size = 14, 67 | legend.text.size = 14) { 68 | 69 | if (class(object) != "clust" && class(object) != "TCA") { 70 | stop("object should be a 'timeclust' object or a 'TCA' object") 71 | } 72 | if (class(object) == "clust") { 73 | data <- object@data 74 | cluster <- object@cluster 75 | membership <- object@membership 76 | } 77 | if (class(object) == "TCA") { 78 | data <- object@clusterRes@data 79 | cluster <- object@clusterRes@cluster 80 | membership <- object@clusterRes@membership 81 | } 82 | ncl <- max(cluster) 83 | membercolor <- vector(length = length(cluster)) 84 | membervalue <- list() 85 | counter <- 0 86 | if (!sum(dim(membership) == 0) == 2) { 87 | color <- membership.color 88 | colorseq <- seq(0, 1, length = length(color)) 89 | for (i in seq_len(ncl)) { 90 | mtmp <- membership[cluster == i, i] 91 | membervalue[[i]] <- mtmp 92 | for (j in seq_len(length(mtmp))) { 93 | counter <- counter + 1 94 | ind <- which(abs(colorseq - mtmp[j]) == min(abs(colorseq - mtmp[j]))) 95 | membercolor[counter] <- color[ind] 96 | } 97 | } 98 | membervalue <- unlist(membervalue) 99 | names(membercolor) <- membervalue 100 | } 101 | 102 | plotlist <- list() 103 | for (i in seq_len(ncl)) { 104 | title <- paste0("Cluster ", i) 105 | dtmp <- data[cluster == i, ] 106 | a <- which(cluster == i) 107 | if (length(a) == 1) { 108 | dtmp <- data.frame(time = 1:length(dtmp), value = dtmp) 109 | if (!sum(dim(membership) == 0) == 2) { 110 | m <- membership[cluster == i, i] 111 | colorname = toString(m) 112 | plotlist[[i]] <- ggplot(dtmp, aes(x = time, y = value)) + 113 | geom_line(colour = membercolor[colorname]) + theme_bw() + 114 | ggtitle(title) + 115 | scale_x_continuous(breaks = dtmp$time, 116 | labels = row.names(dtmp)) + 117 | labs(x = categories, y = value) + 118 | theme(plot.title = element_text(size = title.size), 119 | axis.line.x = element_line(color = "black", 120 | size = axis.line.size), 121 | axis.line.y = element_line(color = "black", 122 | size = axis.line.size), 123 | axis.title = element_text(size = axis.title.size), 124 | axis.text = element_text(size = axis.text.size), 125 | legend.position = "none", panel.border = element_blank(), 126 | panel.grid.major = element_blank(), 127 | panel.grid.minor = element_blank()) 128 | } else { 129 | plotlist[[i]] <- ggplot(dtmp, aes(x = time, y = value)) + 130 | geom_line(colour = cl.color) + theme_bw() + ggtitle(title) + 131 | scale_x_continuous(breaks = dtmp$time, 132 | labels = row.names(dtmp)) + 133 | labs(x = categories, y = value) + 134 | theme(plot.title = element_text(size = title.size), 135 | axis.line.x = element_line(color = "black", 136 | size = axis.line.size), 137 | axis.line.y = element_line(color = "black", 138 | size = axis.line.size), 139 | axis.title = element_text(size = axis.title.size), 140 | axis.text = element_text(size = axis.text.size), 141 | legend.position = "none", panel.border = element_blank(), 142 | panel.grid.major = element_blank(), 143 | panel.grid.minor = element_blank()) 144 | } 145 | } else { 146 | dtmp_m <- melt(dtmp) 147 | colnames(dtmp_m) <- c("group", "time", "value") 148 | if (sum(dim(membership) == 0) == 2) { 149 | plotlist[[i]] <- ggplot(dtmp_m, aes(x = time, y = value)) + 150 | geom_line(aes(group = group), colour = cl.color) + 151 | theme_bw() + ggtitle(title) + 152 | labs(x = categories, y = value) + 153 | theme(plot.title = element_text(size = title.size), 154 | axis.line.x = element_line(color = "black", 155 | size = axis.line.size), 156 | axis.line.y = element_line(color = "black", 157 | size = axis.line.size), 158 | axis.title = element_text(size = axis.title.size), 159 | axis.text = element_text(size = axis.text.size), 160 | legend.position = "none", panel.border = element_blank(), 161 | panel.grid.major = element_blank(), 162 | panel.grid.minor = element_blank()) 163 | } 164 | if (!sum(dim(membership) == 0) == 2) { 165 | mem <- membership[cluster == i, i] 166 | mem1 <- data.frame(group = names(mem), member = mem) 167 | dtmp_m1 <- merge(dtmp_m, mem1, by = "group") 168 | colnames(dtmp_m1) <- c("group", "time", "value", "membership") 169 | dtmp_m1 <- dtmp_m1[order(dtmp_m1[, 4]), ] 170 | new.factor <- unique(as.vector(dtmp_m1$group)) 171 | dtmp_m1$group <- factor(dtmp_m1$group, levels = new.factor) 172 | 173 | plotlist[[i]] <- ggplot(dtmp_m1, aes(x = time, y = value, 174 | colour = membership)) + 175 | geom_line(aes(group = group)) + 176 | scale_colour_gradientn(colours = membership.color) + 177 | guides(colour = guide_colourbar()) + theme_bw() + 178 | ggtitle(title) + labs(x = categories, y = value) + 179 | theme(plot.title = element_text(size = title.size), 180 | axis.line.x = element_line(color = "black", 181 | size = axis.line.size), 182 | axis.line.y = element_line(color = "black", 183 | size = axis.line.size), 184 | axis.title = element_text(size = axis.title.size), 185 | axis.text = element_text(size = axis.text.size), 186 | legend.title = element_text(size = legend.title.size), 187 | legend.text = element_text(size = legend.title.size), 188 | panel.border = element_blank(), 189 | panel.grid.major = element_blank(), 190 | panel.grid.minor = element_blank()) 191 | 192 | 193 | } 194 | } 195 | 196 | } 197 | suppressWarnings(multiplot(plotlist = plotlist, cols = cols)) 198 | plotlist 199 | } 200 | 201 | multiplot <- function(..., plotlist = NULL, cols = 1, layout = NULL) { 202 | plots <- c(list(...), plotlist) 203 | numPlots = length(plots) 204 | if (is.null(layout)) { 205 | layout <- matrix(seq(1, cols * ceiling(numPlots/cols)), 206 | ncol = cols, nrow = ceiling(numPlots/cols)) 207 | } 208 | if (numPlots == 1) { 209 | print(plots[[1]]) 210 | 211 | } else { 212 | grid.newpage() 213 | pushViewport(viewport(layout = grid.layout(nrow(layout), 214 | ncol(layout)))) 215 | for (i in 1:numPlots) { 216 | matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE)) 217 | 218 | print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row, 219 | layout.pos.col = matchidx$col)) 220 | } 221 | } 222 | } 223 | -------------------------------------------------------------------------------- /R/timeclust.R: -------------------------------------------------------------------------------- 1 | #' time couse data clustering 2 | #' 3 | #' This function performs clustering analysis of the time course data. 4 | #' 5 | #' @param x a \code{TCA} object returned from 6 | #' \code{\link{timecourseTable}} or a matrix 7 | #' 8 | #' @param algo a character string giving a clustering method. Options 9 | #' are "\code{km}" (kmeans), "\code{pam}" (partitioning around medoids), 10 | #' "\code{hc}" (hierachical clustering), "\code{cm}" (cmeans). 11 | #' 12 | #' @param k a numeric value between \eqn{1} and \eqn{n - 1} (\eqn{n} 13 | #' is the number of data points to be clustered). 14 | #' 15 | #' @param dist a character string specifying either "\code{distance}" or 16 | #' "\code{correlation}" will be used to measure the distance between data points. 17 | #' 18 | #' @param dist.method a character string. It can be chosen from one of 19 | #' the correlation methods in \code{\link{cor}} function ("\code{pearson}", 20 | #' "\code{spearman}", "\code{kendall}") if \code{dist} is "\code{correlation}", 21 | #' or one of the distance measure methods in \code{\link{dist}} function 22 | #' (for example, "\code{euclidean}", "\code{manhattan}") if \code{dist} is 23 | #' "\code{distance}". 24 | #' 25 | #' @param centers a numeric matrix giving intial centers for kmeams, 26 | #' pam or cmeans. If given, number of rows of the matrix must be equal 27 | #' to k. 28 | #' 29 | #' @param standardize logical, if TRUE, z-score transformation will 30 | #' performed on the data before clustering. See 'Details' below. 31 | #' 32 | #' @param ... additional arguments passing to \code{\link{kmeans}}, 33 | #' \code{\link{pam}}, \code{\link{hclust}}, \code{\link{cmeans}} 34 | #' 35 | #' @details 36 | #' two types of clustering methods are provided: hard clustering 37 | #' (\code{\link{kmeans}}, \code{\link{pam}}, \code{\link{hclust}}) 38 | #' and soft clustering(\code{\link{cmeans}}). In hard clustering, 39 | #' a data point can only be allocated to exactly one cluster 40 | #' (for \code{\link{hclust}}, \code{\link{cutree}} is used to cut 41 | #' a tree into clusters), while in soft clustering (also known as 42 | #' fuzzy clustering), a data point can be assigned to multiple 43 | #' clusters, membership values are used to indicate to what 44 | #' degree a data point belongs to each cluster. 45 | #' 46 | #' To better capture the differences of temporal patterns rather 47 | #' than expression levels, z-score transformation can be applied 48 | #' to covert the the expression values to z-scores by performing 49 | #' the following formula: 50 | #' 51 | #' \deqn{z = \frac{x - \mu}{\sigma}} 52 | #' 53 | #' \eqn{x} is the value to be converted (e.g., expression value of a 54 | #' genomic feature in one condition), \eqn{\mu} is the population 55 | #' mean (e.g., average expression value of a genomic feature across 56 | #' different conditions), \eqn{\sigma} is the standard deviation 57 | #' (e.g., standard deviation of the expression values of a genomic 58 | #' feature across different conditions). 59 | #' 60 | #' 61 | #' @return 62 | #' If x is a \code{TCA} object, a \code{TCA} object will be returned. 63 | #' If x is a matrix, a \code{clust} object will be returned 64 | #' 65 | #' @examples 66 | #' 67 | #' example.mat <- matrix(rnorm(1600,sd=0.3), nrow = 200, 68 | #' dimnames = list(paste0('peak', 1:200), 1:8)) 69 | #' clust_res <- timeclust(x = example.mat, algo = 'cm', k = 4) 70 | #' # return a clust object 71 | #' 72 | #' @author 73 | #' Mengjun Wu 74 | #' 75 | #' @seealso \code{\link{clust}}, \code{\link{kmeans}}, 76 | #' \code{\link{pam}}, \code{\link{hclust}}, \code{\link{cutree}} 77 | #' 78 | #' @export 79 | timeclust <- function(x, algo, k, dist = "distance", dist.method = "euclidean", 80 | centers = NULL, standardize = TRUE, ...) { 81 | if (is.matrix(x)) { 82 | data.tmp <- x 83 | }else{ 84 | data.tmp <- x@tcTable 85 | } 86 | if (standardize) { 87 | for (i in seq_len(nrow(data.tmp))) { 88 | data.tmp[i, ] <- (data.tmp[i, ] - mean(data.tmp[i, ], na.rm = TRUE))/sd(data.tmp[i, ], na.rm = TRUE) 89 | } 90 | data.tmp <- data.tmp[complete.cases(data.tmp), ] 91 | } 92 | object <- new("clust") 93 | object@method <- algo 94 | object@dist <- dist 95 | object@data <- data.tmp 96 | 97 | res <- .timeclust(data = data.tmp, algo = algo, k = k, 98 | dist = dist, dist.method = dist.method, 99 | centers = centers, ...) 100 | 101 | if (algo == "cm") { 102 | object@cluster <- res$cluster 103 | object@membership <- res$membership 104 | object@centers <- res$centers 105 | } else { 106 | object@cluster <- res$cluster 107 | object@centers <- res$centers 108 | } 109 | if (is.matrix(x)) { 110 | object 111 | } else { 112 | x@clusterRes <- object 113 | x 114 | } 115 | } 116 | 117 | # perform time course clustering 118 | .timeclust <- function(data, algo, k, centers = NULL, 119 | dist = "distance", dist.method = "euclidean", ...) { 120 | if (!algo %in% c("pam", "km", "hc", "cm")) { 121 | stop("clustering method should be one of 'pam','km','hc','cm'") 122 | } 123 | if (!dist %in% c("distance", "correlation")) { 124 | stop("Distance can only be one of either 'distance' or 'correlation'") 125 | } 126 | if (!dist.method %in% c("pearson", "kendall", "spearman", "euclidean", "maximum", 127 | "manhattan", "canberra", "binary", "minkowski")) { 128 | stop("Distance metric should either one of correlation measures in cor function or 129 | one of the distance measures in dist function") 130 | } 131 | if (algo == "km") { 132 | if(dist.method != "euclidean"){ 133 | stop("kmeans only support euclidean metric; for other distance metrices, please see the help page") 134 | } 135 | } 136 | if (algo == "cm" ) { 137 | if(!dist.method %in% c("euclidean", "manhattan")){ 138 | stop("cmeans only support euclidean or mahattan distance metrics") 139 | } 140 | } 141 | 142 | d <- NULL 143 | if (algo %in% c("pam", "hc")) { 144 | if (dist == "correlation") { 145 | d <- as.dist(1 - cor(t(data), method = dist.method)) 146 | } 147 | if (dist == "distance") { 148 | d <- dist(data, method = dist.method) 149 | } 150 | } 151 | clustres <- list() 152 | if (algo != "hc") { 153 | if (!is.null(centers)) { 154 | if (nrow(centers) != k) { 155 | stop("Number of rows of centers must be equal to k") 156 | } 157 | } 158 | } 159 | clustres <- switch(algo, km = { 160 | if (!is.null(centers)) { 161 | res <- kmeans(data, centers = centers, ...) 162 | } else { 163 | res <- kmeans(data, centers = k, ...) 164 | } 165 | clustres$cluster <- res$cluster 166 | clustres$centers <- res$centers 167 | clustres 168 | }, pam = { 169 | if (!is.null(centers)) { 170 | ind <- data[, 1] %in% centers[, 1] 171 | ind <- which(ind) 172 | if (length(ind) != k) { 173 | stop("For 'pam', centers must be chosen from the data") 174 | } else { 175 | res <- pam(d, k = k, medoids = ind, ...) 176 | } 177 | } 178 | res <- pam(d, k = k, ...) 179 | clustres$cluster <- res$clustering 180 | clustres$centers <- data[res$medoids, ] 181 | clustres 182 | }, hc = { 183 | tree <- hclust(d, ...) 184 | res <- cutree(tree, k = k) 185 | clustres$cluster <- res 186 | clustres$centers <- matrix(0, 0, 0) 187 | clustres 188 | }, cm = { 189 | if (!is.null(centers)) { 190 | res <- cmeans(data, centers = centers, ...) 191 | } else { 192 | res <- cmeans(data, centers = k, ...) 193 | } 194 | clustres$cluster <- res$cluster 195 | clustres$centers <- res$centers 196 | clustres$membership <- res$membership 197 | clustres 198 | }) 199 | clustres 200 | } 201 | -------------------------------------------------------------------------------- /R/timecourseTable.R: -------------------------------------------------------------------------------- 1 | #' constructs time course table for clustering analysis 2 | #' 3 | #' This function constructs a time course table of which rows 4 | #' are genomic features and columns time points. 5 | #' values can be normalized expression levels or log2-fold 6 | #' changes compared to a control time point. The time course 7 | #' table is used for clustering analysis. 8 | #' 9 | #' @param object a \code{TCA} object returned by \code{DBanalysis}. 10 | #' 11 | #' @param value a character string, either "\code{expression}" or 12 | #' "\code{FC}". "\code{expression}" is the mean normalized read 13 | #' counts of replicates, "\code{FC}" is the log2-fold changes 14 | #' compared to the first time point. 15 | #' 16 | #' @param control.group a character string giving the time point to 17 | #' be compared with, i.e., the denominator in the fold changes. It 18 | #' should match one of the time points in the \code{design} table 19 | #' in the \code{TCA} object. 20 | #' 21 | #' @param lib.norm logical indicating whether or not use effective 22 | #' library size (see "Details" in \code{\link{counts}}). 23 | #' 24 | #' @param norm.method a character string specifying the normalization 25 | #' method if \code{value} is "\code{expression}" 26 | #' 27 | #' @param subset an optinal character vector giving a subset of 28 | #' genomic features, if not NULL, time course table is generated 29 | #' for only this subset of genomic features. 30 | #' 31 | #' @param filter logical, whether to drop the genomic features 32 | #' shows no significant changes (defined by \code{pvalue}, 33 | #' \code{pvalue.threshold},\code{abs.fold} and \code{direction}) 34 | #' between any two time points. 35 | #' 36 | #' @param pvalue character string specify the type of p-values: 37 | #' "\code{none}" is unadjusted p-value or one of adjusted p-value 38 | #' "\code{holm}", "\code{hochberg}", "\code{hommel}", "\code{bonferroni}", 39 | #' "\code{BH}", "\code{BY}", "\code{fdr}". 40 | #' 41 | #' @param pvalue.threshold a numeric value giving threshold of 42 | #' selected p-value, significant changes have lower 43 | #' (adjusted) p-values than the threshold. 44 | #' 45 | #' @param abs.fold a numeric value, the least minimum log2-fold 46 | #' changes. The returned genomic regions have changes 47 | #' with absolute log2-fold changes exceeding \code{abs.fold}. 48 | #' 49 | #' @param direction character string specify the direction of fold 50 | #' changes. "\code{up}": positive fold changes; "\code{down}": 51 | #' negative fold changes; "\code{both}": both positive and 52 | #' negative fold changes. 53 | #' 54 | #' @param ... additional arguments passing to \code{\link{rpkm}}, 55 | #' \code{\link{cpm}} 56 | #' @note 57 | #' If "\code{expression}" in \code{value} is chosen, the average 58 | #' normalized expression values of replicates for each group will 59 | #' be calculated and returned. 60 | #' 61 | #' @return 62 | #' A \code{TCA} object 63 | #' 64 | #' @author 65 | #' Mengjun Wu 66 | #' 67 | #' @examples 68 | #' data(tca_ATAC) 69 | #' tca_ATAC <- DBanalysis(tca_ATAC) 70 | #' tca_ATAC <- timecourseTable(tca_ATAC, value = 'expression', 71 | #' lib.norm = TRUE, norm.method = 'rpkm') 72 | #' 73 | #' @export 74 | #' 75 | #' 76 | timecourseTable <- function(object, value = "expression", control.group=NULL, 77 | lib.norm = TRUE, norm.method = "rpkm", 78 | subset = NULL, filter = FALSE, pvalue = "fdr", 79 | pvalue.threshold = 0.05, abs.fold = 2, 80 | direction = "both", ...) { 81 | if (!value %in% c("expression", "FC")) { 82 | err <- paste0("The value of time course table should be either 83 | normalized expression table (value=\"expression\") or 84 | logarithm of fold changes (value=\"FC\")") 85 | stop(err) 86 | } 87 | group <- unique(object@design$timepoint) 88 | genointerval <- object@genomicFeature[object@genomicFeature$id %in% 89 | row.names(object@DBfit$counts), ] 90 | if (value == "expression") { 91 | count <- object@DBfit$counts 92 | if (lib.norm) { 93 | y <- DGEList(counts = count, group = object@design$timepoint) 94 | y <- calcNormFactors(y) 95 | } else { 96 | y <- DGEList(counts = count, group = object@design$timepoint) 97 | } 98 | if (!norm.method %in% c("rpkm", "cpm")) { 99 | err <- paste0("norm.method should be one of \"rpkm\" or \"cpm\".") 100 | stop(err) 101 | } 102 | tc <- switch(norm.method, rpkm = { 103 | giwidth <- genointerval$end - genointerval$start 104 | t <- rpkm(y, normalized.lib.size = lib.norm, gene.length = giwidth, ...) 105 | t 106 | }, cpm = { 107 | t <- cpm(y, normalized.lib.size = lib.norm, ...) 108 | t 109 | }) 110 | tc <- data.frame(tc, stringsAsFactors = FALSE) 111 | colnames(tc) <- object@design$timepoint 112 | tc <- as.data.frame(sapply(unique(names(tc)), function(col) rowMeans(tc[names(tc) == col]))) 113 | } 114 | if (value == "FC") { 115 | tc <- NULL 116 | if(is.null(control.group)){ 117 | err <- paste0("control group needs to be specified.") 118 | stop(err) 119 | } 120 | group1 <- control.group 121 | tc <- cbind(tc, rep(0, length(genointerval[, 1]))) 122 | group2 <- group[group != group1] 123 | t <- DBresult(object, group1 = group1, group2 = group2, 124 | top.sig = FALSE, result.type = "list") 125 | t <- as(t, "list") 126 | for (i in t) { 127 | tc <- cbind(tc, i$logFC) 128 | } 129 | colnames(tc) <- group 130 | rownames(tc) <- genointerval$id 131 | } 132 | tc <- as.matrix(tc) 133 | 134 | if (filter) { 135 | contrasts <- colnames(object@contrasts) 136 | if (pvalue == "PValue") { 137 | p <- "none" 138 | p2 <- "PValue" 139 | } else { 140 | p <- pvalue 141 | p2 <- "paj" 142 | } 143 | DBtmpfilter <- DBresult(object, contrasts = contrasts, 144 | p.adjust = p, result.type = "list", 145 | pvalue.threshold = pvalue.threshold, 146 | abs.fold = abs.fold, 147 | top.sig = TRUE) 148 | feature.filter <- c() 149 | for (i in DBtmpfilter) { 150 | feature.filter <- c(feature.filter, rownames(i)) 151 | } 152 | tc <- tc[unique(feature.filter), ] 153 | } 154 | 155 | if (!is.null(subset)) { 156 | tc <- tc[row.names(tc) %in% subset, ] 157 | } 158 | 159 | object@tcTable <- tc 160 | object 161 | } 162 | 163 | 164 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TCseq 2 | An R package for time course sequencing data analysis 3 | -------------------------------------------------------------------------------- /TCseq.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | BuildType: Package 16 | PackageUseDevtools: Yes 17 | PackageInstallArgs: --no-multiarch --with-keep.source 18 | -------------------------------------------------------------------------------- /data/countsTable.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MengjunWu/TCseq/f2708be21fed9fa0ea3a5c2c3f72f607cb24e84c/data/countsTable.rda -------------------------------------------------------------------------------- /data/experiment.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MengjunWu/TCseq/f2708be21fed9fa0ea3a5c2c3f72f607cb24e84c/data/experiment.rda -------------------------------------------------------------------------------- /data/experiment_BAMfile.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MengjunWu/TCseq/f2708be21fed9fa0ea3a5c2c3f72f607cb24e84c/data/experiment_BAMfile.rda -------------------------------------------------------------------------------- /data/genomicIntervals.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MengjunWu/TCseq/f2708be21fed9fa0ea3a5c2c3f72f607cb24e84c/data/genomicIntervals.rda -------------------------------------------------------------------------------- /data/tca_ATAC.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MengjunWu/TCseq/f2708be21fed9fa0ea3a5c2c3f72f607cb24e84c/data/tca_ATAC.rda -------------------------------------------------------------------------------- /man/DBanalysis.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/DBanalysis.R 3 | \name{DBanalysis} 4 | \alias{DBanalysis} 5 | \title{Perform differential expression analysis} 6 | \usage{ 7 | DBanalysis( 8 | object, 9 | categories = "timepoint", 10 | norm.lib = TRUE, 11 | filter.type = NULL, 12 | filter.value = NULL, 13 | samplePassfilter = 2, 14 | ... 15 | ) 16 | } 17 | \arguments{ 18 | \item{object}{a \code{TCA} object.} 19 | 20 | \item{categories}{character string giving which column in \code{design} 21 | will be used for differential analysis. For time course analysis, the default 22 | column is "\code{timepoint}".} 23 | 24 | \item{norm.lib}{logical indicating whether or not use effective 25 | library size when perform normalization. See \code{\link{counts}} for more 26 | details.} 27 | 28 | \item{filter.type}{character string indicating which type of count 29 | (raw or normalized) is used when performing filtering. Options are 30 | "\code{raw}", "\code{cpm}", "\code{rpkm}", "\code{NULL}". No filtering will 31 | be performed when using "\code{NULL}'.} 32 | 33 | \item{filter.value}{a numberic value; minimum values of selected 34 | \code{filter.type} ("\code{raw}", "\code{cpm}", "\code{rpkm}"). It is used in 35 | combination with \code{samplePassfilter}.} 36 | 37 | \item{samplePassfilter}{a numberic value indicating the minimum number 38 | of samples/libraries in which a genomic feature has counts value 39 | (raw or normalized) more than \code{filter.value}. Smaller than this number, 40 | the genomic feature will be filtered out.} 41 | 42 | \item{...}{additional arguments passed to \code{\link{glmFit}} from 43 | \code{edgeR} package.} 44 | } 45 | \value{ 46 | A \code{TCA} object 47 | } 48 | \description{ 49 | This function is a wrapper for the \code{\link{glmFit}} in edgeR package. 50 | } 51 | \details{ 52 | The differetial event is detected by using the generalized 53 | linear model (GLM) methods (McCarthy et al, 2012). This function 54 | fits the read counts of each genes to a negative binomial glm by 55 | using \code{\link{glmFit}} function from edgeR. To further test the 56 | significance of changes, see \code{DBresult}, \code{TopDBresult} 57 | } 58 | \examples{ 59 | data(tca_ATAC) 60 | tca_ATAC <- DBanalysis(tca_ATAC) 61 | 62 | } 63 | \references{ 64 | McCarthy,D.J.,Chen, Y., & Smyth, G. K.(2012). Differential 65 | expression analysis of multifactor RNA-Seq experiments with respect to 66 | biological variation. Nucleic acids research 40, 4288-4297. 67 | } 68 | \seealso{ 69 | \code{DBresult}, \code{TopDBresult} 70 | } 71 | \author{ 72 | Mengjun Wu, Lei Gu 73 | } 74 | -------------------------------------------------------------------------------- /man/DBresult.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/DBresults.R 3 | \name{DBresult} 4 | \alias{DBresult} 5 | \alias{DBresult.cluster} 6 | \title{This function tests for differential expression} 7 | \usage{ 8 | DBresult( 9 | object, 10 | group1 = NULL, 11 | group2 = NULL, 12 | contrasts = NULL, 13 | p.adjust = "fdr", 14 | top.sig = FALSE, 15 | pvalue = "paj", 16 | pvalue.threshold = 0.05, 17 | abs.fold = 2, 18 | direction = "both", 19 | result.type = "GRangesList" 20 | ) 21 | 22 | DBresult.cluster( 23 | object, 24 | group1 = NULL, 25 | group2 = NULL, 26 | contrasts = NULL, 27 | p.adjust = "fdr", 28 | top.sig = FALSE, 29 | pvalue = "paj", 30 | pvalue.threshold = 0.05, 31 | abs.fold = 2, 32 | direction = "both", 33 | cluster, 34 | cmthreshold = NULL, 35 | result.type = "GRangesList" 36 | ) 37 | } 38 | \arguments{ 39 | \item{object}{a \code{TCA} object, for \code{DBresult}, 40 | \code{DBanalysis} should already be called on the object; 41 | for \code{DBresult.cluster}, both \code{DBanalysis} and 42 | \code{timeclust} should be already called.} 43 | 44 | \item{group1}{character string giving the group to be compared with, 45 | i.e., the denominator in the fold changes. group1 can be set NULL and 46 | will be ignored if the comparisons are passed to \code{contrasts}} 47 | 48 | \item{group2}{a character vetor giving the other groups to 49 | compare with \code{group1}, i.e., the numerator in the fold changes. 50 | group2 can be set NULL and will be ignored if the comparisons are 51 | passed to \code{contrasts}} 52 | 53 | \item{contrasts}{a character vector, each string in 54 | the vector gives a contrast of two groups with the format 55 | "group2vsgroup1", group1 is the denominator level in the fold 56 | changes and group2 is the numerator 57 | level in the fold changes.} 58 | 59 | \item{p.adjust}{character string specifying a correction method 60 | for p-values. Options are "\code{holm}", "\code{hochberg}", 61 | "\code{hommel}", "\code{bonferroni}", "\code{BH}", "\code{BY}", 62 | "\code{fdr}", and "\code{none}".} 63 | 64 | \item{top.sig}{logical if TRUE, only genomic regions with 65 | given log2-fold changes and significance levels (p-value) 66 | will be returned. Log2-fold changes are defined by \code{abs.fold} 67 | and \code{direction}; significance levels are defined by \code{pvalue} 68 | and \code{pvalue.threshold}} 69 | 70 | \item{pvalue}{character string specify the type of p-values 71 | used for defining the significance level(\code{PValue} 72 | or adjusted p-value \code{paj})} 73 | 74 | \item{pvalue.threshold}{a numeric value giving threshold of 75 | selected p-value, Significant changes have lower 76 | (adjusted) p-values than the threshold.} 77 | 78 | \item{abs.fold}{a numeric value, the minimum absolute log2-fold 79 | changes. The returned genomic regions have changes 80 | with absolute log2-fold changes exceeding \code{abs.fold}.} 81 | 82 | \item{direction}{character string specify the direction of fold 83 | changes. "\code{up}": positive fold changes; "\code{down}": 84 | negative fold changes; "\code{both}": both positive and 85 | negative fold changes.} 86 | 87 | \item{result.type}{character string giving the data type of return 88 | value. Options are "GRangesList" and "list".} 89 | 90 | \item{cluster}{an integer giving the number of cluster from which 91 | genomic features are extracted.} 92 | 93 | \item{cmthreshold}{a numeric value, this argument is applicable 94 | only if \code{cmeans}' clustering method is selected when calling 95 | \code{\link{timeclust}} function. if not NULL, the result table of 96 | genomic features that belong to the defined \code{cluster} and 97 | the membership values to this cluster exceed \code{cmthreshold} 98 | are extracted.} 99 | } 100 | \value{ 101 | A list or a GRangesList. 102 | If \code{result.type} is "GRangesList", a GRangesList is returned containing 103 | the differential analysis results for all provided contrasts. Each GRanges 104 | object of the list is one contrast, the analysis results are contained in 4 105 | metadata columns: 106 | 107 | \code{logFC} log2-fold changes between two groups. 108 | 109 | \code{PValue} p-values. 110 | 111 | \code{paj} adjusted p-values 112 | 113 | \code{id} name of genomic features 114 | 115 | If \code{result.type} is "list", a list of data frames is returned. 116 | Each data frame contains one contrast with the following columns: 117 | 118 | \code{logFC} log2-fold changes between two groups. 119 | 120 | \code{PValue} p-values. 121 | 122 | \code{paj} adjusted p-values 123 | 124 | \code{chr} name of chromosomes 125 | 126 | \code{start} starting positions of features in the 127 | chromosomes 128 | 129 | \code{end} ending postitions of features in the chromosomes 130 | 131 | \code{id} name of genomic features 132 | } 133 | \description{ 134 | This function is a wrapper for \code{\link{glmLRT}} in edgeR package. 135 | It performs likelihood ratio tests for given coefficinets contrasts 136 | after fitting read counts to a negative binomial glm by 137 | \code{\link{DBanalysis}}. \code{DBresult} also extracts the 138 | diffential analysis results of given contrasts at a chosen significance level. 139 | \code{DBresult.cluster} returns similar results but only 140 | contain genomic features belong to a given cluster. 141 | } 142 | \details{ 143 | This function uses \code{\link{glmLRT}} from edgeR which 144 | perform likelihood ratio tests for the significance of changes. 145 | For more deatils, 146 | see \code{\link{glmLRT}} 147 | } 148 | \note{ 149 | If not NULL \code{group1}, \code{group2} and \code{contrasts}, 150 | result tables are extracted from comparisons in \code{constrasts}. 151 | } 152 | \examples{ 153 | data(tca_ATAC) 154 | tca_ATAC <- DBanalysis(tca_ATAC) 155 | ### extract differntial analysis of 24h, 72h to 0h 156 | # set the contrasts using the 'group1' and 'group2' paramters 157 | res1 <- DBresult(tca_ATAC, group1 = '0h', group2 = c('24h', '72h')) 158 | # one can get the same result by setting the contrasts using hte 'contrasts' parameter 159 | res2 <- DBresult(tca_ATAC, contrasts = c('24hvs0h', '72hvs0h')) 160 | # extract significant diffential events 161 | res.sig <- DBresult(tca_ATAC, contrasts = c('24hvs0h', '72hvs0h'), 162 | top.sig = TRUE) 163 | 164 | # extract differntial analysis of 24h, 72h to 0h of a given cluster 165 | tca_ATAC <- timecourseTable(tca_ATAC, filter = TRUE) 166 | tca_ATAC <- timeclust(tca_ATAC, algo = 'cm', k = 6) 167 | res_cluster1 <- DBresult.cluster(tca_ATAC, group1 = '0h', 168 | group2 = c('24h', '72h'), 169 | cluster = 1) 170 | 171 | 172 | 173 | } 174 | \seealso{ 175 | \code{\link{glmLRT}} 176 | } 177 | \author{ 178 | Mengjun Wu, Lei Gu 179 | } 180 | -------------------------------------------------------------------------------- /man/TCA.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Classes.R 3 | \docType{class} 4 | \name{TCA-class} 5 | \alias{TCA-class} 6 | \alias{TCA} 7 | \alias{TCAFromSummarizedExperiment} 8 | \title{TCA class and constructor} 9 | \usage{ 10 | TCA(design, counts = matrix(0L, 0L, 0L), genomicFeature, zero.based = TRUE) 11 | 12 | TCAFromSummarizedExperiment(se, genomicFeature = NULL) 13 | } 14 | \arguments{ 15 | \item{design}{a data frame containing information of 16 | samples/libraries. For time course analysis, design table should 17 | contain at least three columns (case insensitive): \code{sampleid}, 18 | \code{timepoint} and \code{group} providing time point and group 19 | information of each sample/library. If \code{counts} is not provided 20 | when creating \code{TCA} object, an optional column \code{bamfile} can 21 | be used to provide BAM filename of each sample/library and generate 22 | count table using \code{\link{countReads}} function later.} 23 | 24 | \item{counts}{an integer matrix containing read counts. Rows 25 | correspond to genomic features and columns to samples/libraries. 26 | The name of column s should be the same as the time points 27 | in \code{design}.} 28 | 29 | \item{genomicFeature}{a data frame or a GRanges object containing 30 | genomic coordinates of features of interest (e.g. genes in RNA-seq, 31 | binding regions in ChIP-seq). If genomicFeature is a data frame, 32 | four columns are required in \code{genomicFeature}: \code{id}, 33 | \code{chr}, \code{start}, \code{end}; if genomicFeature is a Granges 34 | object, the metadata column "\code{id}" is required. For 35 | \code{TCAFromSummarizedExperiment}, genomicFeature must be 36 | provided if \code{se} is a SummarizedExperiment object.} 37 | 38 | \item{zero.based}{Logical. If TRUE, the start positions of the 39 | genomic ranges in the returned \code{TCA} object are \emph{0-based}, 40 | if FALSE, the start positions will be \emph{1-based}.} 41 | 42 | \item{se}{A SummarizedExperiment or a RangedSummarizedExperiment 43 | object. The object might contain multiple assays in the assay list, 44 | only the first one will be taken to construct TCA object. 45 | For SummarizedExperiment object, \code{genomicFeature} 46 | must be provided while for RangedSummarizedExperiment object, 47 | the genomic features will be extracted directly from the object.} 48 | } 49 | \value{ 50 | A TCA object 51 | } 52 | \description{ 53 | \code{TCA} is a S4 class for storing input data, results of 54 | differential analysis and clustering analysis. A \code{TCA} object 55 | can be created by the constructor function taking a table of sample 56 | information, a table of the genomic coordinates of features, and read 57 | count table (optional). 58 | } 59 | \details{ 60 | A TCA object can be created without providing read counts, 61 | read counts can be provided by \code{\link{counts}} or generated by 62 | \code{\link{countReads}}. For the read counts, the number of rows 63 | should equal to that in '\code{genomicFeature} and the number of columns 64 | should equal to number of rows in \code{design}; in addition, the name 65 | of column names should be the same as the time points in \code{design}. 66 | Input data and analysis results in a TCA object can be accessed by using 67 | corresponding accessors and functions. 68 | The TCA objects also have a show method printing a compact summary of 69 | their contents see \code{\link{counts}}, \code{\link{TCA.accessors}}, 70 | \code{\link{DBresult}}, \code{\link{tcTable}}, \code{\link{timeclust}}. 71 | \code{clust} 72 | } 73 | \examples{ 74 | #create data frame of experiment design: 4 time points and 2 replicates for each time point. 75 | d <- data.frame(sampleID = 1:8, group = rep(c(1, 2, 3, 4), 2), 76 | timepoint = rep(c('0h', '24h', '48h', '72h'), 2)) 77 | 78 | 79 | #create data frame of genomic intervals of interest 80 | gf <- data.frame(chr = c(rep('chr1', 3), rep('chr2', 2), rep('chr4', 2)), 81 | start = seq(100, 2000, by = 300), 82 | end = seq(100, 2000, by = 300) + 150, 83 | id = paste0('peak', 1:7)) 84 | tca <- TCA(design = d, genomicFeature = gf) 85 | genomicFeature(tca) 86 | 87 | #if count table is available 88 | c <- matrix(sample(1000, 56), nrow = 7, dimnames = list(paste0('peak', 1:7), 1:8)) 89 | tca <- TCA(design = d, counts = c, genomicFeature = gf) 90 | # replace the count table of a \code{TCA} object 91 | c2 <- matrix(sample(500, 56), nrow = 7, dimnames = list(paste0('peak', 1:7), 1:8)) 92 | counts(tca) <- c2 93 | 94 | 95 | } 96 | \seealso{ 97 | \code{\link{counts}}, \code{\link{TCA.accessors}}, 98 | \code{\link{DBresult}}, \code{\link{timeclust}}, \code{\link{clust}} 99 | } 100 | \author{ 101 | Mengjun Wu 102 | } 103 | -------------------------------------------------------------------------------- /man/TCA.accessors.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/GenericFunctions.R 3 | \name{TCA.accessors} 4 | \alias{TCA.accessors} 5 | \alias{design} 6 | \alias{design,TCA-method} 7 | \alias{genomicFeature,TCA-method} 8 | \alias{tcTable,TCA-method} 9 | \alias{clustResults,TCA-method} 10 | \alias{genomicFeature} 11 | \alias{tcTable} 12 | \alias{clustResults} 13 | \title{Accessors to extract slots of a TCA class.} 14 | \usage{ 15 | \S4method{design}{TCA}(object) 16 | 17 | genomicFeature(object) 18 | 19 | tcTable(object) 20 | 21 | \S4method{tcTable}{TCA}(object) 22 | 23 | clustResults(object) 24 | 25 | \S4method{clustResults}{TCA}(object) 26 | } 27 | \arguments{ 28 | \item{object}{\code{TCA} object object} 29 | } 30 | \value{ 31 | \code{design} returns a data frame. \code{genomicFeature} returns a data frame. 32 | \code{tcTable} returns a numeric matrix. \code{clustResults} returns a 33 | \code{clust} object, see \code{\link{clust}} for details. 34 | } 35 | \description{ 36 | Accessors are provided to extract \code{design}, \code{genomicFeature}, 37 | \code{tcTable}, \code{clustResults} slots of a TCA class. The \code{design} 38 | slot stores experimental information of samples/libraries, the 39 | \code{genomicFeature} slot stores genomic coordinates of features, the 40 | \code{tcTable} slot stores time couse data as a matrix, where rows are 41 | genomic features and columns time points. The \code{clustResults} slot 42 | stores results of clustering analysis as a \code{clust} object. 43 | } 44 | \examples{ 45 | data(tca_ATAC) 46 | genomicFeature(tca_ATAC) 47 | tcTable(tca_ATAC) 48 | } 49 | \seealso{ 50 | \code{\link{clust}} 51 | } 52 | \author{ 53 | Mengjun Wu 54 | } 55 | -------------------------------------------------------------------------------- /man/clust-class.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Classes.R 3 | \docType{class} 4 | \name{clust-class} 5 | \alias{clust-class} 6 | \alias{clust} 7 | \title{clust class} 8 | \description{ 9 | \code{clust} is a S4 class for storing results of the clustering 10 | analysis of time course data. 11 | } 12 | \details{ 13 | The clust objects are returned from \code{\link{timeclust}} and have 14 | a show method printing a compact summary of their contents 15 | } 16 | \section{Slots}{ 17 | 18 | Object of \code{clust} class contains the following slots: 19 | \describe{ 20 | \item{\code{method}}{clustering method used} 21 | \item{\code{dist}}{distance metric used} 22 | \item{\code{data}}{a matrix of original or standardized data used 23 | in the analysis} 24 | \item{\code{centers}}{a matrix of cluster centers} 25 | \item{\code{cluster}}{an integer vector of length \eqn{n} (the 26 | integers are the indices of clusters the data points belong to. 27 | For the fuzzy cmeans clustering method, a data point is assigned 28 | to the closest cluster to which the data point has highest 29 | membership value.} 30 | \item{\code{membership}}{a matrix of membership values of the 31 | data points to each clusters} 32 | } 33 | } 34 | 35 | \seealso{ 36 | \code{\link{timeclust}}, \code{\link{@}} 37 | } 38 | \author{ 39 | Mengjun Wu 40 | } 41 | -------------------------------------------------------------------------------- /man/clust.accessors.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/GenericFunctions.R 3 | \name{clust.accessors} 4 | \alias{clust.accessors} 5 | \alias{clustData} 6 | \alias{clustData,clust-method} 7 | \alias{clustCenters,clust-method} 8 | \alias{clustCluster,clust-method} 9 | \alias{clustMembership,clust-method} 10 | \alias{clustCenters} 11 | \alias{clustCluster} 12 | \alias{clustMembership} 13 | \title{Accessors to extract slots of a clust class.} 14 | \usage{ 15 | clustData(object) 16 | 17 | \S4method{clustData}{clust}(object) 18 | 19 | clustCenters(object) 20 | 21 | \S4method{clustCenters}{clust}(object) 22 | 23 | clustCluster(object) 24 | 25 | \S4method{clustCluster}{clust}(object) 26 | 27 | clustMembership(object) 28 | 29 | \S4method{clustMembership}{clust}(object) 30 | } 31 | \arguments{ 32 | \item{object}{\code{clust} object object} 33 | } 34 | \value{ 35 | \code{clustData} returns a data matrix. \code{clustCenters} returns a matrix of 36 | centers. \code{clustCluster} returns an integer vector. \code{clustMembership} 37 | returns a matrix of membership, see \code{\link{clust}} for details. 38 | } 39 | \description{ 40 | Accessors are provided to extract \code{data}, \code{centers}, \code{cluster}, 41 | and \code{membership} slots stored in a clust class. 42 | } 43 | \seealso{ 44 | \code{\link{clust}} 45 | } 46 | \author{ 47 | Mengjun Wu 48 | } 49 | -------------------------------------------------------------------------------- /man/countReads.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/countReads.R 3 | \name{countReads} 4 | \alias{countReads} 5 | \title{count mapped reads overlap genomic intervals} 6 | \usage{ 7 | countReads(object, dir, method = "summarizeoverlaps", zero.based = TRUE, ...) 8 | } 9 | \arguments{ 10 | \item{object}{a \code{TCA} object.} 11 | 12 | \item{dir}{character string giving the directory of BAM files.} 13 | 14 | \item{method}{character string giving the counting method. Options 15 | are "\code{summarizeOverlaps}" and "\code{featureCounts}". For 16 | Windows system, only "\code{summarizeOverlaps}" can be used, For 17 | Linux system, both methods can be used.} 18 | 19 | \item{zero.based}{Logical. If TRUE, the start positions of the 20 | genomic intervals are \emph{0-based}, if FALSE, the start positions 21 | will be \emph{1-based}.} 22 | 23 | \item{...}{additional arguments passed to 24 | \code{\link{summarizeOverlaps}} in GenomicAlignments package 25 | or \code{\link{featureCounts}} in Rsubread package.} 26 | } 27 | \value{ 28 | A TCA object with updated \code{count} slot. 29 | } 30 | \description{ 31 | This function counts mapped reads from multiple BAM files 32 | overlapping genomic intervals in \code{genomicFeature} in a 33 | \code{TCA} object. The resulted count table is stored in 34 | \code{count} slot of the \code{TCA} object. 35 | } 36 | \details{ 37 | This function provides two options to count the mapped reads: 38 | "\code{summarizeOverlaps}" in the GenomicAlignments package and 39 | "\code{featureCounts}" in the Rsubread package. As Rsubread package 40 | is only avaible for linux systems, Windows users can only choose 41 | "\code{summarizeOverlaps}". The user could further customize the 42 | counting paramters by passing additional arguments (...), otherwise 43 | the default settings of the two methods will be used. For details 44 | of the counting parameters, see \code{\link{summarizeOverlaps}}, 45 | \code{\link{featureCounts}}. 46 | } 47 | \seealso{ 48 | \code{\link{summarizeOverlaps}}, \code{\link{featureCounts}} 49 | } 50 | \author{ 51 | Mengjun Wu 52 | } 53 | -------------------------------------------------------------------------------- /man/counts.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/GenericFunctions.R 3 | \name{counts} 4 | \alias{counts} 5 | \alias{counts,TCA-method} 6 | \alias{counts<-,TCA-method} 7 | \title{Extracts counts of a TCA object.} 8 | \usage{ 9 | \S4method{counts}{TCA}(object, normalization = "none", lib.norm = TRUE, log = FALSE, ...) 10 | 11 | \S4method{counts}{TCA}(object) <- value 12 | } 13 | \arguments{ 14 | \item{object}{a \code{TCA} object.} 15 | 16 | \item{normalization}{character string giving the normalization method. 17 | Options are "\code{none}" (original raw counts), "\code{cpm}" (counts 18 | per million), 19 | "\code{rpkm}" (reads per kilobase per million).} 20 | 21 | \item{lib.norm}{logical indicating whether or not use effective library 22 | size (see Details below) when \code{normalization} is "\code{cpm}" or 23 | "\code{rpkm}".} 24 | 25 | \item{log}{logical if \code{TRUE}, the returned value will be on a log2 26 | scale.} 27 | 28 | \item{...}{additional arguments passed to \code{\link{cpm}} or 29 | \code{\link{rpkm}} in the edgeR package.} 30 | 31 | \item{value}{an integer matrix.} 32 | } 33 | \value{ 34 | An integer matrix 35 | } 36 | \description{ 37 | \code{counts} extract raw read counts stored in a \code{TCA} object 38 | or compute normalized counts from the raw counts. 39 | } 40 | \details{ 41 | when calculating normalized counts, library size can be rescaled 42 | to minimize the log-fold changes between samples for most genomic features 43 | (e.g. genes, binding sites) by multiplying a scale factor. The rescaled 44 | library size is called effective library size. In this function, the scale 45 | factor is calculated using the weighted trimmed mean of M-values (TMM, 46 | Robinson et al (2010)) 47 | 48 | If log2 values are computed, a small count would be added to avoid logarithm 49 | of zero. The actual added count will be scaled according to the library size, 50 | for details see \code{\link{addPriorCount}} in the edgeR package 51 | when not specified, the prior count is set to 0.25 by default. 52 | } 53 | \examples{ 54 | data(tca_ATAC) 55 | c <- counts(tca_ATAC) 56 | # normalized counts table 57 | c_norm <- counts(tca_ATAC, normalization='rpkm') 58 | } 59 | \references{ 60 | Robinson, M. D., & Oshlack, A. (2010). A scaling normalization method for 61 | differential expression analysis of RNA-seq data. Genome biology, 11(3), 1. 62 | } 63 | \author{ 64 | Mengjun Wu 65 | } 66 | -------------------------------------------------------------------------------- /man/countsTable.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{countsTable} 5 | \alias{countsTable} 6 | \title{An example read Counts table} 7 | \format{ 8 | A data frame containing experiment design information 9 | for 12 samples/libraries. 10 | } 11 | \usage{ 12 | data(countsTable) 13 | } 14 | \value{ 15 | A data frame 16 | } 17 | \description{ 18 | A dataset of exemplary read counts 19 | } 20 | \examples{ 21 | data(countsTable) 22 | } 23 | \keyword{datasets} 24 | -------------------------------------------------------------------------------- /man/experiment.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{experiment} 5 | \alias{experiment} 6 | \title{An example experiment design without BAM file infomration} 7 | \format{ 8 | A data frame containing experiment design information 9 | for 12 samples/libraries. 10 | } 11 | \usage{ 12 | data(experiment) 13 | } 14 | \value{ 15 | A data frame 16 | } 17 | \description{ 18 | A dataset of exemplary experiment design without BAM file 19 | infomration 20 | } 21 | \examples{ 22 | data(experiment) 23 | } 24 | \keyword{datasets} 25 | -------------------------------------------------------------------------------- /man/experiment_BAMfile.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{experiment_BAMfile} 5 | \alias{experiment_BAMfile} 6 | \title{An example experiment design with BAM file infomration} 7 | \format{ 8 | A data frame containing experiment design information 9 | for 12 samples/libraries. 10 | } 11 | \usage{ 12 | data(experiment_BAMfile) 13 | } 14 | \value{ 15 | A data frame 16 | } 17 | \description{ 18 | A dataset of exemplary experiment design with BAM file 19 | infomration 20 | } 21 | \examples{ 22 | data(experiment_BAMfile) 23 | } 24 | \keyword{datasets} 25 | -------------------------------------------------------------------------------- /man/genomicIntervals.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{genomicIntervals} 5 | \alias{genomicIntervals} 6 | \title{An example reference genomic regions} 7 | \format{ 8 | A data frame containing 2751 genomic regions. 9 | } 10 | \usage{ 11 | data(genomicIntervals) 12 | } 13 | \value{ 14 | A data frame 15 | } 16 | \description{ 17 | A dataset of exemplary genomic regions 18 | } 19 | \examples{ 20 | data(genomicIntervals) 21 | } 22 | \keyword{datasets} 23 | -------------------------------------------------------------------------------- /man/peakreference.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/peakreference.R 3 | \name{peakreference} 4 | \alias{peakreference} 5 | \title{combine and merge multiple BED files} 6 | \usage{ 7 | peakreference( 8 | data = NULL, 9 | dir = NULL, 10 | pattern = NULL, 11 | merge = TRUE, 12 | overlap = 1, 13 | ratio = NULL 14 | ) 15 | } 16 | \arguments{ 17 | \item{data}{a data frame containg coordinates information of peaks 18 | to be merged. Columns of the data frame should be consistent with 19 | the BED format where the first column contains chromosome information, 20 | the second column the starting position, and the third column 21 | the ending position.} 22 | 23 | \item{dir}{a character string giving the directory where BED files 24 | are stored. If \code{data} is not given, the function will reads 25 | in the BED files under \code{code}.} 26 | 27 | \item{pattern}{an \code{\link{regular expression}}, only files that 28 | have names match the regular expression will be read in.} 29 | 30 | \item{merge}{logical indicating whether to merge overlapped regions 31 | or not. If False, regions are simply combined.} 32 | 33 | \item{overlap}{a numberic value giving the least number of base(s) 34 | two regions should overlap when merging them.} 35 | 36 | \item{ratio}{a numberic value giving the thresold of overlapping 37 | ratio between two regions to merge them. See '\code{Details}' below 38 | for the definition of the overlapping ratio.} 39 | } 40 | \value{ 41 | a data frame with four columns: \code{chr}, \code{start}, 42 | \code{stop}, \code{id} 43 | } 44 | \description{ 45 | This function merges overlapping genomic regions into a single feature. 46 | The merged single feature represents the widest genomic interval 47 | that covers all overlapping regions. 48 | } 49 | \details{ 50 | The overlapping ratio (OR) is defined as: 51 | 52 | \deqn{ OR = \frac{n}{\min(length(a), length(b)}} 53 | 54 | \eqn{a}, \eqn{b} are two genomic regions, \eqn{n} is the number of 55 | overlapping bases between region \eqn{a} and region \eqn{b}. 56 | } 57 | \examples{ 58 | peaks <- data.frame(chr = c(rep('chr1',4),rep('chr2', 3), rep('chr3',2)), 59 | start = c(100,148,230,300,330,480,1000,700,801), 60 | end = c(150,220,500,450,600,900,1050,760,900)) 61 | 62 | merged_peaks <- peakreference(data = peaks, merge = TRUE, overlap = 1) 63 | 64 | } 65 | \author{ 66 | Mengjun Wu, Lei Gu 67 | } 68 | -------------------------------------------------------------------------------- /man/tca_ATAC.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{tca_ATAC} 5 | \alias{tca_ATAC} 6 | \title{An example TCA object} 7 | \format{ 8 | A TCA object of exemplary ATAC-seq time course data 9 | } 10 | \usage{ 11 | data(tca_ATAC) 12 | } 13 | \value{ 14 | A TCA object 15 | } 16 | \description{ 17 | A TCA object storing exemplary ATAC-seq time course data, 18 | including the experiment design, read counts, reference 19 | genomic regions. 20 | } 21 | \examples{ 22 | data(tca_ATAC) 23 | } 24 | \keyword{datasets} 25 | -------------------------------------------------------------------------------- /man/timeclust.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/timeclust.R 3 | \name{timeclust} 4 | \alias{timeclust} 5 | \title{time couse data clustering} 6 | \usage{ 7 | timeclust( 8 | x, 9 | algo, 10 | k, 11 | dist = "distance", 12 | dist.method = "euclidean", 13 | centers = NULL, 14 | standardize = TRUE, 15 | ... 16 | ) 17 | } 18 | \arguments{ 19 | \item{x}{a \code{TCA} object returned from 20 | \code{\link{timecourseTable}} or a matrix} 21 | 22 | \item{algo}{a character string giving a clustering method. Options 23 | are "\code{km}" (kmeans), "\code{pam}" (partitioning around medoids), 24 | "\code{hc}" (hierachical clustering), "\code{cm}" (cmeans).} 25 | 26 | \item{k}{a numeric value between \eqn{1} and \eqn{n - 1} (\eqn{n} 27 | is the number of data points to be clustered).} 28 | 29 | \item{dist}{a character string specifying either "\code{distance}" or 30 | "\code{correlation}" will be used to measure the distance between data points.} 31 | 32 | \item{dist.method}{a character string. It can be chosen from one of 33 | the correlation methods in \code{\link{cor}} function ("\code{pearson}", 34 | "\code{spearman}", "\code{kendall}") if \code{dist} is "\code{correlation}", 35 | or one of the distance measure methods in \code{\link{dist}} function 36 | (for example, "\code{euclidean}", "\code{manhattan}") if \code{dist} is 37 | "\code{distance}".} 38 | 39 | \item{centers}{a numeric matrix giving intial centers for kmeams, 40 | pam or cmeans. If given, number of rows of the matrix must be equal 41 | to k.} 42 | 43 | \item{standardize}{logical, if TRUE, z-score transformation will 44 | performed on the data before clustering. See 'Details' below.} 45 | 46 | \item{...}{additional arguments passing to \code{\link{kmeans}}, 47 | \code{\link{pam}}, \code{\link{hclust}}, \code{\link{cmeans}}} 48 | } 49 | \value{ 50 | If x is a \code{TCA} object, a \code{TCA} object will be returned. 51 | If x is a matrix, a \code{clust} object will be returned 52 | } 53 | \description{ 54 | This function performs clustering analysis of the time course data. 55 | } 56 | \details{ 57 | two types of clustering methods are provided: hard clustering 58 | (\code{\link{kmeans}}, \code{\link{pam}}, \code{\link{hclust}}) 59 | and soft clustering(\code{\link{cmeans}}). In hard clustering, 60 | a data point can only be allocated to exactly one cluster 61 | (for \code{\link{hclust}}, \code{\link{cutree}} is used to cut 62 | a tree into clusters), while in soft clustering (also known as 63 | fuzzy clustering), a data point can be assigned to multiple 64 | clusters, membership values are used to indicate to what 65 | degree a data point belongs to each cluster. 66 | 67 | To better capture the differences of temporal patterns rather 68 | than expression levels, z-score transformation can be applied 69 | to covert the the expression values to z-scores by performing 70 | the following formula: 71 | 72 | \deqn{z = \frac{x - \mu}{\sigma}} 73 | 74 | \eqn{x} is the value to be converted (e.g., expression value of a 75 | genomic feature in one condition), \eqn{\mu} is the population 76 | mean (e.g., average expression value of a genomic feature across 77 | different conditions), \eqn{\sigma} is the standard deviation 78 | (e.g., standard deviation of the expression values of a genomic 79 | feature across different conditions). 80 | } 81 | \examples{ 82 | 83 | example.mat <- matrix(rnorm(1600,sd=0.3), nrow = 200, 84 | dimnames = list(paste0('peak', 1:200), 1:8)) 85 | clust_res <- timeclust(x = example.mat, algo = 'cm', k = 4) 86 | # return a clust object 87 | 88 | } 89 | \seealso{ 90 | \code{\link{clust}}, \code{\link{kmeans}}, 91 | \code{\link{pam}}, \code{\link{hclust}}, \code{\link{cutree}} 92 | } 93 | \author{ 94 | Mengjun Wu 95 | } 96 | -------------------------------------------------------------------------------- /man/timeclustplot.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plots.R 3 | \name{timeclustplot} 4 | \alias{timeclustplot} 5 | \title{Plot clustering results for time course data.} 6 | \usage{ 7 | timeclustplot( 8 | object = NULL, 9 | categories = "timepoint", 10 | value = "expression", 11 | cols = NULL, 12 | cl.color = "gray50", 13 | membership.color = rainbow(30, s = 3/4, v = 1, start = 1/6), 14 | title.size = 18, 15 | axis.line.size = 0.6, 16 | axis.title.size = 18, 17 | axis.text.size = 16, 18 | legend.title.size = 14, 19 | legend.text.size = 14 20 | ) 21 | } 22 | \arguments{ 23 | \item{object}{a \code{TCA} object or a \code{clust} object} 24 | 25 | \item{categories}{character string giving the x-axis label} 26 | 27 | \item{value}{character string giving the y-axis label} 28 | 29 | \item{cols}{integer value specifying number of columns in the final 30 | layout.} 31 | 32 | \item{cl.color}{character string specifying a color for hard 33 | clustering.} 34 | 35 | \item{membership.color}{color palettes, a character vector of 36 | n colors} 37 | 38 | \item{title.size}{numeric value specifying the font size of title 39 | of each 40 | plot in the layout} 41 | 42 | \item{axis.line.size}{numeric value specifying the size of both 43 | axis lines} 44 | 45 | \item{axis.title.size}{numeric value specifying the font size of 46 | titles of both axis} 47 | 48 | \item{axis.text.size}{numeric value specifying the font size of 49 | labels of both axis} 50 | 51 | \item{legend.title.size}{numeric value specifying the font size 52 | of legend title} 53 | 54 | \item{legend.text.size}{numeric value specifying the font size of 55 | legend text} 56 | } 57 | \value{ 58 | Plot all clusters in one plot and return a list of ggplot objects, 59 | each object is for one cluster. The ggplot object can be drawed by 60 | calling \code{\link{print.ggplot}} 61 | } 62 | \description{ 63 | This function plots the clusters generated from 64 | \code{\link{timeclust}}. For fuzzy cmeans clustering, data points 65 | are color-coded according to membership values, the color palettes 66 | can be customized. 67 | } 68 | \examples{ 69 | x <- matrix(sample(500, 1600, replace = TRUE), nrow = 200, 70 | dimnames = list(paste0('peak', 1:200), 1:8)) 71 | clust_res <- timeclust(x, algo = 'cm', k = 4, standardize = TRUE) 72 | p <- timeclustplot(clust_res, cols =2) 73 | # to plot a individual cluster 74 | print (p[[2]]) # plot cluster 2 75 | print (p[[3]]) # plot cluster 3 76 | 77 | } 78 | \author{ 79 | Mengjun Wu 80 | } 81 | -------------------------------------------------------------------------------- /man/timecourseTable.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/timecourseTable.R 3 | \name{timecourseTable} 4 | \alias{timecourseTable} 5 | \title{constructs time course table for clustering analysis} 6 | \usage{ 7 | timecourseTable( 8 | object, 9 | value = "expression", 10 | control.group = NULL, 11 | lib.norm = TRUE, 12 | norm.method = "rpkm", 13 | subset = NULL, 14 | filter = FALSE, 15 | pvalue = "fdr", 16 | pvalue.threshold = 0.05, 17 | abs.fold = 2, 18 | direction = "both", 19 | ... 20 | ) 21 | } 22 | \arguments{ 23 | \item{object}{a \code{TCA} object returned by \code{DBanalysis}.} 24 | 25 | \item{value}{a character string, either "\code{expression}" or 26 | "\code{FC}". "\code{expression}" is the mean normalized read 27 | counts of replicates, "\code{FC}" is the log2-fold changes 28 | compared to the first time point.} 29 | 30 | \item{control.group}{a character string giving the time point to 31 | be compared with, i.e., the denominator in the fold changes. It 32 | should match one of the time points in the \code{design} table 33 | in the \code{TCA} object.} 34 | 35 | \item{lib.norm}{logical indicating whether or not use effective 36 | library size (see "Details" in \code{\link{counts}}).} 37 | 38 | \item{norm.method}{a character string specifying the normalization 39 | method if \code{value} is "\code{expression}"} 40 | 41 | \item{subset}{an optinal character vector giving a subset of 42 | genomic features, if not NULL, time course table is generated 43 | for only this subset of genomic features.} 44 | 45 | \item{filter}{logical, whether to drop the genomic features 46 | shows no significant changes (defined by \code{pvalue}, 47 | \code{pvalue.threshold},\code{abs.fold} and \code{direction}) 48 | between any two time points.} 49 | 50 | \item{pvalue}{character string specify the type of p-values: 51 | "\code{none}" is unadjusted p-value or one of adjusted p-value 52 | "\code{holm}", "\code{hochberg}", "\code{hommel}", "\code{bonferroni}", 53 | "\code{BH}", "\code{BY}", "\code{fdr}".} 54 | 55 | \item{pvalue.threshold}{a numeric value giving threshold of 56 | selected p-value, significant changes have lower 57 | (adjusted) p-values than the threshold.} 58 | 59 | \item{abs.fold}{a numeric value, the least minimum log2-fold 60 | changes. The returned genomic regions have changes 61 | with absolute log2-fold changes exceeding \code{abs.fold}.} 62 | 63 | \item{direction}{character string specify the direction of fold 64 | changes. "\code{up}": positive fold changes; "\code{down}": 65 | negative fold changes; "\code{both}": both positive and 66 | negative fold changes.} 67 | 68 | \item{...}{additional arguments passing to \code{\link{rpkm}}, 69 | \code{\link{cpm}}} 70 | } 71 | \value{ 72 | A \code{TCA} object 73 | } 74 | \description{ 75 | This function constructs a time course table of which rows 76 | are genomic features and columns time points. 77 | values can be normalized expression levels or log2-fold 78 | changes compared to a control time point. The time course 79 | table is used for clustering analysis. 80 | } 81 | \note{ 82 | If "\code{expression}" in \code{value} is chosen, the average 83 | normalized expression values of replicates for each group will 84 | be calculated and returned. 85 | } 86 | \examples{ 87 | data(tca_ATAC) 88 | tca_ATAC <- DBanalysis(tca_ATAC) 89 | tca_ATAC <- timecourseTable(tca_ATAC, value = 'expression', 90 | lib.norm = TRUE, norm.method = 'rpkm') 91 | 92 | } 93 | \author{ 94 | Mengjun Wu 95 | } 96 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(TCseq) 3 | 4 | test_check("TCseq") 5 | -------------------------------------------------------------------------------- /tests/testthat/test_TCseq.R: -------------------------------------------------------------------------------- 1 | ## test the TCA object constructor 2 | d1<- data.frame(timepoint = rep(c("0h", "24h", "48h", "72h"), 2), group = rep(c(1, 2, 3, 4), 2)) 3 | 4 | d3 <- data.frame(sampleid = 1:8, timepoint = rep(c("0h", "24h", "48h", "72h"), 2), 5 | group = rep(c(1, 2, 3, 4), 2)) 6 | 7 | gf1 <- data.frame(chr = c(rep("chr1", 3), rep("chr2", 2), rep("chr4", 2)), 8 | start = seq(100, 2000, by = 300), end = seq(100, 2000, by = 300) + 150) 9 | 10 | gf2 <- data.frame(CHR = c(rep("chr1", 3), rep("chr2", 2), rep("chr4", 2)), 11 | start = seq(100, 2000, by = 300), end = seq(100, 2000, by = 300) + 150, 12 | id = paste0("peak", 1:7)) 13 | 14 | gf3 <- data.frame(chr = c(rep("chr1", 3), rep("chr2", 2), rep("chr4", 2)), 15 | start = seq(100, 2000, by = 300), end = seq(100, 2000, by = 300) + 150, 16 | id = paste0("peak", 1:7)) 17 | 18 | tca <- TCA(design = d3, genomicFeature = gf3) 19 | expect_error({ 20 | tca <- TCA(design = d1, genomicFeature = gf3) 21 | }) 22 | expect_error({ 23 | tca <- TCA(design = d3, genomicFeature = gf1) 24 | }) 25 | expect_warning({ 26 | tca <- TCA(design = d3, genomicFeature = gf2) 27 | }) 28 | 29 | c1 <- matrix(sample(500, 56), nrow = 7, dimnames = list(paste0("peak", 30 | 1:7), 1:8)) 31 | c2 <- matrix(sample(500, 48), nrow = 6, dimnames = list(paste0("peak", 32 | 1:6), 1:8)) 33 | c3 <- matrix(sample(500, 49), nrow = 7, dimnames = list(paste0("peak", 34 | 1:7), 1:7)) 35 | tca <- TCA(design = d3, counts = c1, genomicFeature = gf3) 36 | expect_error({ 37 | TCA(design = d3, counts = c2, genomicFeature = gf3) 38 | }) 39 | expect_error({ 40 | TCA(design = d3, counts = c3, genomicFeature = gf3) 41 | }) 42 | 43 | ## test the correctness of the merge result results 44 | peaks <- data.frame(chr = c(rep("chr1",4),rep("chr2", 3), rep("chr3",2)), 45 | start = c(100,148,230,300,330,480,1000,700,801), 46 | end = c(150,220,500,450,600,900,1050,760,900)) 47 | 48 | merged_peaks <- peakreference(data = peaks, merge = T, overlap = 1) 49 | 50 | peaks_expect <- data.frame(chr = c(rep("chr1",2),rep("chr2", 2), rep("chr3",2)), 51 | start = c(100, 230, 330, 1000, 700, 801), 52 | end = c(220, 500, 900, 1050, 760, 900), 53 | id = paste0("peak", 1:6)) 54 | 55 | expect_equal(merged_peaks, peaks_expect) 56 | 57 | merged_peaks2 <- peakreference(data = peaks, merge = T, ratio = 0.2) 58 | peaks_expect2 <- data.frame(chr = c(rep("chr1",3),rep("chr2", 2), rep("chr3",2)), 59 | start = c(100,148, 230, 330, 1000, 700, 801), 60 | end = c(150, 220, 500, 900, 1050, 760, 900), 61 | id = paste0("peak", 1:7)) 62 | 63 | expect_equal(merged_peaks2, peaks_expect2) 64 | 65 | -------------------------------------------------------------------------------- /vignettes/TCseq.Rnw: -------------------------------------------------------------------------------- 1 | % \VignetteIndexEntry{TCseq Vignette} 2 | % \VignetteDepends{TCseq} 3 | % \VignetteKeywords{Time course sequencing analysis, Clustering} 4 | % \VignettePackage{TCseq} 5 | 6 | \documentclass[a4paper]{article} 7 | \usepackage{a4wide} 8 | \usepackage[utf8]{inputenc} 9 | \usepackage{float} 10 | 11 | \title{TCseq: time course sequencing data analysis} 12 | \author{Mengjun, Lei Gu} 13 | \date{ \today } 14 | 15 | \begin{document} 16 | \SweaveOpts{concordance=TRUE} 17 | \maketitle 18 | 19 | The TCseq package provides a unified suite for analysis of different types of time course sequencing data. It can be applied to transcriptomic time course data such as RNA-seq as well as epigenomic time course data such as ATAC-seq, ChIP-seq. The main focuses of this package are on differential analysis between different time points and temporal pattern analysis and visualization. 20 | 21 | Unlike RNA-seq, the genomic regions of interest of sequencing data like ATAC-seq, ChIP-seq are not pre-defined and are specific to each experimental conditions, which limits the subsequential differential analysis between conditions. For those data type, the TCseq package provides functions to combine and merge conditionally specific genomic regions and generate a reference genomic regions for all conditions. This package then uses the negative binomial generalized linear model implemented in edgeR to provide differential analysis \cite{Robinson}. To capture the temporal patterns of the time course data, the package includes several unsupervised clustering methods to identify and a function to visualize the patterns. 22 | 23 | This vignette uses an example ATAC-seq time course data to illustrate how to use the TCseq package. 24 | 25 | \section{Input data} 26 | The minimal input data for the TCseq are experiment design and reference genomic regions. 27 | 28 | \subsection{Generate reference genomic regions} 29 | For RNA-seq, the reference genomic regions are predefined (genes or exons). While for epigenome sequencing data, genomic regions of interest are usually defined as reads enriched regions which are also called peaks. peaks set for a given condition can be identified by peak callers such as MACs and is specific to that condition. The TCseq package provides a function to read in a set of peak set file in BED format, combines these files in to a single data frame, merges overlapping regions according use defined criteria and takes the largest bound as the reference region for all the overlapping regions. The merge criteria can be either absolute overlapping bases or overlapping ration (absolute overlapping bases divide mininum length of the regions to be merged). 30 | 31 | If a set of BED files are availble under certain directory, say dir.peaks, the file names of the BED files to be merged have common substring "narrowpeaks", then the reference genomic regions can be generated by: 32 | <<>>= 33 | library(TCseq) 34 | @ 35 | 36 | <>= 37 | dir <- dir.peaks 38 | gf <- peakreference(dir = dir, pattern = "narrowpeaks") 39 | @ 40 | The resulting data frame have four columns as follows: 41 | <<>>= 42 | data("genomicIntervals") 43 | head(genomicIntervals) 44 | @ 45 | 46 | \subsection{Create a TCA object} 47 | The TCseq uses an S4 class TCA to store all input data for subsequential analysis. When read counts table is not available, only data frames of experiment design and reference genomic regions are required to create a TCA object, TCseq also provides a function to generate counts table, to use the function, file names of BAM files for each sample/library have to be provided in the data frame of experiment design: 48 | <<>>= 49 | # Experiment design 50 | data("experiment_BAMfile") 51 | head(experiment_BAMfile) 52 | # create a TCA object 53 | tca <- TCA(design = experiment_BAMfile, genomicFeature = genomicIntervals) 54 | tca 55 | @ 56 | The count table then can be created (suppose the BAM files are store in the directory dir.BAM): 57 | <>= 58 | tca <- countReads(tca, dir = dir.BAM) 59 | @ 60 | When the counts table is available, BAM file information is not mandatory in the experiment design. Counts table can be provides when creating a TCA object: 61 | <<>>= 62 | #Experiment design without BAM file information 63 | data("experiment") 64 | #Counts table 65 | data("countsTable") 66 | tca <- TCA(design = experiment, genomicFeature = genomicIntervals, 67 | counts = countsTable) 68 | tca 69 | @ 70 | The counts table can also be assigned to an existing TCA object: 71 | <>= 72 | counts(tca) <- countsTable 73 | @ 74 | In addition, a TCA object can also be created from an existing RangedSummarizedExperiment or SummarizedExperiment. For summarizedExperiment, additional reference genomic regions information must be provided, while for RangedSummarizedExperiment object, the reference genomic regions will be extracted directly from the object. 75 | For a SummarizedExperiment object: 76 | <<>>= 77 | suppressWarnings(library(SummarizedExperiment)) 78 | se <- SummarizedExperiment(assays=list(counts = countsTable), colData = experiment) 79 | tca <- TCAFromSummarizedExperiment(se = se, genomicFeature = genomicIntervals) 80 | @ 81 | 82 | The TCA object with experiment design, read counts, reference genomic regions can be used for following differential analysis. 83 | 84 | \section{Differential Analysis} 85 | The differetial event is detected by using the generalized linear model (GLM) methods \cite{McCarthy} implemented in edgeR package. 86 | <<>>= 87 | tca <- DBanalysis(tca) 88 | @ 89 | Low quality genomic regions (read counts are low for all the time points) can also be filtered out. The following step only keeps genomic regions with two or more more samples that have read counts more than 10. 90 | <<>>= 91 | tca <- DBanalysis(tca, filter.type = "raw", filter.value = 10, samplePassfilter = 2) 92 | @ 93 | Differential analysis results between given timepoints can be extracted by: 94 | <<>>= 95 | DBres <- DBresult(tca, group1 = "0h", group2 = c("24h","40h","72h")) 96 | str(DBres, strict.width = "cut") 97 | head(DBres$`24hvs0h`) 98 | @ 99 | Significant differential events (log2-fold > 2 or log2-fold < -2, adjusted p-value < 0.05) can be further extracted by: 100 | <<>>= 101 | DBres.sig <- DBresult(tca, group1 = "0h", group2 = c("24h","40h","72h"), top.sig = TRUE) 102 | str(DBres.sig, strict.width = "cut") 103 | @ 104 | 105 | \section{Temporal pattern analysis} 106 | \subsection{Construct time course table} 107 | To detect temporal patterns of the time course sequencing data, the TCseq package uses unsupervised clustering methods. First, a time course table is created for clustering analysis. The rows of the time course table are genomic regions, and the columns are time points, the values can be chosen from normalized read counts or logFC of all time points compared to a given group. Here we compare each time point with the initial time point. Such table can be created as follows: 108 | <<>>= 109 | # values are logFC 110 | tca <- timecourseTable(tca, value = "FC", control.group = "0h", norm.method = "rpkm", filter = TRUE) 111 | @ 112 | or 113 | <<>>= 114 | # values are normalized read counts 115 | tca <- timecourseTable(tca, value = "expression", norm.method = "rpkm", filter = TRUE) 116 | @ 117 | When the "filter" parameter is set to be TRUE, the time course table will filter out all genomic regions with no significant changes between any two time points. The table can be accessed by: 118 | <<>>= 119 | t <- tcTable(tca) 120 | head(t) 121 | @ 122 | \subsection{Clustering analysis} 123 | Two types of clustering algorithms are included in the package: hard clustering (hierachical, pam, kmeans) and soft clustering (fuzzy cmeans \cite{Futschik}). The temporal patterns are analyzed using the following function: 124 | <<>>= 125 | tca <- timeclust(tca, algo = "cm", k = 6, standardize = TRUE) 126 | @ 127 | Instead of absolute value of different time series, one might only focus on the change patterns and expect time series with similar pattern to be cluster in same group. In this case, "standardize" parameter gives an option to perform z-score transformation on the data to be clustered, which reduces the noises introduced by the difference in the absolute values. 128 | 129 | \subsection{Visualize the clustering results} 130 | The clustering results can be visualized as follows: 131 | <>= 132 | p <- timeclustplot(tca, value = "z-score(PRKM)", cols = 3) 133 | @ 134 | 135 | \begin{figure}[H] 136 | \centering 137 | \includegraphics[width=\textwidth]{clusterRes.png} 138 | \caption{Visualization of clustering results} 139 | \end{figure} 140 | 141 | Individual clusters can also be plotted: 142 | <>= 143 | #plot cluster 1: 144 | print(p[[1]]) 145 | @ 146 | \begin{figure}[H] 147 | \centering 148 | \includegraphics[width=0.5\textwidth]{subcluster.png} 149 | \caption{Visualization of cluster 1} 150 | \end{figure} 151 | 152 | To plot the cmeans clustering results, the TCseq provides several color schemes to color code the membership values which indicate the degree to which data points belong to a cluster. 153 | 154 | %BIBLIOGRAPHY 155 | 156 | \begin{thebibliography}{} 157 | \bibitem {Robinson} Robinson, M.D., McCarthy, D.J. and Smyth, G.K. edgeR: a Bioconductor package for differential expression analysis of digital gene expression data, Bioinformatics, 26, 139-140,2010. 158 | \bibitem {McCarthy} McCarthy,D.J.,Chen, Y., Smyth, G. K. Differential expression analysis of multifactor RNA-Seq experiments with respect to biological variation. Nucleic acids research 40, 4288-4297,2012. 159 | \bibitem{Futschik} Futschik, M.E. and Carlisle, B. Noise-robust soft clustering of gene expression time-course data, Journal of bioinformatics and computational biology, 3, 965-988, 2005. 160 | \bibitem{lokesh} L. Kumar and M. Futschik, Mfuzz: a software package for soft clustering of microarray data, Bioinformation, 2(1),5-7,2007 161 | 162 | \end{thebibliography} 163 | 164 | \end{document} 165 | -------------------------------------------------------------------------------- /vignettes/clusterRes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MengjunWu/TCseq/f2708be21fed9fa0ea3a5c2c3f72f607cb24e84c/vignettes/clusterRes.png -------------------------------------------------------------------------------- /vignettes/subcluster.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MengjunWu/TCseq/f2708be21fed9fa0ea3a5c2c3f72f607cb24e84c/vignettes/subcluster.png --------------------------------------------------------------------------------