├── .gitignore
├── DESCRIPTION
├── NAMESPACE
├── NEWS
├── R
    ├── core.R
    ├── data.R
    ├── estimate_abundance.R
    ├── estimate_theta.R
    ├── fit_bias.R
    ├── helper.R
    ├── plots.R
    ├── predict.R
    └── vlmm.R
├── README.md
├── data
    └── preprocessedData.rda
├── inst
    └── CITATION
├── man
    ├── alpine-package.Rd
    ├── buildFragtypes.Rd
    ├── estimateAbundance.Rd
    ├── estimateTheta.Rd
    ├── extractAlpine.Rd
    ├── fitBiasModels.Rd
    ├── getFragmentWidths.Rd
    ├── getReadLength.Rd
    ├── mergeGenes.Rd
    ├── normalizeDESeq.Rd
    ├── plotFragLen.Rd
    ├── plotGC.Rd
    ├── plotGRL.Rd
    ├── plotOrder0.Rd
    ├── plotRelPos.Rd
    ├── predictCoverage.Rd
    ├── preprocessedData.Rd
    ├── splitGenesAcrossChroms.Rd
    └── splitLongGenes.Rd
├── tests
    ├── testthat.R
    └── testthat
    │   └── test_alpine.R
└── vignettes
    └── alpine.Rmd


/.gitignore:
--------------------------------------------------------------------------------
1 | vignettes/figure
2 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: alpine
 2 | Title: alpine
 3 | Version: 1.5.3
 4 | Author: Michael Love, Rafael Irizarry
 5 | Maintainer: Michael Love <michaelisaiahlove@gmail.com>
 6 | Description: Fragment sequence bias modeling and correction for RNA-seq
 7 |         transcript abundance estimation.
 8 | License: GPL (>=2)
 9 | VignetteBuilder: knitr
10 | Depends: R (>= 3.3)
11 | Imports: Biostrings, IRanges, GenomicRanges, GenomicAlignments,
12 |         Rsamtools, SummarizedExperiment, GenomicFeatures, speedglm,
13 |         splines, graph, RBGL, stringr, stats, methods, graphics,
14 | 	GenomeInfoDb, S4Vectors
15 | Suggests: knitr, testthat, alpineData, rtracklayer, ensembldb,
16 |         BSgenome.Hsapiens.NCBI.GRCh38, RColorBrewer
17 | biocViews: Sequencing, RNASeq, AlternativeSplicing,
18 |         DifferentialSplicing, GeneExpression, Transcription, Coverage,
19 |         BatchEffect, Normalization, Visualization, QualityControl
20 | RoxygenNote: 5.0.1
21 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(buildFragtypes)
 4 | export(estimateAbundance)
 5 | export(extractAlpine)
 6 | export(fitBiasModels)
 7 | export(getFragmentWidths)
 8 | export(getReadLength)
 9 | export(mergeGenes)
10 | export(normalizeDESeq)
11 | export(plotFragLen)
12 | export(plotGC)
13 | export(plotGRL)
14 | export(plotOrder0)
15 | export(plotOrder1)
16 | export(plotOrder2)
17 | export(plotRelPos)
18 | export(predictCoverage)
19 | export(splitGenesAcrossChroms)
20 | export(splitLongGenes)
21 | import(Biostrings)
22 | import(GenomicAlignments)
23 | import(GenomicRanges)
24 | import(IRanges)
25 | import(Rsamtools)
26 | import(SummarizedExperiment)
27 | importFrom(GenomeInfoDb,keepSeqlevels)
28 | importFrom(GenomeInfoDb,seqlevels)
29 | importFrom(GenomicFeatures,mapToTranscripts)
30 | importFrom(RBGL,connectedComp)
31 | importFrom(S4Vectors,DataFrame)
32 | importFrom(S4Vectors,queryHits)
33 | importFrom(S4Vectors,subjectHits)
34 | importFrom(graph,ftM2graphNEL)
35 | importFrom(graphics,abline)
36 | importFrom(graphics,legend)
37 | importFrom(graphics,lines)
38 | importFrom(graphics,par)
39 | importFrom(graphics,plot)
40 | importFrom(graphics,points)
41 | importFrom(graphics,segments)
42 | importFrom(methods,as)
43 | importFrom(methods,is)
44 | importFrom(speedglm,speedglm.wfit)
45 | importFrom(splines,ns)
46 | importFrom(stats,density)
47 | importFrom(stats,dpois)
48 | importFrom(stats,formula)
49 | importFrom(stats,glm)
50 | importFrom(stats,model.matrix)
51 | importFrom(stats,poisson)
52 | importFrom(stringr,str_c)
53 | 


--------------------------------------------------------------------------------
/NEWS:
--------------------------------------------------------------------------------
 1 | CHANGES IN VERSION 0.99.6
 2 | -------------------------
 3 | 
 4 |     o Change estimateAbundance() and predictCoverage() interface
 5 |       such that model.names is provided instead of models, using
 6 |       a simple character vector. The gene expression term '+ gene'
 7 |       is taken care of internally.
 8 | 
 9 | CHANGES IN VERSION 0.99.5
10 | -------------------------
11 | 
12 |     o Store readlength, minsize, maxsize in fitpar and so remove
13 |       as arguments to estimateAbundance() and predictCoverage()
14 | 
15 | CHANGES IN VERSION 0.99.4
16 | -------------------------
17 | 
18 |     o Renamed the mysterious estimateTheta() to estimateAbundance()
19 | 
20 | CHANGES IN VERSION 0.99.3
21 | -------------------------
22 | 
23 |     o Allow custom knots for GC and relative position
24 |     
25 | CHANGES IN VERSION 0.99.0
26 | -------------------------
27 | 
28 |     o Package submission!
29 | 


--------------------------------------------------------------------------------
/R/core.R:
--------------------------------------------------------------------------------
  1 | #' alpine: bias corrected transcript abundance estimation
  2 | #'
  3 | #' alpine is a package for estimating and visualizing many forms of sample-specific
  4 | #' biases that can arise in RNA-seq, including fragment length
  5 | #' distribution, positional bias on the transcript, read
  6 | #' start bias (random hexamer priming), and fragment GC content
  7 | #' (amplification). It also offers bias-corrected estimates of
  8 | #' transcript abundance (FPKM). It is currently designed for
  9 | #' un-stranded paired-end RNA-seq data.
 10 | #'
 11 | #' See the package vignette for a detailed workflow.
 12 | #'
 13 | #' The main functions in this package are:
 14 | #' \enumerate{
 15 | #' \item \link{buildFragtypes} - build out features for fragment types from exons of a single gene (GRanges)
 16 | #' \item \link{fitBiasModels} - fit parameters for one or more bias models over a set of ~100 medium to highly expressed single isoform genes (GRangesList)
 17 | #' \item \link{estimateAbundance} - given a set of genome alignments (BAM files) and a set of isoforms of a gene (GRangesList), estimate the transcript abundances for these isoforms (FPKM) for various bias models
 18 | #' \item \link{extractAlpine} - given a list of output from \code{estimateAbundance}, compile an FPKM matrix across transcripts and samples
 19 | #' \item \link{predictCoverage} - given the exons of a single gene (GRanges) predict the coverage for a set of samples given fitted bias parameters and compute the observed coverage
 20 | #' }
 21 | #'
 22 | #' Some helper functions for preparing gene objects:
 23 | #' \enumerate{
 24 | #' \item \link{splitGenesAcrossChroms} - split apart "genes" where isoforms are on different chromosomes
 25 | #' \item \link{splitLongGenes} - split apart "genes" which cover a suspiciously large range, e.g. 1 Mb
 26 | #' \item \link{mergeGenes} - merge overlapping isoforms into new "genes"
 27 | #' }
 28 | #'
 29 | #' Some other assorted helper functions:
 30 | #' \enumerate{
 31 | #' \item \link{normalizeDESeq} - an across-sample normalization for FPKM matrices
 32 | #' \item \link{getFragmentWidths} - return a vector estimated fragment lengths given a set of exons for a single gene (GRanges) and a BAM file
 33 | #' \item \link{getReadLength} - return the read length of the first read across BAM files
 34 | #' }
 35 | #'
 36 | #' The plotting functions are:
 37 | #' \enumerate{
 38 | #' \item \link{plotGC} - plot the fragment GC bias curves
 39 | #' \item \link{plotFragLen} - plot the framgent length distributions
 40 | #' \item \link{plotRelPos} - plot the positional bias (5' to 3')
 41 | #' \item \link{plotOrder0}, \link{plotOrder1}, \link{plotOrder2} - plot the read start bias terms
 42 | #' \item \link{plotGRL} - a simple function for visualizing GRangesList objects
 43 | #' }
 44 | #'
 45 | #' @references
 46 | #'
 47 | #' Love, M.I., Hogenesch, J.B., and Irizarry, R.A.,
 48 | #' Modeling of RNA-seq fragment sequence bias reduces
 49 | #' systematic errors in transcript abundance estimation.
 50 | #' Nature Biotechnologyh (2016) doi: 10.1038/nbt.3682
 51 | #'
 52 | #' @author Michael Love
 53 | #'
 54 | #' @importFrom splines ns
 55 | #' @importFrom speedglm speedglm.wfit
 56 | #' @importFrom stringr str_c
 57 | #' @importFrom graph ftM2graphNEL
 58 | #' @importFrom RBGL connectedComp
 59 | #' @importFrom GenomicFeatures mapToTranscripts
 60 | #' @importFrom graphics abline legend lines par plot points segments
 61 | #' @importFrom stats density dpois formula glm model.matrix poisson
 62 | #' @importFrom methods as is
 63 | #' @importFrom GenomeInfoDb seqlevels keepSeqlevels
 64 | #' @importFrom S4Vectors DataFrame queryHits subjectHits
 65 | #' @import Biostrings IRanges GenomicRanges GenomicAlignments Rsamtools SummarizedExperiment
 66 | #'
 67 | #' @docType package
 68 | #' @name alpine-package
 69 | #' @aliases alpine-package
 70 | #' @keywords package
 71 | NULL
 72 | 
 73 | #' Build fragment types from exons
 74 | #'
 75 | #' This function constructs a DataFrame of fragment features used for
 76 | #' bias modeling, with one row for every potential fragment type that could
 77 | #' arise from a transcript. The output of this function is used by
 78 | #' \link{fitBiasModels}, and this function is used inside \link{estimateAbundance}
 79 | #' in order to model the bias affecting different fragments across isoforms
 80 | #' of a gene.
 81 | #'
 82 | #' @param exons a GRanges object with the exons for a single transcript
 83 | #' @param genome a BSgenome object
 84 | #' @param readlength the length of the reads. This doesn't necessarily
 85 | #' have to be exact (+/- 1 bp is acceptable)
 86 | #' @param minsize the minimum fragment length to model. The interval between
 87 | #' \code{minsize} and \code{maxsize} should contain the at least the
 88 | #' central 95 percent of the fragment length distribution across samples
 89 | #' @param maxsize the maximum fragment length to model
 90 | #' @param gc logical, whether to calculate the fragment GC content
 91 | #' @param gc.str logical, whether to look for presence of
 92 | #' stretches of very high GC within fragments
 93 | #' @param vlmm logical, whether to calculate the Cufflinks Variable Length
 94 | #' Markov Model (VLMM) for read start bias
 95 | #'
 96 | #' @return a DataFrame with bias features (columns) for all
 97 | #' potential fragments (rows)
 98 | #'
 99 | #' @examples
100 | #'
101 | #' library(GenomicRanges)
102 | #' library(BSgenome.Hsapiens.NCBI.GRCh38)
103 | #' data(preprocessedData)
104 | #' readlength <- 100
105 | #' minsize <- 125 # see vignette how to choose
106 | #' maxsize <- 175 # see vignette how to choose
107 | #' fragtypes <- buildFragtypes(ebt.fit[["ENST00000624447"]],
108 | #'                             Hsapiens, readlength,
109 | #'                             minsize, maxsize)
110 | #'
111 | #' @export
112 | buildFragtypes <- function(exons, genome, readlength,
113 |                            minsize, maxsize,
114 |                            gc=TRUE, gc.str=TRUE, vlmm=TRUE) {
115 |   stopifnot(is(exons,"GRanges"))
116 |   stopifnot(is(genome,"BSgenome"))
117 |   stopifnot(is.numeric(minsize) & is.numeric(maxsize) & is.numeric(readlength))
118 |   stopifnot(sum(width(exons)) >= maxsize)
119 |   stopifnot(all(c("exon_rank","exon_id") %in% names(mcols(exons))))
120 |   stopifnot(!any(strand(exons) == "*"))
121 | 
122 |   # these parameters must be fixed, as dictated by fitVLMM()
123 |   npre <- 8
124 |   npost <- 12
125 | 
126 |   map <- mapTxToGenome(exons)
127 |   l <- nrow(map)
128 |   strand <- as.character(strand(exons)[1])
129 |   start <- rep(seq_len(l-minsize+1),each=maxsize-minsize+1)
130 |   end <- as.integer(start + minsize:maxsize - 1)
131 |   mid <- as.integer(0.5 * (start + end))
132 |   relpos <- mid/l
133 |   fraglen <- as.integer(end - start + 1)
134 |   id <- IRanges(start, end)
135 |   fragtypes <- DataFrame(start=start,end=end,relpos=relpos,fraglen=fraglen,id=id)
136 |   fragtypes <- fragtypes[fragtypes$end <= l,,drop=FALSE]
137 |   exon.dna <- getSeq(genome, exons)
138 |   tx.dna <- unlist(exon.dna)
139 |   if (vlmm) {
140 |     # strings needed for VLMM
141 |     fragtypes$fivep.test <- fragtypes$start - npre >= 1
142 |     fragtypes$fivep <- as(Views(tx.dna, fragtypes$start - ifelse(fragtypes$fivep.test, npre, 0),
143 |                                 fragtypes$start + npost), "DNAStringSet")
144 |     fragtypes$threep.test <- fragtypes$end + npre <= length(tx.dna)
145 |     fragtypes$threep <- as(Views(tx.dna, fragtypes$end - npost,
146 |                                  fragtypes$end + ifelse(fragtypes$threep.test, npre, 0),),
147 |                            "DNAStringSet")
148 |     # reverse complement the three prime sequence
149 |     fragtypes$threep <- reverseComplement(fragtypes$threep)
150 |   }
151 |   if (gc) {
152 |     # get the GC content for the entire fragment
153 |     fragrange <- minsize:maxsize
154 |     gc.vecs <- lapply(fragrange, function(i) {
155 |                         letterFrequencyInSlidingView(tx.dna, view.width=i, letters="CG", as.prob=TRUE)
156 |                       })
157 |     fragtypes <- fragtypes[order(fragtypes$fraglen),,drop=FALSE]
158 |     fragtypes$gc <- do.call(c, gc.vecs)
159 |     fragtypes <- fragtypes[order(fragtypes$start),,drop=FALSE]
160 |   }
161 |   if (gc.str) {
162 |     # additional features: GC in smaller sections
163 |     gc.40 <- as.numeric(letterFrequencyInSlidingView(tx.dna, 40, letters="CG", as.prob=TRUE))
164 |     max.gc.40 <- max(Views(gc.40, fragtypes$start, fragtypes$end - 40 + 1))
165 |     gc.20 <- as.numeric(letterFrequencyInSlidingView(tx.dna, 20, letters="CG", as.prob=TRUE))
166 |     max.gc.20 <- max(Views(gc.20, fragtypes$start, fragtypes$end - 20 + 1))
167 |     fragtypes$GC40.90 <- as.numeric(max.gc.40 >= 36/40)
168 |     fragtypes$GC40.80 <- as.numeric(max.gc.40 >= 32/40)
169 |     fragtypes$GC20.90 <- as.numeric(max.gc.20 >= 18/20)
170 |     fragtypes$GC20.80 <- as.numeric(max.gc.20 >= 16/20)
171 |   }
172 |   # these are the fragment start and end in genomic space
173 |   # so for minus strand tx, gstart > gend
174 |   fragtypes$gstart <- txToGenome(fragtypes$start, map)
175 |   fragtypes$gend <- txToGenome(fragtypes$end, map)
176 |   fragtypes$gread1end <- txToGenome(fragtypes$start + readlength - 1, map)
177 |   fragtypes$gread2start <- txToGenome(fragtypes$end - readlength + 1, map)
178 |   #message("nrow fragtypes: ",nrow(fragtypes))
179 |   fragtypes
180 | }
181 | 
182 | ######### unexported core functions #########
183 | 
184 | startLeft <- function(x) {
185 |   first.plus <- as.logical(strand(first(x)) == "+")
186 |   ifelse(first.plus, start(first(x)), start(last(x)))
187 | }
188 | endRight <- function(x) {
189 |   first.plus <- as.logical(strand(first(x)) == "+")
190 |   ifelse(first.plus, end(last(x)), end(first(x)))
191 | }
192 | mapTxToGenome <- function(exons) {
193 |   strand <- as.character(strand(exons)[1])
194 |   stopifnot(all(exons$exon_rank == seq_along(exons)))
195 | 
196 |   # Hack to replicate `rev` from old S4Vectors:::fancy_mseq
197 |   if(strand == "-"){
198 | 
199 |     froms <- start(exons) + width(exons) - 1L
200 |     bys <- -1L
201 | 
202 |   }else{
203 | 
204 |     froms <- start(exons)
205 |     bys <- 1L
206 | 
207 |   }
208 | 
209 |   bases <- sequence(width(exons), from = froms,
210 |                     by = bys)
211 | 
212 |   data.frame(tx=seq_along(bases),
213 |              genome=bases,
214 |              exon_rank=rep(exons$exon_rank, width(exons)))
215 | }
216 | genomeToTx <- function(genome, map) map$tx[match(genome, map$genome)]
217 | txToGenome <- function(tx, map) map$genome[match(tx, map$tx)]
218 | txToExon <- function(tx, map) map$exon_rank[match(tx, map$tx)]
219 | gaToReadsOnTx <- function(ga, grl, fco=NULL) {
220 |   reads <- list()
221 |   for (i in seq_along(grl)) {
222 |     exons <- grl[[i]]
223 |     strand <- as.character(strand(exons)[1])
224 |     read.idx <- if (is.null(fco)) {
225 |       seq_along(ga)
226 |     } else {
227 |       queryHits(fco)[subjectHits(fco) == i]
228 |     }
229 |     map <- mapTxToGenome(exons)
230 |     # depending on strand of gene:
231 |     # start of left will be the first coordinate on the transcript (+ gene)
232 |     # or start of left will be the last coordinate on the transcript (- gene)
233 |     if (strand == "+") {
234 |       start <- genomeToTx(startLeft(ga[read.idx]), map)
235 |       end <- genomeToTx(endRight(ga[read.idx]), map)
236 |     } else if (strand == "-") {
237 |       start <- genomeToTx(endRight(ga[read.idx]), map)
238 |       end <- genomeToTx(startLeft(ga[read.idx]), map)
239 |     }
240 |     valid <- start < end & !is.na(start) & !is.na(end)
241 |     reads[[i]] <- IRanges(start[valid], end[valid])
242 |   }
243 |   names(reads) <- names(grl)
244 |   reads
245 | }
246 | matchReadsToFraglist <- function(reads, fraglist) {
247 |   for (tx.idx in seq_along(fraglist)) {
248 |     uniq.reads <- unique(reads[[tx.idx]])
249 |     readtab <- table(match(reads[[tx.idx]], uniq.reads))
250 |     fraglist[[tx.idx]]$count <- 0
251 |     # this can be slow (up to 1 min) when fraglist has many millions of rows
252 |     match.uniq <- match(uniq.reads, fraglist[[tx.idx]]$id)
253 |     reads.in.fraglist <- !is.na(match.uniq)
254 |     # uniq.reads <- uniq.reads[reads.in.fraglist] # not needed
255 |     readtab <- readtab[reads.in.fraglist]
256 |     # the map between {uniq.reads that are in fraglist} and {rows of fraglist}
257 |     match.uniq.non.na <- match.uniq[!is.na(match.uniq)]
258 |     fraglist[[tx.idx]][match.uniq.non.na,"count"] <- as.numeric(readtab)
259 |   }
260 |   fraglist
261 | }
262 | 
263 | subsetAndWeightFraglist <- function(fraglist, downsample=20, minzero=2000) {
264 |   unique.zero.list <- list()
265 |   for (tx in seq_len(length(fraglist))) {
266 |     # need to make a unique id for each fragment
267 |     fraglist[[tx]]$genomic.id <- str_c(fraglist[[tx]]$gstart,"-",
268 |                                        fraglist[[tx]]$gread1end,"-",
269 |                                        fraglist[[tx]]$gread2start,"-",
270 |                                        fraglist[[tx]]$gend)
271 |     unique.zero.list[[tx]] <- fraglist[[tx]]$genomic.id[fraglist[[tx]]$count == 0]
272 |   }
273 |   unique.zero <- unique(do.call(c, unique.zero.list))
274 |   sumzero <- length(unique.zero)
275 |   numzero <- round(sumzero / downsample)
276 |   numzero <- max(numzero, minzero)
277 |   numzero <- min(numzero, sumzero)
278 |   unique.ids <- sample(unique.zero, numzero, replace=FALSE)
279 |   # once again, this time grab all fragments with positive count or in our list of zeros
280 |   fraglist.sub <- list()
281 |   for (tx in seq_len(length(fraglist))) {
282 |     idx.pos <- which(fraglist[[tx]]$count > 0)
283 |     idx.zero <- which(fraglist[[tx]]$genomic.id %in% unique.ids)
284 |     fraglist.sub[[tx]] <- fraglist[[tx]][c(idx.pos,idx.zero),,drop=FALSE]
285 |   }
286 |   fragtypes <- do.call(rbind, fraglist.sub)
287 |   # the zero weight is the number of unique zero count fragtypes in the original fraglist
288 |   # divided by the current (down-sampled) number of zero count fragtypes
289 |   zero.wt <- sumzero / numzero
290 |   # return fragtypes, but with duplicate rows for selected fragments
291 |   fragtypes$wts <- rep(1, nrow(fragtypes))
292 |   fragtypes$wts[fragtypes$count == 0] <- zero.wt
293 |   fragtypes
294 | }
295 | 
296 | matchToDensity <- function(x, d) {
297 |   idx <- cut(x, c(-Inf, d$x, Inf))
298 |   pdf <- c(0, d$y)
299 |   pdf.x <- pdf[ idx ] + 1e-6
300 |   stopifnot(all(pdf.x > 0))
301 |   pdf.x
302 | }
303 | getFPBP <- function(genes, bam.file) {
304 |   gene.ranges <- unlist(range(genes))
305 |   gene.lengths <- sum(width(genes))
306 |   res <- countBam(bam.file, param=ScanBamParam(which=gene.ranges))
307 |   # two records per fragment
308 |   out <- (res$records / 2)/gene.lengths
309 |   names(out) <- names(genes)
310 |   out
311 | }
312 | getLogLambda <- function(fragtypes, models, modeltype, fitpar, bamname) {
313 | 
314 |   # knots and boundary knots need to come from the fitted parameters object
315 |   # (just use the first sample, knots will be the same across samples)
316 |   model.params <- fitpar[[1]][["model.params"]]
317 |   stopifnot(!is.null(model.params))
318 | 
319 |   gc.knots <- model.params$gc.knots
320 |   gc.bk <- model.params$gc.bk
321 |   relpos.knots <- model.params$relpos.knots
322 |   relpos.bk <- model.params$relpos.bk
323 | 
324 |   # which formula to use
325 |   f <- models[[modeltype]]$formula
326 | 
327 |   offset <- numeric(nrow(fragtypes))
328 |   if ("fraglen" %in% models[[modeltype]]$offset) {
329 |     # message("-- fragment length correction")
330 |     offset <- offset + fragtypes$logdfraglen
331 |   }
332 |   if ("vlmm" %in% models[[modeltype]]$offset) {
333 |     # message("-- VLMM fragment start/end correction")
334 |     offset <- offset + fragtypes$fivep.bias + fragtypes$threep.bias
335 |   }
336 |   if (!is.null(f)) {
337 |     stopifnot(modeltype %in% names(fitpar[[bamname]][["coefs"]]))
338 |     # assume: no intercept in formula
339 |     # sparse.model.matrix produces different column names, so don't use
340 |     # mm.big <- sparse.model.matrix(f, data=fragtypes)
341 |     mm.big <- model.matrix(formula(f), data=fragtypes)
342 |     beta <- fitpar[[bamname]][["coefs"]][[modeltype]]
343 |     stopifnot(any(colnames(mm.big) %in% names(beta)))
344 |     if (all(is.na(beta))) stop("all coefs are NA")
345 |     beta[is.na(beta)] <- 0 # replace NA coefs with 0: these were not observed in the training data
346 |     # this gets rid of the gene1, gene2 and Intercept terms
347 |     beta <- beta[match(colnames(mm.big), names(beta))]
348 |     # add offset
349 |     log.lambda <- as.numeric(mm.big %*% beta) + offset
350 |   } else {
351 |     log.lambda <- offset
352 |   }
353 |   if (!all(is.finite(log.lambda))) stop("log.lambda is not finite")
354 |   log.lambda
355 | }
356 | namesToModels <- function(model.names, fitpar) {
357 |   # create the model.bank
358 |   model.bank <- c(fitpar[[1]][["models"]],
359 |                   list("null"=list(formula=NULL, offset=NULL),
360 |                        "fraglen"=list(formula=NULL, offset="fraglen"),
361 |                        "vlmm"=list(formula=NULL, offset="vlmm"),
362 |                        "fraglen.vlmm"=list(formula=NULL, offset=c("fraglen","vlmm"))))
363 |   models <- model.bank[model.names]
364 |   # replace '+ gene' with '+ 0' in formula
365 |   for (m in model.names) {
366 |     if (!is.null(models[[m]]$formula)) {
367 |       if (!grepl("\\+ gene$",models[[m]]$formula)) {
368 |         stop("was expecting '+ gene' to be at the end of the formula string from fitpar")
369 |       }
370 |       models[[m]]$formula <- sub("\\+ gene$","\\+ 0",models[[m]]$formula)
371 |     }
372 |   }
373 |   models
374 | }
375 | 
376 | 


--------------------------------------------------------------------------------
/R/data.R:
--------------------------------------------------------------------------------
 1 | #' Preprocessed data for vignettes and examples 
 2 | #'
 3 | #' The following data objects are prepared for use
 4 | #' in the alpine vignette and examples pages,
 5 | #' as the preparation of these objects requires
 6 | #' either long running time or a large amount of disk
 7 | #' space.
 8 | #'
 9 | #' \itemize{
10 | #'   \item \strong{ebt.fit} - the GRangesList prepared in the vignette
11 | #' for fitting the bias models
12 | #'   \item \strong{fitpar} - the fitted parameters, similar to those
13 | #' made in the vignette, but using \code{minsize=80} and \code{maxsize=350}
14 | #'   \item \strong{fitpar.small} - the fitted parameters from the
15 | #' vignette, returned by fitBiasModels
16 | #'   \item \strong{res} - the results object from the vignette,
17 | #' returned by estimateAbundance
18 | #'   \item \strong{ebt.theta} - the GRangesList prepared in the vignette
19 | #' for running estimateAbundance
20 | #'   \item \strong{genes.theta} - the names of genes used in the vignette
21 | #' for running estimateAbundance
22 | #'   \item \strong{txdf.theta} - the DataFrame of gene and transcript
23 | #' information used in the vignette for running estimateAbundance
24 | #' }
25 | #'
26 | #' @docType data
27 | #' @name preprocessedData
28 | #' @aliases ebt.fit ebt.theta fitpar fitpar.small genes.theta res txdf.theta
29 | #' 
30 | #' 
31 | #' @format \code{ebt.fit} and \code{ebt.theta} are GRangesList.
32 | #' \code{fitpar}, \code{fitpar.small}, \code{res} are lists created
33 | #' by alpine functions. \code{genes.theta} is a character vector.
34 | #' \code{txdf.theta} is a DataFrame.
35 | #' @source See vignette for details of object construction.
36 | #' The alignments come from alpineData (4 samples from GEUVADIS project),
37 | #' the Ensembl gene annotations come from \code{Homo_sapiens.GRCh38.84.gtf},
38 | #' and the genome is \code{BSgenome.Hsapiens.NCBI.GRCh38}.
39 | NULL
40 | 


--------------------------------------------------------------------------------
/R/estimate_abundance.R:
--------------------------------------------------------------------------------
  1 | #' Estimate bias-corrected transcript abundances (FPKM) 
  2 | #'
  3 | #' This function takes the fitted bias parameters from \link{fitBiasModels}
  4 | #' and uses this information to derive bias corrected estimates of
  5 | #' transcript abundance for a gene (with one or more isoforms)
  6 | #' across multiple samples.
  7 | #' 
  8 | #' @param transcripts a GRangesList of the exons for multiple isoforms of a gene.
  9 | #' For a single-isoform gene, just wrap the exons in \code{GRangesList()}
 10 | #' @param bam.files a named vector pointing to the indexed BAM files
 11 | #' @param fitpar the output of \link{fitBiasModels}
 12 | #' @param genome a BSGenome object
 13 | #' @param model.names a character vector of the bias models to use.
 14 | #' These should have already been specified when calling \link{fitBiasModels}.
 15 | #' Four exceptions are models that use none, one or both of the offsets,
 16 | #' and these are called with:
 17 | #' \code{"null"}, \code{"fraglen"}, \code{"vlmm"}, or \code{"fraglen.vlmm"}.
 18 | #' @param subset logical, whether to downsample the non-observed fragments. Default is TRUE
 19 | #' @param niter the number of EM iterations. Default is 100.
 20 | #' @param lib.sizes a named vector of library sizes to use in calculating the FPKM.
 21 | #' If NULL (the default) a value of 1e6 is used for all samples.
 22 | #' @param optim logical, whether to use numerical optimization instead of the EM.
 23 | #' Default is FALSE.
 24 | #' @param custom.features an optional function to add custom features
 25 | #' to the fragment types DataFrame. This function takes in a DataFrame
 26 | #' returned by \link{buildFragtypes} and returns a DataFrame
 27 | #' with additional columns added. Default is NULL, adding no custom features.
 28 | #'
 29 | #' @return a list of lists. For each sample, a list with elements:
 30 | #' theta, lambda and count.
 31 | #' \itemize{
 32 | #' \item \strong{theta} gives the FPKM estimates for the
 33 | #' isoforms in \code{transcripts}
 34 | #' \item \strong{lambda} gives the average bias term
 35 | #' for the isoforms
 36 | #' \item \strong{count} gives the number of fragments which are
 37 | #' compatible with any of the isoforms in \code{transcripts}
 38 | #' }
 39 | #'
 40 | #' @references
 41 | #'
 42 | #' The model describing how bias estimates are used to
 43 | #' estimate bias-corrected abundances is described in
 44 | #' the Supplemental Note of the following publication:
 45 | #'
 46 | #' Love, M.I., Hogenesch, J.B., and Irizarry, R.A.,
 47 | #' Modeling of RNA-seq fragment sequence bias reduces
 48 | #' systematic errors in transcript abundance estimation.
 49 | #' Nature Biotechnologyh (2016) doi: 10.1038/nbt.3682
 50 | #'
 51 | #' The likelihood formulation and EM algorithm
 52 | #' for finding the maximum likelihood estimate for abundances
 53 | #' follows this publication:
 54 | #'
 55 | #' Salzman, J., Jiang, H., and Wong, W.H.,
 56 | #' Statistical Modeling of RNA-Seq Data.
 57 | #' Statistical Science (2011) doi: 10.1214/10-STS343
 58 | #' 
 59 | #' @examples
 60 | #'
 61 | #' # see vignette for a more realistic example
 62 | #'
 63 | #' # these next lines just write out a BAM file from R
 64 | #' # typically you would already have a BAM file
 65 | #' library(alpineData)
 66 | #' library(GenomicAlignments)
 67 | #' library(rtracklayer)
 68 | #' gap <- ERR188088()
 69 | #' dir <- system.file(package="alpineData", "extdata")
 70 | #' bam.file <- c("ERR188088" = file.path(dir,"ERR188088.bam"))
 71 | #' export(gap, con=bam.file)
 72 | #' 
 73 | #' data(preprocessedData)
 74 | #' library(GenomicRanges)
 75 | #' library(BSgenome.Hsapiens.NCBI.GRCh38)
 76 | #' 
 77 | #' model.names <- c("fraglen","GC")
 78 | #' 
 79 | #' txs <- txdf.theta$tx_id[txdf.theta$gene_id == "ENSG00000198918"]
 80 | #' 
 81 | #' res <- estimateAbundance(transcripts=ebt.theta[txs],
 82 | #'                          bam.files=bam.file,
 83 | #'                          fitpar=fitpar.small,
 84 | #'                          genome=Hsapiens,
 85 | #'                          model.names=model.names)
 86 | #' 
 87 | #' @export
 88 | estimateAbundance <- function(transcripts, bam.files, fitpar, genome, model.names, 
 89 |                               subset=TRUE, niter=100, lib.sizes=NULL, optim=FALSE,
 90 |                               custom.features=NULL) {
 91 | 
 92 |   stopifnot(is(transcripts, "GRangesList"))
 93 |   stopifnot(length(transcripts) >= 1)
 94 |   singleiso <- length(transcripts) == 1
 95 |   stopifnot(!is.null(names(transcripts)))
 96 |   stopifnot(all(c("exon_rank","exon_id") %in% names(mcols(transcripts[[1]]))))
 97 | 
 98 |   stopifnot(!is.null(fitpar))
 99 |   stopifnot(all(!is.null(names(bam.files))))
100 |   stopifnot(all(names(bam.files) %in% names(fitpar)))
101 |   stopifnot(all(file.exists(bam.files)))
102 | 
103 |   # pull out some model parameters
104 |   stopifnot(all(c("readlength","minsize","maxsize","maxsize") %in%
105 |                 names(fitpar[[1]][["model.params"]])))
106 |   readlength <- fitpar[[1]][["model.params"]][["readlength"]]
107 |   minsize <- fitpar[[1]][["model.params"]][["minsize"]]
108 |   maxsize <- fitpar[[1]][["model.params"]][["maxsize"]]
109 | 
110 |   # take model names and fitpar models and make the
111 |   # models suitable for bias calculation
112 |   models <- namesToModels(model.names, fitpar)
113 |   
114 |   stopifnot(all(file.exists(paste0(bam.files, ".bai"))))
115 |   if (!is.null(lib.sizes)) stopifnot(all(names(bam.files) %in% names(lib.sizes)))
116 |   if (is.null(lib.sizes)) {
117 |     lib.sizes <- rep(1e6, length(bam.files))
118 |     names(lib.sizes) <- names(bam.files)
119 |   }
120 |   
121 |   w <- sum(width(transcripts))
122 | 
123 |   # is VLMM one of the offsets for any model
124 |   any.vlmm <- any(sapply(models, function(m) "vlmm" %in% m$offset))
125 | 
126 |   # TODO: give better output for genes with smaller length than minsize
127 |   if (min(w) <= minsize + 1) return(NULL)
128 | 
129 |   if (min(w) <= maxsize) {
130 |     maxsize <- min(w)
131 |   }
132 |   
133 |   # this is a list of fragment types for each transcript
134 |   st <- system.time({
135 |     # TODO: could also save time by only doing GC stretches if necessary
136 |     fraglist <- lapply(seq_along(transcripts), function(i) {
137 |       out <- buildFragtypes(transcripts[[i]], genome, readlength,
138 |                             minsize, maxsize, vlmm=any.vlmm)
139 |       # optionally add more features to the fragment types DataFrame
140 |       if (!is.null(custom.features)) {
141 |         out <- custom.features(out)
142 |       }
143 |       out$tx <- names(transcripts)[i]
144 |       out
145 |     })
146 |   })
147 | 
148 |   #message("building fragment types: ",round(unname(st[3]),1)," seconds")
149 |   names(fraglist) <- names(transcripts)
150 | 
151 |   res <- lapply(seq_along(bam.files), function(i) {
152 |     bam.file <- bam.files[i]
153 |     bamname <- names(bam.file)
154 |     txrange <- unlist(range(transcripts))
155 |     strand(txrange) <- "*"
156 |     generange <- range(txrange)
157 |     
158 |     #message("align reads to txs")
159 |     suppressWarnings({
160 |       ga <- readGAlignAlpine(bam.file, generange)
161 |     })
162 |     #message("-- ",length(ga)," reads")
163 | 
164 |     outputZero <- FALSE
165 |     if (length(ga) == 0) {
166 |       numCompatible <- 0
167 |       outputZero <- TRUE
168 |     } else { 
169 |       ga <- keepSeqlevels(ga, as.character(seqnames(transcripts[[1]])[1]))
170 |       fco <- findCompatibleOverlaps(ga, transcripts)
171 |       numCompatible <- length(unique(queryHits(fco)))
172 |       #message("-- ",round(numCompatible/length(ga),2)," compatible overlaps")
173 |       #message("---- ",seqnames(generange),":",start(generange),"-",end(generange))
174 |       # table(strand(ga)[unique(queryHits(fco))]) # are the read counts even across strand?
175 |       # boxplot(lapply(reads, function(x) width(x)))
176 |       # here the variable is called "reads" although they are really fragments.
177 |       # everything is already called fragments :-/
178 |       reads <- gaToReadsOnTx(ga, transcripts, fco)
179 |       fraglist.temp <- matchReadsToFraglist(reads, fraglist)
180 |       txcounts <- sapply(fraglist.temp, function(x) sum(x$count))
181 |       #message("---- ",paste(txcounts, collapse=" "))
182 |       if (all(txcounts == 0)) outputZero <- TRUE
183 |     }
184 | 
185 |     # report 0 output for all models if all txs have 0 count
186 |     names(model.names) <- model.names
187 |     nms.tx <- names(transcripts)
188 |     if (outputZero) {
189 |       #message("all transcripts have 0 counts")
190 |       theta <- numeric(length(nms.tx))
191 |       lambda <- rep(NA,length(nms.tx)) # don't bother calculating lambda
192 |       names(lambda) <- names(theta) <- nms.tx
193 |       # for all models:
194 |       res.sub <- lapply(model.names, function(x) {
195 |                    list(theta=theta, lambda=lambda)
196 |                  })
197 |       return(c(res.sub,count=0)) # return results for this sample
198 |     }
199 |     
200 |     if (subset) {
201 |       st <- system.time({
202 |         fragtypes <- subsetAndWeightFraglist(fraglist.temp)
203 |       })
204 |      #message("subset and weight fragment types: ", round(unname(st[3]),1), " seconds")
205 |     } else {
206 |         fragtypes <- do.call(rbind, fraglist.temp)
207 |         # this is done in subsetAndWeightFraglist()
208 |         fragtypes$genomic.id <- paste0(fragtypes$gstart,"-",fragtypes$gread1end,"-",
209 |                                        fragtypes$gread2start,"-",fragtypes$gend)
210 |     }
211 | 
212 |     # message("fragment bias")
213 |     ## -- fragment bias --
214 |     fraglen.density <- fitpar[[bamname]][["fraglen.density"]]
215 |     fragtypes$logdfraglen <- log(matchToDensity(fragtypes$fraglen, fraglen.density))
216 | 
217 |     if (any.vlmm) {
218 |       stopifnot( "vlmm.fivep" %in% names(fitpar[[bamname]]) )
219 |       # message("priming bias")
220 |       ## -- random hexamer priming bias with VLMM --
221 |       vlmm.fivep <- fitpar[[bamname]][["vlmm.fivep"]]
222 |       vlmm.threep <- fitpar[[bamname]][["vlmm.threep"]]
223 |       fragtypes <- addVLMMBias(fragtypes, vlmm.fivep, vlmm.threep)
224 |     }
225 |       
226 |     # specific code for one isoform
227 |     if (singleiso) {
228 |       n.obs <- fragtypes$count
229 |       # this gives list output for one BAM file
230 |       res.sub <- lapply(model.names, function(modeltype) {
231 |         log.lambda <- getLogLambda(fragtypes, models, modeltype, fitpar, bamname)
232 |         log.lambda <- as.numeric(log.lambda)
233 |         N <- if (is.null(lib.sizes)) {
234 |           mean(n.obs)
235 |         } else {
236 |             # TODO: here fix like below
237 |             lib.sizes[bamname] / (1e9 * (maxsize - minsize))
238 |           }
239 |         A <- N * exp(log.lambda)
240 |         wts <- if (subset) { fragtypes$wts } else { 1 } 
241 |         theta <- sum(n.obs  * wts)/sum(A * wts)
242 |         lambda <- sum(wts * exp(log.lambda)) / sum(wts)
243 |         names(lambda) <- names(theta) <- names(transcripts)
244 |         list(theta=theta, lambda=lambda)
245 |       })
246 |       return(c(res.sub,count=numCompatible)) # return results for this sample
247 |     }
248 | 
249 |     # make incidence matrix
250 |     # duplicate genomic ID across tx will be a single column 
251 |     mat <- incidenceMat(fragtypes$tx, fragtypes$genomic.id)
252 |     # make sure the rows are in correct order
253 |     stopifnot(all(rownames(mat) == names(transcripts)))
254 | 
255 |     # NOTE: duplicated weights and bias are not the same for each tx.
256 |     # The bias will often be identical for read start bias,
257 |     # and very close for fragment length and fragment GC content given long reads.
258 |     # It will not be so similar for relative position bias.
259 |     # Zhonghui Xu points out: why not do the extra bookkeeping and
260 |     # have the proper lambda-hat_ij fill out the A matrix.
261 |     fragtypes.sub <- fragtypes[!duplicated(fragtypes$genomic.id),,drop=FALSE]
262 |     stopifnot(all(fragtypes.sub$genomic.id == colnames(mat)))
263 | 
264 |     #message("run EM for models: ",paste(names(models), collapse=", "))
265 |     n.obs <- fragtypes.sub$count
266 |     
267 |     # run EM for different models
268 |     # this gives list output for one BAM file
269 |     res.sub <- lapply(model.names, function(modeltype) {
270 |       log.lambda <- getLogLambda(fragtypes, models, modeltype, fitpar, bamname)
271 |       ## pred0 <- as.numeric(exp(log.lambda))
272 |       ## pred <- pred0/mean(pred0)*mean(fragtypes.sub$count)
273 |       ## boxplot(pred ~ factor(cut(fragtypes.sub$count,c(-1:10 + .5,20,Inf))), main=modeltype, range=0)
274 |       if (is.null(lib.sizes)) {
275 |         N <- mean(n.obs)
276 |       } else {
277 |         # TODO: in addition to the interval of considered lengths L
278 |         # account for the triangle of fragments not in the count matrix
279 |         # (analagous to effective length)
280 |         N <- lib.sizes[bamname] / (1e9 * (maxsize - minsize))
281 |       }
282 |       
283 |       # transcript-specific bias
284 |       lambda.mat <- mat
285 |       for (tx in names(transcripts)) {
286 |         tx.id <- fragtypes$genomic.id[fragtypes$tx == tx]
287 |         tx.idx <- match(tx.id, colnames(mat))
288 |         lambda.mat[tx, tx.idx] <- exp(log.lambda[fragtypes$tx == tx])
289 |       }
290 | 
291 |       wts <- if (subset) { fragtypes.sub$wts } else { 1 }
292 | 
293 |       # A also includes the library size
294 |       A <- N * lambda.mat 
295 |       theta <- runEM(n.obs, A, wts, niter, optim)
296 | 
297 |       # the average lambda for each transcript is stored in results
298 |       lambda <- if (subset) {
299 |         lambda.mat %*% wts / mat %*% wts
300 |       } else {
301 |         rowSums(lambda.mat) / rowSums(mat)
302 |       }
303 | 
304 |       lambda <- as.numeric(lambda)
305 |       names(lambda) <- names(transcripts)
306 |       list(theta=theta, lambda=lambda)
307 |     })
308 |     return(c(res.sub, count=numCompatible))
309 |   })
310 |   names(res) <- names(bam.files)
311 |   res
312 | }
313 | 
314 | ######### unexported EM functions #########
315 | 
316 | incidenceMat <- function(x, y, numeric=TRUE) {
317 |   # borrowed from Wolfgang Huber
318 |   ux = unique(x)
319 |   uy = unique(y)
320 |   im = matrix(FALSE, nrow=length(ux), ncol=length(uy), dimnames=list(ux, uy))
321 |   im[ cbind(x, y) ] = TRUE
322 |   if (numeric) {
323 |     mode(im) <- "numeric"
324 |   }
325 |   return(im)
326 | }
327 | runEM <- function(n.obs, A, wts=1, niter=20, optim=FALSE) {
328 |   J <- ncol(A)
329 |   ntx <- nrow(A)
330 |   log.like <- function(theta.hat) {
331 |     sum(wts * dpois(n.obs, colSums(A * theta.hat), log=TRUE))
332 |   }
333 |   theta.hat <- rep(1, ntx)
334 |   theta.0 <- rep(1, ntx)
335 |   n.obs.sub <- n.obs[n.obs > 0]
336 |   A.sub <- A[,n.obs > 0,drop=FALSE]
337 |   rowSumsA <- rowSums(t(t(A) * wts))
338 |   if (!optim) {
339 |     for (tt in 1:niter) {
340 |       n.hat <- t(t(theta.hat * A.sub) * n.obs.sub / colSums(theta.hat * A.sub))
341 |       theta.hat <- rowSums(n.hat) / rowSumsA
342 |     }
343 |   } else {
344 |     theta.hat <- optim(theta.hat, log.like,
345 |                        lower=rep(1e-6,ntx), upper=rep(1e6,ntx),
346 |                        control=list(fnscale=-1), method="L-BFGS-B")$par
347 |   }
348 |   theta.hat
349 | }
350 | 


--------------------------------------------------------------------------------
/R/estimate_theta.R:
--------------------------------------------------------------------------------
  1 | #' Estimate bias-corrected transcript abundances (FPKM) 
  2 | #'
  3 | #' This function takes the fitted bias parameters from \link{fitBiasModels}
  4 | #' and uses this information to derive bias corrected estimates of
  5 | #' transcript abundance for a gene (with one or more isoforms)
  6 | #' across multiple samples.
  7 | #' 
  8 | #' @param transcripts a GRangesList of the exons for multiple isoforms of a gene.
  9 | #' For a single-isoform gene, just wrap the exons in \code{GRangesList()}
 10 | #' @param bam.files a named vector pointing to the indexed BAM files
 11 | #' @param fitpar the output of \link{fitBiasModels}
 12 | #' @param genome a BSGenome object
 13 | #' @param models a list of character strings or formula describing the bias models, see vignette
 14 | #' @param readlength the read length
 15 | #' @param minsize the minimum fragment length to model
 16 | #' @param maxsize the maximum fragment length to model
 17 | #' @param subset logical, whether to downsample the non-observed fragments. Default is TRUE
 18 | #' @param niter the number of EM iterations. Default is 100.
 19 | #' @param lib.sizes a named vector of library sizes to use in calculating the FPKM.
 20 | #' If NULL (the default) a value of 1e6 is used for all samples.
 21 | #' @param optim logical, whether to use numerical optimization instead of the EM.
 22 | #' Default is FALSE.
 23 | #' @param custom.features an optional function to add custom features
 24 | #' to the fragment types DataFrame. This function takes in a DataFrame
 25 | #' returned by \link{buildFragtypes} and returns a DataFrame
 26 | #' with additional columns added. Default is NULL, adding no custom features.
 27 | #'
 28 | #' @return a list of lists. For each sample, a list with elements:
 29 | #' theta, lambda and count.
 30 | #' \itemize{
 31 | #' \item \strong{theta} gives the FPKM estimates for the
 32 | #' isoforms in \code{transcripts}
 33 | #' \item \strong{lambda} gives the average bias term
 34 | #' for the isoforms
 35 | #' \item \strong{count} gives the number of fragments which are
 36 | #' compatible with any of the isoforms in \code{transcripts}
 37 | #' }
 38 | #'
 39 | #' @examples
 40 | #'
 41 | #' # see vignette for a more realistic example
 42 | #'
 43 | #' # these next lines just write out a BAM file from R
 44 | #' # typically you would already have a BAM file
 45 | #' library(alpineData)
 46 | #' library(GenomicAlignments)
 47 | #' library(rtracklayer)
 48 | #' gap <- ERR188088()
 49 | #' dir <- system.file(package="alpineData", "extdata")
 50 | #' bam.file <- c("ERR188088" = file.path(dir,"ERR188088.bam"))
 51 | #' export(gap, con=bam.file)
 52 | #' 
 53 | #' data(preprocessedData)
 54 | #' library(GenomicRanges)
 55 | #' library(BSgenome.Hsapiens.NCBI.GRCh38)
 56 | #' models <- list(
 57 | #'  "GC"=list(formula="count~
 58 | #'  ns(gc,knots=gc.knots,Boundary.knots=gc.bk) +
 59 | #'  ns(relpos,knots=relpos.knots,Boundary.knots=relpos.bk) +
 60 | #'  0",
 61 | #'  offset=c("fraglen"))
 62 | #' )
 63 | #'
 64 | #' readlength <- 75
 65 | #' minsize <- 125 # see vignette how to choose
 66 | #' maxsize <- 175 # see vignette how to choose
 67 | #' txs <- txdf.theta$tx_id[txdf.theta$gene_id == "ENSG00000198918"]
 68 | #' 
 69 | #' res <- estimateTheta(transcripts=ebt.theta[txs],
 70 | #'                      bam.files=bam.file,
 71 | #'                      fitpar=fitpar.small,
 72 | #'                      genome=Hsapiens,
 73 | #'                      models=models,
 74 | #'                      readlength=readlength,
 75 | #'                      minsize=minsize,
 76 | #'                      maxsize=maxsize)
 77 | #' 
 78 | #' @export
 79 | estimateTheta <- function(transcripts, bam.files, fitpar, genome,
 80 |                           models, readlength, minsize, maxsize,
 81 |                           subset=TRUE, niter=100, 
 82 |                           lib.sizes=NULL, optim=FALSE,
 83 |                           custom.features=NULL) {
 84 |   
 85 |   stopifnot(is(transcripts, "GRangesList"))
 86 |   stopifnot(length(transcripts) >= 1)
 87 |   singleiso <- length(transcripts) == 1
 88 |   stopifnot(!is.null(names(transcripts)))
 89 |   stopifnot(all(c("exon_rank","exon_id") %in% names(mcols(transcripts[[1]]))))
 90 | 
 91 |   stopifnot(!is.null(fitpar))
 92 |   stopifnot(all(!is.null(names(bam.files))))
 93 |   stopifnot(all(names(bam.files) %in% names(fitpar)))
 94 |   stopifnot(all(file.exists(bam.files)))
 95 | 
 96 |   stopifnot(all(file.exists(paste0(bam.files, ".bai"))))
 97 |   if (!is.null(lib.sizes)) stopifnot(all(names(bam.files) %in% names(lib.sizes)))
 98 |   if (is.null(lib.sizes)) {
 99 |     lib.sizes <- rep(1e6, length(bam.files))
100 |     names(lib.sizes) <- names(bam.files)
101 |   }
102 |   
103 |   w <- sum(width(transcripts))
104 | 
105 |   # is VLMM one of the offsets for any model
106 |   any.vlmm <- any(sapply(models, function(m) "vlmm" %in% m$offset))
107 | 
108 |   # TODO: give better output for genes with smaller length than minsize
109 |   if (min(w) <= minsize + 1) return(NULL)
110 | 
111 |   if (min(w) <= maxsize) {
112 |     maxsize <- min(w)
113 |   }
114 |   
115 |   # TODO: come up with a check on whether models is compatible with fitpar
116 |   
117 |   # this is a list of fragment types for each transcript
118 |   st <- system.time({
119 |     # TODO: could also save time by only doing GC stretches if necessary
120 |     fraglist <- lapply(seq_along(transcripts), function(i) {
121 |       out <- buildFragtypes(transcripts[[i]], genome, readlength,
122 |                             minsize, maxsize, vlmm=any.vlmm)
123 |       # optionally add more features to the fragment types DataFrame
124 |       if (!is.null(custom.features)) {
125 |         out <- custom.features(out)
126 |       }
127 |       out$tx <- names(transcripts)[i]
128 |       out
129 |     })
130 |   })
131 | 
132 |   #message("building fragment types: ",round(unname(st[3]),1)," seconds")
133 |   names(fraglist) <- names(transcripts)
134 | 
135 |   res <- lapply(seq_along(bam.files), function(i) {
136 |     bam.file <- bam.files[i]
137 |     bamname <- names(bam.file)
138 |     txrange <- unlist(range(transcripts))
139 |     strand(txrange) <- "*"
140 |     generange <- range(txrange)
141 |     
142 |     #message("align reads to txs")
143 |     suppressWarnings({
144 |       ga <- readGAlignAlpine(bam.file, generange)
145 |     })
146 |     #message("-- ",length(ga)," reads")
147 | 
148 |     outputZero <- FALSE
149 |     if (length(ga) == 0) {
150 |       numCompatible <- 0
151 |       outputZero <- TRUE
152 |     } else { 
153 |       ga <- keepSeqlevels(ga, as.character(seqnames(transcripts[[1]])[1]))
154 |       fco <- findCompatibleOverlaps(ga, transcripts)
155 |       numCompatible <- length(unique(queryHits(fco)))
156 |       #message("-- ",round(numCompatible/length(ga),2)," compatible overlaps")
157 |       #message("---- ",seqnames(generange),":",start(generange),"-",end(generange))
158 |       # table(strand(ga)[unique(queryHits(fco))]) # are the read counts even across strand?
159 |       # boxplot(lapply(reads, function(x) width(x)))
160 |       # here called "reads" although they are fragments. everything is already called fragments :-/
161 |       reads <- gaToReadsOnTx(ga, transcripts, fco)
162 |       fraglist.temp <- matchReadsToFraglist(reads, fraglist)
163 |       txcounts <- sapply(fraglist.temp, function(x) sum(x$count))
164 |       #message("---- ",paste(txcounts, collapse=" "))
165 |       if (all(txcounts == 0)) outputZero <- TRUE
166 |     }
167 | 
168 |     # report 0 output for all models if all txs have 0 count
169 |     model.names <- names(models)
170 |     names(model.names) <- model.names
171 |     nms.tx <- names(transcripts)
172 |     if (outputZero) {
173 |       #message("all transcripts have 0 counts")
174 |       theta <- numeric(length(nms.tx))
175 |       lambda <- rep(NA,length(nms.tx)) # don't bother calculating lambda
176 |       names(lambda) <- names(theta) <- nms.tx
177 |       # for all models:
178 |       res.sub <- lapply(model.names, function(x) {
179 |                    list(theta=theta, lambda=lambda)
180 |                  })
181 |       return(c(res.sub,count=0)) # return results for this sample
182 |     }
183 |     
184 |     if (subset) {
185 |       st <- system.time({
186 |         fragtypes <- subsetAndWeightFraglist(fraglist.temp)
187 |       })
188 |      #message("subset and weight fragment types: ", round(unname(st[3]),1), " seconds")
189 |     } else {
190 |         fragtypes <- do.call(rbind, fraglist.temp)
191 |         # this is done in subsetAndWeightFraglist()
192 |         fragtypes$genomic.id <- paste0(fragtypes$gstart,"-",fragtypes$gread1end,"-",
193 |                                        fragtypes$gread2start,"-",fragtypes$gend)
194 |     }
195 | 
196 |     # message("fragment bias")
197 |     ## -- fragment bias --
198 |     fraglen.density <- fitpar[[bamname]][["fraglen.density"]]
199 |     fragtypes$logdfraglen <- log(matchToDensity(fragtypes$fraglen, fraglen.density))
200 | 
201 |     if (any.vlmm) {
202 |       stopifnot( "vlmm.fivep" %in% names(fitpar[[bamname]]) )
203 |       # message("priming bias")
204 |       ## -- random hexamer priming bias with VLMM --
205 |       vlmm.fivep <- fitpar[[bamname]][["vlmm.fivep"]]
206 |       vlmm.threep <- fitpar[[bamname]][["vlmm.threep"]]
207 |       fragtypes <- addVLMMBias(fragtypes, vlmm.fivep, vlmm.threep)
208 |     }
209 |       
210 |     # specific code for one isoform
211 |     if (singleiso) {
212 |       n.obs <- fragtypes$count
213 |       # this gives list output for one BAM file
214 |       res.sub <- lapply(model.names, function(modeltype) {
215 |         log.lambda <- getLogLambda(fragtypes, models, modeltype, fitpar, bamname)
216 |         log.lambda <- as.numeric(log.lambda)
217 |         N <- if (is.null(lib.sizes)) {
218 |           mean(n.obs)
219 |         } else {
220 |             # TODO: here fix like below
221 |             lib.sizes[bamname] / (1e9 * (maxsize - minsize))
222 |           }
223 |         A <- N * exp(log.lambda)
224 |         wts <- if (subset) { fragtypes$wts } else { 1 } 
225 |         theta <- sum(n.obs  * wts)/sum(A * wts)
226 |         lambda <- sum(wts * exp(log.lambda)) / sum(wts)
227 |         names(lambda) <- names(theta) <- names(transcripts)
228 |         list(theta=theta, lambda=lambda)
229 |       })
230 |       return(c(res.sub,count=numCompatible)) # return results for this sample
231 |     }
232 | 
233 |     # make incidence matrix
234 |     # duplicate genomic ID across tx will be a single column 
235 |     mat <- incidenceMat(fragtypes$tx, fragtypes$genomic.id)
236 |     # make sure the rows are in correct order
237 |     stopifnot(all(rownames(mat) == names(transcripts)))
238 | 
239 |     # NOTE: duplicated weights and bias are not the same for each tx.
240 |     # The bias will often be identical for read start bias,
241 |     # and very close for fragment length and fragment GC content given long reads.
242 |     # It will not be so similar for relative position bias.
243 |     # Zhonghui Xu points out: why not do the extra bookkeeping and
244 |     # have the proper lambda-hat_ij fill out the A matrix.
245 |     fragtypes.sub <- fragtypes[!duplicated(fragtypes$genomic.id),,drop=FALSE]
246 |     stopifnot(all(fragtypes.sub$genomic.id == colnames(mat)))
247 | 
248 |     #message("run EM for models: ",paste(names(models), collapse=", "))
249 |     n.obs <- fragtypes.sub$count
250 |     
251 |     # run EM for different models
252 |     # this gives list output for one BAM file
253 |     res.sub <- lapply(model.names, function(modeltype) {
254 |       log.lambda <- getLogLambda(fragtypes, models, modeltype, fitpar, bamname)
255 |       ## pred0 <- as.numeric(exp(log.lambda))
256 |       ## pred <- pred0/mean(pred0)*mean(fragtypes.sub$count)
257 |       ## boxplot(pred ~ factor(cut(fragtypes.sub$count,c(-1:10 + .5,20,Inf))), main=modeltype, range=0)
258 |       if (is.null(lib.sizes)) {
259 |         N <- mean(n.obs)
260 |       } else {
261 |         # TODO: in addition to the interval of considered lengths L
262 |         # account for the triangle of fragments not in the count matrix
263 |         N <- lib.sizes[bamname] / (1e9 * (maxsize - minsize))
264 |       }
265 |       
266 |       # transcript-specific bias
267 |       lambda.mat <- mat
268 |       for (tx in names(transcripts)) {
269 |         tx.id <- fragtypes$genomic.id[fragtypes$tx == tx]
270 |         tx.idx <- match(tx.id, colnames(mat))
271 |         lambda.mat[tx, tx.idx] <- exp(log.lambda[fragtypes$tx == tx])
272 |       }
273 | 
274 |       wts <- if (subset) { fragtypes.sub$wts } else { 1 }
275 | 
276 |       # A also includes the library size
277 |       A <- N * lambda.mat 
278 |       theta <- runEM(n.obs, A, wts, niter, optim)
279 | 
280 |       # the average lambda for each transcript is stored in results
281 |       lambda <- if (subset) {
282 |         lambda.mat %*% wts / mat %*% wts
283 |       } else {
284 |         rowSums(lambda.mat) / rowSums(mat)
285 |       }
286 | 
287 |       lambda <- as.numeric(lambda)
288 |       names(lambda) <- names(transcripts)
289 |       list(theta=theta, lambda=lambda)
290 |     })
291 |     return(c(res.sub, count=numCompatible))
292 |   })
293 |   names(res) <- names(bam.files)
294 |   res
295 | }
296 | 
297 | ######### unexported EM functions #########
298 | 
299 | incidenceMat <- function(x, y, numeric=TRUE) {
300 |   # borrowed from Wolfgang Huber
301 |   ux = unique(x)
302 |   uy = unique(y)
303 |   im = matrix(FALSE, nrow=length(ux), ncol=length(uy), dimnames=list(ux, uy))
304 |   im[ cbind(x, y) ] = TRUE
305 |   if (numeric) {
306 |     mode(im) <- "numeric"
307 |   }
308 |   return(im)
309 | }
310 | runEM <- function(n.obs, A, wts=1, niter=20, optim=FALSE) {
311 |   J <- ncol(A)
312 |   ntx <- nrow(A)
313 |   log.like <- function(theta.hat) {
314 |     sum(wts * dpois(n.obs, colSums(A * theta.hat), log=TRUE))
315 |   }
316 |   theta.hat <- rep(1, ntx)
317 |   theta.0 <- rep(1, ntx)
318 |   n.obs.sub <- n.obs[n.obs > 0]
319 |   A.sub <- A[,n.obs > 0,drop=FALSE]
320 |   rowSumsA <- rowSums(t(t(A) * wts))
321 |   if (!optim) {
322 |     for (tt in 1:niter) {
323 |       n.hat <- t(t(theta.hat * A.sub) * n.obs.sub / colSums(theta.hat * A.sub))
324 |       theta.hat <- rowSums(n.hat) / rowSumsA
325 |     }
326 |   } else {
327 |     theta.hat <- optim(theta.hat, log.like,
328 |                        lower=rep(1e-6,ntx), upper=rep(1e6,ntx),
329 |                        control=list(fnscale=-1), method="L-BFGS-B")$par
330 |   }
331 |   theta.hat
332 | }
333 | 


--------------------------------------------------------------------------------
/R/fit_bias.R:
--------------------------------------------------------------------------------
  1 | #' Fit bias models over single-isoform genes
  2 | #'
  3 | #' This function estimates parameters for one or more bias models
  4 | #' for a single sample over a set of single-isoform
  5 | #' genes. ~100 medium to highly expressed genes should be sufficient to
  6 | #' estimate the parameters robustly.
  7 | #' 
  8 | #' @param genes a GRangesList with the exons of different
  9 | #' single-isoform genes
 10 | #' @param bam.file a character string pointing to an indexed BAM file
 11 | #' @param fragtypes the output of \link{buildFragtypes}. must contain
 12 | #' the potential fragment types for the genes named in \code{genes}
 13 | #' @param genome a BSgenome object
 14 | #' @param models a list of lists: the outer list describes multiple models.
 15 | #' each element of the inner list has two elements: \code{formula} and \code{offset}.
 16 | #' \code{formula} should be a character strings of an R formula
 17 | #' describing the bias models, e.g. \code{"count ~ ns(gc) + gene"}.
 18 | #' The end of the string is required to be \code{"+ gene"}.
 19 | #' \code{offset} should be a character vector
 20 | #' listing possible bias offsets to be used (\code{"fraglen"} or \code{"vlmm"}).
 21 | #' Either \code{offset} or \code{formula} can be NULL for a model.
 22 | #' See vignette for recommendations and details.
 23 | #' @param readlength the read length
 24 | #' @param minsize the minimum fragment length to model
 25 | #' @param maxsize the maximum fragment length to model
 26 | #' @param speedglm logical, whether to use speedglm to estimate the coefficients.
 27 | #' Default is TRUE.
 28 | #' @param gc.knots knots for the GC splines
 29 | #' @param gc.bk boundary knots for the GC splines
 30 | #' @param relpos.knots knots for the relative position splines
 31 | #' @param relpos.bk boundary knots for the relative position splines
 32 | #'
 33 | #' @return a list with elements: coefs, summary, models, model.params,
 34 | #' and optional offets: fraglen.density, vlmm.fivep,
 35 | #' and vlmm.threep.
 36 | #' \itemize{
 37 | #' \item \strong{coefs} gives the estimated coefficients
 38 | #' for the different models that specified formula.
 39 | #' \item \strong{summary} gives the tables with coefficients, standard
 40 | #' errors and p-values,
 41 | #' \item \strong{models} stores the incoming \code{models} list,
 42 | #' \item \strong{model.params} stores parameters for the
 43 | #' models, such as knot locations
 44 | #' \item \strong{fraglen.density} is a
 45 | #' estimated density object for the fragment length distribution,
 46 | #' \item \strong{vlmm.fivep} and \strong{vlmm.threep}
 47 | #' store the observed and expected tabulations for the different
 48 | #' orders of the VLMM for read start bias.
 49 | #' }
 50 | #'
 51 | #' @references
 52 | #'
 53 | #' The complete bias model including fragment sequence bias
 54 | #' is described in detail in the Supplemental Note of the
 55 | #' following publication:
 56 | #' 
 57 | #' Love, M.I., Hogenesch, J.B., and Irizarry, R.A.,
 58 | #' Modeling of RNA-seq fragment sequence bias reduces
 59 | #' systematic errors in transcript abundance estimation.
 60 | #' Nature Biotechnologyh (2016) doi: 10.1038/nbt.3682
 61 | #' 
 62 | #' The read start variable length Markov model (VLMM) for
 63 | #' addressing bias introduced by random hexamer priming
 64 | #' was introduced in the following publication (the sequence
 65 | #' bias model used in Cufflinks):
 66 | #'
 67 | #' Roberts, A., Trapnell, C., Donaghey, J., Rinn, J.L., and Pachter, L.,
 68 | #' Improving RNA-Seq expression estimates by correcting for fragment bias.
 69 | #' Genome Biology (2011) doi: 10.1186/gb-2011-12-3-r22
 70 | #' 
 71 | #' @examples
 72 | #'
 73 | #' # see vignette for a more realistic example
 74 | #'
 75 | #' # these next lines just write out a BAM file from R
 76 | #' # typically you would already have a BAM file
 77 | #' library(alpineData)
 78 | #' library(GenomicAlignments)
 79 | #' library(rtracklayer)
 80 | #' gap <- ERR188088()
 81 | #' dir <- system.file(package="alpineData", "extdata")
 82 | #' bam.file <- c("ERR188088" = file.path(dir,"ERR188088.bam"))
 83 | #' export(gap, con=bam.file)
 84 | #' 
 85 | #' library(GenomicRanges)
 86 | #' library(BSgenome.Hsapiens.NCBI.GRCh38)
 87 | #' data(preprocessedData)
 88 | #'
 89 | #' readlength <- 75
 90 | #' minsize <- 125 # see vignette how to choose
 91 | #' maxsize <- 175 # see vignette how to choose
 92 | #' 
 93 | #' # here a very small subset, should be ~100 genes
 94 | #' gene.names <- names(ebt.fit)[6:8] 
 95 | #' names(gene.names) <- gene.names
 96 | #' fragtypes <- lapply(gene.names, function(gene.name) {
 97 | #'                       buildFragtypes(ebt.fit[[gene.name]],
 98 | #'                                      Hsapiens, readlength,
 99 | #'                                      minsize, maxsize)
100 | #' })
101 | #' models <- list(
102 | #'   "GC" = list(formula = "count ~ ns(gc,knots=gc.knots, Boundary.knots=gc.bk) + gene",
103 | #'               offset=c("fraglen","vlmm"))
104 | #' )
105 | #' 
106 | #' fitpar <- fitBiasModels(genes=ebt.fit[gene.names],
107 | #'                         bam.file=bam.file,
108 | #'                         fragtypes=fragtypes,
109 | #'                         genome=Hsapiens,
110 | #'                         models=models,
111 | #'                         readlength=readlength,
112 | #'                         minsize=minsize,
113 | #'                         maxsize=maxsize)
114 | #'
115 | #' @export
116 | fitBiasModels <- function(genes, bam.file, fragtypes, genome,
117 |                           models, readlength, minsize, maxsize,
118 |                           speedglm=TRUE,
119 |                           gc.knots=seq(from=.4, to=.6, length=3),
120 |                           gc.bk=c(0,1),
121 |                           relpos.knots=seq(from=.25, to=.75, length=3),
122 |                           relpos.bk=c(0,1)) {
123 |   stopifnot(file.exists(bam.file))
124 |   stopifnot(file.exists(paste0(as.character(bam.file),".bai")))
125 |   stopifnot(is(genes, "GRangesList"))
126 |   stopifnot(all(!is.na(sapply(models, function(x) x$formula))))
127 |   stopifnot(is.numeric(readlength) & length(readlength) == 1)
128 |   stopifnot(all(names(genes) %in% names(fragtypes)))
129 |   if (any(sapply(models, function(m) "vlmm" %in% m$offset))) {
130 |     stopifnot("fivep" %in% colnames(fragtypes[[1]]))
131 |   }
132 |   for (m in models) {
133 |     if (!is.null(m$formula)) {
134 |       stopifnot(is.character(m$formula))
135 |       if (!grepl("+ gene$",m$formula)) {
136 |         stop("'+ gene' needs to be at the end of the formula string")
137 |       }
138 |     }
139 |   }
140 |   exon.dna <- getSeq(genome, genes)
141 |   gene.seqs <- as(lapply(exon.dna, unlist), "DNAStringSet")
142 |   # FPBP needed to downsample to a target fragment per kilobase
143 |   fpbp <- getFPBP(genes, bam.file)
144 | 
145 |   # TODO check these downsampling parameters now that subset 
146 |   # routine is not related to number of positive counts
147 |   
148 |   # want ~1000 rows per gene, so ~300 reads per gene
149 |   # so ~300/1500 = 0.2 fragments per basepair 
150 |   target.fpbp <- 0.4
151 |   fitpar.sub <- list()
152 |   fitpar.sub[["coefs"]] <- list()
153 |   fitpar.sub[["summary"]] <- list()
154 |   # create a list over genes, populated with read info from this 'bam.file'
155 |   # so we create a new object, and preserve the original 'fragtypes' object
156 |   fragtypes.sub.list <- list()
157 |   for (i in seq_along(genes)) {
158 |     gene.name <- names(genes)[i]
159 |     gene <- genes[[gene.name]]
160 |     l <- sum(width(gene))
161 |     # add counts per sample and subset
162 |     generange <- range(gene)
163 |     strand(generange) <- "*" # not necessary
164 |     if (!as.character(seqnames(generange)) %in% seqlevels(BamFile(bam.file))) next
165 |     # this necessary to avoid hanging on highly duplicated regions
166 |     ## roughNumFrags <- countBam(bam.file, param=ScanBamParam(which=generange))$records/2
167 |     ## if (roughNumFrags > 10000) next
168 |     suppressWarnings({
169 |                        ga <- readGAlignAlpine(bam.file, generange)
170 |                      })
171 |     if (length(ga) < 20) next
172 |     ga <- keepSeqlevels(ga, as.character(seqnames(gene)[1]))
173 |     # downsample to a target FPBP
174 |     nfrags <- length(ga)
175 |     this.fpbp <- nfrags / l
176 |     if (this.fpbp > target.fpbp) {
177 |       ga <- ga[sample(nfrags, round(nfrags * target.fpbp / this.fpbp), FALSE)]
178 |     } 
179 |     fco <- findCompatibleOverlaps(ga, GRangesList(gene))
180 |     # message("-- ",round(length(fco)/length(ga),2)," compatible overlaps")
181 |     # as.numeric(table(as.character(strand(ga))[queryHits(fco)])) # strand balance
182 |     reads <- gaToReadsOnTx(ga, GRangesList(gene), fco)
183 |     # fraglist.temp is a list of length 1
184 |     # ...(matchReadsToFraglist also works for multiple transcripts)
185 |     # it will only last for a few lines...
186 |     fraglist.temp <- matchReadsToFraglist(reads, fragtypes[gene.name])
187 |     # remove first and last bp for fitting the bias terms
188 |     not.first.or.last.bp <- !(fraglist.temp[[1]]$start == 1 | fraglist.temp[[1]]$end == l)
189 |     fraglist.temp[[1]] <- fraglist.temp[[1]][not.first.or.last.bp,]
190 |     if (sum(fraglist.temp[[1]]$count) < 20) next
191 |     # randomly downsample and up-weight
192 |     fragtypes.sub.list[[gene.name]] <- subsetAndWeightFraglist(fraglist.temp,
193 |                                                                downsample=200,
194 |                                                                minzero=700)
195 |   }
196 |   if (length(fragtypes.sub.list) == 0) stop("not enough reads to model: ",bam.file)
197 |   # collapse the list over genes into a
198 |   # single DataFrame with the subsetted and weighted
199 |   # potential fragment types from all genes
200 |   # message("num genes w/ suf. reads: ",length(fragtypes.sub.list))
201 |   if (length(fragtypes.sub.list) < 2) stop("requires at least two genes to fit model")
202 |   gene.nrows <- sapply(fragtypes.sub.list, nrow)
203 |   # message("mean rows per gene: ", round(mean(gene.nrows)))
204 |   # a DataFrame of the subsetted fragtypes
205 |   fragtypes.sub <- do.call(rbind, fragtypes.sub.list)
206 | 
207 |   # check the FPBP after downsampling:
208 |   ## gene.counts <- sapply(fragtypes.sub.list, function(x) sum(x$count))
209 |   ## gene.lengths <- sum(width(genes))
210 |   ## round(unname(gene.counts / gene.lengths[names(gene.counts)]), 2)
211 | 
212 |   # save the models and parameters
213 |   fitpar.sub[["models"]] <- models
214 |   fitpar.sub[["model.params"]] <- list(
215 |     readlength=readlength,
216 |     minsize=minsize,
217 |     maxsize=maxsize,
218 |     gc.knots=gc.knots,
219 |     gc.bk=gc.bk,
220 |     relpos.knots=relpos.knots,
221 |     relpos.bk=relpos.bk
222 |   )
223 |   
224 |   if (any(sapply(models, function(m) "fraglen" %in% m$offset))) {
225 |     ## -- fragment bias --
226 |     pos.count <- fragtypes.sub$count > 0
227 |     fraglens <- rep(fragtypes.sub$fraglen[pos.count], fragtypes.sub$count[pos.count])
228 |     fraglen.density <- density(fraglens)
229 |     fragtypes.sub$logdfraglen <- log(matchToDensity(fragtypes.sub$fraglen, fraglen.density))
230 |     # with(fragtypes.sub, plot(fraglen, exp(logdfraglen), cex=.1))
231 |     fitpar.sub[["fraglen.density"]] <- fraglen.density
232 |   }
233 | 
234 |   if (any(sapply(models, function(m) "vlmm" %in% m$offset))) {
235 |     ## -- random hexamer priming bias with VLMM --
236 |     pos.count <- fragtypes.sub$count > 0
237 |     fivep <- fragtypes.sub$fivep[fragtypes.sub$fivep.test & pos.count]
238 |     threep <- fragtypes.sub$threep[fragtypes.sub$threep.test & pos.count]
239 |     vlmm.fivep <- fitVLMM(fivep, gene.seqs)
240 |     vlmm.threep <- fitVLMM(threep, gene.seqs)
241 |     ## par(mfrow=c(2,1))
242 |     ## plotOrder0(vlmm.fivep$order0)
243 |     ## plotOrder0(vlmm.threep$order0)
244 |     
245 |     # now calculate log(bias) for each fragment based on the VLMM
246 |     fragtypes.sub <- addVLMMBias(fragtypes.sub, vlmm.fivep, vlmm.threep)
247 |     fitpar.sub[["vlmm.fivep"]] <- vlmm.fivep
248 |     fitpar.sub[["vlmm.threep"]] <- vlmm.threep
249 |   }
250 | 
251 |   # allow a gene-specific intercept (although mostly handled already with downsampling)
252 |   fragtypes.sub$gene <- factor(rep(seq_along(gene.nrows), gene.nrows))
253 |   for (modeltype in names(models)) {
254 |     if (is.null(models[[modeltype]]$formula)) {
255 |       next
256 |     }
257 |     # message("fitting model type: ",modeltype)
258 |     f <- models[[modeltype]]$formula
259 |     offset <- numeric(nrow(fragtypes.sub))
260 |     if ("fraglen" %in% models[[modeltype]]$offset) {
261 |       # message("-- fragment length correction")
262 |       offset <- offset + fragtypes.sub$logdfraglen
263 |     }
264 |     if ("vlmm" %in% models[[modeltype]]$offset) {
265 |       # message("-- VLMM fragment start/end correction")
266 |       offset <- offset + fragtypes.sub$fivep.bias + fragtypes.sub$threep.bias
267 |     }
268 |     if (!all(is.finite(offset))) stop("offset needs to be finite")
269 |     fragtypes.sub$offset <-  offset
270 |     if ( speedglm ) {
271 |       # mm.small <- sparse.model.matrix(f, data=fragtypes.sub)
272 |       mm.small <- model.matrix(formula(f), data=fragtypes.sub)
273 |       stopifnot(all(colSums(abs(mm.small)) > 0))
274 |       fit <- speedglm.wfit(fragtypes.sub$count, mm.small,
275 |                            family=poisson(), 
276 |                            weights=fragtypes.sub$wts,
277 |                            offset=fragtypes.sub$offset)
278 |     } else {
279 |       fit <- glm(formula(f),
280 |                  family=poisson,
281 |                  data=fragtypes.sub,
282 |                  weights=fragtypes.sub$wts,
283 |                  offset=fragtypes.sub$offset)
284 |     }
285 |     fitpar.sub[["coefs"]][[modeltype]] <- fit$coefficients
286 |     fitpar.sub[["summary"]][[modeltype]] <- summary(fit)$coefficients
287 |   }
288 |   fitpar.sub
289 | }
290 | 


--------------------------------------------------------------------------------
/R/helper.R:
--------------------------------------------------------------------------------
  1 | #' Extract results from estimateAbundance run across genes
  2 | #'
  3 | #' This function extracts estimates for a given model from a list
  4 | #' over many genes, returning a matrix with dimensions:
  5 | #' number of transcript x number of samples.
  6 | #' Here, the count of compatible fragments aligning to the
  7 | #' genes is used to estimate the FPKM, dividing out the previously
  8 | #' used estimate \code{lib.sizes}.
  9 | #' 
 10 | #' @param res a list where each element is the output of \link{estimateAbundance}
 11 | #' @param model the name of a model, corresponds to names of \code{models}
 12 | #' used in \link{fitBiasModels}
 13 | #' @param lib.sizes the vector of library sizes passed to \link{estimateAbundance}.
 14 | #' not needed if \code{divide.out=FALSE}
 15 | #' @param divide.out logical, whether to divide out the initial estimate of
 16 | #' library size and to instead use the count of compatible fragments for
 17 | #' genes calculated by \link{estimateAbundance}. Default is TRUE
 18 | #' @param transcripts an optional GRangesList of the exons for each
 19 | #' transcript. If this is provided, the output will be a
 20 | #' SummarizedExperiment. The transcripts do not need
 21 | #' to be provided in the correct order, extractAlpine will
 22 | #' find the correct transcript by the names in \code{res} and
 23 | #' put them in the correct order.
 24 | #' 
 25 | #' @return a matrix of FPKM values across transcripts and samples,
 26 | #' or a SummarizedExperiment if \code{transcripts} is provided
 27 | #'
 28 | #' @examples
 29 | #' 
 30 | #' data(preprocessedData)
 31 | #' extractAlpine(res, "GC")
 32 | #' 
 33 | #' @export
 34 | extractAlpine <- function(res, model, lib.sizes=1e6,
 35 |                           divide.out=TRUE, transcripts=NULL) {
 36 |   # some rough code to figure out how many samples:
 37 |   # look at the first 10 (or fewer) elements of res and
 38 |   # calculate the length. why? the result for a given gene
 39 |   # could be NULL if it didn't pass some tests in estimateAbundance
 40 |   nsamp <- max(sapply(res[seq_len(min(10,length(res)))], length))
 41 |   fpkm <- extractRes(res, model, "theta", nsamp)
 42 |   lambda <- extractRes(res, model, "lambda", nsamp)
 43 |   count <- extractRes(res, model, "count", nsamp)
 44 |   lambdaBar <- colMeans(lambda, na.rm=TRUE)
 45 |   colSumsCount <- colSums(count)
 46 |   multFactor <- if (divide.out) {
 47 |     lambdaBar * lib.sizes / colSumsCount
 48 |   } else {
 49 |     lambdaBar
 50 |   }
 51 |   mat <- sweep(fpkm, 2, multFactor, `*`)
 52 |   if (is.null(transcripts)) {
 53 |     return(mat)
 54 |   } else {
 55 |     row.ranges <- transcripts[rownames(mat)]
 56 |     se <- SummarizedExperiment(assays=list(FPKM=mat),
 57 |                                rowRanges=row.ranges)
 58 |     return(se)
 59 |   }
 60 | }
 61 | 
 62 | #' Split genes that have isoforms across chromosomes
 63 | #'
 64 | #' This function simply splits apart genes which have isoforms across multiple
 65 | #' chromosomes. New "genes" are created with the suffix "_cs" and a number.
 66 | #' 
 67 | #' @param ebg an exons-by-genes GRangesList, created with \code{exonsBy}
 68 | #' @param txdf a data.frame created by running \code{select} on a TxDb object.
 69 | #' Must have columns TXCHROM and GENEID
 70 | #'
 71 | #' @return a list of manipulated \code{ebg} and \code{txdf}
 72 | #'
 73 | #' @examples
 74 | #'
 75 | #' library(GenomicRanges)
 76 | #' txdf <- data.frame(TXCHROM=c("1","1","2"),
 77 | #'                    GENEID=c("101","102","102")) 
 78 | #' ebg <- GRangesList(GRanges("1",IRanges(c(100,200),width=50)),
 79 | #'                    GRanges(c("1","2"),IRanges(c(400,100),width=50)))
 80 | #' names(ebg) <- c("101","102")
 81 | #' splitGenesAcrossChroms(ebg, txdf)
 82 | #' 
 83 | #' @export
 84 | splitGenesAcrossChroms <- function(ebg, txdf) {
 85 |   txdf$GENEID <- as.character(txdf$GENEID)
 86 |   split.chroms <- sapply(split(txdf$TXCHROM, txdf$GENEID), function(x) !all(x == x[1]))
 87 |   message("found ",sum(split.chroms),
 88 |           " genes split over chroms out of ",length(split.chroms))
 89 |   split.chroms <- names(split.chroms)[split.chroms]
 90 |   new.genes <- GRangesList()
 91 |   for (gid in split.chroms) {
 92 |     chroms <- unique(txdf$TXCHROM[txdf$GENEID == gid])
 93 |     exs <- ebg[[gid]]
 94 |     for (i in seq_along(chroms)) {
 95 |       # cs = chromosome split
 96 |       new.name <- paste0(gid,"_cs",i)
 97 |       txdf$GENEID[txdf$GENEID == gid & txdf$TXCHROM == chroms[i]] <- new.name
 98 |       new.genes[[new.name]] <- exs[seqnames(exs) == chroms[i]]
 99 |     }
100 |     ebg[[gid]] <- NULL
101 |   }
102 |   ebg <- c(ebg, new.genes)
103 |   list(ebg=ebg, txdf=txdf)
104 | }
105 | 
106 | #' Split very long genes
107 | #'
108 | #' This function splits genes which have a very long range (e.g. 1 Mb),
109 | #' and new "genes" are formed where each isoform is its own "gene",
110 | #' with the suffix "_ls" and a number.
111 | #' It makes sense to turn each isoform into its own gene only if this
112 | #' function is followed by \link{mergeGenes}.
113 | #' 
114 | #' @param ebg an exons-by-genes GRangesList, created with \code{exonsBy}
115 | #' @param ebt an exons-by-tx GRangesList, created with \code{exonsBy}
116 | #' @param txdf a data.frame created by running \code{select} on a TxDb object.
117 | #' Must have columns GENEID and TXID, where TXID corresponds to the
118 | #' names of \code{ebt}. 
119 | #' @param long a numeric value such that ranges longer than this are "long"
120 | #'
121 | #' @return a list of manipulated \code{ebg} and \code{txdf}
122 | #'
123 | #' @examples
124 | #'
125 | #' library(GenomicRanges)
126 | #' txdf <- data.frame(GENEID=c("101","101","102"),
127 | #'                    TXID=c("201","202","203"))
128 | #' ebt <- GRangesList(GRanges("1",IRanges(c(100,200),width=50)),
129 | #'                    GRanges("1",IRanges(2e6 + c(100,200),width=50)),
130 | #'                    GRanges("1",IRanges(3e6 + c(100,200),width=50)))
131 | #' names(ebt) <- c("201","202","203")
132 | #' ebg <- GRangesList(reduce(unlist(ebt[1:2])),ebt[[3]])
133 | #' names(ebg) <- c("101","102")
134 | #' splitLongGenes(ebg, ebt, txdf)
135 | #' 
136 | #' @export
137 | splitLongGenes <- function(ebg, ebt, txdf, long=1e6) {
138 |   txdf$GENEID <- as.character(txdf$GENEID)
139 |   txdf$TXID <- as.character(txdf$TXID)
140 |   strand(ebg) <- "*"
141 |   r <- unlist(range(ebg))
142 |   w <- width(r)
143 |   stopifnot(length(w) == length(ebg))
144 |   long.genes <- names(ebg)[w > long]
145 |   message("found ",length(long.genes),
146 |           " long genes (1e",log10(long)," bp) out of ",length(ebg))
147 |   new.genes <- GRangesList()
148 |   for (gid in long.genes) {
149 |     # ls = long split
150 |     new.names <- paste0(gid,"_ls",seq_len(sum(txdf$GENEID == gid)))
151 |     ebg[[gid]] <- NULL
152 |     txdf$GENEID[txdf$GENEID == gid] <- new.names
153 |     for (new.gene in new.names) {
154 |       gr <- ebt[[txdf$TXID[txdf$GENEID == new.gene]]]
155 |       mcols(gr)$exon_rank <- NULL
156 |       new.genes[[new.gene]] <- gr
157 |     }
158 |   }
159 |   ebg <- c(ebg, new.genes)
160 |   list(ebg=ebg, txdf=txdf)
161 | }
162 | 
163 | #' Merge overlapping "genes" into gene clusters
164 | #'
165 | #' This function looks for overlapping exons in \code{ebg}.
166 | #' The overlapping "genes" are used to form a graph.
167 | #' Any connected components in the graph (sets of "genes"
168 | #' which can be reached from each other through overlap relations)
169 | #' are connected into a new gene cluster, which is given the
170 | #' suffix "_mrg" and using one of the original gene names.
171 | #' 
172 | #' @param ebg an exons-by-genes GRangesList, created with \code{exonsBy}
173 | #' @param txdf a data.frame created by running \code{select} on a TxDb object.
174 | #' Must have a column GENEID.
175 | #' @param ignore.strand Default is TRUE.
176 | #'
177 | #' @return a manipulated \code{txdf}.
178 | #'
179 | #' @examples
180 | #' 
181 | #' library(GenomicRanges)
182 | #' txdf <- data.frame(GENEID=c("101","102","103","104"))
183 | #' ebg <- GRangesList(GRanges("1",IRanges(c(100,200),width=50)),
184 | #'                    GRanges("1",IRanges(c(200,300),width=50)),
185 | #'                    GRanges("1",IRanges(c(300,400),width=50)),
186 | #'                    GRanges("1",IRanges(c(500,600),width=50)))
187 | #' names(ebg) <- c("101","102","103","104")
188 | #' mergeGenes(ebg, txdf)
189 | #' 
190 | #' @export
191 | mergeGenes <- function(ebg, txdf, ignore.strand=TRUE) {
192 |   txdf$GENEID <- as.character(txdf$GENEID)
193 |   fo <- findOverlaps(ebg, ignore.strand=ignore.strand)
194 |   fo <- fo[queryHits(fo) < subjectHits(fo)]
195 |   mat <- as.matrix(fo)
196 |   graph <- ftM2graphNEL(mat, edgemode="undirected")
197 |   components <- connectedComp(graph)
198 |   message("found ",length(components), " clusters from ",length(ebg)," genes")
199 |   components <- lapply(components, function(x) names(ebg)[as.numeric(x)])
200 |   for (cluster in components) {
201 |     txdf$GENEID[txdf$GENEID %in% cluster] <- paste0(cluster[1],"_mrg")
202 |   }
203 |   txdf
204 | }
205 | 
206 | #' DESeq median ratio normalization for matrix
207 | #'
208 | #' Simple implementation of DESeq median ratio normalization
209 | #'
210 | #' @param mat a matrix of numeric values
211 | #' @param cutoff a numeric value to be used as the cutoff
212 | #' for the row means of \code{mat}. Only rows with row mean
213 | #' larger than \code{cutoff} are used for calculating
214 | #' the size factors
215 | #'
216 | #' @return a matrix with the median ratio size factors
217 | #' divided out
218 | #'
219 | #' @references Anders, S. and Huber, W.,
220 | #' Differential expression analysis for sequence count data.
221 | #' Genome Biology (2010) doi: 10.1186/gb-2010-11-10-r106
222 | #'
223 | #' @examples
224 | #'
225 | #' x <- runif(50,1,100)
226 | #' mat <- cbind(x, 2*x, 3*x)
227 | #' norm.mat <- normalizeDESeq(mat, 5)
228 | #' 
229 | #' @export
230 | normalizeDESeq <- function(mat, cutoff) {
231 |   mat2 <- mat[rowMeans(mat) > cutoff,,drop=FALSE]
232 |   loggeomeans <- rowMeans(log(mat2))
233 |   logratio <- (log(mat2) - loggeomeans)[is.finite(loggeomeans),,drop=FALSE]
234 |   sf <- exp(apply(logratio, 2, median, na.rm=TRUE))
235 |   sweep(mat, 2, sf, "/")
236 | }
237 | 
238 | #' Get fragment widths
239 | #'
240 | #' From a BAM file and a particular transcript (recommened
241 | #' to be the single isoform of a gene), this function
242 | #' returns estimates of the fragment widths, by mapping the
243 | #' fragment alignments to the transcript coordinates.
244 | #'
245 | #' @param bam.file a character string pointing to a BAM file
246 | #' @param tx a GRanges object of the exons of a single isoform gene
247 | #'
248 | #' @return a numeric vector of estimated fragment widths
249 | #'
250 | #' @examples
251 | #'
252 | #' # these next lines just write out a BAM file from R
253 | #' # typically you would already have a BAM file
254 | #' library(alpineData)
255 | #' library(GenomicAlignments)
256 | #' library(rtracklayer)
257 | #' gap <- ERR188088()
258 | #' dir <- system.file(package="alpineData", "extdata")
259 | #' bam.file <- c("ERR188088" = file.path(dir,"ERR188088.bam"))
260 | #' export(gap, con=bam.file)
261 | #' 
262 | #' data(preprocessedData)
263 | #'
264 | #' w <- getFragmentWidths(bam.file, ebt.fit[[2]])
265 | #' quantile(w, c(.025, .975))
266 | #' 
267 | #' @export
268 | getFragmentWidths <- function(bam.file, tx) {
269 |   gap <- readGAlignmentPairs(bam.file, param=ScanBamParam(which=range(tx)))
270 |   stopifnot(length(gap) > 0)
271 |   fo <- findCompatibleOverlaps(gap, GRangesList(tx=tx))
272 |   stopifnot(length(fo) > 0)
273 |   gap <- gap[queryHits(fo)]
274 |   left <- first(gap)
275 |   right <- last(gap)
276 |   first.minus <- as.vector(strand(first(gap)) == "-")
277 |   left[first.minus] <- last(gap)[first.minus]
278 |   right[first.minus] <- first(gap)[first.minus]
279 |   left.tx <- start(mapToTranscripts(GRanges(seqnames(gap),
280 |                                             IRanges(start(left),width=1)),
281 |                                     GRangesList(tx=tx)))
282 |   right.tx <- end(mapToTranscripts(GRanges(seqnames(gap),
283 |                                            IRanges(end(right),width=1)),
284 |                                    GRangesList(tx=tx)))
285 |   w <- right.tx - left.tx + 1
286 |   if (as.character(strand(tx)[1]) == "-") {
287 |     w <- w * -1
288 |   }
289 |   return(w)
290 | }
291 | 
292 | #' Get read length
293 | #'
294 | #' Gets the length of the first read in a BAM file
295 | #'
296 | #' @param bam.files a character vector pointing to BAM files
297 | #'
298 | #' @return a numeric vector, one number per BAM file, the
299 | #' length of the first read in the file
300 | #'
301 | #' @examples
302 | #'
303 | #' # these next lines just write out a BAM file from R
304 | #' # typically you would already have a BAM file
305 | #' library(alpineData)
306 | #' library(GenomicAlignments)
307 | #' library(rtracklayer)
308 | #' gap <- ERR188088()
309 | #' dir <- system.file(package="alpineData", "extdata")
310 | #' bam.file <- c("ERR188088" = file.path(dir,"ERR188088.bam"))
311 | #' export(gap, con=bam.file)
312 | #'
313 | #' getReadLength(bam.file)
314 | #'
315 | #' @export
316 | getReadLength <- function(bam.files) {
317 |   getRL1 <- function(file) {
318 |     qwidth(readGAlignments(BamFile(file, yieldSize=1)))
319 |   }
320 |   sapply(bam.files, getRL1)
321 | }
322 | 
323 | ######### unexported helper functions #########
324 | 
325 | extractRes <- function(res, model, what, nsamp) {
326 |   do.call(rbind, lapply(res, function(x) {
327 |     if (is.null(x)) {
328 |       if (what == "count") return(rep(0, nsamp))
329 |       return(rep(NA, nsamp)) # the whole gene gets a single row of NA
330 |     }
331 |     if (what == "count") {
332 |       return(sapply(x, `[[`, what))
333 |     } else {
334 |       res.list <- lapply(x, function(y) y[[model]][[what]])
335 |       return(do.call(cbind, res.list))
336 |     }
337 |   }))
338 | }
339 | alpineFlag <- function() scanBamFlag(isSecondaryAlignment=FALSE)
340 | readGAlignAlpine <- function(bam.file, generange, manual=TRUE) {
341 |   if (manual) {
342 |     param <- ScanBamParam(which=generange, what=c("flag","mrnm","mpos"), flag=alpineFlag())
343 |     gal <- readGAlignments(bam.file, use.names=TRUE, param=param)
344 |     makeGAlignmentPairs(gal)
345 |   } else {
346 |     readGAlignmentPairs(bam.file,param=ScanBamParam(which=generange,flag=alpineFlag()))
347 |   }
348 | }
349 | 


--------------------------------------------------------------------------------
/R/plots.R:
--------------------------------------------------------------------------------
  1 | #' Plot the fragment GC bias over samples
  2 | #' 
  3 | #' Plots smooth curves of the log fragment rate over fragment GC content.
  4 | #'
  5 | #' @param fitpar a list of the output of \link{fitBiasModels} over samples
  6 | #' @param model the name of one of the models 
  7 | #' @param col a vector of colors
  8 | #' @param lty a vector of line types
  9 | #' @param ylim the y limits for the plot
 10 | #' @param gc.range a numeric of length two,
 11 | #' the range of the fragment GC content. By default,
 12 | #' [.2,.8] for plotting and [0,1] for returning a matrix
 13 | #' @param return.type a numeric, either
 14 | #' 0: make a plot,
 15 | #' 1: skip the plot and return a matrix of log fragment rate,
 16 | #' 2: skip the plot and return a matrix of probabilities 
 17 | #'
 18 | #' @return Either plot, or if \code{return.type} is 1 or 2, a matrix
 19 | #' 
 20 | #' @examples
 21 | #'
 22 | #' # fitpar was fit using identical code
 23 | #' # as found in the vignette, except with
 24 | #' # 25 genes, and with fragment size in 80-350 bp
 25 | #' data(preprocessedData)
 26 | #' perf <- rep(1:2, each=2)
 27 | #' plotGC(fitpar, "all", col=perf)
 28 | #' 
 29 | #' @export
 30 | plotGC <- function(fitpar, model, col, lty, ylim,
 31 |                    gc.range=NULL, return.type=0) {
 32 | 
 33 |   if (is.null(gc.range)) {
 34 |     gc.range <- if (return.type == 0) {
 35 |       c(.2, .8)
 36 |     } else {
 37 |       c(0,1)
 38 |     }
 39 |   }
 40 |   stopifnot(length(gc.range) == 2)
 41 |   stopifnot(length(return.type) == 1 & return.type %in% 0:2)
 42 | 
 43 |   # just a single sample?
 44 |   if ("models" %in% names(fitpar)) {
 45 |     fitpar <- list(fitpar)
 46 |   }
 47 |   
 48 |   knots <- fitpar[[1]][["model.params"]]$gc.knots
 49 |   bk <- fitpar[[1]][["model.params"]]$gc.bk
 50 |   
 51 |   n <- length(knots)
 52 |   coef.nms <- names(fitpar[[1]][["coefs"]][[model]])
 53 |   coef.idx <- c(grep("\\(Intercept\\)",coef.nms), grep("ns\\(gc", coef.nms))
 54 |   coefmat <- sapply(fitpar, function(elem) elem[["coefs"]][[model]][coef.idx])
 55 |   gene.coefs <- lapply(fitpar, function(elem)
 56 |     elem[["coefs"]][[model]][ grep("gene", names(elem[["coefs"]][[model]])) ])
 57 |   # new intercept: the average of the intercept + gene coefficients
 58 |   coefmat[1,] <- coefmat[1,] + sapply(gene.coefs, mean)
 59 |   z <- seq(from=gc.range[1],to=gc.range[2],length=101)
 60 |   x <- model.matrix(~ ns(z, knots=knots, Boundary.knots=bk))
 61 |   logpred <- x %*% coefmat
 62 |   rownames(logpred) <- as.character(z)
 63 |   if (return.type == 1) {
 64 |     return(logpred)
 65 |   } else if (return.type == 2) {
 66 |     probmat <- exp(logpred)
 67 |     probmat <- sweep(probmat, 2, apply(probmat, 2, max), "/")
 68 |     return(probmat)
 69 |   }
 70 |   if (missing(ylim)) {
 71 |     ylim <- c(min(logpred),max(logpred))
 72 |   }
 73 |   plot(0,0,type="n",xlim=c(gc.range[1],gc.range[2]),ylim=ylim,
 74 |        ylab="log fragment rate", xlab="fragment GC content",
 75 |        main="fragment sequence bias")
 76 |   if (missing(col)) {
 77 |     col <- rep("black", ncol(logpred))
 78 |   }
 79 |   if (missing(lty)) {
 80 |     lty <- rep(1, ncol(logpred))
 81 |   }
 82 |   for (i in 1:ncol(logpred)) {
 83 |     lines(z, logpred[,i], col=col[i], lwd=2, lty=lty[i])
 84 |   }
 85 | }
 86 | 
 87 | #' Plot relative position bias over samples
 88 | #'
 89 | #' Plots the smooth curves of log fragment rate over relative position.
 90 | #' 
 91 | #' @param fitpar a list of the output of \link{fitBiasModels} over samples
 92 | #' @param model the name of one of the models 
 93 | #' @param col a vector of colors
 94 | #' @param lty a vector of line types
 95 | #' @param ylim the y limits for the plot
 96 | #'
 97 | #' @return plot
 98 | #' 
 99 | #' @examples
100 | #'
101 | #' # fitpar was fit using identical code
102 | #' # as found in the vignette, except with
103 | #' # 25 genes, and with fragment size in 80-350 bp
104 | #' data(preprocessedData)
105 | #' perf <- rep(1:2, each=2)
106 | #' plotRelPos(fitpar, "all", col=perf)
107 | #' 
108 | #' @export
109 | plotRelPos <- function(fitpar, model, col, lty, ylim) {
110 | 
111 |   # just a single sample?
112 |   if ("models" %in% names(fitpar)) {
113 |     fitpar <- list(fitpar)
114 |   }
115 | 
116 |   knots <- fitpar[[1]][["model.params"]]$relpos.knots
117 |   bk <- fitpar[[1]][["model.params"]]$relpos.bk
118 | 
119 |   n <- length(knots)
120 |   coef.nms <- names(fitpar[[1]][["coefs"]][[model]])
121 |   coef.idx <- c(grep("\\(Intercept\\)",coef.nms), grep("ns\\(relpos", coef.nms))
122 |   coefmat <- sapply(fitpar, function(elem) elem[["coefs"]][[model]][coef.idx])
123 |   z <- seq(from=0,to=1,length=101)
124 |   x <- model.matrix(~ ns(z, knots=knots, Boundary.knots=bk))
125 |   logpred <- x %*% coefmat
126 |   logpred <- scale(logpred, scale=FALSE)
127 |   if (missing(ylim)) {
128 |     ylim <- c(min(logpred),max(logpred))
129 |   }
130 |   plot(0,0,type="n",xlim=c(0,1),ylim=ylim,
131 |        ylab="log fragment rate", xlab="5' -- position in transcript -- 3'",
132 |        main="relative position bias")
133 |   if (missing(col)) {
134 |     col <- rep("black", ncol(logpred))
135 |   }
136 |   if (missing(lty)) {
137 |     lty <- rep(1, ncol(logpred))
138 |   }
139 |   for (i in 1:ncol(logpred)) {
140 |     lines(z, logpred[,i], col=col[i], lwd=2, lty=lty[i])
141 |   }
142 | }
143 | 
144 | #' Plot fragment length distribution over samples
145 | #'
146 | #' Plots the fragment length distribution.
147 | #' 
148 | #' @param fitpar a list of the output of \link{fitBiasModels} over samples
149 | #' @param col a vector of colors
150 | #' @param lty a vector of line types
151 | #'
152 | #' @return plot
153 | #' 
154 | #' @examples
155 | #' 
156 | #' # fitpar was fit using identical code
157 | #' # as found in the vignette, except with
158 | #' # 25 genes, and with fragment size in 80-350 bp
159 | #' data(preprocessedData)
160 | #' perf <- rep(1:2, each=2)
161 | #' plotFragLen(fitpar, col=perf)
162 | #' 
163 | #' @export
164 | plotFragLen <- function(fitpar, col, lty) {
165 | 
166 |   # just a single sample?
167 |   if ("models" %in% names(fitpar)) {
168 |     fitpar <- list(fitpar)
169 |   }
170 | 
171 |   if (missing(col)) {
172 |     col <- rep("black", length(fitpar))
173 |   }
174 |   if (missing(lty)) {
175 |     lty <- rep(1, length(fitpar))
176 |   }
177 |   ymax <- max(sapply(fitpar, function(x) max(x$fraglen.density$y)))
178 |   plot(fitpar[[1]]$fraglen.density, ylim=c(0,ymax*1.1),
179 |        xlab="fragment length", ylab="density", main="fragment length distribution",
180 |        col=col[1], lty=lty[1], lwd=2)
181 |   for (i in seq_along(fitpar)[-1]) {
182 |     lines(fitpar[[i]]$fraglen.density, col=col[i], lty=lty[i], lwd=2)
183 |   }
184 | }
185 | 
186 | #' Plot parameters of the variable length Markov model (VLMM) for read starts
187 | #'
188 | #' This function plots portions of the Cufflinks VLMM for read start bias.
189 | #' The natural log of observed over expected is shown, such that 0
190 | #' indicates no contribution of a position to the read start bias.
191 | #' As the variable lenght Markov model has different dependencies for different
192 | #' positions (see Roberts et al, 2011), it is difficult
193 | #' to show all the 744 parameters simultaneously. Instead this function
194 | #' offers to show the 0-order terms for all positions, or the 1st and 2nd
195 | #' order terms for selected positions within the read start sequence.
196 | #' For the 1- and 2-order terms, the log bias is shown for each nucleotide
197 | #' (A,C,T,G) given the previous nucleotide (1-order) or di-nucleotide (2-order).
198 | #'
199 | #' @references
200 | #'
201 | #' Roberts et al, "Improving RNA-Seq expression estimates by correcting for fragment bias"
202 | #' Genome Biology (2011) doi:101186/gb-2011-12-3-r22
203 | #' 
204 | #' @param order0 the "order0" element of the list named "vlmm.fivep" or "vlmm.threep"
205 | #' within the list that is the output of \link{fitBiasModels}
206 | #' @param order1 as for "order0" but "order1"
207 | #' @param pos1 the position of the 1st order VLMM to plot
208 | #' @param order2 as for "order0" but "order2"
209 | #' @param pos2 the position of the 2nd order VLMM to plot
210 | #' @param ... parameters passed to \code{plot}
211 | #'
212 | #' @return plot
213 | #' 
214 | #' @examples
215 | #'
216 | #' # fitpar was fit using identical code
217 | #' # as found in the vignette, except with
218 | #' # 25 genes, and with fragment size in 80-350 bp
219 | #' data(preprocessedData)
220 | #' plotOrder0(fitpar[[1]][["vlmm.fivep"]][["order0"]])
221 | #' plotOrder1(fitpar[[1]][["vlmm.fivep"]][["order1"]], pos1=5:19)
222 | #' plotOrder2(fitpar[[1]][["vlmm.fivep"]][["order2"]], pos2=8:17)
223 | #' 
224 | #' @export
225 | plotOrder0 <- function(order0, ...) {
226 |   dna.letters <- c("A","C","G","T")
227 |   mat <- log(order0$obs/order0$expect)
228 |   xpos <- -8:12
229 |   dna.cols <- c("green3","blue3","orange3","red3")
230 |   plot(0,0,xlim=c(-8,12),type="n",xlab="position",ylab="log(observed / expected)", ...)
231 |   for (i in 1:4) {
232 |     points(xpos, mat[i,], col=dna.cols[i], type="b", lwd=2)
233 |   }
234 |   abline(v=0, h=0)
235 |   legend("topright",dna.letters,pch=1,lty=1,col=dna.cols,bg="white")
236 | }
237 | 
238 | #' @describeIn plotOrder0 Plot first order parameters for a position
239 | #' @export
240 | plotOrder1 <- function(order1, pos1) {
241 |   dna.letters <- c("A","C","G","T")
242 |   order <- 1
243 |   npos <- length(pos1)
244 |   dna.cols <- c("green3","blue3","orange3","red3")
245 |   par(mfrow=c(1,npos+1),mar=c(5,1,3,1))
246 |   for (i in seq_len(npos)) {
247 |     plot(as.vector(log(order1$obs[,,i]/order1$expect)), rev(seq_len(4 * 4^order)),
248 |          col=rep(dna.cols,each=4), xlim=c(-1,1),
249 |          ylab="",xlab="",yaxt="n",main=pos1[i] - 10 + 1,cex=2)
250 |     abline(v=0)
251 |   }
252 |   plot(0,0,type="n",xaxt="n",yaxt="n",xlab="",ylab="")
253 |   alpha <- alphafun(dna.letters, order-1)
254 |   legend("center",alpha,pch=1,col=dna.cols,cex=2,title="prev")
255 | }
256 | 
257 | #' @describeIn plotOrder0 Plot second order parameters for a position
258 | #' @export
259 | plotOrder2 <- function(order2, pos2) {
260 |   dna.letters <- c("A","C","G","T")
261 |   order <- 2
262 |   npos <- length(pos2)
263 |   dna.cols <- c("green3","blue3","orange3","red3")
264 |   par(mfrow=c(1,npos+1),mar=c(5,1,3,1))
265 |   for (i in 1:npos) {
266 |     plot(as.vector(log(order2$obs[,,i]/order2$expect)), rev(seq_len(4 * 4^order)),
267 |          col=rep(dna.cols,each=4),pch=rep(1:4,each=16), xlim=c(-1,1),
268 |          ylab="",xlab="",yaxt="n",main=pos2[i] - 10 + 1,cex=2)
269 |     abline(v=0)
270 |   }
271 |   plot(0,0,type="n",xaxt="n",yaxt="n",xlab="",ylab="")
272 |   alpha <- alphafun(dna.letters, order-1)
273 |   legend("center",alpha,pch=rep(1:4,each=4),col=dna.cols,cex=2,title="prev")
274 | }
275 | 
276 | #' Simple segments plot for GRangesList
277 | #' 
278 | #' Simple segments plot for GRangesList
279 | #'
280 | #' @param grl GRangesList object
281 | #' @param ... passed to plot
282 | #'
283 | #' @return plot
284 | #' 
285 | #' @examples
286 | #' 
287 | #' library(GenomicRanges)
288 | #' grl <- GRangesList(GRanges("1",IRanges(c(100,200,300),width=50)),
289 | #'                    GRanges("1",IRanges(c(100,300),width=c(75,50))),
290 | #'                    GRanges("1",IRanges(c(100,200,400),width=c(75,50,50))),
291 | #'                    GRanges("1",IRanges(c(200,300,400),width=50)))
292 | #' plotGRL(grl)
293 | #' 
294 | #' @export
295 | plotGRL <- function(grl, ...) {
296 |   df <- as.data.frame(grl)
297 |   plot(0, 0, xlim=range(c(df$start,df$end)), ylim=c(1,max(df$group)),
298 |        type="n", xlab="position", ylab="group", ...)
299 |   segments(df$start, df$group, df$end, df$group, lwd=3)
300 | }
301 | 


--------------------------------------------------------------------------------
/R/predict.R:
--------------------------------------------------------------------------------
  1 | #' Predict coverage for a single-isoform gene
  2 | #'
  3 | #' Predict coverage for a single-isoform gene given
  4 | #' fitted bias parameters in a set of models,
  5 | #' and compare to the observed fragment coverage.
  6 | #'
  7 | #' Note that if the range between \code{minsize} and \code{maxsize}
  8 | #' does not cover most of the fragment length distribution, the
  9 | #' predicted coverage will underestimate the observed coverage.
 10 | #'
 11 | #' @param gene a GRangesList with the exons of different genes
 12 | #' @param bam.files a character string pointing to indexed BAM files
 13 | #' @param fitpar the output of running \link{fitBiasModels}
 14 | #' @param genome a BSgenome object
 15 | #' @param model.names a character vector listing the models,
 16 | #' see same argument in \link{estimateAbundance}
 17 | #'
 18 | #' @return a list with elements frag.cov, the observed fragment coverage
 19 | #' from the \code{bam.files} and pred.cov, a list with the predicted
 20 | #' fragment coverage for each of the \code{models}. 
 21 | #'
 22 | #' @examples
 23 | #'
 24 | #' # these next lines just write out a BAM file from R
 25 | #' # typically you would already have a BAM file
 26 | #' library(alpineData)
 27 | #' library(GenomicAlignments)
 28 | #' library(rtracklayer)
 29 | #' gap <- ERR188088()
 30 | #' dir <- system.file(package="alpineData", "extdata")
 31 | #' bam.file <- c("ERR188088" = file.path(dir,"ERR188088.bam"))
 32 | #' export(gap, con=bam.file)
 33 | #' 
 34 | #' data(preprocessedData)
 35 | #' library(BSgenome.Hsapiens.NCBI.GRCh38)
 36 | #' 
 37 | #' model.names <- c("fraglen","fraglen.vlmm","GC","all")
 38 | #' 
 39 | #' pred.cov <- predictCoverage(gene=ebt.fit[["ENST00000379660"]],
 40 | #'                             bam.files=bam.file,
 41 | #'                             fitpar=fitpar.small,
 42 | #'                             genome=Hsapiens,
 43 | #'                             model.names=model.names)
 44 | #' 
 45 | #' # plot the coverage:
 46 | #' # note that, because [125,175] bp range specified in fitpar.small
 47 | #' # does not cover the fragment width distribution, the predicted curves
 48 | #' # will underestimate the observed. we correct here post-hoc
 49 | #' 
 50 | #' frag.cov <- pred.cov[["ERR188088"]][["frag.cov"]]
 51 | #' plot(frag.cov, type="l", lwd=3, ylim=c(0,max(frag.cov)*1.5))
 52 | #' for (i in seq_along(model.names)) {
 53 | #'   m <- model.names[i]
 54 | #'   pred <- pred.cov[["ERR188088"]][["pred.cov"]][[m]]
 55 | #'   lines(pred/mean(pred)*mean(frag.cov), col=i+1, lwd=3)
 56 | #' }
 57 | #' legend("topright", legend=c("observed",model.names),
 58 | #'        col=seq_len(length(model.names)+1), lwd=3)
 59 | #' 
 60 | #' @export
 61 | predictCoverage <- function(gene, bam.files, fitpar, genome, model.names) {
 62 |   stopifnot(is(gene, "GRanges"))
 63 |   stopifnot(!is.null(fitpar))
 64 |   stopifnot(all(names(bam.files) %in% names(fitpar)))
 65 |   if (is.null(names(bam.files))) {
 66 |     names(bam.files) <- seq_along(bam.files)
 67 |   }
 68 | 
 69 |   # pull out some model parameters
 70 |   stopifnot(all(c("readlength","minsize","maxsize","maxsize") %in%
 71 |                 names(fitpar[[1]][["model.params"]])))
 72 |   readlength <- fitpar[[1]][["model.params"]][["readlength"]]
 73 |   minsize <- fitpar[[1]][["model.params"]][["minsize"]]
 74 |   maxsize <- fitpar[[1]][["model.params"]][["maxsize"]]
 75 | 
 76 |   # take model names and fitpar models and make the
 77 |   # models suitable for bias calculation
 78 |   models <- namesToModels(model.names, fitpar)
 79 |   
 80 |   fragtypes <- buildFragtypes(gene, genome, readlength=readlength,
 81 |                               minsize=minsize, maxsize=maxsize)
 82 |   res <- list()
 83 |   for (bamname in names(bam.files)) {
 84 |     # add counts
 85 |     bam.file <- bam.files[bamname]
 86 |     generange <- range(gene)
 87 |     strand(generange) <- "*" # not necessary
 88 |     suppressWarnings({
 89 |       ga <- readGAlignAlpine(bam.file, generange)
 90 |     })
 91 |     if (length(ga) == 0) {
 92 |       res[[bamname]] <- as.list(rep(NA,length(models)))
 93 |       names(res[[bamname]]) <- names(models)
 94 |       next
 95 |     }
 96 |     ga <- keepSeqlevels(ga, as.character(seqnames(gene)[1]))
 97 |     fco <- findCompatibleOverlaps(ga, GRangesList(gene))
 98 |     # message("-- ",round(length(fco)/length(ga),2)," compatible overlaps")
 99 |     reads <- gaToReadsOnTx(ga, GRangesList(gene), fco)
100 | 
101 |     # save fragment coverage for later
102 |     l <- sum(width(gene))
103 |     frag.cov <- coverage(reads[[1]][start(reads[[1]]) != 1 & end(reads[[1]]) != l])
104 |     
105 |     fragtypes.temp <- matchReadsToFraglist(reads, list(fragtypes))[[1]]
106 |     ## -- fragment bias --
107 |     fraglen.density <- fitpar[[bamname]][["fraglen.density"]]
108 |     stopifnot(!is.null(fraglen.density))
109 |     fragtypes.temp$logdfraglen <- log(matchToDensity(fragtypes.temp$fraglen,
110 |                                                      fraglen.density))
111 |     ## -- random hexamer priming bias with VLMM --
112 |     vlmm.fivep <- fitpar[[bamname]][["vlmm.fivep"]]
113 |     vlmm.threep <- fitpar[[bamname]][["vlmm.threep"]]
114 |     stopifnot(!is.null(vlmm.fivep))
115 |     stopifnot(!is.null(vlmm.threep))
116 |     fragtypes.temp <- addVLMMBias(fragtypes.temp, vlmm.fivep, vlmm.threep)
117 |     
118 |     # -- fit models --
119 |     res[[bamname]] <- list()
120 | 
121 |     # remove first and last bp for predicting coverage along transcript
122 |     not.first.or.last.bp <- !(fragtypes.temp$start == 1 | fragtypes.temp$end == l)
123 |     fragtypes.temp <- fragtypes.temp[not.first.or.last.bp,]
124 | 
125 |     ir <- IRanges(fragtypes.temp$start, fragtypes.temp$end)
126 |     res[[bamname]]$l <- l
127 |     res[[bamname]]$frag.cov <- frag.cov
128 |     res[[bamname]]$pred.cov <- list()
129 |     for (modeltype in names(models)) {
130 |       # message("predicting model type: ",modeltype)
131 |       log.lambda <- getLogLambda(fragtypes.temp, models, modeltype, fitpar, bamname)
132 |       pred0 <- exp(log.lambda)
133 |       pred <- pred0/mean(pred0)*mean(fragtypes.temp$count)
134 |       res[[bamname]][["pred.cov"]][[modeltype]] <- coverage(ir, weight=pred)
135 |     }
136 |   }
137 |   res
138 | }
139 | 


--------------------------------------------------------------------------------
/R/vlmm.R:
--------------------------------------------------------------------------------
  1 | # unexported functions for estimating a variable length Markov model (VLMM).
  2 | #
  3 | # Here we implement the 21 bp VLMM used in Cufflinks to estimate read start
  4 | # biases. The method is described in:
  5 | #
  6 | # Roberts et al, "Improving RNA-Seq expression estimates by correcting for fragment bias"
  7 | # Genome Biology (2011) doi:101186/gb-2011-12-3-r22
  8 | 
  9 | alphafun <- function(x, order) {
 10 |   if (order == 0) {
 11 |     return(x)
 12 |   } else{
 13 |     return(as.vector(t(outer( alphafun(x, order-1), x, paste0))))
 14 |   }
 15 | }
 16 | getKmerFreqs <- function(seqs, dna.letters, order, pc=1) {
 17 |   alpha <- alphafun(dna.letters, order)
 18 |   n <- sum(width(seqs)) - order*length(seqs)
 19 |   if (order > 0) {
 20 |     out <- sapply(alpha, function(p) sum(vcountPattern(p, seqs)))
 21 |   } else {
 22 |     out <- colSums(letterFrequency(seqs, dna.letters))
 23 |   }
 24 |   stopifnot(sum(out) == n)
 25 |   out <- out + pc
 26 |   out/sum(out)
 27 | }
 28 | getPositionalKmerFreqs <- function(seqs, dna.letters, order, pos, pc=1) {
 29 |   alpha <- alphafun(dna.letters, order)
 30 |   out <- as.numeric(table(factor(substr(seqs, pos-order, pos), alpha)))
 31 |   names(out) <- alpha
 32 |   out <- out + pc
 33 |   out/sum(out)
 34 | }
 35 | getPositionalObsOverExp <- function(seqs, gene.seqs, dna.letters, order, pos) { 
 36 |   npos <- length(pos)
 37 |   res <- sapply(pos, function(i) getPositionalKmerFreqs(seqs,
 38 |                                    dna.letters, order=order, pos=i))
 39 |   # 'obs' is a 3 dimensional array:
 40 |   #   1st dim: A,C,G,T the current base
 41 |   #   2nd dim: the 4^order previous bases
 42 |   #   3rd dim: the position, which is within a subset of the full VLMM order
 43 |   obs <- array(0, dim=c(4, 4^order, npos), dimnames=
 44 |                list(dna.letters, alphafun(dna.letters, order-1), seq_len(npos)))  
 45 |   for (i in 1:npos) {
 46 |     obs[,,i] <- res[,i]
 47 |     obs[,,i] <- sweep(obs[,,i], 2, colSums(obs[,,i]), "/")
 48 |   }
 49 |   res.gene <- getKmerFreqs(gene.seqs, dna.letters, order)
 50 |   alpha <- alphafun(dna.letters, order-1)
 51 |   # 'expect' has dims 1 and 2 from above
 52 |   expect <- array(0, dim=c(4, 4^order),
 53 |                   dimnames=list(dna.letters, alphafun(dna.letters, order-1)))
 54 |   for (p in alpha) {
 55 |     prob <- res.gene[grep(paste0("^",p), names(res.gene))]
 56 |     expect[,p] <- prob/sum(prob)
 57 |   }
 58 |   list(obs=obs, expect=expect)
 59 | }
 60 | fitVLMM <- function (seqs, gene.seqs) {
 61 |   # fit a VLMM according to Roberts et al. (2011), doi:101186/gb-2011-12-3-r22
 62 |   dna.letters <- c("A","C","G","T")
 63 |   vlmm.order <- c(0,0,0,0,1,1,1,2,2,2,2,2,2,2,2,2,2,1,1,0,0)
 64 |   # sum(table(vlmm.order) * c(4,4^2,4^3)) # the 744 parameters
 65 |   # order 0
 66 |   order0 <- list()
 67 |   order0$obs <- sapply(seq_along(vlmm.order), function(i)
 68 |     getPositionalKmerFreqs(seqs, dna.letters, order=0, pos=i))
 69 |   colnames(order0$obs) <- seq_along(vlmm.order)
 70 |   order0$expect <- getKmerFreqs(gene.seqs, dna.letters, 0)
 71 |   # order 1
 72 |   order <- 1
 73 |   pos1 <- which(vlmm.order >= order)
 74 |   order1 <- getPositionalObsOverExp(seqs, gene.seqs, dna.letters, order, pos1)
 75 |   # order 2
 76 |   order <- 2
 77 |   pos2 <- which(vlmm.order >= order)
 78 |   order2 <- getPositionalObsOverExp(seqs, gene.seqs, dna.letters, order, pos2)
 79 |   list(order0=order0, order1=order1, order2=order2)
 80 | }
 81 | calcVLMMBias <- function(seqs, vlmm.model, short=FALSE, pseudocount=1) {
 82 |   stopifnot(!is.null(vlmm.model))
 83 |   dna.letters <- c("A","C","G","T")
 84 |   vlmm.order <- if (short) {
 85 |     # short = the VLMM when the reads are 8 or less positions from the end of transcript
 86 |     c(0,1,2,2,2,2,2,2,2,1,1,0,0)
 87 |   } else {
 88 |     c(0,0,0,0,1,1,1,2,2,2,2,2,2,2,2,2,2,1,1,0,0)
 89 |   }
 90 |   # maps from position in the seq to the VLMM matrices
 91 |   map <- if (short) {
 92 |     list("order0"=c(9:21),
 93 |          "order1"=c(rep(NA,1),6:15,rep(NA,2)),
 94 |          "order2"=c(rep(NA,2),4:10,rep(NA,4)))
 95 |   } else {
 96 |     list("order0"=1:21,
 97 |          "order1"=c(rep(NA,4),1:15,rep(NA,2)),
 98 |          "order2"=c(rep(NA,7),1:10,rep(NA,4)))
 99 |   }
100 |   bias <- matrix(NA, length(seqs), length(vlmm.order))
101 |   for (i in seq_along(vlmm.order)) {
102 |     order <- vlmm.order[i]
103 |     alpha <- alphafun(dna.letters, order)
104 |     o <- paste0("order",order)
105 |     kmer <- substr(seqs, i - order, i)
106 |     j <- map[[o]][i]
107 |     if (order == 0) {
108 |       bias.lookup <- vlmm.model[[o]]$obs[,j] / vlmm.model[[o]]$exp
109 |     } else {
110 |       bias.lookup <- as.vector(vlmm.model[[o]]$obs[,,j] / vlmm.model[[o]]$exp)
111 |       names(bias.lookup) <- alpha
112 |     }
113 |     bias[,i] <- bias.lookup[ kmer ]
114 |   }
115 |   bias
116 | }
117 | addVLMMBias <- function(fragtypes, vlmm.fivep, vlmm.threep) {
118 |   fivep <- fragtypes$fivep[fragtypes$fivep.test]
119 |   fivep.short <- fragtypes$fivep[!fragtypes$fivep.test]
120 |   threep <- fragtypes$threep[fragtypes$threep.test]
121 |   threep.short <- fragtypes$threep[!fragtypes$threep.test]   
122 |   fivep.bias <- numeric(nrow(fragtypes))
123 |   fivep.bias[fragtypes$fivep.test] <- rowSums(log(calcVLMMBias(fivep, vlmm.fivep, short=FALSE)))
124 |   fivep.bias[!fragtypes$fivep.test] <- rowSums(log(calcVLMMBias(fivep.short, vlmm.fivep, short=TRUE)))
125 |   fragtypes$fivep.bias <- fivep.bias
126 |   threep.bias <- numeric(nrow(fragtypes))
127 |   threep.bias[fragtypes$threep.test] <- rowSums(log(calcVLMMBias(threep, vlmm.threep, short=FALSE)))
128 |   threep.bias[!fragtypes$threep.test] <- rowSums(log(calcVLMMBias(threep.short, vlmm.threep, short=TRUE)))
129 |   fragtypes$threep.bias <- threep.bias
130 |   fragtypes
131 | }
132 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # alpine
 2 | 
 3 | ![alpine](http://mikelove.nfshost.com/img/alpine.jpg)
 4 | 
 5 | (the [Sassolungo](https://en.wikipedia.org/wiki/Langkofel) mountain in the Dolomites)
 6 | 
 7 | *alpine* is an R/Bioconductor package for modeling and correcting fragment
 8 | sequence bias for RNA-seq transcript abundance estimation. In
 9 | particular, it is the first method of its kind to take into account
10 | sample-specific dependence of RNA-seq fragments on their GC content.
11 | 
12 | An example workflow can be found in `vignettes/alpine.Rmd`, or by typing:
13 | 
14 | ```{r}
15 | vignette("alpine")
16 | ```
17 | 
18 | *alpine* is designed for un-stranded paired-end RNA-seq data.
19 | 
20 | A paper explaining the *alpine* methods was published in December 2016:
21 | 
22 | <http://dx.doi.org/10.1038/nbt.3682>
23 | 


--------------------------------------------------------------------------------
/data/preprocessedData.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mikelove/alpine/779995dec261695d79f194a37d1fe73a3f8b3cca/data/preprocessedData.rda


--------------------------------------------------------------------------------
/inst/CITATION:
--------------------------------------------------------------------------------
 1 | citEntry(entry="article",
 2 |          title = "Modeling of RNA-seq fragment sequence bias reduces systematic errors in transcript abundance estimation",
 3 |          author = personList( as.person("Michael I. Love"),
 4 |                               as.person("John B. Hogenesch"),
 5 |                               as.person("Rafael A. Irizarry")),
 6 |          year = 2016,
 7 |          journal = "Nature Biotechnology",
 8 |          doi = "10.1038/nbt.3682",
 9 |          volume = 34,
10 |          issue = 12,
11 |          pages = 1287-1291,
12 |          textVersion =
13 |          paste("Love, M.I., Hogenesch, J.B., Irizarry, R.A.",
14 |                "Modeling of RNA-seq fragment sequence bias reduces systematic errors in transcript abundance estimation",
15 |                 "Nature Biotechnology 34(12):1287-1291 (2016)" ) )
16 | 


--------------------------------------------------------------------------------
/man/alpine-package.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/core.R
 3 | \docType{package}
 4 | \name{alpine-package}
 5 | \alias{alpine-package}
 6 | \title{alpine: bias corrected transcript abundance estimation}
 7 | \description{
 8 | alpine is a package for estimating and visualizing many forms of sample-specific
 9 | biases that can arise in RNA-seq, including fragment length
10 | distribution, positional bias on the transcript, read
11 | start bias (random hexamer priming), and fragment GC content
12 | (amplification). It also offers bias-corrected estimates of
13 | transcript abundance (FPKM). It is currently designed for
14 | un-stranded paired-end RNA-seq data.
15 | }
16 | \details{
17 | See the package vignette for a detailed workflow.
18 | 
19 | The main functions in this package are:
20 | \enumerate{
21 | \item \link{buildFragtypes} - build out features for fragment types from exons of a single gene (GRanges)
22 | \item \link{fitBiasModels} - fit parameters for one or more bias models over a set of ~100 medium to highly expressed single isoform genes (GRangesList)
23 | \item \link{estimateAbundance} - given a set of genome alignments (BAM files) and a set of isoforms of a gene (GRangesList), estimate the transcript abundances for these isoforms (FPKM) for various bias models
24 | \item \link{extractAlpine} - given a list of output from \code{estimateAbundance}, compile an FPKM matrix across transcripts and samples
25 | \item \link{predictCoverage} - given the exons of a single gene (GRanges) predict the coverage for a set of samples given fitted bias parameters and compute the observed coverage
26 | }
27 | 
28 | Some helper functions for preparing gene objects:
29 | \enumerate{
30 | \item \link{splitGenesAcrossChroms} - split apart "genes" where isoforms are on different chromosomes
31 | \item \link{splitLongGenes} - split apart "genes" which cover a suspiciously large range, e.g. 1 Mb
32 | \item \link{mergeGenes} - merge overlapping isoforms into new "genes"
33 | }
34 | 
35 | Some other assorted helper functions:
36 | \enumerate{
37 | \item \link{normalizeDESeq} - an across-sample normalization for FPKM matrices
38 | \item \link{getFragmentWidths} - return a vector estimated fragment lengths given a set of exons for a single gene (GRanges) and a BAM file
39 | \item \link{getReadLength} - return the read length of the first read across BAM files
40 | }
41 | 
42 | The plotting functions are:
43 | \enumerate{
44 | \item \link{plotGC} - plot the fragment GC bias curves
45 | \item \link{plotFragLen} - plot the framgent length distributions
46 | \item \link{plotRelPos} - plot the positional bias (5' to 3')
47 | \item \link{plotOrder0}, \link{plotOrder1}, \link{plotOrder2} - plot the read start bias terms
48 | \item \link{plotGRL} - a simple function for visualizing GRangesList objects
49 | }
50 | }
51 | \author{
52 | Michael Love
53 | }
54 | \references{
55 | Love, M.I., Hogenesch, J.B., and Irizarry, R.A.,
56 | Modeling of RNA-seq fragment sequence bias reduces
57 | systematic errors in transcript abundance estimation.
58 | Nature Biotechnologyh (2016) doi: 10.1038/nbt.3682
59 | }
60 | \keyword{package}
61 | 
62 | 


--------------------------------------------------------------------------------
/man/buildFragtypes.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/core.R
 3 | \name{buildFragtypes}
 4 | \alias{buildFragtypes}
 5 | \title{Build fragment types from exons}
 6 | \usage{
 7 | buildFragtypes(exons, genome, readlength, minsize, maxsize, gc = TRUE,
 8 |   gc.str = TRUE, vlmm = TRUE)
 9 | }
10 | \arguments{
11 | \item{exons}{a GRanges object with the exons for a single transcript}
12 | 
13 | \item{genome}{a BSgenome object}
14 | 
15 | \item{readlength}{the length of the reads. This doesn't necessarily
16 | have to be exact (+/- 1 bp is acceptable)}
17 | 
18 | \item{minsize}{the minimum fragment length to model. The interval between
19 | \code{minsize} and \code{maxsize} should contain the at least the
20 | central 95 percent of the fragment length distribution across samples}
21 | 
22 | \item{maxsize}{the maximum fragment length to model}
23 | 
24 | \item{gc}{logical, whether to calculate the fragment GC content}
25 | 
26 | \item{gc.str}{logical, whether to look for presence of
27 | stretches of very high GC within fragments}
28 | 
29 | \item{vlmm}{logical, whether to calculate the Cufflinks Variable Length
30 | Markov Model (VLMM) for read start bias}
31 | }
32 | \value{
33 | a DataFrame with bias features (columns) for all
34 | potential fragments (rows)
35 | }
36 | \description{
37 | This function constructs a DataFrame of fragment features used for
38 | bias modeling, with one row for every potential fragment type that could
39 | arise from a transcript. The output of this function is used by
40 | \link{fitBiasModels}, and this function is used inside \link{estimateAbundance}
41 | in order to model the bias affecting different fragments across isoforms
42 | of a gene.
43 | }
44 | \examples{
45 | 
46 | library(GenomicRanges)
47 | library(BSgenome.Hsapiens.NCBI.GRCh38)
48 | data(preprocessedData)
49 | readlength <- 100
50 | minsize <- 125 # see vignette how to choose
51 | maxsize <- 175 # see vignette how to choose
52 | fragtypes <- buildFragtypes(ebt.fit[["ENST00000624447"]],
53 |                             Hsapiens, readlength,
54 |                             minsize, maxsize)
55 | 
56 | }
57 | 
58 | 


--------------------------------------------------------------------------------
/man/estimateAbundance.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/estimate_abundance.R
  3 | \name{estimateAbundance}
  4 | \alias{estimateAbundance}
  5 | \title{Estimate bias-corrected transcript abundances (FPKM)}
  6 | \usage{
  7 | estimateAbundance(transcripts, bam.files, fitpar, genome, model.names,
  8 |   subset = TRUE, niter = 100, lib.sizes = NULL, optim = FALSE,
  9 |   custom.features = NULL)
 10 | }
 11 | \arguments{
 12 | \item{transcripts}{a GRangesList of the exons for multiple isoforms of a gene.
 13 | For a single-isoform gene, just wrap the exons in \code{GRangesList()}}
 14 | 
 15 | \item{bam.files}{a named vector pointing to the indexed BAM files}
 16 | 
 17 | \item{fitpar}{the output of \link{fitBiasModels}}
 18 | 
 19 | \item{genome}{a BSGenome object}
 20 | 
 21 | \item{model.names}{a character vector of the bias models to use.
 22 | These should have already been specified when calling \link{fitBiasModels}.
 23 | Four exceptions are models that use none, one or both of the offsets,
 24 | and these are called with:
 25 | \code{"null"}, \code{"fraglen"}, \code{"vlmm"}, or \code{"fraglen.vlmm"}.}
 26 | 
 27 | \item{subset}{logical, whether to downsample the non-observed fragments. Default is TRUE}
 28 | 
 29 | \item{niter}{the number of EM iterations. Default is 100.}
 30 | 
 31 | \item{lib.sizes}{a named vector of library sizes to use in calculating the FPKM.
 32 | If NULL (the default) a value of 1e6 is used for all samples.}
 33 | 
 34 | \item{optim}{logical, whether to use numerical optimization instead of the EM.
 35 | Default is FALSE.}
 36 | 
 37 | \item{custom.features}{an optional function to add custom features
 38 | to the fragment types DataFrame. This function takes in a DataFrame
 39 | returned by \link{buildFragtypes} and returns a DataFrame
 40 | with additional columns added. Default is NULL, adding no custom features.}
 41 | }
 42 | \value{
 43 | a list of lists. For each sample, a list with elements:
 44 | theta, lambda and count.
 45 | \itemize{
 46 | \item \strong{theta} gives the FPKM estimates for the
 47 | isoforms in \code{transcripts}
 48 | \item \strong{lambda} gives the average bias term
 49 | for the isoforms
 50 | \item \strong{count} gives the number of fragments which are
 51 | compatible with any of the isoforms in \code{transcripts}
 52 | }
 53 | }
 54 | \description{
 55 | This function takes the fitted bias parameters from \link{fitBiasModels}
 56 | and uses this information to derive bias corrected estimates of
 57 | transcript abundance for a gene (with one or more isoforms)
 58 | across multiple samples.
 59 | }
 60 | \examples{
 61 | 
 62 | # see vignette for a more realistic example
 63 | 
 64 | # these next lines just write out a BAM file from R
 65 | # typically you would already have a BAM file
 66 | library(alpineData)
 67 | library(GenomicAlignments)
 68 | library(rtracklayer)
 69 | gap <- ERR188088()
 70 | dir <- system.file(package="alpineData", "extdata")
 71 | bam.file <- c("ERR188088" = file.path(dir,"ERR188088.bam"))
 72 | export(gap, con=bam.file)
 73 | 
 74 | data(preprocessedData)
 75 | library(GenomicRanges)
 76 | library(BSgenome.Hsapiens.NCBI.GRCh38)
 77 | 
 78 | model.names <- c("fraglen","GC")
 79 | 
 80 | txs <- txdf.theta$tx_id[txdf.theta$gene_id == "ENSG00000198918"]
 81 | 
 82 | res <- estimateAbundance(transcripts=ebt.theta[txs],
 83 |                          bam.files=bam.file,
 84 |                          fitpar=fitpar.small,
 85 |                          genome=Hsapiens,
 86 |                          model.names=model.names)
 87 | 
 88 | }
 89 | \references{
 90 | The model describing how bias estimates are used to
 91 | estimate bias-corrected abundances is described in
 92 | the Supplemental Note of the following publication:
 93 | 
 94 | Love, M.I., Hogenesch, J.B., and Irizarry, R.A.,
 95 | Modeling of RNA-seq fragment sequence bias reduces
 96 | systematic errors in transcript abundance estimation.
 97 | Nature Biotechnologyh (2016) doi: 10.1038/nbt.3682
 98 | 
 99 | The likelihood formulation and EM algorithm
100 | for finding the maximum likelihood estimate for abundances
101 | follows this publication:
102 | 
103 | Salzman, J., Jiang, H., and Wong, W.H.,
104 | Statistical Modeling of RNA-Seq Data.
105 | Statistical Science (2011) doi: 10.1214/10-STS343
106 | }
107 | 
108 | 


--------------------------------------------------------------------------------
/man/estimateTheta.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/estimate_theta.R
  3 | \name{estimateTheta}
  4 | \alias{estimateTheta}
  5 | \title{Estimate bias-corrected transcript abundances (FPKM)}
  6 | \usage{
  7 | estimateTheta(transcripts, bam.files, fitpar, genome, models, readlength,
  8 |   minsize, maxsize, subset = TRUE, niter = 100, lib.sizes = NULL,
  9 |   optim = FALSE, custom.features = NULL)
 10 | }
 11 | \arguments{
 12 | \item{transcripts}{a GRangesList of the exons for multiple isoforms of a gene.
 13 | For a single-isoform gene, just wrap the exons in \code{GRangesList()}}
 14 | 
 15 | \item{bam.files}{a named vector pointing to the indexed BAM files}
 16 | 
 17 | \item{fitpar}{the output of \link{fitBiasModels}}
 18 | 
 19 | \item{genome}{a BSGenome object}
 20 | 
 21 | \item{models}{a list of character strings or formula describing the bias models, see vignette}
 22 | 
 23 | \item{readlength}{the read length}
 24 | 
 25 | \item{minsize}{the minimum fragment length to model}
 26 | 
 27 | \item{maxsize}{the maximum fragment length to model}
 28 | 
 29 | \item{subset}{logical, whether to downsample the non-observed fragments. Default is TRUE}
 30 | 
 31 | \item{niter}{the number of EM iterations. Default is 100.}
 32 | 
 33 | \item{lib.sizes}{a named vector of library sizes to use in calculating the FPKM.
 34 | If NULL (the default) a value of 1e6 is used for all samples.}
 35 | 
 36 | \item{optim}{logical, whether to use numerical optimization instead of the EM.
 37 | Default is FALSE.}
 38 | 
 39 | \item{custom.features}{an optional function to add custom features
 40 | to the fragment types DataFrame. This function takes in a DataFrame
 41 | returned by \link{buildFragtypes} and returns a DataFrame
 42 | with additional columns added. Default is NULL, adding no custom features.}
 43 | }
 44 | \value{
 45 | a list of lists. For each sample, a list with elements:
 46 | theta, lambda and count.
 47 | \itemize{
 48 | \item \strong{theta} gives the FPKM estimates for the
 49 | isoforms in \code{transcripts}
 50 | \item \strong{lambda} gives the average bias term
 51 | for the isoforms
 52 | \item \strong{count} gives the number of fragments which are
 53 | compatible with any of the isoforms in \code{transcripts}
 54 | }
 55 | }
 56 | \description{
 57 | This function takes the fitted bias parameters from \link{fitBiasModels}
 58 | and uses this information to derive bias corrected estimates of
 59 | transcript abundance for a gene (with one or more isoforms)
 60 | across multiple samples.
 61 | }
 62 | \examples{
 63 | 
 64 | # see vignette for a more realistic example
 65 | 
 66 | # these next lines just write out a BAM file from R
 67 | # typically you would already have a BAM file
 68 | library(alpineData)
 69 | library(GenomicAlignments)
 70 | library(rtracklayer)
 71 | gap <- ERR188088()
 72 | dir <- system.file(package="alpineData", "extdata")
 73 | bam.file <- c("ERR188088" = file.path(dir,"ERR188088.bam"))
 74 | export(gap, con=bam.file)
 75 | 
 76 | data(preprocessedData)
 77 | library(GenomicRanges)
 78 | library(BSgenome.Hsapiens.NCBI.GRCh38)
 79 | models <- list(
 80 |  "GC"=list(formula="count~
 81 |  ns(gc,knots=gc.knots,Boundary.knots=gc.bk) +
 82 |  ns(relpos,knots=relpos.knots,Boundary.knots=relpos.bk) +
 83 |  0",
 84 |  offset=c("fraglen"))
 85 | )
 86 | 
 87 | readlength <- 75
 88 | minsize <- 125 # see vignette how to choose
 89 | maxsize <- 175 # see vignette how to choose
 90 | txs <- txdf.theta$tx_id[txdf.theta$gene_id == "ENSG00000198918"]
 91 | 
 92 | res <- estimateTheta(transcripts=ebt.theta[txs],
 93 |                      bam.files=bam.file,
 94 |                      fitpar=fitpar.small,
 95 |                      genome=Hsapiens,
 96 |                      models=models,
 97 |                      readlength=readlength,
 98 |                      minsize=minsize,
 99 |                      maxsize=maxsize)
100 | 
101 | }
102 | 
103 | 


--------------------------------------------------------------------------------
/man/extractAlpine.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/helper.R
 3 | \name{extractAlpine}
 4 | \alias{extractAlpine}
 5 | \title{Extract results from estimateAbundance run across genes}
 6 | \usage{
 7 | extractAlpine(res, model, lib.sizes = 1e+06, divide.out = TRUE,
 8 |   transcripts = NULL)
 9 | }
10 | \arguments{
11 | \item{res}{a list where each element is the output of \link{estimateAbundance}}
12 | 
13 | \item{model}{the name of a model, corresponds to names of \code{models}
14 | used in \link{fitBiasModels}}
15 | 
16 | \item{lib.sizes}{the vector of library sizes passed to \link{estimateAbundance}.
17 | not needed if \code{divide.out=FALSE}}
18 | 
19 | \item{divide.out}{logical, whether to divide out the initial estimate of
20 | library size and to instead use the count of compatible fragments for
21 | genes calculated by \link{estimateAbundance}. Default is TRUE}
22 | 
23 | \item{transcripts}{an optional GRangesList of the exons for each
24 | transcript. If this is provided, the output will be a
25 | SummarizedExperiment. The transcripts do not need
26 | to be provided in the correct order, extractAlpine will
27 | find the correct transcript by the names in \code{res} and
28 | put them in the correct order.}
29 | }
30 | \value{
31 | a matrix of FPKM values across transcripts and samples,
32 | or a SummarizedExperiment if \code{transcripts} is provided
33 | }
34 | \description{
35 | This function extracts estimates for a given model from a list
36 | over many genes, returning a matrix with dimensions:
37 | number of transcript x number of samples.
38 | Here, the count of compatible fragments aligning to the
39 | genes is used to estimate the FPKM, dividing out the previously
40 | used estimate \code{lib.sizes}.
41 | }
42 | \examples{
43 | 
44 | data(preprocessedData)
45 | extractAlpine(res, "GC")
46 | 
47 | }
48 | 
49 | 


--------------------------------------------------------------------------------
/man/fitBiasModels.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/fit_bias.R
  3 | \name{fitBiasModels}
  4 | \alias{fitBiasModels}
  5 | \title{Fit bias models over single-isoform genes}
  6 | \usage{
  7 | fitBiasModels(genes, bam.file, fragtypes, genome, models, readlength, minsize,
  8 |   maxsize, speedglm = TRUE, gc.knots = seq(from = 0.4, to = 0.6, length =
  9 |   3), gc.bk = c(0, 1), relpos.knots = seq(from = 0.25, to = 0.75, length =
 10 |   3), relpos.bk = c(0, 1))
 11 | }
 12 | \arguments{
 13 | \item{genes}{a GRangesList with the exons of different
 14 | single-isoform genes}
 15 | 
 16 | \item{bam.file}{a character string pointing to an indexed BAM file}
 17 | 
 18 | \item{fragtypes}{the output of \link{buildFragtypes}. must contain
 19 | the potential fragment types for the genes named in \code{genes}}
 20 | 
 21 | \item{genome}{a BSgenome object}
 22 | 
 23 | \item{models}{a list of lists: the outer list describes multiple models.
 24 | each element of the inner list has two elements: \code{formula} and \code{offset}.
 25 | \code{formula} should be a character strings of an R formula
 26 | describing the bias models, e.g. \code{"count ~ ns(gc) + gene"}.
 27 | The end of the string is required to be \code{"+ gene"}.
 28 | \code{offset} should be a character vector
 29 | listing possible bias offsets to be used (\code{"fraglen"} or \code{"vlmm"}).
 30 | Either \code{offset} or \code{formula} can be NULL for a model.
 31 | See vignette for recommendations and details.}
 32 | 
 33 | \item{readlength}{the read length}
 34 | 
 35 | \item{minsize}{the minimum fragment length to model}
 36 | 
 37 | \item{maxsize}{the maximum fragment length to model}
 38 | 
 39 | \item{speedglm}{logical, whether to use speedglm to estimate the coefficients.
 40 | Default is TRUE.}
 41 | 
 42 | \item{gc.knots}{knots for the GC splines}
 43 | 
 44 | \item{gc.bk}{boundary knots for the GC splines}
 45 | 
 46 | \item{relpos.knots}{knots for the relative position splines}
 47 | 
 48 | \item{relpos.bk}{boundary knots for the relative position splines}
 49 | }
 50 | \value{
 51 | a list with elements: coefs, summary, models, model.params,
 52 | and optional offets: fraglen.density, vlmm.fivep,
 53 | and vlmm.threep.
 54 | \itemize{
 55 | \item \strong{coefs} gives the estimated coefficients
 56 | for the different models that specified formula.
 57 | \item \strong{summary} gives the tables with coefficients, standard
 58 | errors and p-values,
 59 | \item \strong{models} stores the incoming \code{models} list,
 60 | \item \strong{model.params} stores parameters for the
 61 | models, such as knot locations
 62 | \item \strong{fraglen.density} is a
 63 | estimated density object for the fragment length distribution,
 64 | \item \strong{vlmm.fivep} and \strong{vlmm.threep}
 65 | store the observed and expected tabulations for the different
 66 | orders of the VLMM for read start bias.
 67 | }
 68 | }
 69 | \description{
 70 | This function estimates parameters for one or more bias models
 71 | for a single sample over a set of single-isoform
 72 | genes. ~100 medium to highly expressed genes should be sufficient to
 73 | estimate the parameters robustly.
 74 | }
 75 | \examples{
 76 | 
 77 | # see vignette for a more realistic example
 78 | 
 79 | # these next lines just write out a BAM file from R
 80 | # typically you would already have a BAM file
 81 | library(alpineData)
 82 | library(GenomicAlignments)
 83 | library(rtracklayer)
 84 | gap <- ERR188088()
 85 | dir <- system.file(package="alpineData", "extdata")
 86 | bam.file <- c("ERR188088" = file.path(dir,"ERR188088.bam"))
 87 | export(gap, con=bam.file)
 88 | 
 89 | library(GenomicRanges)
 90 | library(BSgenome.Hsapiens.NCBI.GRCh38)
 91 | data(preprocessedData)
 92 | 
 93 | readlength <- 75
 94 | minsize <- 125 # see vignette how to choose
 95 | maxsize <- 175 # see vignette how to choose
 96 | 
 97 | # here a very small subset, should be ~100 genes
 98 | gene.names <- names(ebt.fit)[6:8] 
 99 | names(gene.names) <- gene.names
100 | fragtypes <- lapply(gene.names, function(gene.name) {
101 |                       buildFragtypes(ebt.fit[[gene.name]],
102 |                                      Hsapiens, readlength,
103 |                                      minsize, maxsize)
104 | })
105 | models <- list(
106 |   "GC" = list(formula = "count ~ ns(gc,knots=gc.knots, Boundary.knots=gc.bk) + gene",
107 |               offset=c("fraglen","vlmm"))
108 | )
109 | 
110 | fitpar <- fitBiasModels(genes=ebt.fit[gene.names],
111 |                         bam.file=bam.file,
112 |                         fragtypes=fragtypes,
113 |                         genome=Hsapiens,
114 |                         models=models,
115 |                         readlength=readlength,
116 |                         minsize=minsize,
117 |                         maxsize=maxsize)
118 | 
119 | }
120 | \references{
121 | The complete bias model including fragment sequence bias
122 | is described in detail in the Supplemental Note of the
123 | following publication:
124 | 
125 | Love, M.I., Hogenesch, J.B., and Irizarry, R.A.,
126 | Modeling of RNA-seq fragment sequence bias reduces
127 | systematic errors in transcript abundance estimation.
128 | Nature Biotechnologyh (2016) doi: 10.1038/nbt.3682
129 | 
130 | The read start variable length Markov model (VLMM) for
131 | addressing bias introduced by random hexamer priming
132 | was introduced in the following publication (the sequence
133 | bias model used in Cufflinks):
134 | 
135 | Roberts, A., Trapnell, C., Donaghey, J., Rinn, J.L., and Pachter, L.,
136 | Improving RNA-Seq expression estimates by correcting for fragment bias.
137 | Genome Biology (2011) doi: 10.1186/gb-2011-12-3-r22
138 | }
139 | 
140 | 


--------------------------------------------------------------------------------
/man/getFragmentWidths.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/helper.R
 3 | \name{getFragmentWidths}
 4 | \alias{getFragmentWidths}
 5 | \title{Get fragment widths}
 6 | \usage{
 7 | getFragmentWidths(bam.file, tx)
 8 | }
 9 | \arguments{
10 | \item{bam.file}{a character string pointing to a BAM file}
11 | 
12 | \item{tx}{a GRanges object of the exons of a single isoform gene}
13 | }
14 | \value{
15 | a numeric vector of estimated fragment widths
16 | }
17 | \description{
18 | From a BAM file and a particular transcript (recommened
19 | to be the single isoform of a gene), this function
20 | returns estimates of the fragment widths, by mapping the
21 | fragment alignments to the transcript coordinates.
22 | }
23 | \examples{
24 | 
25 | # these next lines just write out a BAM file from R
26 | # typically you would already have a BAM file
27 | library(alpineData)
28 | library(GenomicAlignments)
29 | library(rtracklayer)
30 | gap <- ERR188088()
31 | dir <- system.file(package="alpineData", "extdata")
32 | bam.file <- c("ERR188088" = file.path(dir,"ERR188088.bam"))
33 | export(gap, con=bam.file)
34 | 
35 | data(preprocessedData)
36 | 
37 | w <- getFragmentWidths(bam.file, ebt.fit[[2]])
38 | quantile(w, c(.025, .975))
39 | 
40 | }
41 | 
42 | 


--------------------------------------------------------------------------------
/man/getReadLength.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/helper.R
 3 | \name{getReadLength}
 4 | \alias{getReadLength}
 5 | \title{Get read length}
 6 | \usage{
 7 | getReadLength(bam.files)
 8 | }
 9 | \arguments{
10 | \item{bam.files}{a character vector pointing to BAM files}
11 | }
12 | \value{
13 | a numeric vector, one number per BAM file, the
14 | length of the first read in the file
15 | }
16 | \description{
17 | Gets the length of the first read in a BAM file
18 | }
19 | \examples{
20 | 
21 | # these next lines just write out a BAM file from R
22 | # typically you would already have a BAM file
23 | library(alpineData)
24 | library(GenomicAlignments)
25 | library(rtracklayer)
26 | gap <- ERR188088()
27 | dir <- system.file(package="alpineData", "extdata")
28 | bam.file <- c("ERR188088" = file.path(dir,"ERR188088.bam"))
29 | export(gap, con=bam.file)
30 | 
31 | getReadLength(bam.file)
32 | 
33 | }
34 | 
35 | 


--------------------------------------------------------------------------------
/man/mergeGenes.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/helper.R
 3 | \name{mergeGenes}
 4 | \alias{mergeGenes}
 5 | \title{Merge overlapping "genes" into gene clusters}
 6 | \usage{
 7 | mergeGenes(ebg, txdf, ignore.strand = TRUE)
 8 | }
 9 | \arguments{
10 | \item{ebg}{an exons-by-genes GRangesList, created with \code{exonsBy}}
11 | 
12 | \item{txdf}{a data.frame created by running \code{select} on a TxDb object.
13 | Must have a column GENEID.}
14 | 
15 | \item{ignore.strand}{Default is TRUE.}
16 | }
17 | \value{
18 | a manipulated \code{txdf}.
19 | }
20 | \description{
21 | This function looks for overlapping exons in \code{ebg}.
22 | The overlapping "genes" are used to form a graph.
23 | Any connected components in the graph (sets of "genes"
24 | which can be reached from each other through overlap relations)
25 | are connected into a new gene cluster, which is given the
26 | suffix "_mrg" and using one of the original gene names.
27 | }
28 | \examples{
29 | 
30 | library(GenomicRanges)
31 | txdf <- data.frame(GENEID=c("101","102","103","104"))
32 | ebg <- GRangesList(GRanges("1",IRanges(c(100,200),width=50)),
33 |                    GRanges("1",IRanges(c(200,300),width=50)),
34 |                    GRanges("1",IRanges(c(300,400),width=50)),
35 |                    GRanges("1",IRanges(c(500,600),width=50)))
36 | names(ebg) <- c("101","102","103","104")
37 | mergeGenes(ebg, txdf)
38 | 
39 | }
40 | 
41 | 


--------------------------------------------------------------------------------
/man/normalizeDESeq.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/helper.R
 3 | \name{normalizeDESeq}
 4 | \alias{normalizeDESeq}
 5 | \title{DESeq median ratio normalization for matrix}
 6 | \usage{
 7 | normalizeDESeq(mat, cutoff)
 8 | }
 9 | \arguments{
10 | \item{mat}{a matrix of numeric values}
11 | 
12 | \item{cutoff}{a numeric value to be used as the cutoff
13 | for the row means of \code{mat}. Only rows with row mean
14 | larger than \code{cutoff} are used for calculating
15 | the size factors}
16 | }
17 | \value{
18 | a matrix with the median ratio size factors
19 | divided out
20 | }
21 | \description{
22 | Simple implementation of DESeq median ratio normalization
23 | }
24 | \examples{
25 | 
26 | x <- runif(50,1,100)
27 | mat <- cbind(x, 2*x, 3*x)
28 | norm.mat <- normalizeDESeq(mat, 5)
29 | 
30 | }
31 | \references{
32 | Anders, S. and Huber, W.,
33 | Differential expression analysis for sequence count data.
34 | Genome Biology (2010) doi: 10.1186/gb-2010-11-10-r106
35 | }
36 | 
37 | 


--------------------------------------------------------------------------------
/man/plotFragLen.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plots.R
 3 | \name{plotFragLen}
 4 | \alias{plotFragLen}
 5 | \title{Plot fragment length distribution over samples}
 6 | \usage{
 7 | plotFragLen(fitpar, col, lty)
 8 | }
 9 | \arguments{
10 | \item{fitpar}{a list of the output of \link{fitBiasModels} over samples}
11 | 
12 | \item{col}{a vector of colors}
13 | 
14 | \item{lty}{a vector of line types}
15 | }
16 | \value{
17 | plot
18 | }
19 | \description{
20 | Plots the fragment length distribution.
21 | }
22 | \examples{
23 | 
24 | # fitpar was fit using identical code
25 | # as found in the vignette, except with
26 | # 25 genes, and with fragment size in 80-350 bp
27 | data(preprocessedData)
28 | perf <- rep(1:2, each=2)
29 | plotFragLen(fitpar, col=perf)
30 | 
31 | }
32 | 
33 | 


--------------------------------------------------------------------------------
/man/plotGC.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plots.R
 3 | \name{plotGC}
 4 | \alias{plotGC}
 5 | \title{Plot the fragment GC bias over samples}
 6 | \usage{
 7 | plotGC(fitpar, model, col, lty, ylim, gc.range = NULL, return.type = 0)
 8 | }
 9 | \arguments{
10 | \item{fitpar}{a list of the output of \link{fitBiasModels} over samples}
11 | 
12 | \item{model}{the name of one of the models}
13 | 
14 | \item{col}{a vector of colors}
15 | 
16 | \item{lty}{a vector of line types}
17 | 
18 | \item{ylim}{the y limits for the plot}
19 | 
20 | \item{gc.range}{a numeric of length two,
21 | the range of the fragment GC content. By default,
22 | [.2,.8] for plotting and [0,1] for returning a matrix}
23 | 
24 | \item{return.type}{a numeric, either
25 | 0: make a plot,
26 | 1: skip the plot and return a matrix of log fragment rate,
27 | 2: skip the plot and return a matrix of probabilities}
28 | }
29 | \value{
30 | Either plot, or if \code{return.type} is 1 or 2, a matrix
31 | }
32 | \description{
33 | Plots smooth curves of the log fragment rate over fragment GC content.
34 | }
35 | \examples{
36 | 
37 | # fitpar was fit using identical code
38 | # as found in the vignette, except with
39 | # 25 genes, and with fragment size in 80-350 bp
40 | data(preprocessedData)
41 | perf <- rep(1:2, each=2)
42 | plotGC(fitpar, "all", col=perf)
43 | 
44 | }
45 | 
46 | 


--------------------------------------------------------------------------------
/man/plotGRL.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plots.R
 3 | \name{plotGRL}
 4 | \alias{plotGRL}
 5 | \title{Simple segments plot for GRangesList}
 6 | \usage{
 7 | plotGRL(grl, ...)
 8 | }
 9 | \arguments{
10 | \item{grl}{GRangesList object}
11 | 
12 | \item{...}{passed to plot}
13 | }
14 | \value{
15 | plot
16 | }
17 | \description{
18 | Simple segments plot for GRangesList
19 | }
20 | \examples{
21 | 
22 | library(GenomicRanges)
23 | grl <- GRangesList(GRanges("1",IRanges(c(100,200,300),width=50)),
24 |                    GRanges("1",IRanges(c(100,300),width=c(75,50))),
25 |                    GRanges("1",IRanges(c(100,200,400),width=c(75,50,50))),
26 |                    GRanges("1",IRanges(c(200,300,400),width=50)))
27 | plotGRL(grl)
28 | 
29 | }
30 | 
31 | 


--------------------------------------------------------------------------------
/man/plotOrder0.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plots.R
 3 | \name{plotOrder0}
 4 | \alias{plotOrder0}
 5 | \alias{plotOrder1}
 6 | \alias{plotOrder2}
 7 | \title{Plot parameters of the variable length Markov model (VLMM) for read starts}
 8 | \usage{
 9 | plotOrder0(order0, ...)
10 | 
11 | plotOrder1(order1, pos1)
12 | 
13 | plotOrder2(order2, pos2)
14 | }
15 | \arguments{
16 | \item{order0}{the "order0" element of the list named "vlmm.fivep" or "vlmm.threep"
17 | within the list that is the output of \link{fitBiasModels}}
18 | 
19 | \item{...}{parameters passed to \code{plot}}
20 | 
21 | \item{order1}{as for "order0" but "order1"}
22 | 
23 | \item{pos1}{the position of the 1st order VLMM to plot}
24 | 
25 | \item{order2}{as for "order0" but "order2"}
26 | 
27 | \item{pos2}{the position of the 2nd order VLMM to plot}
28 | }
29 | \value{
30 | plot
31 | }
32 | \description{
33 | This function plots portions of the Cufflinks VLMM for read start bias.
34 | The natural log of observed over expected is shown, such that 0
35 | indicates no contribution of a position to the read start bias.
36 | As the variable lenght Markov model has different dependencies for different
37 | positions (see Roberts et al, 2011), it is difficult
38 | to show all the 744 parameters simultaneously. Instead this function
39 | offers to show the 0-order terms for all positions, or the 1st and 2nd
40 | order terms for selected positions within the read start sequence.
41 | For the 1- and 2-order terms, the log bias is shown for each nucleotide
42 | (A,C,T,G) given the previous nucleotide (1-order) or di-nucleotide (2-order).
43 | }
44 | \section{Functions}{
45 | \itemize{
46 | \item \code{plotOrder1}: Plot first order parameters for a position
47 | 
48 | \item \code{plotOrder2}: Plot second order parameters for a position
49 | }}
50 | \examples{
51 | 
52 | # fitpar was fit using identical code
53 | # as found in the vignette, except with
54 | # 25 genes, and with fragment size in 80-350 bp
55 | data(preprocessedData)
56 | plotOrder0(fitpar[[1]][["vlmm.fivep"]][["order0"]])
57 | plotOrder1(fitpar[[1]][["vlmm.fivep"]][["order1"]], pos1=5:19)
58 | plotOrder2(fitpar[[1]][["vlmm.fivep"]][["order2"]], pos2=8:17)
59 | 
60 | }
61 | \references{
62 | Roberts et al, "Improving RNA-Seq expression estimates by correcting for fragment bias"
63 | Genome Biology (2011) doi:101186/gb-2011-12-3-r22
64 | }
65 | 
66 | 


--------------------------------------------------------------------------------
/man/plotRelPos.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plots.R
 3 | \name{plotRelPos}
 4 | \alias{plotRelPos}
 5 | \title{Plot relative position bias over samples}
 6 | \usage{
 7 | plotRelPos(fitpar, model, col, lty, ylim)
 8 | }
 9 | \arguments{
10 | \item{fitpar}{a list of the output of \link{fitBiasModels} over samples}
11 | 
12 | \item{model}{the name of one of the models}
13 | 
14 | \item{col}{a vector of colors}
15 | 
16 | \item{lty}{a vector of line types}
17 | 
18 | \item{ylim}{the y limits for the plot}
19 | }
20 | \value{
21 | plot
22 | }
23 | \description{
24 | Plots the smooth curves of log fragment rate over relative position.
25 | }
26 | \examples{
27 | 
28 | # fitpar was fit using identical code
29 | # as found in the vignette, except with
30 | # 25 genes, and with fragment size in 80-350 bp
31 | data(preprocessedData)
32 | perf <- rep(1:2, each=2)
33 | plotRelPos(fitpar, "all", col=perf)
34 | 
35 | }
36 | 
37 | 


--------------------------------------------------------------------------------
/man/predictCoverage.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/predict.R
 3 | \name{predictCoverage}
 4 | \alias{predictCoverage}
 5 | \title{Predict coverage for a single-isoform gene}
 6 | \usage{
 7 | predictCoverage(gene, bam.files, fitpar, genome, model.names)
 8 | }
 9 | \arguments{
10 | \item{gene}{a GRangesList with the exons of different genes}
11 | 
12 | \item{bam.files}{a character string pointing to indexed BAM files}
13 | 
14 | \item{fitpar}{the output of running \link{fitBiasModels}}
15 | 
16 | \item{genome}{a BSgenome object}
17 | 
18 | \item{model.names}{a character vector listing the models,
19 | see same argument in \link{estimateAbundance}}
20 | }
21 | \value{
22 | a list with elements frag.cov, the observed fragment coverage
23 | from the \code{bam.files} and pred.cov, a list with the predicted
24 | fragment coverage for each of the \code{models}.
25 | }
26 | \description{
27 | Predict coverage for a single-isoform gene given
28 | fitted bias parameters in a set of models,
29 | and compare to the observed fragment coverage.
30 | }
31 | \details{
32 | Note that if the range between \code{minsize} and \code{maxsize}
33 | does not cover most of the fragment length distribution, the
34 | predicted coverage will underestimate the observed coverage.
35 | }
36 | \examples{
37 | 
38 | # these next lines just write out a BAM file from R
39 | # typically you would already have a BAM file
40 | library(alpineData)
41 | library(GenomicAlignments)
42 | library(rtracklayer)
43 | gap <- ERR188088()
44 | dir <- system.file(package="alpineData", "extdata")
45 | bam.file <- c("ERR188088" = file.path(dir,"ERR188088.bam"))
46 | export(gap, con=bam.file)
47 | 
48 | data(preprocessedData)
49 | library(BSgenome.Hsapiens.NCBI.GRCh38)
50 | 
51 | model.names <- c("fraglen","fraglen.vlmm","GC","all")
52 | 
53 | pred.cov <- predictCoverage(gene=ebt.fit[["ENST00000379660"]],
54 |                             bam.files=bam.file,
55 |                             fitpar=fitpar.small,
56 |                             genome=Hsapiens,
57 |                             model.names=model.names)
58 | 
59 | # plot the coverage:
60 | # note that, because [125,175] bp range specified in fitpar.small
61 | # does not cover the fragment width distribution, the predicted curves
62 | # will underestimate the observed. we correct here post-hoc
63 | 
64 | frag.cov <- pred.cov[["ERR188088"]][["frag.cov"]]
65 | plot(frag.cov, type="l", lwd=3, ylim=c(0,max(frag.cov)*1.5))
66 | for (i in seq_along(model.names)) {
67 |   m <- model.names[i]
68 |   pred <- pred.cov[["ERR188088"]][["pred.cov"]][[m]]
69 |   lines(pred/mean(pred)*mean(frag.cov), col=i+1, lwd=3)
70 | }
71 | legend("topright", legend=c("observed",model.names),
72 |        col=seq_len(length(model.names)+1), lwd=3)
73 | 
74 | }
75 | 
76 | 


--------------------------------------------------------------------------------
/man/preprocessedData.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{preprocessedData}
 5 | \alias{ebt.fit}
 6 | \alias{ebt.theta}
 7 | \alias{fitpar}
 8 | \alias{fitpar.small}
 9 | \alias{genes.theta}
10 | \alias{preprocessedData}
11 | \alias{res}
12 | \alias{txdf.theta}
13 | \title{Preprocessed data for vignettes and examples}
14 | \format{\code{ebt.fit} and \code{ebt.theta} are GRangesList.
15 | \code{fitpar}, \code{fitpar.small}, \code{res} are lists created
16 | by alpine functions. \code{genes.theta} is a character vector.
17 | \code{txdf.theta} is a DataFrame.}
18 | \source{
19 | See vignette for details of object construction.
20 | The alignments come from alpineData (4 samples from GEUVADIS project),
21 | the Ensembl gene annotations come from \code{Homo_sapiens.GRCh38.84.gtf},
22 | and the genome is \code{BSgenome.Hsapiens.NCBI.GRCh38}.
23 | }
24 | \description{
25 | The following data objects are prepared for use
26 | in the alpine vignette and examples pages,
27 | as the preparation of these objects requires
28 | either long running time or a large amount of disk
29 | space.
30 | }
31 | \details{
32 | \itemize{
33 |   \item \strong{ebt.fit} - the GRangesList prepared in the vignette
34 | for fitting the bias models
35 |   \item \strong{fitpar} - the fitted parameters, similar to those
36 | made in the vignette, but using \code{minsize=80} and \code{maxsize=350}
37 |   \item \strong{fitpar.small} - the fitted parameters from the
38 | vignette, returned by fitBiasModels
39 |   \item \strong{res} - the results object from the vignette,
40 | returned by estimateAbundance
41 |   \item \strong{ebt.theta} - the GRangesList prepared in the vignette
42 | for running estimateAbundance
43 |   \item \strong{genes.theta} - the names of genes used in the vignette
44 | for running estimateAbundance
45 |   \item \strong{txdf.theta} - the DataFrame of gene and transcript
46 | information used in the vignette for running estimateAbundance
47 | }
48 | }
49 | 
50 | 


--------------------------------------------------------------------------------
/man/splitGenesAcrossChroms.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/helper.R
 3 | \name{splitGenesAcrossChroms}
 4 | \alias{splitGenesAcrossChroms}
 5 | \title{Split genes that have isoforms across chromosomes}
 6 | \usage{
 7 | splitGenesAcrossChroms(ebg, txdf)
 8 | }
 9 | \arguments{
10 | \item{ebg}{an exons-by-genes GRangesList, created with \code{exonsBy}}
11 | 
12 | \item{txdf}{a data.frame created by running \code{select} on a TxDb object.
13 | Must have columns TXCHROM and GENEID}
14 | }
15 | \value{
16 | a list of manipulated \code{ebg} and \code{txdf}
17 | }
18 | \description{
19 | This function simply splits apart genes which have isoforms across multiple
20 | chromosomes. New "genes" are created with the suffix "_cs" and a number.
21 | }
22 | \examples{
23 | 
24 | library(GenomicRanges)
25 | txdf <- data.frame(TXCHROM=c("1","1","2"),
26 |                    GENEID=c("101","102","102")) 
27 | ebg <- GRangesList(GRanges("1",IRanges(c(100,200),width=50)),
28 |                    GRanges(c("1","2"),IRanges(c(400,100),width=50)))
29 | names(ebg) <- c("101","102")
30 | splitGenesAcrossChroms(ebg, txdf)
31 | 
32 | }
33 | 
34 | 


--------------------------------------------------------------------------------
/man/splitLongGenes.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/helper.R
 3 | \name{splitLongGenes}
 4 | \alias{splitLongGenes}
 5 | \title{Split very long genes}
 6 | \usage{
 7 | splitLongGenes(ebg, ebt, txdf, long = 1e+06)
 8 | }
 9 | \arguments{
10 | \item{ebg}{an exons-by-genes GRangesList, created with \code{exonsBy}}
11 | 
12 | \item{ebt}{an exons-by-tx GRangesList, created with \code{exonsBy}}
13 | 
14 | \item{txdf}{a data.frame created by running \code{select} on a TxDb object.
15 | Must have columns GENEID and TXID, where TXID corresponds to the
16 | names of \code{ebt}.}
17 | 
18 | \item{long}{a numeric value such that ranges longer than this are "long"}
19 | }
20 | \value{
21 | a list of manipulated \code{ebg} and \code{txdf}
22 | }
23 | \description{
24 | This function splits genes which have a very long range (e.g. 1 Mb),
25 | and new "genes" are formed where each isoform is its own "gene",
26 | with the suffix "_ls" and a number.
27 | It makes sense to turn each isoform into its own gene only if this
28 | function is followed by \link{mergeGenes}.
29 | }
30 | \examples{
31 | 
32 | library(GenomicRanges)
33 | txdf <- data.frame(GENEID=c("101","101","102"),
34 |                    TXID=c("201","202","203"))
35 | ebt <- GRangesList(GRanges("1",IRanges(c(100,200),width=50)),
36 |                    GRanges("1",IRanges(2e6 + c(100,200),width=50)),
37 |                    GRanges("1",IRanges(3e6 + c(100,200),width=50)))
38 | names(ebt) <- c("201","202","203")
39 | ebg <- GRangesList(reduce(unlist(ebt[1:2])),ebt[[3]])
40 | names(ebg) <- c("101","102")
41 | splitLongGenes(ebg, ebt, txdf)
42 | 
43 | }
44 | 
45 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(alpine)
3 | test_check("alpine")
4 | 


--------------------------------------------------------------------------------
/tests/testthat/test_alpine.R:
--------------------------------------------------------------------------------
 1 | context("alpine")
 2 | test_that("alpine works", {
 3 |   library(alpineData)
 4 |   library(GenomicAlignments)
 5 |   library(rtracklayer)
 6 |   gap <- ERR188088()
 7 |   dir <- tempdir()
 8 |   bam.file <- c("ERR188088" = file.path(dir,"ERR188088.bam"))
 9 |   export(gap, con=bam.file)
10 |   library(GenomicRanges)
11 |   library(BSgenome.Hsapiens.NCBI.GRCh38)
12 |   data(preprocessedData)
13 |   readlength <- 75
14 |   minsize <- 125
15 |   maxsize <- 175
16 |   gene.names <- names(ebt.fit)[6:8]
17 |   names(gene.names) <- gene.names
18 |   fragtypes <- lapply(gene.names, function(gene.name) {
19 |     buildFragtypes(ebt.fit[[gene.name]],
20 |                    Hsapiens, readlength,
21 |                    minsize, maxsize)
22 |   })
23 | 
24 |   
25 |   # model missing '+ gene' gives error
26 |   models <- list(
27 |     "GC"=list(formula="count ~ ns(gc,knots=gc.knots,Boundary.knots=gc.bk)",
28 |               offset=c("fraglen","vlmm"))
29 |   )
30 |   expect_error(
31 |     fitBiasModels(
32 |       genes=ebt.fit[gene.names],bam.file=bam.file,fragtypes=fragtypes,genome=Hsapiens,
33 |       models=models,readlength=readlength,minsize=minsize,maxsize=maxsize
34 |     )
35 |   )
36 | 
37 |   
38 |   # works to fit only fraglen and vlmm
39 |   models <- list("readstart"=list(formula=NULL,offset=c("fraglen","vlmm")))
40 |   fitpar <- fitBiasModels(
41 |     genes=ebt.fit[gene.names],bam.file=bam.file,fragtypes=fragtypes,genome=Hsapiens,
42 |     models=models,readlength=readlength,minsize=minsize,maxsize=maxsize
43 |   )
44 | 
45 |   
46 |   # works to fit different knots
47 |   models <- list(
48 |     "GC"=list(formula="count ~ ns(gc,knots=gc.knots,Boundary.knots=gc.bk) + gene",
49 |               offset=c("fraglen","vlmm"))
50 |   )
51 |   fitpar <- fitBiasModels(
52 |     genes=ebt.fit[gene.names],bam.file=bam.file,fragtypes=fragtypes,genome=Hsapiens,
53 |     models=models,readlength=readlength,minsize=minsize,maxsize=maxsize,
54 |     gc.knots=seq(from=.3,to=.6,length=5), gc.bk=c(0,1)
55 |   )
56 |   plotGC(fitpar, model="GC")
57 | 
58 |   
59 |   # works to estimate abundances
60 |   models <- list(
61 |     "GC"=list(formula="count ~ ns(gc,knots=gc.knots,Boundary.knots=gc.bk) + gene",
62 |               offset=c("fraglen","vlmm"))
63 |   )
64 |   fitpar <- list(
65 |     ERR188088 = fitBiasModels(
66 |       genes=ebt.fit[gene.names],bam.file=bam.file,fragtypes=fragtypes,genome=Hsapiens,
67 |       models=models,readlength=readlength,minsize=minsize,maxsize=maxsize
68 |     )
69 |   )
70 |   # specify models using a character vector 
71 |   model.names <- c("fraglen","fraglen.vlmm","GC")
72 |   txs <- txdf.theta$tx_id[txdf.theta$gene_id == "ENSG00000198918"]
73 |   res <- estimateAbundance(transcripts=ebt.theta[txs],
74 |                            bam.files=bam.file,
75 |                            fitpar=fitpar,
76 |                            genome=Hsapiens,
77 |                            model.names=model.names)
78 | 
79 |   
80 |   # works to predict coverage
81 |   pred.cov <- predictCoverage(gene=ebt.fit[["ENST00000379660"]],
82 |                               bam.files=bam.file,
83 |                               fitpar=fitpar.small,
84 |                               genome=Hsapiens,
85 |                               model.names=model.names)
86 |   # plot
87 |   frag.cov <- pred.cov[["ERR188088"]][["frag.cov"]]
88 |   plot(frag.cov, type="l", lwd=3, ylim=c(0,max(frag.cov)*1.5))
89 |   for (i in seq_along(model.names)) {
90 |     m <- model.names[i]
91 |     pred <- pred.cov[["ERR188088"]][["pred.cov"]][[m]]
92 |     lines(pred/mean(pred)*mean(frag.cov), col=i+1, lwd=3)
93 |   }
94 |   legend("topright", legend=c("observed",model.names),
95 |          col=seq_len(length(model.names)+1), lwd=3)
96 | })
97 | 


--------------------------------------------------------------------------------
/vignettes/alpine.Rmd:
--------------------------------------------------------------------------------
  1 | <!--
  2 | %\VignetteEngine{knitr::knitr}
  3 | %\VignetteIndexEntry{alpine}
  4 | -->
  5 | 
  6 | # Modeling and correcting fragment sequence bias
  7 | 
  8 | Here we show a brief example of using the *alpine* package to model
  9 | bias parameters and then using those parameters to estimate transcript
 10 | abundance. We load a metadata table and a subset of reads from four
 11 | samples from the GEUVADIS project. For more details on these files,
 12 | see `?alpineData` in the *alpineData* package.
 13 | 
 14 | ```{r, echo=FALSE} 
 15 | library(knitr)
 16 | opts_chunk$set(cache=FALSE,
 17 |                error=FALSE)
 18 | ```
 19 | 
 20 | ```{r message=FALSE}
 21 | library(alpineData)
 22 | dir <- system.file("extdata",package="alpineData")
 23 | metadata <- read.csv(file.path(dir,"metadata.csv"),
 24 |                      stringsAsFactors=FALSE)
 25 | metadata[,c("Title","Performer","Date","Population")]
 26 | ```
 27 | 
 28 | A subset of the reads from one of the samples:
 29 | 
 30 | ```{r message=FALSE}
 31 | library(GenomicAlignments)
 32 | ERR188297()
 33 | ```
 34 | 
 35 | Before we start, we need to write these paired-end reads, here stored
 36 | in a R/Bioconductor data object, out to a BAM file, because the *alpine*
 37 | software works with alignments stored as BAM files. *This is
 38 | not a typical step*, as you would normally have BAM files already on
 39 | disk.  We write out four BAM files for each of the four samples
 40 | contained in *alpineData*. So you can ignore the following code chunk
 41 | if you are working with your own BAM files.
 42 | 
 43 | ```{r message=FALSE}
 44 | library(rtracklayer)
 45 | dir <- tempdir()
 46 | for (sample.name in metadata$Title) {
 47 |   # the reads are accessed with functions named
 48 |   # after the sample name. the following line calls
 49 |   # the function with the sample name and saves 
 50 |   # the reads to `gap`
 51 |   gap <- match.fun(sample.name)()
 52 |   file.name <- file.path(dir,paste0(sample.name,".bam"))
 53 |   export(gap, con=file.name)
 54 | }
 55 | bam.files <- file.path(dir, paste0(metadata$Title, ".bam"))
 56 | names(bam.files) <- metadata$Title
 57 | stopifnot(all(file.exists(bam.files)))
 58 | ```
 59 | 
 60 | Now we continue with the typical steps in an *alpine* workflow.
 61 | To fit the bias model, we need to identify single-isoform genes.
 62 | We used the following chunk of code (here not evaluated) to generate a
 63 | *GRangesList* of exons per single-isoform gene. 
 64 | 
 65 | ```{r, eval=FALSE}
 66 | library(ensembldb)
 67 | gtf.file <- "Homo_sapiens.GRCh38.84.gtf"
 68 | txdb <- EnsDb(gtf.file) # already an EnsDb
 69 | txdf <- transcripts(txdb, return.type="DataFrame")
 70 | tab <- table(txdf$gene_id)
 71 | one.iso.genes <- names(tab)[tab == 1]
 72 | # pre-selected genes based on medium to high counts
 73 | # calculated using Rsubread::featureCounts
 74 | selected.genes <- scan("selected.genes.txt", what="char")
 75 | one.iso.txs <- txdf$tx_id[txdf$gene_id %in%
 76 |                           intersect(one.iso.genes, selected.genes)]
 77 | ebt0 <- exonsBy(txdb, by="tx")
 78 | ebt.fit <- ebt0[one.iso.txs]
 79 | ```
 80 | 
 81 | Here we pick a subset of single-isoform genes based on the
 82 | number of exons, and the length. We show in comments the recommended
 83 | parameters to use in selecting this subset of genes,
 84 | although here we use different parameters to ensure the building of
 85 | the vignette takes only a short period of time and does not use much memory.
 86 | 
 87 | ```{r message=FALSE}
 88 | library(GenomicRanges)
 89 | ```
 90 | 
 91 | ```{r}
 92 | library(alpine)
 93 | data(preprocessedData)
 94 | # filter small genes and long genes
 95 | min.bp <- 600
 96 | max.bp <- 7000 
 97 | gene.lengths <- sum(width(ebt.fit))
 98 | summary(gene.lengths)
 99 | ebt.fit <- ebt.fit[gene.lengths > min.bp & gene.lengths < max.bp]
100 | length(ebt.fit)
101 | set.seed(1)
102 | # better to use ~100 genes
103 | ebt.fit <- ebt.fit[sample(length(ebt.fit),10)] 
104 | ```
105 | 
106 | ## Defining a set of fragment types
107 | 
108 | Robust fitting of these bias parameters is best with ~100 medium to
109 | high count genes, e.g. mean count across samples between 200 and
110 | 10,000. These counts can be identified by *featureCounts* from the
111 | *Rsubread* Bioconductor package, for example.
112 | It is required to specify a minimum and maximum fragment size
113 | which should be lower and upper quantiles of the fragment length
114 | distribution. The `minsize` and `maxsize`
115 | arguments are recommended to be roughly the 2.5% and 97.5% of the
116 | fragment length distribution. This can be quickly estimated using the
117 | helper function *getFragmentWidths*, iterating over a few
118 | single-isoform genes with sufficient counts:
119 | 
120 | ```{r}
121 | w <- getFragmentWidths(bam.files[1], ebt.fit[[1]])
122 | c(summary(w), Number=length(w))
123 | quantile(w, c(.025, .975))
124 | ```
125 | 
126 | It is also required to specify the read length. Currently *alpine*
127 | only supports unstranded, paired-end RNA-seq with fixed read
128 | length. Differences of +/- 1 bp in read length across samples can be
129 | ignored.
130 | 
131 | ```{r}
132 | getReadLength(bam.files)
133 | ```
134 | 
135 | Here we use a very limited range of fragment lengths for speed, but
136 | for a real analysis we would suggest using the minimum and maximum
137 | of the quantiles computed above across all samples (the minimum of the
138 | lower quantiles and the maximum of the upper quantiles).
139 | 
140 | ```{r message=FALSE}
141 | library(alpine)
142 | library(BSgenome.Hsapiens.NCBI.GRCh38)
143 | readlength <- 75 
144 | minsize <- 125 # better 80 for this data
145 | maxsize <- 175 # better 350 for this data
146 | gene.names <- names(ebt.fit)
147 | names(gene.names) <- gene.names
148 | ```
149 | 
150 | The following function builds a list of *DataFrames* which store
151 | information about the fragment types from each gene in our
152 | training set.
153 | 
154 | ```{r buildFragtype}
155 | system.time({
156 | fragtypes <- lapply(gene.names, function(gene.name) {
157 |                       buildFragtypes(exons=ebt.fit[[gene.name]],
158 |                                      genome=Hsapiens,
159 |                                      readlength=readlength,
160 |                                      minsize=minsize,
161 |                                      maxsize=maxsize,
162 |                                      gc.str=FALSE)
163 |                     })
164 | })
165 | print(object.size(fragtypes), units="auto")
166 | ```
167 | 
168 | We can examine the information for a single gene:
169 | 
170 | ```{r}
171 | head(fragtypes[[1]], 3)
172 | ```
173 | 
174 | ## Defining and fitting bias models
175 | 
176 | The definition of bias models is extremely flexible in *alpine*.  The
177 | `models` argument should be given as a list, where each element is
178 | model.  The model itself should be provided as a list with elements
179 | `formula` and `offset`. Either `formula` or `offset` can be set to
180 | `NULL` for a given model. 
181 | The allowable offsets are `fraglen` and/or `vlmm` which should be
182 | provided in a character vector.
183 | Offsets are only estimated once for all models, so setting
184 | `formula=NULL` only makes sense if extra offsets are desired
185 | which were not already calculated by other models.
186 | 
187 | Any kind of R formula can be provided to `formula`, making use of the
188 | fragment features:
189 | 
190 | * `gc` (fragment GC content from 0 to 1)
191 | * `relpos` (fragment midpoint relative position from 0 to 1)
192 | * `GC40.80`, `GC40.90`, `GC20.80`, `GC20.90` (indicator variables
193 |   indicating the presence of, e.g. a 40 bp stretch of 80% or higher GC
194 |   content within the fragment)
195 | 
196 | These fragment features reference columns of information stored in
197 | `fragtypes`.  Interactions between these terms and offsets are also
198 | possible, e.g. `gc:fraglen`.
199 | 
200 | **Note:** It is required to provide formula as
201 | character strings, which are converted internally into formula, due to
202 | details in how R formula make copies of objects from the environment.
203 | 
204 | ```{r}
205 | models <- list(
206 |   "GC" = list(
207 |     formula = "count ~ ns(gc,knots=gc.knots,Boundary.knots=gc.bk) + ns(relpos,knots=relpos.knots,Boundary.knots=relpos.bk) + gene",
208 |     offset=c("fraglen")
209 |   ),
210 |   "all" = list(
211 |     formula = "count ~ ns(gc,knots=gc.knots,Boundary.knots=gc.bk) + ns(relpos,knots=relpos.knots,Boundary.knots=relpos.bk) + gene",
212 |     offset=c("fraglen","vlmm")
213 |   )
214 | )
215 | ```
216 | 
217 | Here we fit one bias model, `GC`, using fragment length, fragment GC
218 | content, relative position, and a term for differences in expression
219 | across the genes (`+ gene`).
220 | 
221 | We fit another bias model, `all`, with all the terms of the first but
222 | additionally with read start bias (encoded by a Variable Length Markov
223 | Model, or VLMM).
224 | 
225 | **Note:** It is required if a formula is provided that it end with `+
226 | gene` to account for differences in base expression levels while
227 | fitting the bias parameters.
228 | 
229 | The knots and boundary knots for GC content (`gc`) and relative
230 | position (`relpos`) splines have reasonable default values, but they
231 | can be customized using arguments to the *fitBiasModels* function.
232 | 
233 | The returned object, `fitpar`, stores the information as a list of
234 | fitted parameters across samples.
235 | 
236 | ```{r fitBiasModels}
237 | system.time({
238 | fitpar <- lapply(bam.files, function(bf) {
239 |                    fitBiasModels(genes=ebt.fit,
240 |                                  bam.file=bf,
241 |                                  fragtypes=fragtypes,
242 |                                  genome=Hsapiens,
243 |                                  models=models,
244 |                                  readlength=readlength,
245 |                                  minsize=minsize,
246 |                                  maxsize=maxsize)
247 |                  })
248 | })
249 | # this object saved was 'fitpar.small' for examples in alpine man pages
250 | # fitpar.small <- fitpar 
251 | ```
252 | 
253 | ## Visually exploring the bias parameters
254 | 
255 | Note that with more basepairs between `minsize` and `maxsize` and with
256 | more genes used for estimation, the bias parameters would be more
257 | precise. As estimated here, the curves look a bit wobbly. Compare to
258 | the curves that are fit in the *alpine* paper (see `citation("alpine")`).
259 | The estimated spline coefficients have high variance from too few
260 | observations (paired-end fragments) across too few genes.
261 | 
262 | First we set a palette to distinguish between samples
263 | 
264 | ```{r}
265 | library(RColorBrewer)
266 | palette(brewer.pal(8,"Dark2"))
267 | ```
268 | 
269 | The fragment length distribution:
270 | 
271 | ```{r fraglen}
272 | perf <- as.integer(factor(metadata$Performer))
273 | plotFragLen(fitpar, col=perf)
274 | ```
275 | 
276 | The fragment GC bias curves:
277 | 
278 | ```{r gccurve}
279 | plotGC(fitpar, model="all", col=perf)
280 | ```
281 | 
282 | 
283 | The relative position curves:
284 | 
285 | ```{r relpos}
286 | plotRelPos(fitpar, model="all", col=perf)
287 | ```
288 | 
289 | A 0-order version of the VLMM (note that the VLMM that is used in the
290 | model includes positions that are 1- and 2-order, so this plot does
291 | not represent the final VLMM used in bias estimation or in estimation
292 | of abundances).
293 | 
294 | ```{r vlmm}
295 | plotOrder0(fitpar[["ERR188297"]][["vlmm.fivep"]][["order0"]])
296 | plotOrder0(fitpar[["ERR188297"]][["vlmm.threep"]][["order0"]])
297 | ```
298 | 
299 | A coefficient table for the terms in `formula`:
300 | 
301 | ```{r}
302 | print(head(fitpar[["ERR188297"]][["summary"]][["all"]]), row.names=FALSE)
303 | ```
304 | 
305 | ## Estimating transcript abundances
306 | 
307 | We pick a subset of genes for estimating transcript abundances.  If
308 | the gene annotation includes genes with transcripts which span
309 | multiple chromosomes or which do not have any overlap and are very far
310 | apart, *splitGenesAcrossChroms* and *splitLongGenes*, respectively,
311 | can be used to split these.  For again merging any overlapping
312 | transcripts into "genes", the *mergeGenes* function can be used.  Here
313 | we use the ENSEMBL gene annotation as is.
314 | 
315 | The following code chunk is not evaluated but was used to select 
316 | a few genes for demonstrating *estimateAbundance*:
317 | 
318 | ```{r, eval=FALSE}
319 | one.iso.genes <- intersect(names(tab)[tab == 1], selected.genes)
320 | two.iso.genes <- intersect(names(tab)[tab == 2], selected.genes)
321 | three.iso.genes <- intersect(names(tab)[tab == 3], selected.genes)
322 | set.seed(1)
323 | genes.theta <- c(sample(one.iso.genes, 2),
324 |                  sample(two.iso.genes, 2),
325 |                  sample(three.iso.genes, 2)) 
326 | txdf.theta <- txdf[txdf$gene_id %in% genes.theta,]
327 | ebt.theta <- ebt0[txdf.theta$tx_id]
328 | ```
329 | 
330 | Next we specify the set of models we want to use, referring back by
331 | name to the models that were fit in the previous step. Additionally, 
332 | we can include any of the following models: `null`, `fraglen`, `vlmm`, 
333 | or `fraglen.vlmm` which are the four models that can be fit using only
334 | offsets (none, either or both of the offsets).
335 | 
336 | ```{r}
337 | model.names <- c("null","fraglen.vlmm","GC")
338 | ```
339 | 
340 | Here we estimate FPKM-scale abundances for multiple genes and multiple
341 | samples. If `lib.sizes` is not specified, a default value of 1e6
342 | is used. *estimateAbundance* works one gene at a time, where the
343 | `transcripts` argument expects a *GRangesList* of the exons for each
344 | transcript (multiple if the gene has multiple isoforms).
345 | 
346 | ```{r estimateAbundance}
347 | system.time({
348 | res <- lapply(genes.theta, function(gene.name) {
349 |          txs <- txdf.theta$tx_id[txdf.theta$gene_id == gene.name]
350 |          estimateAbundance(transcripts=ebt.theta[txs],
351 |                            bam.files=bam.files,
352 |                            fitpar=fitpar,
353 |                            genome=Hsapiens,
354 |                            model.names=model.names)
355 |        })
356 | })
357 | ```
358 | 
359 | Each element of this list has the abundances (`theta`) and average
360 | bias (`lambda`) for a single gene across all samples, all models, and all
361 | isoforms of the gene: 
362 | 
363 | ```{r}
364 | res[[1]][["ERR188297"]][["GC"]]
365 | res[[6]][["ERR188297"]][["GC"]]
366 | ```
367 | 
368 | The *extractAlpine* function can be used to collate estimates from
369 | across all genes.  *extractAlpine* will scale the estimates such that
370 | the total bias observed over all transcripts is centered at 1.  The
371 | estimates produce by *estimateAbundance* presume a default library size of
372 | 1e6, but will be rescaled using the total number of fragments across
373 | genes when using *extractAlpine* (if this library size rescaling is
374 | not desired, choose `divide.out=FALSE`).
375 | 
376 | ```{r}
377 | mat <- extractAlpine(res, model="GC")
378 | mat
379 | ```
380 | 
381 | If we provide a *GRangesList* which contains the exons for each
382 | transcript, the returned object will be a *SummarizedExperiment*.
383 | The *GRangesList* provided to `transcripts` does not have to be in the
384 | correct order, the transcripts will be extracted by name to match the
385 | rows of the FPKM matrix.
386 | 
387 | ```{r}
388 | se <- extractAlpine(res, model="GC", transcripts=ebt.theta)
389 | se
390 | ```
391 | 
392 | The matrix of FPKM values can be scaled using the median ratio method
393 | of DESeq with the *normalizeDESeq* function. This is a robust method
394 | which removes systematic differences in values across samples, and is
395 | more appropriate than using the total count which is sensitive to
396 | very large abundance estimates for a minority of transcripts. 
397 | 
398 | ```{r, eval=FALSE}
399 | norm.mat <- normalizeDESeq(mat, cutoff=0.1)
400 | ```
401 | 
402 | ## Simulating RNA-seq data with empirical GC bias
403 | 
404 | The fragment GC bias which *alpine* estimates can be used in
405 | downstream simulations, for example in the 
406 | [polyester](http://bioconductor.org/packages/polyester) Bioconductor
407 | package. All we need to do is to run the *plotGC* function, but
408 | specifying that instead of a plot, we want to return a matrix of
409 | probabilities for each percentile of fragment GC content. This matrix
410 | can be provided to the `frag_GC_bias` argument of *simulate_experiment*.
411 | 
412 | We load a `fitpar` object that was run with the fragment length range
413 | [80,350] bp. 
414 | 
415 | ```{r}
416 | data(preprocessedData)
417 | prob.mat <- plotGC(fitpar, "all", return.type=2)
418 | head(prob.mat)
419 | ```
420 | 
421 | If `return.type=0` (the default) the function makes the plot of log
422 | fragment rate over fragment GC content. If `return.type=1` the
423 | function returns the matrix of log fragment rate over percentiles of
424 | fragment GC content, and if `return.type=2`, the matrix returns
425 | probabilities of observing fragments based on percentiles of fragment
426 | GC content (the log fragment rate exponentiated and scaled to have a
427 | maximum of 1). The matrix returned by `return.type=2` is appropriate
428 | for downstream use with *polyester*.
429 | 
430 | ## Plotting predicted fragment coverage
431 | 
432 | In the *alpine* paper, it was shown that models incorporating fragment
433 | GC bias can be a better predictor of test set RNA-seq fragment
434 | coverage, compared to models incorporating read start bias. Here we
435 | show how to predict fragment coverage for a single-isoform gene using
436 | a variety of fitted bias models. As with *estimateAbundace*, the
437 | model names need to refer back to models fit using *fitBiasModels*.
438 | 
439 | ```{r}
440 | model.names <- c("fraglen","fraglen.vlmm","GC","all")
441 | ```
442 | 
443 | The following function computes the predicted coverage for one
444 | single-isoform gene. We load a `fitpar` object that was run
445 | with the fragment length range [80,350] bp.
446 | 
447 | ```{r}
448 | fitpar[[1]][["model.params"]][c("minsize","maxsize")]
449 | ```
450 | 
451 | ```{r predictCoverage}
452 | system.time({
453 |   pred.cov <- predictCoverage(gene=ebt.fit[["ENST00000245479"]],
454 |                               bam.files=bam.files["ERR188204"],
455 |                               fitpar=fitpar,
456 |                               genome=Hsapiens,
457 |                               model.names=model.names)
458 | })
459 | ```
460 | 
461 | We can plot the observed and predicted coverage for one of the
462 | genes: 
463 | 
464 | ```{r}
465 | palette(brewer.pal(9, "Set1"))
466 | frag.cov <- pred.cov[["ERR188204"]][["frag.cov"]]
467 | plot(frag.cov, type="l", lwd=3, ylim=c(0,max(frag.cov)*1.5))
468 | for (i in seq_along(model.names)) {
469 |   m <- model.names[i]
470 |   pred <- pred.cov[["ERR188204"]][["pred.cov"]][[m]]
471 |   lines(pred, col=i, lwd=3)
472 | }
473 | legend("topright", legend=c("observed",model.names),
474 |        col=c("black",seq_along(model.names)), lwd=3)
475 | ```
476 | 
477 | ## Session information
478 | 
479 | ```{r}
480 | sessionInfo()
481 | ```
482 | 
483 | 


--------------------------------------------------------------------------------