├── DESCRIPTION ├── MD5 ├── NAMESPACE ├── NEWS ├── R ├── bayes.R ├── extract.R ├── fit.R ├── gcstats.R ├── graphics.R ├── misc.R ├── model.R ├── mutations.R ├── read.R ├── results.R ├── segments.R ├── solutions.R └── windows.R ├── build └── vignette.rds ├── data ├── CP.example.RData └── example.seqz.rda ├── inst ├── CITATION ├── doc │ ├── sequenza.R │ ├── sequenza.Rmd │ └── sequenza.html └── extdata │ ├── example.seqz.txt.gz │ └── example.seqz.txt.gz.tbi ├── man ├── CP.data.Rd ├── baf.model.fit.Rd ├── bayes.Rd ├── breaks.Rd ├── chromosome.view.Rd ├── cp.plot.Rd ├── example.seqz.Rd ├── gc.Rd ├── model_points.Rd ├── mutations.Rd ├── plotWindows.Rd ├── read_seqz.Rd ├── theoretical.Rd ├── type_matrix.Rd ├── windowValues.Rd └── workflow.Rd ├── tests ├── testthat.R └── testthat │ ├── test.model.R │ └── test.read.R └── vignettes └── sequenza.Rmd /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: sequenza 2 | Title: Copy Number Estimation from Tumor Genome Sequencing Data 3 | Description: Tools to analyze genomic sequencing data from 4 | paired normal-tumor samples, including cellularity and ploidy estimation; mutation 5 | and copy number (allele-specific and total copy number) detection, quantification 6 | and visualization. 7 | Version: 3.0.0 8 | Date: 2019-05-09 9 | Authors@R: c( 10 | person(given = "Francesco", family = "Favero", 11 | email = "favero.francesco@gmail.com", 12 | role = c("aut", "cre"), 13 | comment = c(ORCID = "0000-0003-3684-2659")), 14 | person(given = c("Andrea", "Marion"), family = "Marquard", 15 | role = c("rev"), 16 | comment = c(ORCID = "0000-0003-2928-6017")), 17 | person(given = "Tejal", family = "Joshi", 18 | role = c("rev"), 19 | comment = c(ORCID = "0000-0002-0939-2982")), 20 | person(given = c("Aron", "Charles"), family = "Eklund", 21 | role = c("aut", "ths"), 22 | comment = c(ORCID = "0000-0003-0861-1001")) 23 | ) 24 | Depends: R (>= 3.2.0) 25 | Imports: pbapply, squash, iotools, readr, seqminer, copynumber 26 | Suggests: testthat, knitr, rmarkdown, rmdformats 27 | License: GPL-3 28 | URL: https://sequenzatools.bitbucket.io, Mailing list: 29 | https://groups.google.com/forum/#!forum/sequenza-user-group 30 | BugReports: https://bitbucket.org/sequenzatools/sequenza/issues 31 | SystemRequirements: pandoc (>= 1.12.3) 32 | VignetteBuilder: knitr, rmarkdown 33 | Encoding: UTF-8 34 | NeedsCompilation: no 35 | Packaged: 2019-05-09 12:58:45 UTC; lgq442 36 | Author: Francesco Favero [aut, cre] (), 37 | Andrea Marion Marquard [rev] (), 38 | Tejal Joshi [rev] (), 39 | Aron Charles Eklund [aut, ths] 40 | () 41 | Maintainer: Francesco Favero 42 | Repository: CRAN 43 | Date/Publication: 2019-05-09 13:50:04 UTC 44 | -------------------------------------------------------------------------------- /MD5: -------------------------------------------------------------------------------- 1 | 60b9b3990ed91c0ce59226344dca5979 *DESCRIPTION 2 | 837dc998a0bfc9cb81b08d5583f4d980 *NAMESPACE 3 | dc74f3941cd7eb5d5d2820a78e855291 *NEWS 4 | 0090c3e305006e38f8ec8cef5616a3e2 *R/bayes.R 5 | 1c95cfc1a6597466e42c386b0c6a6196 *R/extract.R 6 | c37085ee8b851dfd42eea66341904e82 *R/fit.R 7 | 06b55a573555d6e0315478fb47fd48c0 *R/gcstats.R 8 | 3ad97d0891d98e23cd5be6b4417067c5 *R/graphics.R 9 | 79624c9cf668bfb2b05e8a339e98f393 *R/misc.R 10 | 4b7f3320bad5b252cc0727ba9e019ab8 *R/model.R 11 | 8ab48098af7628c6d2f155b8f9f24b74 *R/mutations.R 12 | 1415cb2b3e892624fb7ea4ff76a2c46f *R/read.R 13 | bfe91a98d0c94dab8ae8bcb554eb5acd *R/results.R 14 | f763a946f45e700642201c486263b969 *R/segments.R 15 | 0359f75dbcc67975a0a8654cc04e32a3 *R/solutions.R 16 | 379d0ddc4bffc4fcd9449f18c0a2654a *R/windows.R 17 | 60d166bb186a38ec469d4e76452c2052 *build/vignette.rds 18 | 33cb86ef72ebdd33f91839241f0bd890 *data/CP.example.RData 19 | 47e0915035bb00de6af2b672114b34e0 *data/example.seqz.rda 20 | 49a67dbefc97a501bc47492ca1108e0a *inst/CITATION 21 | a93878af46779d2634e55b13814cb9e2 *inst/doc/sequenza.R 22 | 25228690501dd6c00834e253f2002e07 *inst/doc/sequenza.Rmd 23 | fd4b930147525ec496116c2c84ca6b31 *inst/doc/sequenza.html 24 | 6cecdcda8da1c0a1c7971d6f3923bb20 *inst/extdata/example.seqz.txt.gz 25 | 70764a3c09eea6ae46eaa489498da14f *inst/extdata/example.seqz.txt.gz.tbi 26 | 5fcc49fe68e8207193911291677fdeb5 *man/CP.data.Rd 27 | 6fdf4884a01b8a3586464842f004c469 *man/baf.model.fit.Rd 28 | e503cc7708c4f3a8327a55827398b240 *man/bayes.Rd 29 | 10f9ad6809df484eb98f4de2bde191ec *man/breaks.Rd 30 | 31fc1f8ee27e8a7ca30dbdee27bddbae *man/chromosome.view.Rd 31 | 9c03a6ae6635e05ab4b74eafc17f1eb1 *man/cp.plot.Rd 32 | d436a0c2af880fa09e74fb4e483bb281 *man/example.seqz.Rd 33 | 1289aacdf8835553e2029932b5064948 *man/gc.Rd 34 | 7abd12a6f38af576424d55417ec9bf0d *man/model_points.Rd 35 | f14dff388f23e5bff09e926505787867 *man/mutations.Rd 36 | 3c90a936eb5f68993b38381955a71d48 *man/plotWindows.Rd 37 | c35f64702b6f29bb8a5922366b5b4e25 *man/read_seqz.Rd 38 | 73f2cc0d3bf0c141919c9d2cea3aad44 *man/theoretical.Rd 39 | a50ca8ef2629fb91193b2a0660337f43 *man/type_matrix.Rd 40 | 565da5a0ae9ca06b579cf9ce4d0c8f18 *man/windowValues.Rd 41 | ebf3602ece02d98d51766ef8d2baac78 *man/workflow.Rd 42 | ddd5ddf951a4a538c3d4cb03c3673c7a *tests/testthat.R 43 | 5c373f393040d14844556c63d0ee140d *tests/testthat/test.model.R 44 | 92691ae6d8ce244b626ab7ee777071b5 *tests/testthat/test.read.R 45 | 25228690501dd6c00834e253f2002e07 *vignettes/sequenza.Rmd 46 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | export( 2 | baf.bayes, 3 | baf.model.fit, 4 | baf.model.points, 5 | baf.types.matrix, 6 | chromosome.view, 7 | cp.plot, 8 | cp.plot.contours, 9 | find.breaks, 10 | gc.sample.stats, 11 | gc.summary.plot, 12 | get.ci, 13 | mean_gc, 14 | median_gc, 15 | mufreq.bayes, 16 | mufreq.model.fit, 17 | mufreq.model.points, 18 | mufreq.types.matrix, 19 | plotWindows, 20 | read.seqz, 21 | segment.breaks, 22 | sequenza.extract, 23 | sequenza.fit, 24 | sequenza.results, 25 | theoretical.baf, 26 | theoretical.depth.ratio, 27 | theoretical.mufreq, 28 | windowBf, 29 | windowValues 30 | ) 31 | importFrom("squash", 32 | "colorgram", 33 | "makecmap" 34 | ) 35 | 36 | importFrom("pbapply", 37 | "pblapply" 38 | ) 39 | 40 | importFrom("squash", 41 | "colorgram", 42 | "makecmap" 43 | ) 44 | importFrom("iotools", 45 | "mstrsplit", 46 | "chunk.apply" 47 | ) 48 | 49 | importFrom("readr", 50 | "read_tsv" 51 | ) 52 | 53 | importFrom("seqminer", 54 | "tabix.read" 55 | ) 56 | 57 | importFrom("copynumber", 58 | "winsorize", 59 | "pcf", 60 | "aspcf" 61 | ) 62 | 63 | importFrom("grDevices", 64 | "colorRampPalette", 65 | "dev.off", 66 | "gray.colors", 67 | "heat.colors", 68 | "palette", 69 | "pdf", 70 | "rgb" 71 | ) 72 | importFrom("graphics", 73 | "abline", 74 | "axis", 75 | "barplot", 76 | "contour", 77 | "legend", 78 | "lines", 79 | "mtext", 80 | "par", 81 | "plot", 82 | "points", 83 | "polygon", 84 | "rect", 85 | "segments", 86 | "strwidth", 87 | "text" 88 | ) 89 | importFrom("stats", 90 | "dbinom", 91 | "density", 92 | "dpois", 93 | "dt", 94 | "median", 95 | "na.exclude", 96 | "quantile", 97 | "sd", 98 | "setNames", 99 | "weighted.mean" 100 | ) 101 | importFrom("utils", 102 | "packageVersion", 103 | "read.delim", 104 | "read.table", 105 | "setTxtProgressBar", 106 | "tail", 107 | "txtProgressBar", 108 | "write.table" 109 | ) 110 | -------------------------------------------------------------------------------- /NEWS: -------------------------------------------------------------------------------- 1 | Changes in version 3.0.0 (2019-05-09) 2 | - Code cleanup and refactoring 3 | - Bug fixes 4 | - Use readr and stop relying on system commands (grep, sed, gunzip) to read seqz files 5 | - Use the package pbapply for progress bar 6 | - Normalize each sample separately 7 | - Provide coverage profile of the 2 samples separately, before and after normalization 8 | - Provide CG-content vs depth profile for each samples, before and after normalization 9 | - Draw standard "mirrored" BAF (from 0 to 1) in the raw genome view 10 | 11 | 12 | Changes in version 2.1.2 (2015-08-18) 13 | - Keep the order of the bases corresponding to the major and minor alleles as in the normal sample. (allows mocking haplotype). 14 | - Fix check for NOTES in newer R versions 15 | 16 | Changes in version 2.1.1 (2015-01-20) 17 | - Fix heterozygous detection when importing VarScan2 data of very high coverage seq. 18 | - Cleanup sequenza-utils.py 19 | - Update citation info to published manuscript 20 | 21 | Changes in version 2.1.0 (2014-10-08) 22 | - Add sequenza-utils.py function bam2seqz 23 | - Add raw data depth-ratio/Bf in the genome view plots 24 | - Results model fitting plot using the B-allele/depth-ratio plot. 25 | - Present alternative solutions using local maxima of the CP plot. 26 | - Model the expected B-allele frequencies with a t-distribution using the observed sd, rather then taking the 95% of the observed Bf. 27 | - Use dt instead of dbinom for the BAF and depth ratio model. 28 | - Fix documentation discrepancies on the sequenza-utils.py execution. 29 | - Add breaks as optional argument - enables to input custom segmentation - 30 | - Add different segmentation options, fast, het and full, corresponding to different resolutions. 31 | - Minor fixes. 32 | 33 | Changes in version 2.0.0 (2014-04-08) 34 | - Change seqz names header and some function arguments/formats to improve usability 35 | - Add to sequenza.fit a method argument to calculate cellularity and ploidy also from mutations 36 | - Change the recommended file extension from ".abfreq" to ".seqz" for clarity. 37 | - The "seqz" file now contains a column with strand orientation information. 38 | - "cp.plot" now plots the scaled log-likelihood. 39 | - "theoretical.depth.ratio" now implements the correct formula. 40 | - The "theoretical.*" functions have their arguments rearranged, and defaults changed, for consistency. 41 | - "sequenza.extract" now has additional filtering arguments. 42 | - Miscellaneous cleanup/simplification/optimization. 43 | 44 | Changes in version 1.0.5 (2014-02-04) 45 | - Add a python utility for binning the data to a desired window size (reducing vastly memory footprint in the analysis) 46 | - Fix default workflows parameter to detect CN up to 20. 47 | 48 | Changes in version 1.0.4 (2014-01-16) 49 | - Add function to import VarScan2 output 50 | - Add results function, to save results and standard plots. 51 | - Fix error on loading non-zipped files in the workflow. 52 | - Ignore error and finish the process if one of the chromosome fails in the workflows. 53 | 54 | Changes in version 1.0.3 (2013-12-12) 55 | - Fix grep instructions that broke single chromosome loading 56 | -------------------------------------------------------------------------------- /R/bayes.R: -------------------------------------------------------------------------------- 1 | mufreq.dbinom <- function(mufreq, mufreq.model, depth.t, 2 | seq.errors = 0.01, ...) { 3 | mufreq.model[mufreq.model == 0] <- seq.errors 4 | n.success <- round(mufreq * depth.t, 0) 5 | dbinom(x = n.success, size = depth.t, prob = mufreq.model, ...) 6 | } 7 | 8 | mufreq.dpois <- function(mufreq, mufreq.model, depth.t, 9 | seq.errors = 0.01, ...) { 10 | mufreq.model[mufreq.model == 0] <- seq.errors 11 | n.success <- round(mufreq * depth.t, 0) 12 | dpois( x = n.success, lambda = mufreq.model * depth.t, ...) 13 | } 14 | 15 | baf.dbinom <- function(baf, baf.model, depth.t, ...) { 16 | n.success <- round(baf * depth.t, 0) 17 | dbinom( x = n.success, size = depth.t, prob = baf.model, ...) 18 | } 19 | 20 | baf.dpois <- function(baf, baf.model, depth.t, ...) { 21 | n.success <- round(baf * depth.t, 0) 22 | dpois( x = n.success, lambda = baf.model * depth.t, ...) 23 | } 24 | 25 | depth.ratio.dbinom <- function(size, depth.ratio, depth.ratio.model, ...) { 26 | n.success <- round(size * (depth.ratio / (1 + depth.ratio)), 0) 27 | prob <- depth.ratio.model / (1 + depth.ratio.model) 28 | dbinom( x = n.success, size = size, prob = prob, ...) 29 | } 30 | 31 | depth.ratio.dpois <- function(size, depth.ratio, depth.ratio.model, ...) { 32 | x <- round(size * depth.ratio, 0) 33 | dpois( x = x, lambda = depth.ratio.model * size, ...) 34 | } 35 | 36 | 37 | 38 | baf.bayes <- function(Bf, depth.ratio, cellularity, ploidy, avg.depth.ratio, 39 | sd.Bf = 0.1, sd.ratio = 0.5, weight.Bf = 1, weight.ratio = 1, CNt.min = 0, 40 | CNt.max = 7, CNn = 2, priors.table = data.frame(CN = CNt.min:CNt.max, 41 | value = 1), ratio.priority = FALSE) { 42 | 43 | baf.tab <- data.frame(Bf = Bf, ratio = log(depth.ratio), 44 | sd.Bf = sd.Bf, sd.ratio = sd.ratio, weight.Bf = weight.Bf, 45 | weight.ratio = weight.ratio) 46 | baf_types <- baf.types.matrix(CNt.min = CNt.min, 47 | CNt.max = CNt.max, CNn = CNn) 48 | model_baf <- baf.model.points(cellularity = cellularity, 49 | ploidy = ploidy, baf_types = baf_types, 50 | avg.depth.ratio = avg.depth.ratio) 51 | model.pts <- cbind(CNt = baf_types$CNt, A = baf_types$CNt - baf_types$B, 52 | B = baf_types$B, model_baf) 53 | 54 | rows.x <- 1:nrow(baf.tab) 55 | priors <- rep(1, nrow(model.pts)) 56 | for (i in 1:nrow(priors.table)) { 57 | priors[model.pts$CNt == priors.table$CN[i]] <- priors.table$value[i] 58 | } 59 | priors <- priors / sum(priors) 60 | 61 | bayes.fit <- function (x, mat, model.pts, priors, ratio.priority) { 62 | test.ratio <- log(model.pts$depth.ratio) 63 | test.baf <- model.pts$BAF 64 | min.offset <- 1e-323 65 | score.r <- dt2(sd = mat[x, ]$sd.ratio / sqrt(mat[x, ]$weight.ratio), 66 | mean = mat[x, ]$ratio, x = test.ratio, df = 5, log = TRUE) 67 | score.r <- score.r + log(priors) 68 | if (!is.na(mat[x, ]$Bf) & !is.na(mat[x, ]$sd.Bf / 69 | sqrt(mat[x, ]$weight.Bf))) { 70 | score.b <- dt2(mean = mat[x, ]$Bf, sd = mat[x, ]$sd.Bf / 71 | sqrt(mat[x, ]$weight.Bf), x = test.baf, df = 5, log = TRUE) 72 | post.model <- score.r + score.b 73 | } else { 74 | post.model <- score.r 75 | } 76 | if (ratio.priority == FALSE) { 77 | max.lik <- which.max(post.model) 78 | max.post <- c(as.numeric(model.pts[max.lik, 1:3]), 79 | post.model[max.lik]) 80 | } else { 81 | res.cn <- model.pts$CNt[which.max(score.r)] 82 | idx.pts <- model.pts$CNt == res.cn 83 | model.lik <- cbind(model.pts[idx.pts, 1:3], post.model[idx.pts]) 84 | if (is.null(dim(model.lik))) { 85 | max.post <- model.lik 86 | } else { 87 | max.post <- model.lik[which.max(model.lik[, 4]), ] 88 | } 89 | } 90 | if (is.na(mat[x, ]$Bf)) { 91 | max.post[2:3] <- NA 92 | } 93 | max.post 94 | } 95 | bafs.L <- mapply(FUN = bayes.fit, rows.x, MoreArgs = list(mat = baf.tab, 96 | model.pts = model.pts, priors = priors, 97 | ratio.priority = ratio.priority), SIMPLIFY = FALSE) 98 | bafs.L <- do.call(rbind, bafs.L) 99 | colnames(bafs.L) <- c("CNt", "A", "B", "LPP") 100 | bafs.L 101 | } 102 | 103 | baf.model.fit <- function(cellularity = seq(0.3, 1, by = 0.01), 104 | ploidy = seq(1, 7, by = 0.1), mc.cores = getOption("mc.cores", 2L), ...) { 105 | 106 | result <- expand.grid(ploidy = ploidy, cellularity = cellularity, 107 | KEEP.OUT.ATTRS = FALSE) 108 | 109 | fit.cp <- function(ii) { 110 | L.model <- baf.bayes(cellularity = result$cellularity[ii], 111 | ploidy = result$ploidy[ii], ...) 112 | sum(L.model[,4]) 113 | } 114 | bayes.res <- pblapply(X = 1:nrow(result), FUN = fit.cp, 115 | cl = mc.cores) 116 | result$LPP <- unlist(bayes.res) 117 | z <- tapply(result$LPP, list(result$ploidy, result$cellularity), mean) 118 | x <- as.numeric(rownames(z)) 119 | y <- as.numeric(colnames(z)) 120 | max.lik <- max(result$LPP, na.rm = TRUE) 121 | LogSumLik <- log(sum(exp(result$LPP - max.lik))) + max.lik 122 | znorm <- exp(z - LogSumLik) 123 | list(ploidy = x, cellularity = y, lpp = znorm) 124 | } 125 | 126 | mufreq.bayes <- function(mufreq, depth.ratio, cellularity, 127 | ploidy, avg.depth.ratio, weight.mufreq = 100, weight.ratio = 100, 128 | CNt.min = 1, CNt.max = 7, CNn = 2, 129 | priors.table = data.frame(CN = CNt.min:CNt.max, value = 1)) { 130 | mufreq.tab <- data.frame(F = mufreq, ratio = depth.ratio, 131 | weight.mufreq = weight.mufreq, weight.ratio = weight.ratio) 132 | mufreq_types <- mufreq.types.matrix(CNt.min = CNt.min, 133 | CNt.max = CNt.max, CNn = CNn) 134 | model.pts <- mufreq.model.points(cellularity = cellularity, 135 | ploidy = ploidy, mufreq_types = mufreq_types, 136 | avg.depth.ratio = avg.depth.ratio) 137 | model.pts <- cbind(mufreq_types, model.pts) 138 | rows.x <- 1:nrow(mufreq.tab) 139 | priors <- rep(1, nrow(model.pts)) 140 | for (i in 1:nrow(priors.table)) { 141 | priors[model.pts$CNt == 142 | priors.table$CN[i]] <- priors.table$value[i] 143 | } 144 | priors <- priors / sum(priors) 145 | bayes.fit <- function (x, mat, model.pts, priors) { 146 | test.ratio <- model.pts$depth.ratio 147 | test.mufrq <- model.pts$mufreqs 148 | min.offset <- 1e-323 149 | score.r <- depth.ratio.dbinom(size = mat[x, ]$weight.ratio, 150 | depth.ratio = mat[x, ]$ratio, test.ratio) 151 | score.m <- mufreq.dbinom(mufreq = mat[x, ]$F, 152 | depth.t = mat[x, ]$weight.mufreq, test.mufrq) 153 | score.r <- score.r * priors 154 | score.m <- score.m 155 | post.model <- score.r * score.m 156 | post.model[post.model == 0] <- min.offset 157 | res.cn <- model.pts$CNt[which.max(score.r)] 158 | idx.pts <- model.pts$CNt == res.cn 159 | model.lik <- cbind(model.pts[idx.pts, 1:3], log(post.model[idx.pts])) 160 | if (is.null(dim(model.lik))) { 161 | max.post <- model.lik 162 | } else { 163 | max.post <- model.lik[which.max(model.lik[,4]),] 164 | } 165 | max.post 166 | } 167 | types.L <- mapply(FUN = bayes.fit, rows.x, MoreArgs = list( 168 | mat = mufreq.tab, model.pts = model.pts, 169 | priors = priors), SIMPLIFY = FALSE) 170 | types.L <- do.call(rbind, types.L) 171 | colnames(types.L) <- c("CNn", "CNt", "Mt", "LPP") 172 | types.L 173 | } 174 | 175 | mufreq.model.fit <- function(cellularity = seq(0.3, 1, by = 0.01), 176 | ploidy = seq(1, 7, by = 0.1), mc.cores = getOption("mc.cores", 2L), ...) { 177 | result <- expand.grid(ploidy = ploidy, cellularity = cellularity, 178 | KEEP.OUT.ATTRS = FALSE) 179 | fit.cp <- function(ii) { 180 | L.model <- mufreq.bayes(cellularity = result$cellularity[ii], 181 | ploidy = result$ploidy[ii], ...) 182 | sum(L.model[, 4]) 183 | } 184 | bayes.res <- pblapply(X = 1:nrow(result), FUN = fit.cp, 185 | cl = mc.cores) 186 | result$LPP <- unlist(bayes.res) 187 | z <- tapply(result$LPP, list(result$ploidy, result$cellularity), mean) 188 | x <- as.numeric(rownames(z)) 189 | y <- as.numeric(colnames(z)) 190 | max.lik <- max(result$LPP, na.rm = TRUE) 191 | LogSumLik <- log(sum(exp(result$LPP - max.lik))) + max.lik 192 | znorm <- exp(z - LogSumLik) 193 | list(ploidy = x, cellularity = y, lpp = znorm) 194 | } 195 | -------------------------------------------------------------------------------- /R/extract.R: -------------------------------------------------------------------------------- 1 | sequenza.extract <- function(file, window = 1e6, overlap = 1, 2 | gamma = 80, kmin = 10, gamma.pcf = 140, kmin.pcf = 40, 3 | mufreq.treshold = 0.10, min.reads = 40, min.reads.normal = 10, 4 | min.reads.baf = 1, max.mut.types = 1, min.type.freq = 0.9, 5 | min.fw.freq = 0, verbose = TRUE, chromosome.list = NULL, 6 | breaks = NULL, breaks.method = "het", assembly = "hg19", 7 | weighted.mean = TRUE, normalization.method = "mean", 8 | ignore.normal = FALSE, parallel = 1, gc.stats = NULL, 9 | segments.samples = FALSE){ 10 | 11 | if (is.null(gc.stats)) { 12 | gc.stats <- gc.sample.stats(file, verbose = verbose, 13 | parallel = parallel) 14 | } 15 | if (normalization.method == "mean") { 16 | gc.normal.vect <- mean_gc(gc.stats$normal) 17 | gc.tumor.vect <- mean_gc(gc.stats$tumor) 18 | avg_tum_depth <- weighted.mean(x = gc.stats$tumor$depth, 19 | w = colSums(gc.stats$tumor$n)) 20 | avg_nor_depth <- weighted.mean(x = gc.stats$normal$depth, 21 | w = colSums(gc.stats$normal$n)) 22 | } else { 23 | gc.normal.vect <- median_gc(gc.stats$normal) 24 | gc.tumor.vect <- median_gc(gc.stats$tumor) 25 | avg_tum_depth <- weighted.median(x = gc.stats$tumor$depth, 26 | w = colSums(gc.stats$tumor$n)) 27 | avg_nor_depth <- weighted.median(x = gc.stats$normal$depth, 28 | w = colSums(gc.stats$normal$n)) 29 | } 30 | windows.baf <- list() 31 | windows.ratio <- list() 32 | windows.raw_ratio <- list() 33 | windows.normal <- list() 34 | windows.tumor <- list() 35 | windows.n_normal <- list() 36 | windows.n_tumor <- list() 37 | mutation.list <- list() 38 | segments.list <- list() 39 | segments_samples.list <- list() 40 | norm.gc.list <- list() 41 | if (is.null(dim(breaks))) { 42 | breaks <- NULL 43 | } 44 | chr.vect <- as.character(gc.stats$file.metrics$chr) 45 | if (is.null(chromosome.list)) { 46 | chromosome.list <- chr.vect 47 | } else { 48 | chromosome.list <- chromosome.list[chromosome.list %in% chr.vect] 49 | } 50 | for (chr in chromosome.list) { 51 | if (verbose) { 52 | message("Processing ", chr, ":", appendLF = TRUE) 53 | } 54 | tbi <- file.exists(paste0(file, ".tbi")) 55 | if (tbi) { 56 | seqz.data <- read.seqz(file, chr_name = chr) 57 | } else { 58 | file.lines <- gc.stats$file.metrics[which(chr.vect == chr), ] 59 | seqz.data <- read.seqz(file, n_lines = c(file.lines$start, 60 | file.lines$end)) 61 | } 62 | 63 | norm_tumor_depth <- seqz.data$depth.tumor / 64 | gc.tumor.vect[as.character(seqz.data$GC.percent)] 65 | norm_normal_depth <- seqz.data$depth.normal / 66 | gc.normal.vect[as.character(seqz.data$GC.percent)] 67 | norm.gc.stats <- depths_gc( 68 | depth_n = round(norm_normal_depth * avg_nor_depth, 0), 69 | depth_t = round(norm_tumor_depth * avg_tum_depth, 0), 70 | gc = seqz.data$GC.percent) 71 | if (ignore.normal) { 72 | seqz.data$adjusted.ratio <- round(norm_tumor_depth, 3) 73 | } else { 74 | seqz.data$adjusted.ratio <- round( 75 | norm_tumor_depth / norm_normal_depth, 3) 76 | } 77 | if (segments.samples == TRUE) { 78 | breaks_normal_chr <- breaks_full( 79 | data = data.frame(chromosome = seqz.data$chromosome, 80 | position = seqz.data$position, 81 | adjusted.ratio = norm_normal_depth, 82 | singsAsFactors = FALSE), 83 | gamma = gamma.pcf, kmin = kmin.pcf, assembly = assembly, 84 | breaks.het = NULL) 85 | breaks_tumor_chr <- breaks_full( 86 | data = data.frame(chromosome = seqz.data$chromosome, 87 | position = seqz.data$position, 88 | adjusted.ratio = norm_tumor_depth, 89 | singsAsFactors = FALSE), 90 | gamma = gamma.pcf, kmin = kmin.pcf, assembly = assembly, 91 | breaks.het = NULL) 92 | } else { 93 | breaks_normal_chr <- NULL 94 | breaks_tumor_chr <- NULL 95 | } 96 | seqz.r.win <- windowValues(x = seqz.data$adjusted.ratio, 97 | positions = seqz.data$position, 98 | chromosomes = seqz.data$chromosome, 99 | window = window, overlap = overlap, 100 | weight = seqz.data$depth.normal) 101 | seqz.n.win <- windowValues(x = seqz.data$depth.normal / avg_nor_depth, 102 | positions = seqz.data$position, 103 | chromosomes = seqz.data$chromosome, 104 | window = window, overlap = overlap) 105 | seqz.t.win <- windowValues(x = seqz.data$depth.tumor / avg_tum_depth, 106 | positions = seqz.data$position, 107 | chromosomes = seqz.data$chromosome, 108 | window = window, overlap = overlap) 109 | seqz.r_r.win <- windowValues(x = seqz.data$depth.ratio / ( 110 | avg_tum_depth / avg_nor_depth), 111 | positions = seqz.data$position, 112 | chromosomes = seqz.data$chromosome, 113 | window = window, overlap = overlap, 114 | weight = seqz.data$depth.normal) 115 | seqz.n_n.win <- windowValues(x = norm_normal_depth, 116 | positions = seqz.data$position, 117 | chromosomes = seqz.data$chromosome, 118 | window = window, overlap = overlap) 119 | seqz.n_t.win <- windowValues(x = norm_tumor_depth, 120 | positions = seqz.data$position, 121 | chromosomes = seqz.data$chromosome, 122 | window = window, overlap = overlap) 123 | 124 | 125 | seqz.hom <- seqz.data$zygosity.normal == "hom" 126 | seqz.het <- seqz.data[!seqz.hom, ] 127 | het.filt <- seqz.het$good.reads >= min.reads.baf 128 | seqz.het <- seqz.het[het.filt, ] 129 | het_ok <- nrow(seqz.het) > 0 130 | if (is.null(breaks)) { 131 | breaks_chr <- NULL 132 | } else { 133 | breaks_chr <- breaks[breaks$chrom == chr, ] 134 | } 135 | if (het_ok) { 136 | seqz.b.win <- windowBf(Af = seqz.het$Af, Bf = seqz.het$Bf, 137 | good.reads = seqz.het$good.reads, 138 | chromosomes = seqz.het$chromosome, 139 | positions = seqz.het$position, conf = 0.95, 140 | window = window, overlap = overlap) 141 | } else { 142 | seqz.b.win <- list() 143 | seqz.b.win[[1]] <- data.frame(start = min(seqz.data$position, 144 | na.rm = TRUE), end = max(seqz.data$position, na.rm = TRUE), 145 | mean = 0, q0 = 0, q1 = 0, N = 1) 146 | } 147 | if (het_ok) { 148 | breaks_chr <- extract_breaks(data = seqz.data, data_het = seqz.het, 149 | ratio = seqz.r.win, baf = seqz.b.win, 150 | gamma = gamma, kmin = kmin, breaks = breaks_chr, 151 | gamma.pcf = gamma.pcf, kmin.pcf = kmin.pcf, 152 | assembly = assembly, chromosome = chr, 153 | method = breaks.method) 154 | } else { 155 | if (breaks.method == "full") { 156 | breaks_chr <- extract_breaks(data = seqz.data, 157 | data_het = seqz.het, ratio = seqz.r.win, baf = seqz.b.win, 158 | gamma = gamma, kmin = kmin, 159 | gamma.pcf = gamma.pcf, kmin.pcf = kmin.pcf, 160 | assembly = assembly, chromosome = chr, 161 | method = breaks.method) 162 | } 163 | } 164 | if (class(breaks_chr) == "try-error") { 165 | breaks_chr <- NULL 166 | } 167 | if (is.null(breaks_chr) || nrow(breaks_chr) == 0 || 168 | length(breaks_chr) == 0) { 169 | breaks_chr <- data.frame(chrom = chr, 170 | start.pos = min(seqz.data$position, na.rm = TRUE), 171 | end.pos = max(seqz.data$position, na.rm = TRUE)) 172 | } 173 | seg.s1 <- segment.breaks(seqz.tab = seqz.data, breaks = breaks_chr, 174 | min.reads.baf = min.reads.baf, weighted.mean = weighted.mean) 175 | 176 | mut.tab <- mutation.table(seqz.data, 177 | mufreq.treshold = mufreq.treshold, 178 | min.reads = min.reads, min.reads.normal = min.reads.normal, 179 | max.mut.types = max.mut.types, min.type.freq = min.type.freq, 180 | min.fw.freq = min.fw.freq, segments = seg.s1) 181 | 182 | windows.ratio[[which(chromosome.list == chr)]] <- seqz.r.win[[1]] 183 | windows.raw_ratio[[which(chromosome.list == chr)]] <- seqz.r_r.win[[1]] 184 | windows.normal[[which(chromosome.list == chr)]] <- seqz.n.win[[1]] 185 | windows.tumor[[which(chromosome.list == chr)]] <- seqz.t.win[[1]] 186 | windows.n_normal[[which(chromosome.list == chr)]] <- seqz.n_n.win[[1]] 187 | windows.n_tumor[[which(chromosome.list == chr)]] <- seqz.n_t.win[[1]] 188 | windows.baf[[which(chromosome.list == chr)]] <- seqz.b.win[[1]] 189 | segments.list[[which(chromosome.list == chr)]] <- seg.s1 190 | mutation.list[[which(chromosome.list == chr)]] <- mut.tab 191 | norm.gc.list[[which(chromosome.list == chr)]] <- norm.gc.stats 192 | segments_samples.list[[which(chromosome.list == chr)]] <- list( 193 | normal = breaks_normal_chr, tumor = breaks_tumor_chr) 194 | 195 | if (verbose) { 196 | message(' ', nrow(mut.tab), ' variant calls.', appendLF = TRUE) 197 | message(' ', nrow(seg.s1), ' copy-number segments.', appendLF = TRUE) 198 | message(' ', nrow(seqz.het), ' heterozygous positions.', appendLF = TRUE) 199 | message(' ', sum(seqz.hom), ' homozygous positions.', appendLF = TRUE) 200 | } 201 | } 202 | names(windows.baf) <- chromosome.list 203 | names(windows.ratio) <- chromosome.list 204 | names(windows.raw_ratio) <- chromosome.list 205 | names(windows.normal) <- chromosome.list 206 | names(windows.tumor) <- chromosome.list 207 | names(windows.n_normal) <- chromosome.list 208 | names(windows.n_tumor) <- chromosome.list 209 | names(mutation.list) <- chromosome.list 210 | names(segments.list) <- chromosome.list 211 | names(segments_samples.list) <- chromosome.list 212 | 213 | 214 | gc_norm <- unfold_gc(do.call(rbind, norm.gc.list), stats = FALSE) 215 | 216 | if (normalization.method == "mean") { 217 | avg_tum_ndepth <- weighted.mean(x = gc_norm$tumor$depth, 218 | w = colSums(gc_norm$tumor$n)) 219 | avg_nor_ndepth <- weighted.mean(x = gc_norm$normal$depth, 220 | w = colSums(gc_norm$normal$n)) 221 | } else { 222 | avg_tum_ndepth <- weighted.median(x = gc_norm$tumor$depth, 223 | w = colSums(gc_norm$tumor$n)) 224 | avg_nor_ndepth <- weighted.median(x = gc_norm$normal$depth, 225 | w = colSums(gc_norm$normal$n)) 226 | } 227 | if (ignore.normal) { 228 | avg_depth_ratio <- avg_tum_ndepth / avg_tum_depth 229 | } else { 230 | avg_depth_ratio <- (avg_tum_ndepth / avg_tum_depth) / 231 | (avg_nor_ndepth / avg_nor_depth) 232 | } 233 | 234 | list(BAF = windows.baf, ratio = windows.ratio, 235 | raw_ratio = windows.raw_ratio, 236 | depths = list( 237 | raw = list(normal = windows.normal, tumor = windows.tumor), 238 | norm = list(normal = windows.n_normal, tumor = windows.n_tumor)), 239 | mutations = mutation.list, segments = segments.list, 240 | chromosomes = chromosome.list, gc = gc.stats, 241 | gc_norm = gc_norm, avg.depth.ratio = avg_depth_ratio, 242 | avg.depth.tumor = avg_tum_depth, avg.depth.normal = avg_nor_depth, 243 | segments_samples = segments_samples.list) 244 | } 245 | -------------------------------------------------------------------------------- /R/fit.R: -------------------------------------------------------------------------------- 1 | sequenza.fit <- function(sequenza.extract, female = TRUE, 2 | N.ratio.filter = 10, N.BAF.filter = 1, segment.filter = 3e6, 3 | mufreq.treshold = 0.10, XY = c(X = "X", Y = "Y"), 4 | cellularity = seq(0.1, 1, 0.01), ploidy = seq(1, 7, 0.1), 5 | ratio.priority = FALSE, method = "baf", 6 | priors.table = data.frame(CN = 2, value = 2), chromosome.list = 1:24, 7 | mc.cores = getOption("mc.cores", 2L)){ 8 | 9 | if (is.null(chromosome.list)) { 10 | mut.all <- do.call(rbind, sequenza.extract$mutations) 11 | mut.all <- na.exclude(mut.all) 12 | segs.all <- do.call(rbind, sequenza.extract$segments) 13 | } else { 14 | mut.all <- do.call(rbind, 15 | sequenza.extract$mutations[chromosome.list]) 16 | mut.all <- na.exclude(mut.all) 17 | segs.all <- do.call(rbind, 18 | sequenza.extract$segments[chromosome.list]) 19 | } 20 | segs.len <- segs.all$end.pos - segs.all$start.pos 21 | avg.depth.ratio <- sequenza.extract$avg.depth.ratio 22 | 23 | if (method == "baf") { 24 | 25 | avg.sd.ratio <- sum(segs.all$sd.ratio * segs.all$N.ratio, 26 | na.rm = TRUE) / sum(segs.all$N.ratio, na.rm = TRUE) 27 | avg.sd.Bf <- sum(segs.all$sd.BAF * segs.all$N.BAF, na.rm = TRUE) / 28 | sum(segs.all$N.BAF, na.rm = TRUE) 29 | segs.all$sd.BAF[segs.all$sd.BAF == 0] <- max( 30 | segs.all$sd.BAF, na.rm = TRUE) 31 | segs.all$sd.ratio[segs.all$sd.ratio == 0] <- max( 32 | segs.all$sd.ratio, na.rm = TRUE) 33 | 34 | segs.filt <- segs.all$N.ratio > N.ratio.filter & 35 | segs.all$N.BAF > N.BAF.filter 36 | segs.filt <- segs.len >= segment.filter & segs.filt 37 | if (female){ 38 | segs.is.xy <- segs.all$chromosome == XY["Y"] 39 | } else{ 40 | segs.is.xy <- segs.all$chromosome %in% XY 41 | } 42 | filt.test <- segs.filt & !segs.is.xy 43 | seg.test <- segs.all[filt.test, ] 44 | seg.len.mb <- segs.len[filt.test] / 1e6 45 | baf.model.fit(Bf = seg.test$Bf, depth.ratio = seg.test$depth.ratio, 46 | sd.ratio = seg.test$sd.ratio, weight.ratio = seg.len.mb, 47 | sd.Bf = seg.test$sd.BAF, weight.Bf = seg.len.mb, 48 | avg.depth.ratio = avg.depth.ratio, cellularity = cellularity, 49 | ploidy = ploidy, priors.table = priors.table, 50 | mc.cores = mc.cores, ratio.priority = ratio.priority) 51 | } else if (method == "mufreq") { 52 | mut.filt <- mut.all$F >= mufreq.treshold 53 | if (female){ 54 | mut.is.xy <- mut.all$chromosome == XY["Y"] 55 | } else{ 56 | mut.is.xy <- mut.all$chromosome %in% XY 57 | } 58 | filt.test <- mut.filt & !mut.is.xy 59 | mut.test <- mut.all[filt.test, ] 60 | w.mufreq <- round(mut.test$good.reads, 0) 61 | mufreq.model.fit(mufreq = mut.test$F, 62 | depth.ratio = mut.test$adjusted.ratio, 63 | weight.ratio = 2 * w.mufreq, weight.mufreq = w.mufreq, 64 | avg.depth.ratio = avg.depth.ratio, cellularity = cellularity, 65 | ploidy = ploidy, priors.table = priors.table, mc.cores = mc.cores) 66 | } else { 67 | stop("The only available methods are \"baf\" and \"mufreq\"") 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /R/gcstats.R: -------------------------------------------------------------------------------- 1 | gc.sample.stats <- function(file, col_types = "c--dd----d----", 2 | buffer = 33554432, parallel = 2L, verbose = TRUE) { 3 | 4 | con <- gzfile(file, "rb") 5 | 6 | suppressWarnings(skip_line <- readLines(con, n = 1)) 7 | remove(skip_line) 8 | parse_chunck <- function(x, col_types) { 9 | x <- read_tsv(file = paste(mstrsplit(x), collapse = "\n"), 10 | col_types = col_types, col_names = FALSE, 11 | skip = 0, n_max = Inf, progress = FALSE) 12 | u_chr <- unique(x[, 1]) 13 | n_chr <- table(x[, 1]) 14 | gc1 <- lapply(split(x[, 2], x[, 4]), table) 15 | gc2 <- lapply(split(x[, 3], x[, 4]), table) 16 | if (verbose){ 17 | message(".", appendLF = FALSE) 18 | } 19 | list(unique = u_chr, lines = n_chr, gc_nor = gc1, gc_tum = gc2) 20 | } 21 | if (verbose){ 22 | message("Collecting GC information ", appendLF = FALSE) 23 | } 24 | res <- chunk.apply(input = con, FUN = parse_chunck, col_types = col_types, 25 | CH.MAX.SIZE = buffer, parallel = parallel) 26 | close(con) 27 | if (verbose){ 28 | message(" done\n") 29 | } 30 | unfold_gc(res, stats = TRUE) 31 | } 32 | 33 | unfold_gc <- function(x, stats = TRUE) { 34 | gc_norm <- get_gc(x[, "gc_nor"]) 35 | gc_tum <- get_gc(x[, "gc_tum"]) 36 | if (stats) { 37 | ord_chrom <- unique(Reduce("c", Reduce("c", x[, "unique"]))) 38 | stats_chrom <- Reduce("c", x[, "lines"]) 39 | stats_chrom <- sapply(splash_table(x[, "lines"]), sum) 40 | stats_chrom <- stats_chrom[ord_chrom] 41 | stats_start <- cumsum(c(1, stats_chrom[-length(stats_chrom)])) 42 | stats_end <- stats_start + stats_chrom - 1 43 | stats_chrom <- data.frame(chr = ord_chrom, n_lines = stats_chrom, 44 | start = stats_start, end = stats_end) 45 | 46 | list(file.metrics = stats_chrom, normal = gc_norm, tumor = gc_tum) 47 | } else { 48 | list(normal = gc_norm, tumor = gc_tum) 49 | } 50 | } 51 | 52 | splash_table <- function(lis_obj){ 53 | lis_obj <- Reduce("c", lis_obj) 54 | split(lis_obj, names(lis_obj)) 55 | } 56 | 57 | get_gc <- function(gc_col) { 58 | sort_char <- function(x) { 59 | as.character(sort(as.numeric(x))) 60 | } 61 | all_depths <- splash_table(gc_col) 62 | all_depths <- lapply(all_depths, FUN = function(x) { 63 | sapply(splash_table(x), sum) 64 | }) 65 | names_gc <- sort_char(names(all_depths)) 66 | all_depths <- all_depths[names_gc] 67 | names_depths <- sort_char(unique(Reduce("c", lapply(all_depths, names)))) 68 | n <- do.call(rbind, lapply(all_depths, FUN = function(x, names_depths) { 69 | res <- x[names_depths] 70 | names(res) <- names_depths 71 | res 72 | }, 73 | names_depths = names_depths)) 74 | n[is.na(n)] <- 0 75 | list(gc = as.numeric(names_gc), depth = as.numeric(names_depths), n = n) 76 | } 77 | 78 | median_gc <- function(gc_list) { 79 | apply(gc_list$n, 1, FUN = function(x, w) { 80 | weighted.median(x = w, w = x, na.rm = TRUE) 81 | }, 82 | w = gc_list$depth) 83 | } 84 | 85 | mean_gc <- function(gc_list) { 86 | apply(gc_list$n, 1, FUN = function(x, w) { 87 | weighted.mean(x = w, w = x, na.rm = TRUE) 88 | }, 89 | w = gc_list$depth) 90 | } 91 | 92 | depths_gc <- function(depth_n, depth_t, gc) { 93 | gc_nor <- lapply(split(depth_n, gc), table) 94 | gc_tum <- lapply(split(depth_t, gc), table) 95 | list(gc_nor = gc_nor, gc_tum = gc_tum) 96 | } 97 | -------------------------------------------------------------------------------- /R/graphics.R: -------------------------------------------------------------------------------- 1 | plotWindows <- function(seqz.window, m.lty = 1, m.lwd = 3, 2 | m.col = "black", q.bg = "lightblue", 3 | log2.plot = FALSE, n.min = 1, xlim, ylim, 4 | add = FALSE, ...) { 5 | if (log2.plot) { 6 | seqz.window[, c(3, 4, 5)] <- log2(seqz.window[, c(3, 4, 5)]) 7 | } 8 | if (!add) { 9 | if (missing(xlim)) 10 | xlim <- c(seqz.window$start[1], seqz.window$end[nrow(seqz.window)]) 11 | if (missing(ylim)) 12 | ylim <- c(min(seqz.window$q0, na.rm = TRUE), 13 | max(seqz.window$q1, na.rm = TRUE)) 14 | plot(xlim, ylim, type = "n", ...) 15 | } 16 | seqz.window <- seqz.window[seqz.window$N >= n.min, ] 17 | rect(xleft = seqz.window$start, ybottom = seqz.window$q0, 18 | xright = seqz.window$end, ytop = seqz.window$q1, 19 | col = q.bg, border = NA) 20 | segments(y0 = seqz.window$mean, x0 = seqz.window$start, 21 | x1 = seqz.window$end, lty = m.lty, lwd = m.lwd, col = m.col) 22 | 23 | } 24 | 25 | gc.plot <- function(gc_list, range.gc = NULL, range.depth = NULL, ...) { 26 | n <- gc_list$n 27 | n[n == 0] <- NA 28 | gc <- gc_list$gc 29 | depth <- gc_list$depth 30 | if (length(range.gc) == 2) { 31 | range.gc <- sort(range.gc) 32 | gc.select <- gc <= range.gc[2] & gc >= range.gc[1] 33 | gc <- gc[gc.select] 34 | n <- n[gc.select, ] 35 | } 36 | if (length(range.depth) == 2) { 37 | range.depth <- sort(range.depth) 38 | depth.select <- depth <= range.depth[2] & depth >= range.depth[1] 39 | depth <- depth[depth.select] 40 | n <- n[, depth.select] 41 | } 42 | colorgram(x = gc, y = depth, z = n, ...) 43 | } 44 | 45 | gc.summary.plot <- function(gc_list, mean.col = 1, median.col = 2, 46 | scale.subset = 1.5, ...){ 47 | mengc <- mean_gc(gc_list) 48 | medgc <- median_gc(gc_list) 49 | max_depth <- round(max(c(mengc, medgc)) * scale.subset, 0) 50 | gc.plot(gc_list, range.depth = c(0, max_depth), ...) 51 | lines(x = gc_list$gc, y = mengc, lwd = 3, col = mean.col) 52 | lines(x = gc_list$gc, y = medgc, lwd = 3, col = median.col) 53 | legend("topright", c("Mean depth", "Median depth"), 54 | col = c(mean.col, median.col), bg = "white", lty = 1, lwd = 3) 55 | } 56 | 57 | cp.plot <- function (cp.table, xlab = "Ploidy", ylab = "Cellularity", 58 | zlab = "Scaled rank LPP", 59 | colFn = colorRampPalette(c("white", "lightblue")), ...) { 60 | z <- matrix(rank(cp.table$lpp), nrow = nrow(cp.table$lpp)) / 61 | length(cp.table$lpp) 62 | map <- makecmap(c(0, 1), colFn = colFn, include.lowest = TRUE) 63 | colorgram(x = cp.table$ploidy, y = cp.table$cellularity, z = z, 64 | map = map, las = 1, xlab = xlab, ylab = ylab, zlab = zlab, ...) 65 | } 66 | 67 | cp.plot.contours <- function(cp.table, likThresh = c(0.95), 68 | alternative = TRUE, col = palette(), legend.pos = "bottomright", 69 | pch = 18, alt.pch = 3, ...) { 70 | znormsort <- sort(cp.table$lpp, decreasing = TRUE) 71 | znormcumLik <- cumsum(znormsort) 72 | n <- sapply(likThresh, function(x) sum(znormcumLik < x) + 1) 73 | LikThresh <- znormsort[n] 74 | names(LikThresh) <- paste0(likThresh * 100, "%") 75 | contour(x = cp.table$ploidy, y = cp.table$cellularity, z = cp.table$lpp, 76 | levels = znormsort[n], col = col, drawlabels = FALSE, 77 | xlab = "Ploidy", ylab = "Cellularity", ...) 78 | max.xy <- which(cp.table$lpp == max(cp.table$lpp), arr.ind = TRUE) 79 | points(x = cp.table$ploidy[max.xy[, "row"]], 80 | y = cp.table$cellularity[max.xy[, "col"]], pch = pch) 81 | if (alternative == TRUE){ 82 | alt.sol <- alternative.cp.solutions(cp.table) 83 | alt.sol <- alt.sol[-1, ] 84 | points(x = alt.sol$ploidy, y = alt.sol$cellularity, pch = alt.pch) 85 | } 86 | if (!is.na(legend.pos)) { 87 | if (alternative == FALSE) { 88 | legend(legend.pos, legend = c(paste("C.R.", names(LikThresh), 89 | sep = " "), "Point estimate"), 90 | col = c(col[1:length(LikThresh)], "black"), 91 | lty = c(rep(1, length(LikThresh)), NA), 92 | pch = c(rep(NA, length(LikThresh)), pch), 93 | border = NA, bty = "n") 94 | } else { 95 | legend(legend.pos, legend = c(paste("C.R.", 96 | names(LikThresh), sep = " "), 97 | "Point estimate", "Alternative solutions"), 98 | col = c(col[1:length(LikThresh)], "black", "black"), 99 | lty = c(rep(1, length(LikThresh)), NA, NA), 100 | pch = c(rep(NA, length(LikThresh)), pch, alt.pch), 101 | border = NA, bty = "n") 102 | } 103 | } 104 | invisible(LikThresh) 105 | } 106 | 107 | chromosome.view <- function(baf.windows, ratio.windows, mut.tab = NULL, 108 | segments = NULL, min.N.baf = 1, min.N.ratio = 1e4, main = "", 109 | vlines = FALSE, legend.inset = c(-20 * strwidth("a", units = "figure"), 0), 110 | CNn = 2, cellularity = NULL, ploidy = NULL, 111 | avg.depth.ratio = NULL, model.lwd = 1, model.lty = "24", model.col = 1, 112 | x.chr.space = 10) { 113 | if (is.null(segments)) { 114 | data.model <- NULL 115 | } else { 116 | if ("CNt" %in% colnames(segments)) { 117 | if (length(c(cellularity, ploidy, avg.depth.ratio)) != 3) { 118 | data.model <- NULL 119 | } else { 120 | data.model <- list() 121 | CNt.max <- max(segments$CNt, na.rm = TRUE) + 1 122 | CNt.min <- 0 123 | baf_types <- baf.types.matrix(CNt.min = CNt.min, 124 | CNt.max = CNt.max, CNn = 2) 125 | data.model$baf <- baf.model.points(cellularity = cellularity, 126 | ploidy = ploidy, baf_types = baf_types, 127 | avg.depth.ratio = avg.depth.ratio) 128 | data.model$baf <- data.frame(CNt = baf_types$CNt, 129 | A = baf_types$CNt - baf_types$B, B = baf_types$B, 130 | data.model$baf) 131 | mufreq_types <- mufreq.types.matrix(CNt.min = CNt.min, 132 | CNt.max = CNt.max, CNn = CNn) 133 | data.model$muf <- cbind(mufreq_types, 134 | mufreq.model.points(cellularity = cellularity, 135 | ploidy = ploidy, mufreq_types = mufreq_types, 136 | avg.depth.ratio = avg.depth.ratio)) 137 | } 138 | } else { 139 | data.model <- NULL 140 | } 141 | } 142 | if (is.null(mut.tab)) { 143 | par(mar = c(0, 4, 0, 3), oma = c(5, 0, 4, 0), 144 | mfcol = c(2, 1), xaxt = "n") 145 | min.x <- min(c(min(baf.windows$start), min(ratio.windows$start))) 146 | max.x <- max(c(max(baf.windows$end), max(ratio.windows$end))) 147 | xlim <- c(min.x, max.x) 148 | } else { 149 | min.x <- min(c(min(baf.windows$start), min(ratio.windows$start), 150 | min(mut.tab$position))) 151 | max.x <- max(c(max(baf.windows$end), max(ratio.windows$end), 152 | max(mut.tab$position))) 153 | xlim <- c(min.x, max.x) 154 | par(mar = c(0, 4, 0, 10), oma = c(5, 0, 4, 0), 155 | mfcol = c(3, 1), xaxt = "n", xpd = TRUE) 156 | mutation.colors <- c( 157 | "A>C" = rgb(red = 0, green = 178, blue = 238, 158 | alpha = 120, maxColorValue = 255), 159 | "T>G" = rgb(red = 0, green = 178, blue = 238, 160 | alpha = 120, maxColorValue = 255), 161 | "A>G" = rgb(red = 255, green = 64, blue = 64, 162 | alpha = 120, maxColorValue = 255), 163 | "T>C" = rgb(red = 255, green = 64, blue = 64, 164 | alpha = 120, maxColorValue = 255), 165 | "A>T" = rgb(red = 34, green = 139, blue = 34, 166 | alpha = 120, maxColorValue = 255), 167 | "T>A" = rgb(red = 34, green = 139, blue = 34, 168 | alpha = 120, maxColorValue = 255), 169 | "C>A" = rgb(red = 139, green = 90, blue = 0, 170 | alpha = 120, maxColorValue = 255), 171 | "G>T" = rgb(red = 139, green = 90, blue = 0, 172 | alpha = 120, maxColorValue = 255), 173 | "C>G" = rgb(red = 127, green = 0, blue = 255, 174 | alpha = 120, maxColorValue = 255), 175 | "G>C" = rgb(red = 127, green = 0, blue = 255, 176 | alpha = 120, maxColorValue = 255), 177 | "C>T" = rgb(red = 255, green = 215, blue = 0, 178 | alpha = 120, maxColorValue = 255), 179 | "G>A" = rgb(red = 255, green = 215, blue = 0, 180 | alpha = 120, maxColorValue = 255)) 181 | plot(x = mut.tab$position, y = mut.tab$F, 182 | ylab = "Mutant allele frequency", las = 1, pch = 19, 183 | col = c(mutation.colors, "NA" = NA)[as.character(mut.tab$mutation)], 184 | ylim = c(min(mut.tab$F, na.rm = TRUE), 1), xlim = xlim) 185 | unique.colors <- unique(mutation.colors) 186 | labels <- sapply(unique.colors, function(a) { 187 | paste(names(mutation.colors)[mutation.colors == a], 188 | collapse = ", ") 189 | }) 190 | legend(y = "center", x = "right", legend = labels, 191 | inset = legend.inset, pch = 19, col = unique.colors, 192 | pt.bg = unique.colors, border = NA, bty = "n") 193 | if (!is.null(segments)){ 194 | if (vlines) { 195 | abline(v = segments$end.pos, lwd = 1, lty = 2) 196 | } 197 | if (!is.null(data.model)) { 198 | for (i in 1:nrow(segments)) { 199 | segments(x0 = segments$start.pos[i], 200 | x1 = segments$end.pos[i], 201 | y0 = unique(data.model$muf$mufreqs[ 202 | data.model$muf$CNt == segments$CNt[i]]), 203 | lwd = model.lwd, lty = model.lty, col = model.col) 204 | } 205 | } 206 | } 207 | } 208 | if (!is.null(segments)){ 209 | plot(ylab = "B allele frequency", type = "n", 210 | x = xlim, y = c(0, 0.5), las = 1) 211 | plotWindows(baf.windows, ylab = "B allele frequency", 212 | xlim = xlim, ylim = c(0, 0.5), las = 1, 213 | n.min = min.N.baf, add = TRUE) 214 | if (vlines) { 215 | abline(v = segments$end.pos, lwd = 1, lty = 2) 216 | } 217 | segments(x0 = segments$start.pos, y0 = segments$Bf, 218 | x1 = segments$end.pos, y1 = segments$Bf, col = "red", lwd = 3) 219 | if (!is.null(data.model)) { 220 | for (i in 1:nrow(segments)) { 221 | segments(x0 = segments$start.pos[i], x1 = segments$end.pos[i], 222 | y0 = unique(data.model$baf$BAF[ 223 | data.model$baf$CNt == segments$CNt[i]]), 224 | lwd = model.lwd, lty = model.lty, col = model.col) 225 | } 226 | } 227 | } else { 228 | plotWindows(baf.windows, ylab = "B allele frequency", 229 | xlim = xlim, ylim = c(0, 0.5), las = 1, n.min = min.N.baf) 230 | } 231 | plotWindows(ratio.windows, ylab = "Depth ratio", 232 | las = 1, n.min = min.N.ratio, ylim = c(0, 2.5)) 233 | if (!is.null(segments)){ 234 | if (vlines) { 235 | abline(v = segments$end.pos, lwd = 1, lty = 2) 236 | } 237 | segments(x0 = segments$start.pos, y0 = segments$depth.ratio, 238 | x1 = segments$end.pos, y1 = segments$depth.ratio, 239 | col = "red", lwd = 3) 240 | if (!is.null(data.model)) { 241 | ratios.theoric <- unique(data.model$muf[, c("CNt", "depth.ratio")]) 242 | segments(x0 = rep(min(segments$start.pos, na.rm = TRUE), 243 | times = nrow(ratios.theoric)), 244 | x1 = rep(max(segments$end.pos, na.rm = TRUE), 245 | times = nrow(ratios.theoric)), 246 | y0 = ratios.theoric$depth.ratio, lwd = model.lwd, 247 | lty = model.lty, col = model.col) 248 | 249 | axis(labels = as.character(ratios.theoric$CNt), side = 4, 250 | line = 0, las = 1, at = ratios.theoric$depth.ratio) 251 | mtext(text = "Copy number", side = 4, line = 2, 252 | cex = par("cex.lab") * par("cex")) 253 | } 254 | } 255 | par(xaxt = "s") 256 | axis(labels = as.character(round(seq(xlim[1] / 1e6, xlim[2] / 1e6, 257 | by = x.chr.space), 0)), 258 | side = 1, line = 0, at = seq(xlim[1], xlim[2], by = 1e6 * x.chr.space), 259 | outer = FALSE, cex = par("cex.axis") * par("cex")) 260 | mtext("Position (Mb)", side = 1, line = 3, outer = FALSE, 261 | cex = par("cex.lab") * par("cex")) 262 | mtext(main, 3, outer = TRUE, cex = par("cex.main") * par("cex"), line = 2) 263 | } 264 | 265 | genome.view <- function(seg.cn, info.type = "AB", ...) { 266 | chr.order <- unique(seg.cn$chromosome) 267 | seg.list <- split(x = seg.cn[, 268 | c("chromosome", "start.pos", "end.pos", "A", "B", "CNt")], 269 | f = seg.cn$chromosome) 270 | 271 | seg.list <- seg.list[order(order(chr.order))] 272 | seg.max <- lapply(X = seg.list, FUN = function(x) x[nrow(x), "end.pos" ]) 273 | seg.pos <- lapply(seg.list, "[", TRUE, c("start.pos", "end.pos")) 274 | seg.max <- cumsum(as.numeric(do.call(rbind, seg.max))) 275 | chr.offset <- 0 276 | for (i in 1:length(seg.pos)){ 277 | seg.pos[[i]] <- seg.pos[[i]] + chr.offset 278 | colnames(seg.pos[[i]]) <- c("abs.start", "abs.end") 279 | chr.offset <- seg.max[i] 280 | } 281 | seg.max <- sapply(X = seg.pos, FUN = function(x) x[nrow(x), "abs.end" ]) 282 | abs.list <- mapply(cbind, seg.list, seg.pos, SIMPLIFY = FALSE) 283 | abs.segments <- do.call(rbind, abs.list) 284 | if (info.type == "AB") { 285 | na_As <- is.na(abs.segments$A) 286 | max_A <- max(abs.segments$A, na.rm = TRUE) 287 | abs.segments$A[na_As] <- abs.segments$CNt[na_As] 288 | plot(x = c(min(abs.segments$abs.start), max(abs.segments$abs.end)), 289 | y = c(-0.1, (max_A + 0.1)), type = "n", 290 | ylab = "Copy number", xlab = "Position (Mb)", 291 | xaxt = "n", yaxt = "n", xaxs = "i", ...) 292 | axis(labels = 0:max_A, at = 0:max_A, side = 2, line = 0, las = 1) 293 | segments(x0 = abs.segments$abs.start, x1 = abs.segments$abs.end, 294 | y0 = (abs.segments$B - 0.1), y1 = (abs.segments$B - 0.1), 295 | col = "blue", lwd = 5, lend = 1) 296 | segments(x0 = abs.segments$abs.start, x1 = abs.segments$abs.end, 297 | y0 = (abs.segments$A + 0.1), y1 = (abs.segments$A + 0.1), 298 | col = "red", lwd = 5, lend = 1) 299 | } else { 300 | min_CNt <- min(abs.segments$CNt, na.rm = TRUE) 301 | max_CNt <- max(abs.segments$CNt, na.rm = TRUE) 302 | plot(x = c(min(abs.segments$abs.start), max(abs.segments$abs.end)), 303 | y = c(min_CNt, max_CNt), type = "n", 304 | ylab = "Copy number", xlab = "Position (Mb)", 305 | xaxt = "n", yaxt = "n", xaxs = "i", ...) 306 | axis(labels = min_CNt:max_CNt, 307 | at = min_CNt:max_CNt, 308 | side = 2, line = 0, las = 1) 309 | segments(x0 = abs.segments$abs.start, x1 = abs.segments$abs.end, 310 | y0 = abs.segments$CNt, y1 = abs.segments$CNt, col = "red", 311 | lwd = 5, lend = 1) 312 | } 313 | abline(v = c(0, seg.max), lty = 3) 314 | for (i in 1:length(abs.list)){ 315 | max.pos <- nrow(abs.list[[i]]) 316 | mtext(chr.order[i], side = 3, line = 0, 317 | at = sum(abs.list[[i]]$abs.start[1], 318 | abs.list[[i]]$abs.end[max.pos]) / 2) 319 | } 320 | axis(labels = as.character(round(seq(abs.list[[1]]$start.pos[1] / 1e6, 321 | abs.list[[1]]$end.pos[nrow(abs.list[[1]])] / 1e6, by = 50), 0)), 322 | at = seq(abs.list[[1]]$abs.start[1], 323 | abs.list[[1]]$abs.end[nrow(abs.list[[1]])], by = 5e7), 324 | outer = FALSE, cex = par("cex.axis") * par("cex"), side = 1, 325 | line = 1) 326 | } 327 | 328 | plotRawGenome <- function(sequenza.extract, cellularity, 329 | ploidy, CNt.max = 7, main = "", mirror.BAF = TRUE, ...){ 330 | max.end <- sapply(sequenza.extract$ratio, FUN = function(x) { 331 | max(x$end, na.rm = TRUE) 332 | }) 333 | max.end <- c(0, cumsum(as.numeric(max.end))) 334 | chrs <- names(sequenza.extract$ratio) 335 | coords.names <- (max.end + c(diff(max.end) / 2, 0))[1:length(chrs)] 336 | new.coords <- function(win.list, max.end){ 337 | lapply(1:length(win.list), FUN = function(x) { 338 | y <- win.list[[x]] 339 | y$start <- y$start + max.end[x] 340 | y$end <- y$end + max.end[x] 341 | y 342 | } 343 | )} 344 | new.coords.segs <- function(segs, max.end){ 345 | lapply(1:length(segs), FUN = function(x) { 346 | y <- segs[[x]] 347 | y$start.pos <- y$start.pos + max.end[x] 348 | y$end.pos <- y$end.pos + max.end[x] 349 | y 350 | } 351 | )} 352 | 353 | ratio.new <- new.coords(sequenza.extract$ratio, max.end) 354 | BAF.new <- new.coords(sequenza.extract$BAF, max.end) 355 | 356 | segs.new <- do.call(rbind, 357 | new.coords.segs(sequenza.extract$segments, max.end)) 358 | avg.depth.ratio <- 1 359 | 360 | par(mar = c(1, 4, 0, 3), oma = c(5, 0, 4, 0), mfcol = c(2, 1), ...) 361 | 362 | if (mirror.BAF) { 363 | AAF.new <- lapply(BAF.new, function(x) { 364 | x[, 3:5] <- 1 - x[, 3:5] 365 | x 366 | }) 367 | plot(x = c(min(max.end), max(max.end)), y = c(0, 1), main = main, 368 | xlab = NA, ylab = "Allele frequency", type = "n", las = 1, 369 | xaxs = "i", yaxs = "i", xaxt = "n" ) 370 | plotWindows(seqz.window = do.call(rbind, AAF.new), 371 | q.bg = "lightblue", m.col = "black", add = T) 372 | segments(x0 = segs.new$start.pos, x1 = segs.new$end.pos, 373 | y0 = 1 - (segs.new$Bf), y1 = 1 - (segs.new$Bf), 374 | col = "red", lwd = 2, lend = 1) 375 | } else { 376 | plot(x = c(min(max.end), max(max.end)), y = c(0, 0.5), main = main, 377 | xlab = NA, ylab = "B allele frequency", type = "n", las = 1, 378 | xaxs = "i", yaxs = "i", xaxt = "n" ) 379 | } 380 | plotWindows(seqz.window = do.call(rbind, BAF.new), q.bg = "lightblue", 381 | m.col = "black", add = T) 382 | segments(x0 = segs.new$start.pos, x1 = segs.new$end.pos, 383 | y0 = segs.new$Bf, y1 = segs.new$Bf, col = "red", lwd = 2, lend = 1) 384 | abline(v = max.end, lty = 1) 385 | plot(x = c(min(max.end), max(max.end)), y = c(0, 2.5), main = "", 386 | xlab = NA, ylab = "Depth ratio", type = "n", las = 1, xaxs = "i", 387 | yaxs = "i", xaxt = "n") 388 | plotWindows(seqz.window = do.call(rbind, ratio.new), q.bg = "lightblue", 389 | m.col = "black", add = T) 390 | segments(x0 = segs.new$start.pos, x1 = segs.new$end.pos, 391 | y0 = (segs.new$depth.ratio), y1 = (segs.new$depth.ratio), 392 | col = "red", lwd = 2, lend = 1) 393 | if (!missing(ploidy) & !missing(cellularity)){ 394 | types <- baf.types.matrix(CNt.min = 0, CNt.max = CNt.max, CNn = 2) 395 | depth.ratios <- baf.model.points(cellularity = cellularity, 396 | ploidy = ploidy, avg.depth.ratio = avg.depth.ratio, 397 | baf_types = types)[, "depth.ratio"] 398 | depth.ratios <- unique(data.frame(CNt = types$CNt, 399 | ratio = depth.ratios)) 400 | abline(h = depth.ratios$ratio, lty = 2) 401 | axis(labels = as.character(depth.ratios$CNt), side = 4, 402 | line = 0, las = 1, at = depth.ratios$ratio) 403 | mtext(text = "Copy number", side = 4, line = 2, 404 | cex = par("cex.lab") * par("cex")) 405 | } 406 | abline(v = max.end, lty = 1) 407 | axis(labels = chrs, at = coords.names, side = 1, cex.axis = 1) 408 | } 409 | 410 | baf.model.view <- function(cellularity, ploidy, segs, 411 | BAF.space = seq(0.001, 0.5, 0.005), ratio.space = seq(0.01, 2.5, 0.05), 412 | avg.depth.ratio = 1, CNt.max = 7, segment.filter = 3e6, col = "black") { 413 | s.b <- mean(segs$sd.BAF, na.rm = TRUE) 414 | s.r <- mean(segs$sd.ratio, na.rm = TRUE) 415 | l.s <- segs$end.pos - segs$start.pos 416 | s.big <- l.s >= segment.filter 417 | test.values <- expand.grid(Bf = BAF.space, ratio = ratio.space, 418 | KEEP.OUT.ATTRS = FALSE) 419 | both.space <- baf.bayes(Bf = test.values$Bf, CNt.max = CNt.max, 420 | CNt.min = 0, depth.ratio = test.values$ratio, 421 | cellularity = cellularity, ploidy = ploidy, 422 | avg.depth.ratio = avg.depth.ratio, sd.Bf = s.b, weight.Bf = 10, 423 | sd.ratio = s.r, weight.ratio = 10, ratio.priority = F, CNn = 2) 424 | both.space <- as.data.frame(both.space) 425 | z <- tapply(both.space$LPP, list(test.values$Bf, test.values$ratio), mean) 426 | x <- as.numeric(rownames(z)) 427 | y <- as.numeric(colnames(z)) 428 | t <- baf.types.matrix(CNt.min = 0, CNt.max = CNt.max, CNn = 2) 429 | mpts <- cbind(t, baf.model.points(cellularity = cellularity, 430 | ploidy = ploidy, baf_types = t, avg.depth.ratio = avg.depth.ratio) 431 | ) 432 | mpts <- unique(mpts[, c("CNt", "depth.ratio")]) 433 | par(mar = c(5.1, 4.1, 4.1, 4.1)) 434 | rev.heat <- function(...){rev(heat.colors(...))} 435 | suppressWarnings(colorgram(x, y, z, key = NA, nz = 1000, 436 | xlab = "B allele frequency", ylab = "Depth ratio", 437 | main = paste("cellularity:", cellularity, "ploidy:", ploidy, 438 | "sd.BAF:", round(s.b, 2), sep = " "), 439 | map = makecmap(z, breaks = unique(quantile(z, seq(0.25, 1, 0.0001))), 440 | right = TRUE, n = 1000, colFn = rev.heat), outlier = "white", 441 | las = 1, xlim = c(0, 0.5))) 442 | axis(side = 4, at = mpts$depth.ratio, labels = mpts$CNt, las = 1) 443 | mtext(text = "Copy number", side = 4, line = 2) 444 | segs$col <- col 445 | points(x = segs$Bf[s.big], y = segs$depth.ratio[s.big], 446 | pch = 1, cex = 1, col = segs$col[s.big]) 447 | points(x = segs$Bf[!s.big], y = segs$depth.ratio[!s.big], pch = ".", 448 | cex = 1, col = segs$col[!s.big]) 449 | } 450 | -------------------------------------------------------------------------------- /R/misc.R: -------------------------------------------------------------------------------- 1 | dt2 <- function(x, df, ncp, log = FALSE, mean, sd) { 2 | x2 <- (x - mean) / sd 3 | dt(x2, df = df, ncp = ncp, log = log) 4 | } 5 | 6 | split_chr_coord <- function (x) { 7 | # Ensure that there is a start and a end coordinate 8 | split_chr <- strsplit(x, split = ":")[[1]] 9 | chromosome <- split_chr[1] 10 | start_end <- split_chr[2] 11 | split_coors <- strsplit(start_end, split = "-")[[1]] 12 | start <- split_coors[1] 13 | end <- split_coors[2] 14 | if (is.na(start)) { 15 | start <- 1 16 | } 17 | if (is.na(end)){ 18 | end <- 2147483647 19 | } 20 | paste0(chromosome, ":", start, "-", end) 21 | } 22 | 23 | weighted.median <- function(x, w, na.rm=TRUE, ties=NULL) { 24 | if (missing(w)) { 25 | w <- rep(1, length(x)) 26 | } 27 | if (na.rm == TRUE) { 28 | keep <- !(is.na(x) | is.na(w)); 29 | x <- x[keep] 30 | w <- w[keep] 31 | } else if (any(is.na(x))) { 32 | return(NA) 33 | } 34 | 35 | if (any(w < 0)) { 36 | stop("The weight vactor can only contains positive numbers") 37 | } 38 | n <- length(w) 39 | keep <- (w > 0) 40 | nkeep <- sum(keep) 41 | if (nkeep < n) { 42 | x <- x[keep] 43 | w <- w[keep] 44 | n <- nkeep 45 | } 46 | wInfs <- is.infinite(w) 47 | if (any(wInfs)) { 48 | x <- x[wInfs] 49 | n <- length(x) 50 | w <- rep(1, n) 51 | } 52 | 53 | if (n == 0) { 54 | return(NA) 55 | } 56 | ord <- order(x) 57 | x <- x[ord] 58 | w <- w[ord] 59 | wcum <- cumsum(w) 60 | wsum <- wcum[n] 61 | wmid <- wsum / 2 62 | lows <- (wcum <= wmid) 63 | k <- sum(lows) 64 | 65 | if (k == 0) { 66 | return(x[1]) 67 | } 68 | if (k == n) { 69 | return(x[n]) 70 | } 71 | 72 | wlow <- wcum[k] 73 | whigh <- wsum - wlow 74 | if (whigh > wmid) { 75 | return(x[k + 1]) 76 | } 77 | (wlow * x[k] + whigh * x[k + 1]) / wsum 78 | } 79 | -------------------------------------------------------------------------------- /R/model.R: -------------------------------------------------------------------------------- 1 | theoretical.depth.ratio <- function(CNt, cellularity, ploidy, CNn = 2, 2 | normal.ploidy = 2, avg.depth.ratio = 1) { 3 | cellu_copy_term <- (1 - cellularity) + (CNt / CNn * cellularity) 4 | ploidy_cellu_term <- (ploidy / normal.ploidy * cellularity) + 5 | 1 - cellularity 6 | avg.depth.ratio * cellu_copy_term / ploidy_cellu_term 7 | } 8 | 9 | theoretical.baf <- function(CNt, B, cellularity, CNn = 2) { 10 | baf <- ( (B * cellularity) + ( 1 - cellularity) ) / 11 | ( (CNt * cellularity) + CNn * ( 1 - cellularity) ) 12 | baf[CNn <= 1] <- NA 13 | baf 14 | } 15 | 16 | theoretical.mufreq <- function(CNt, Mt, cellularity, CNn = 2) { 17 | normal_alleles <- (CNt - Mt) * cellularity + CNn * (1 - cellularity) 18 | all_alleles <- (CNt * cellularity) + CNn * (1 - cellularity) 19 | 1 - (normal_alleles / all_alleles) 20 | } 21 | 22 | baf.types.matrix <- function(CNt.min, CNt.max, CNn = 2) { 23 | cn_ratio_vect <- seq(from = CNt.min / CNn, to = CNt.max / CNn, 24 | by = 1 / CNn) 25 | CNt <- cn_ratio_vect * CNn 26 | if (CNn < 2) { 27 | b_comb <- lapply(CNt, FUN = function(x) 0) 28 | } else { 29 | b_comb <- lapply(CNt, FUN = function(x) { 30 | seq(from = 0, to = trunc(x / 2)) 31 | }) 32 | } 33 | times_b <- sapply(b_comb, length) 34 | CNt <- rep(CNt, times = times_b) 35 | B <- unlist(b_comb) 36 | data.frame(CNn = CNn, CNt = CNt, B = B) 37 | } 38 | 39 | mufreq.types.matrix <- function(CNt.min, CNt.max, CNn = 2) { 40 | cn_ratio_vect <- seq(from = CNt.min / CNn, 41 | to = CNt.max / CNn, by = 1 / CNn) 42 | CNt <- cn_ratio_vect * CNn 43 | mut_comb <- lapply(CNt, FUN = function(x) seq(from = 0, to = x)) 44 | times_muts <- sapply(mut_comb, length) 45 | data.frame(CNn = CNn, CNt = rep(CNt, times = times_muts), 46 | Mt = unlist(mut_comb)) 47 | } 48 | 49 | baf.model.points <- function(cellularity, ploidy, baf_types, avg.depth.ratio) { 50 | depth_ratio <- theoretical.depth.ratio(cellularity = cellularity, 51 | ploidy = ploidy, CNn = baf_types[, "CNn"], CNt = baf_types[, "CNt"], 52 | avg.depth.ratio = avg.depth.ratio) 53 | baf <- theoretical.baf(cellularity = cellularity, CNn = baf_types[, "CNn"], 54 | CNt = baf_types[, "CNt"], B = baf_types[, "B"]) 55 | data.frame(BAF = baf, depth.ratio = depth_ratio) 56 | } 57 | 58 | mufreq.model.points <- function(cellularity, ploidy, mufreq_types, 59 | avg.depth.ratio) { 60 | mufreqs <- theoretical.mufreq(cellularity = cellularity, 61 | CNn = mufreq_types[, "CNn"], CNt = mufreq_types[, "CNt"], 62 | Mt = mufreq_types[, "Mt"]) 63 | depth_ratio <- theoretical.depth.ratio(cellularity = cellularity, 64 | ploidy = ploidy, CNn = mufreq_types[, "CNn"], 65 | CNt = mufreq_types[, "CNt"], avg.depth.ratio = avg.depth.ratio) 66 | data.frame(mufreqs = mufreqs, depth.ratio = depth_ratio) 67 | } 68 | 69 | b_allele_freq <- function(Af, Bf, good.reads, conf = 0.95) { 70 | if (length(Bf) > 1) { 71 | dd <- density(c(Bf, Af), weight = c(good.reads, good.reads) / 72 | (2 * sum(good.reads))) 73 | points.max <- which(diff(sign(diff(dd$y))) == -2) + 1 74 | if (length(points.max) < 1) { 75 | points.max <- which(diff(sign(diff(dd$y))) == -1) + 1 76 | } 77 | l.max <- dd$x[points.max] 78 | d.max <- dd$y[points.max] 79 | b.val <- l.max[which.max(dd$y[dd$x %in% l.max])] 80 | if (length(b.val) < 1) { 81 | message('WARNING', l.max, d.max, b.val) 82 | b.val <- min(l.max) 83 | } 84 | d.val <- d.max[which(l.max == b.val)] 85 | b.range <- range(dd$x[dd$y >= d.val - (d.val * (1 - conf))]) 86 | if (b.val > 0.5) { 87 | b.val <- 1 - b.val 88 | } 89 | max_diff <- max(b.range) - b.val 90 | min_diff <- b.val - min(b.range) 91 | min_diff <- min(c(max_diff, min_diff)) 92 | c(b.val - min_diff, b.val, b.val + min_diff) 93 | } else if (length(Bf) == 1) { 94 | c(Bf, Bf, Bf) 95 | } else { 96 | c(NA, NA, NA) 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /R/mutations.R: -------------------------------------------------------------------------------- 1 | mut.fractions <- function(AB.tumor, Af, tumor.strand) { 2 | F = 1 - Af 3 | base.mut <- lapply(X = AB.tumor, FUN = function(x) { 4 | unlist(strsplit(as.character(x), split = "[:]")) 5 | }) 6 | base.fw <- lapply(X = tumor.strand, FUN = function(x) { 7 | unlist(strsplit(as.character(x), split = "[:]")) 8 | }) 9 | frequencify <- function (x) { 10 | base.name <- substr(unlist(x), 1, 1) 11 | base.val <- as.numeric(substr(unlist(x), 2, nchar(x))) 12 | setNames(base.val, base.name) 13 | } 14 | base.freqs <- lapply(X = base.mut, FUN = frequencify) 15 | fw.freqs <- lapply(X = base.fw, FUN = frequencify) 16 | n.base.mut <- do.call(c, lapply(X = base.mut, FUN = length)) 17 | max.fq <- function (x) { 18 | freq.rel <- base.freqs[[x]] / F[x] 19 | f.max <- which.max(freq.rel) 20 | c(freq.rel[f.max], names(base.freqs[[x]])[f.max], 21 | base.freqs[[x]][f.max], fw.freqs[[x]][f.max]) 22 | } 23 | max.freqs <- do.call(rbind, lapply(1:length(F), max.fq)) 24 | data.frame(base.count = as.integer(n.base.mut), 25 | maj.base.freq = as.numeric(max.freqs[, 1]), 26 | base = as.character(max.freqs[, 2]), 27 | freq = as.numeric(max.freqs[, 3]), 28 | fw.freq = as.numeric(max.freqs[, 4])) 29 | } 30 | 31 | mutation.table <- function(seqz.tab, mufreq.treshold = 0.15, 32 | min.reads = 40, min.reads.normal = 10, max.mut.types = 3, 33 | min.type.freq = 0.9, min.fw.freq = 0, segments = NULL) { 34 | chroms <- unique(seqz.tab$chromosome) 35 | hom.filt <- seqz.tab$zygosity.normal == "hom" & 36 | seqz.tab$AB.tumor != "." 37 | seqz.tab <- seqz.tab[hom.filt, ] 38 | reads.filt <- seqz.tab$good.reads >= min.reads & 39 | seqz.tab$depth.normal >= min.reads.normal 40 | seqz.tab <- seqz.tab[reads.filt, ] 41 | mufreq.filt <- seqz.tab$Af <= (1 - mufreq.treshold) 42 | seqz.tab <- seqz.tab[mufreq.filt, ] 43 | if (!is.null(segments)) { 44 | for (i in 1:nrow(segments)) { 45 | pos.filt <- seqz.tab$chromosome == segments$chromosome[i] & 46 | seqz.tab$position >= segments$start.pos[i] & 47 | seqz.tab$position <= segments$end.pos[i] 48 | seqz.tab$adjusted.ratio[pos.filt] <- segments$depth.ratio[i] 49 | } 50 | } 51 | seqz.dummy <- data.frame(chromosome = chroms, position = 1, 52 | GC.percent = NA, good.reads = NA, adjusted.ratio = NA, 53 | F = 0, mutation = "NA", stringsAsFactors = FALSE) 54 | if (nrow(seqz.tab) >= 1) { 55 | mu.fracts <- mut.fractions(AB.tumor = seqz.tab$AB.tumor, 56 | Af = seqz.tab$Af, tumor.strand = seqz.tab$tumor.strand) 57 | mufreq.filt <- mu.fracts$freq >= mufreq.treshold 58 | type.filt <- mu.fracts$base.count <= max.mut.types 59 | prop.filt <- mu.fracts$maj.base.freq >= min.type.freq 60 | if (!is.na(min.fw.freq)) { 61 | fw.2 <- 1 - min.fw.freq 62 | fw.2 <- sort(c(fw.2, min.fw.freq)) 63 | fw.filt <- mu.fracts$fw.freq > fw.2[1] & 64 | mu.fracts$fw.freq < fw.2[2] 65 | mufreq.filt <- mufreq.filt & type.filt & prop.filt & fw.filt 66 | } else { 67 | mufreq.filt <- mufreq.filt & type.filt & prop.filt 68 | } 69 | mut.type <- paste(seqz.tab$AB.normal, mu.fracts$base, sep = ">") 70 | seqz.tab <- seqz.tab[, c("chromosome", "position", "GC.percent", 71 | "good.reads", "adjusted.ratio")] 72 | seqz.tab <- cbind(seqz.tab, F = mu.fracts$freq, 73 | mutation = mut.type) 74 | rbind(seqz.tab[mufreq.filt, ], seqz.dummy) 75 | } else { 76 | seqz.dummy 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /R/read.R: -------------------------------------------------------------------------------- 1 | read.seqz <- function(file, n_lines = NULL, col_types = "ciciidddcddccc", 2 | chr_name = NULL, buffer = 33554432, parallel = 1, 3 | col_names = c("chromosome", "position", "base.ref", "depth.normal", 4 | "depth.tumor", "depth.ratio", "Af", "Bf", "zygosity.normal", 5 | "GC.percent", "good.reads", "AB.normal", "AB.tumor", 6 | "tumor.strand"), ...) { 7 | 8 | if (is.null(n_lines)) { 9 | skip <- 1 10 | n_max <- Inf 11 | } else { 12 | n_lines <- round(sort(n_lines), 0) 13 | skip <- n_lines[1] 14 | 15 | n_max <- n_lines[2] - skip + 1 16 | } 17 | if (!is.null(chr_name)) { 18 | chr_name <- as.character(chr_name) 19 | tbi <- file.exists(paste(file, "tbi", sep = ".")) 20 | if (tbi) { 21 | read.seqz.tbi(file, split_chr_coord(chr_name), 22 | col_names, col_types) 23 | } else { 24 | read.seqz.chr(file, chr_name = chr_name, col_types = col_types, 25 | col_names = col_names, buffer = buffer, parallel = parallel) 26 | } 27 | } else { 28 | read_tsv(file = file, col_types = col_types, skip = skip, 29 | n_max = n_max, col_names = col_names, progress = FALSE, ...) 30 | } 31 | } 32 | 33 | read.seqz.chr <- function(file, chr_name, col_names, 34 | col_types, buffer, parallel) { 35 | con <- gzfile(file, "rb") 36 | suppressWarnings(skip_line <- readLines(con, n = 1)) 37 | remove(skip_line) 38 | parse_chunck <- function(x, chr_name, col_names, col_types) { 39 | x <- read_tsv(file = paste(mstrsplit(x), collapse = "\n"), 40 | col_types = col_types, skip = 0, n_max = Inf, 41 | col_names = col_names, progress = FALSE) 42 | x[x$chromosome == chr_name, ] 43 | } 44 | res <- chunk.apply(input = con, FUN = parse_chunck, chr_name = chr_name, 45 | col_names = col_names, col_types = col_types, CH.MAX.SIZE = buffer, 46 | parallel = parallel) 47 | close(con) 48 | res 49 | } 50 | 51 | read.seqz.tbi <- function(file, chr_name, col_names, col_types) { 52 | res <- tabix.read(file, chr_name) 53 | res <- read_tsv(file = paste(mstrsplit(res), collapse = "\n"), 54 | col_types = col_types, skip = 0, n_max = Inf, 55 | col_names = col_names, progress = FALSE) 56 | } 57 | -------------------------------------------------------------------------------- /R/results.R: -------------------------------------------------------------------------------- 1 | sequenza.results <- function(sequenza.extract, cp.table = NULL, 2 | sample.id, out.dir = getwd(), cellularity = NULL, ploidy = NULL, 3 | female = TRUE, CNt.max = 20, ratio.priority = FALSE, 4 | XY = c(X = "X", Y = "Y"), chromosome.list = 1:24){ 5 | if(!file.exists(out.dir)) { 6 | dir.ok <- dir.create(path = out.dir, recursive = TRUE) 7 | if(!dir.ok) { 8 | stop("Directory does not exist and cannot be created: ", out.dir) 9 | } 10 | } 11 | make_filename <- function(x){ 12 | file.path(out.dir, paste(sample.id, x, sep = "_")) 13 | } 14 | cp.file <- make_filename("CP_contours.pdf") 15 | cint.file <- make_filename("confints_CP.txt") 16 | chrw.file <- make_filename("chromosome_view.pdf") 17 | depths.file <- make_filename("chromosome_depths.pdf") 18 | gc.file <- make_filename("gc_plots.pdf") 19 | geno.file <- make_filename("genome_view.pdf") 20 | cn.file <- make_filename("CN_bars.pdf") 21 | fit.file <- make_filename("model_fit.pdf") 22 | alt.file <- make_filename("alternative_solutions.txt") 23 | afit.file <- make_filename("alternative_fit.pdf") 24 | muts.file <- make_filename("mutations.txt") 25 | segs.file <- make_filename("segments.txt") 26 | robj.extr <- make_filename("sequenza_extract.RData") 27 | robj.fit <- make_filename("sequenza_cp_table.RData") 28 | log.file <- make_filename("sequenza_log.txt") 29 | seg.tab <- do.call(rbind, sequenza.extract$segments[chromosome.list]) 30 | seg.len <- (seg.tab$end.pos - seg.tab$start.pos) / 1e6 31 | 32 | avg.depth.ratio <- sequenza.extract$avg.depth.ratio 33 | assign(x = paste0(sample.id, "_sequenza_extract"), 34 | value = sequenza.extract) 35 | save(list = paste0(sample.id, "_sequenza_extract"), file = robj.extr) 36 | if (is.null(cp.table) && (is.null(cellularity) || is.null(ploidy))){ 37 | stop("cp.table and/or cellularity and ploidy argument are required.") 38 | } 39 | 40 | pdf(gc.file, width = 10, height = 5) 41 | par(mfrow=c(1, 2)) 42 | gc.summary.plot(sequenza.extract$gc$normal, mean.col = "lightsalmon", 43 | median.col = "lightgreen", las = 1, xlab = "GC %", ylab = "Depth", 44 | zlab = "N", main = "GC vs raw depth in the normal sample") 45 | gc.summary.plot(sequenza.extract$gc_norm$normal, mean.col = "lightsalmon", 46 | median.col = "lightgreen", las = 1, xlab = "GC %", ylab = "Depth", 47 | zlab = "N", main = "GC vs normalized depth in the normal sample") 48 | gc.summary.plot(sequenza.extract$gc$tumor, mean.col = "lightsalmon", 49 | median.col = "lightgreen", las = 1, xlab = "GC %", ylab = "Depth", 50 | zlab = "N", main = "GC vs raw depth in the tumor sample") 51 | gc.summary.plot(sequenza.extract$gc_norm$tumor, mean.col = "lightsalmon", 52 | median.col = "lightgreen", las = 1, xlab = "GC %", ylab = "Depth", 53 | zlab = "N", main = "GC vs normalized depth in the tumor sample") 54 | dev.off() 55 | pdf(depths.file, height = 10, width = 15) 56 | for (i in unique(seg.tab$chromosome)) { 57 | max_coord_chr_i <- max(sequenza.extract$ratio[[i]]$end) 58 | par(mfcol = c(3, 2), xaxt = "n", mar = c(0, 4, 3, 0), oma = c(5, 0, 4, 0)) 59 | plotWindows(sequenza.extract$depths$raw$normal[[i]], 60 | ylab = "normal depth", ylim = c(0, 2.5), 61 | main = paste("raw", i, sep = " ")) 62 | plotWindows(sequenza.extract$depths$raw$tumor[[i]], 63 | ylab = "tumor depth", ylim = c(0, 2.5)) 64 | plotWindows(sequenza.extract$raw_ratio[[i]], 65 | ylab = "depth ratio", ylim = c(0, 2.5)) 66 | par(xaxt = "s") 67 | axis(labels = as.character(round(seq(0, max_coord_chr_i / 1e6, 68 | by = 10), 0)), 69 | side = 1, line = 0, at = seq(0, max_coord_chr_i, by = 1e7), 70 | outer = FALSE, cex = par("cex.axis") * par("cex")) 71 | mtext("Position (Mb)", side = 1, line = 3, outer = FALSE, 72 | cex = par("cex.lab") * par("cex")) 73 | par(xaxt = "n") 74 | plotWindows(sequenza.extract$depths$norm$normal[[i]], 75 | ylab = "normal depth", ylim = c(0, 2.5), 76 | main = paste("normalized", i, sep = " ")) 77 | plotWindows(sequenza.extract$depths$norm$tumor[[i]], 78 | ylab = "tumor depth", ylim = c(0, 2.5)) 79 | plotWindows(sequenza.extract$ratio[[i]], 80 | ylab = "depth ratio", ylim = c(0, 2.5)) 81 | par(xaxt = "s") 82 | axis(labels = as.character(round(seq(0, max_coord_chr_i / 1e6, 83 | by = 10), 0)), 84 | side = 1, line = 0, at = seq(0, max_coord_chr_i, by = 1e7), 85 | outer = FALSE, cex = par("cex.axis") * par("cex")) 86 | mtext("Position (Mb)", side = 1, line = 3, outer = FALSE, 87 | cex = par("cex.lab") * par("cex")) 88 | } 89 | dev.off() 90 | 91 | if (!is.null(cp.table)){ 92 | assign(x = paste0(sample.id, "_sequenza_cp_table"), value = cp.table) 93 | save(list = paste0(sample.id, "_sequenza_cp_table"), file = robj.fit) 94 | cint <- get.ci(cp.table) 95 | pdf(cp.file) 96 | cp.plot(cp.table) 97 | cp.plot.contours(cp.table, add = TRUE, 98 | likThresh = c(0.95), col = "red", pch = 20) 99 | if (!is.null(cellularity) || !is.null(ploidy)) { 100 | if (is.null(cellularity)) { 101 | cellularity <- cint$max.cellularity 102 | } 103 | if (is.null(ploidy)) { 104 | ploidy <- cint$max.ploidy 105 | } 106 | points(x = ploidy, y = cellularity, pch = 5) 107 | text(x = ploidy, y = cellularity, labels = "User selection", 108 | pos = 3, offset = 0.5) 109 | } else { 110 | cellularity <- cint$max.cellularity 111 | ploidy <- cint$max.ploidy 112 | } 113 | dev.off() 114 | } 115 | mut.tab <- na.exclude(do.call(rbind, 116 | sequenza.extract$mutations[chromosome.list])) 117 | if (female){ 118 | segs.is.xy <- seg.tab$chromosome == XY["Y"] 119 | mut.is.xy <- mut.tab$chromosome == XY["Y"] 120 | } else{ 121 | segs.is.xy <- seg.tab$chromosome %in% XY 122 | mut.is.xy <- mut.tab$chromosome %in% XY 123 | } 124 | avg.sd.ratio <- sum(seg.tab$sd.ratio * seg.tab$N.ratio, na.rm = TRUE) / 125 | sum(seg.tab$N.ratio, na.rm = TRUE) 126 | avg.sd.Bf <- sum(seg.tab$sd.BAF * seg.tab$N.BAF, na.rm = TRUE) / 127 | sum(seg.tab$N.BAF, na.rm = TRUE) 128 | cn.alleles <- baf.bayes(Bf = seg.tab$Bf[!segs.is.xy], CNt.max = CNt.max, 129 | depth.ratio = seg.tab$depth.ratio[!segs.is.xy], 130 | cellularity = cellularity, ploidy = ploidy, 131 | avg.depth.ratio = avg.depth.ratio, 132 | sd.ratio = seg.tab$sd.ratio[!segs.is.xy], 133 | weight.ratio = seg.len[!segs.is.xy], 134 | sd.Bf = seg.tab$sd.BAF[!segs.is.xy], 135 | weight.Bf = 1, ratio.priority = ratio.priority, CNn = 2) 136 | seg.res <- cbind(seg.tab[!segs.is.xy, ], cn.alleles) 137 | if (!female){ 138 | if (sum(segs.is.xy) >= 1) { 139 | cn.alleles <- baf.bayes(Bf = NA, CNt.max = CNt.max, 140 | depth.ratio = seg.tab$depth.ratio[segs.is.xy], 141 | cellularity = cellularity, ploidy = ploidy, 142 | avg.depth.ratio = avg.depth.ratio, 143 | sd.ratio = seg.tab$sd.ratio[segs.is.xy], 144 | weight.ratio = seg.len[segs.is.xy], sd.Bf = NA, 145 | weight.Bf = NA, ratio.priority = ratio.priority, CNn = 1) 146 | seg.xy <- cbind(seg.tab[segs.is.xy, ], cn.alleles) 147 | seg.res <- rbind(seg.res, seg.xy) 148 | } 149 | } 150 | write.table(seg.res, file = segs.file, col.names = TRUE, 151 | row.names = FALSE, sep = "\t", quote = FALSE) 152 | if (nrow(mut.tab) > 0) { 153 | mut.alleles <- mufreq.bayes(mufreq = mut.tab$F[!mut.is.xy], 154 | CNt.max = CNt.max, 155 | depth.ratio = mut.tab$adjusted.ratio[!mut.is.xy], 156 | cellularity = cellularity, ploidy = ploidy, 157 | avg.depth.ratio = avg.depth.ratio, CNn = 2) 158 | mut.res <- cbind(mut.tab[!mut.is.xy, ], mut.alleles) 159 | if (!female){ 160 | if (sum(mut.is.xy) >= 1) { 161 | mut.alleles <- mufreq.bayes(mufreq = mut.tab$F[mut.is.xy], 162 | CNt.max = CNt.max, 163 | depth.ratio = mut.tab$adjusted.ratio[mut.is.xy], 164 | cellularity = cellularity, ploidy = ploidy, 165 | avg.depth.ratio = avg.depth.ratio, CNn = 1) 166 | mut.xy <- cbind(mut.tab[mut.is.xy, ], mut.alleles) 167 | mut.res <- rbind(mut.res, mut.xy) 168 | } 169 | } 170 | write.table(mut.res, file = muts.file, col.names = TRUE, 171 | row.names = FALSE, sep = "\t", quote = FALSE) 172 | } 173 | pdf(chrw.file) 174 | for (i in unique(seg.res$chromosome)) { 175 | if (!female && i %in% XY){ 176 | CNn <- 1 177 | } else { 178 | CNn <- 2 179 | } 180 | chromosome.view(mut.tab = sequenza.extract$mutations[[i]], 181 | baf.windows = sequenza.extract$BAF[[i]], 182 | ratio.windows = sequenza.extract$ratio[[i]], 183 | cellularity = cellularity, ploidy = ploidy, main = i, 184 | segments = seg.res[seg.res$chromosome == i, ], 185 | avg.depth.ratio = avg.depth.ratio, CNn = CNn, min.N.ratio = 1) 186 | } 187 | dev.off() 188 | pdf(geno.file, height = 5, width = 15) 189 | if (sum(!is.na(seg.res$A)) > 0) { 190 | genome.view(seg.res) 191 | } 192 | genome.view(seg.res, "CN") 193 | plotRawGenome(sequenza.extract, cellularity = cellularity, ploidy = ploidy, 194 | mirror.BAF = TRUE) 195 | dev.off() 196 | barscn <- data.frame(size = seg.res$end.pos - seg.res$start.pos, 197 | CNt = seg.res$CNt) 198 | cn.sizes <- split(barscn$size, barscn$CNt) 199 | cn.sizes <- sapply(cn.sizes, "sum") 200 | pdf(cn.file) 201 | barplot(round(cn.sizes / sum(cn.sizes) * 100), names = names(cn.sizes), 202 | las = 1, ylab = "Percentage (%)", xlab = "Copy number") 203 | dev.off() 204 | 205 | ## Write down the results.... ploidy etc... 206 | if (!is.null(cp.table)){ 207 | res.tab <- data.frame(cellularity = c(cint$confint.cellularity[1], 208 | cint$max.cellularity[1], cint$confint.cellularity[2]), 209 | ploidy.estimate = c(cint$confint.ploidy[1], cint$max.ploidy[1], 210 | cint$confint.ploidy[2]), 211 | ploidy.mean.cn = weighted.mean(x = as.integer(names(cn.sizes)), 212 | w = cn.sizes)) 213 | write.table(res.tab, cint.file, col.names = TRUE, 214 | row.names = FALSE, sep = "\t", quote = FALSE) 215 | } 216 | pdf(fit.file, width = 6, height = 6) 217 | baf.model.view(cellularity = cellularity, ploidy = ploidy, 218 | segs = seg.res[!segs.is.xy, ]) 219 | dev.off() 220 | if (!is.null(cp.table)){ 221 | alt.sol <- alternative.cp.solutions(cp.table) 222 | write.table(alt.sol, file = alt.file, col.names = TRUE, 223 | row.names = FALSE, sep = "\t", quote = FALSE) 224 | pdf(afit.file) 225 | for (sol in 1:nrow(alt.sol)){ 226 | baf.model.view(cellularity = alt.sol$cellularity[sol], 227 | ploidy = alt.sol$ploidy[sol], segs = seg.res[!segs.is.xy, ]) 228 | } 229 | dev.off() 230 | } 231 | file_conn <- file(log.file) 232 | writeLines(c(date(), paste("Sequenza version:", 233 | packageVersion("sequenza"), sep = " ")), file_conn) 234 | close(file_conn) 235 | } 236 | -------------------------------------------------------------------------------- /R/segments.R: -------------------------------------------------------------------------------- 1 | extract_breaks <- function(data, data_het, ratio, baf, breaks, gamma, kmin, 2 | gamma.pcf, kmin.pcf, assembly, chromosome, 3 | method = c("het", "full", "fast")) { 4 | method_list <- c("het", "full", "fast") 5 | if (is.null(breaks)) { 6 | if (method %in% method_list) { 7 | if (method == "fast"){ 8 | breaks <- breaks_fast(ratio_win = ratio, baf_win = baf, 9 | gamma = gamma, kmin = kmin, chr = chromosome) 10 | } else { 11 | breaks <- breaks_het(data = data_het, gamma = gamma, 12 | kmin = kmin, assembly = assembly) 13 | } 14 | if (method == "full") { 15 | breaks <- breaks_full(data = data, gamma = gamma.pcf, 16 | kmin = kmin.pcf, assembly, breaks.het = breaks) 17 | } 18 | } else { 19 | stop("Available methods are \'full\', \'het\' and \'fast\'.") 20 | } 21 | } 22 | breaks 23 | } 24 | 25 | 26 | breaks_het <- function(data, gamma, kmin, assembly){ 27 | try( 28 | find.breaks(data, gamma = gamma, assembly = assembly, 29 | kmin = kmin, baf.thres = c(0, 0.5)), 30 | silent = FALSE) 31 | } 32 | 33 | breaks_full <- function(data, gamma, kmin, 34 | assembly, breaks.het = NULL) { 35 | merge.breaks <- function (breaks, breaks.het) { 36 | merged.breaks <- unique(sort(c(breaks$start.pos, 37 | breaks$end.pos, breaks.het$start.pos, breaks.het$end.pos))) 38 | merged.breaks <- merged.breaks[diff(merged.breaks) > 1] 39 | merged.start <- merged.breaks 40 | merged.start[-1] <- merged.start[-1] + 1 41 | breaks <- data.frame(chrom = unique(breaks$chrom), 42 | start.pos = merged.start[- (length(merged.start))], 43 | end.pos = merged.breaks[-1]) 44 | } 45 | breaks <- find.breaks(data, gamma = gamma, kmin = kmin, 46 | assembly = assembly, seg.algo = "pcf") 47 | if (!is.null(breaks.het)) { 48 | chr.p <- merge.breaks(breaks[breaks$arm == "p", ], 49 | breaks.het[breaks.het$arm == "p", ]) 50 | chr.q <- merge.breaks(breaks[breaks$arm == "q", ], 51 | breaks.het[breaks.het$arm == "q", ]) 52 | breaks <- rbind(chr.p, chr.q) 53 | } 54 | breaks 55 | } 56 | 57 | breaks_fast <- function(ratio_win, baf_win, chr, gamma, kmin) { 58 | BAF <- data.frame(chrom = chr, 59 | pos = c(baf_win[[1]]$start, tail(baf_win[[1]]$end, n = 1)), 60 | s1 = c(baf_win[[1]]$mean, tail(baf_win[[1]]$mean, n = 1))) 61 | logR <- data.frame(chrom = chr, 62 | pos = c(ratio_win[[1]]$start, tail(ratio_win[[1]]$end, n = 1)), 63 | s1 = c(log2(ratio_win[[1]]$mean), 64 | log2(tail(ratio_win[[1]]$mean, n = 1)))) 65 | cat(nrow(BAF), nrow(logR), "\n") 66 | not.cover <- is.na(logR$s1) & is.na(BAF$s1) 67 | BAF <- BAF[!not.cover, ] 68 | logR <- logR[!not.cover, ] 69 | logR.wins <- copynumber::winsorize(logR, verbose = FALSE) 70 | allele.seg <- copynumber::aspcf(logR = logR.wins, BAF = BAF, 71 | baf.thres = c(0, 0.5), verbose = FALSE, 72 | gamma = gamma, kmin = kmin) 73 | if (length(grep("^chr", chr)) > 0) { 74 | allele.seg$chrom <- paste0("chr", allele.seg$chrom) 75 | } 76 | breaks <- allele.seg[, c("chrom", "start.pos", "end.pos")] 77 | not.uniq <- which(breaks$end.pos == c(breaks$start.pos[-1], 0)) 78 | breaks$end.pos[not.uniq] <- breaks$end.pos[not.uniq] - 1 79 | breaks 80 | } 81 | 82 | 83 | find.breaks <- function(seqz.baf, gamma = 80, kmin = 10, 84 | baf.thres = c(0, 0.5), verbose = FALSE, seg.algo = "aspcf", ...) { 85 | chromosome <- gsub(x = seqz.baf$chromosome, 86 | pattern = "chr", replacement = "") 87 | logR = data.frame(chrom = chromosome, 88 | pos = seqz.baf$position, 89 | s1 = log2(seqz.baf$adjusted.ratio)) 90 | logR.wins <- copynumber::winsorize(logR, verbose = verbose) 91 | if (seg.algo == "aspcf"){ 92 | BAF = data.frame(chrom = chromosome, 93 | pos = seqz.baf$position, 94 | s1 = seqz.baf$Bf) 95 | allele.seg <- copynumber::aspcf(logR = logR.wins, 96 | BAF = BAF, baf.thres = baf.thres, verbose = verbose, 97 | gamma = gamma, kmin = kmin, ...) 98 | } else if (seg.algo == "pcf") { 99 | allele.seg <- copynumber::pcf(data = logR.wins, verbose = verbose, 100 | gamma = gamma, kmin = kmin, ...) 101 | } else { 102 | stop("Segmentation algorithm must be either \'aspcf\' or \'pcf\'.") 103 | } 104 | if (length(grep("chr", seqz.baf$chromosome)) > 0) { 105 | allele.seg$chrom <- paste("chr", allele.seg$chrom, sep = "") 106 | } 107 | breaks <- allele.seg[, c("chrom", "start.pos", "end.pos", "arm")] 108 | not.uniq <- which(breaks$end.pos == c(breaks$start.pos[-1],0)) 109 | breaks$end.pos[not.uniq] <- breaks$end.pos[not.uniq] - 1 110 | breaks 111 | } 112 | 113 | segment.breaks <- function(seqz.tab, breaks, min.reads.baf = 1, 114 | weighted.mean = TRUE) { 115 | if (weighted.mean){ 116 | w.r <- sqrt(seqz.tab$depth.normal) 117 | rw <- seqz.tab$adjusted.ratio * w.r 118 | w.b <- sqrt(seqz.tab$good.reads) 119 | bw <- seqz.tab$Bf * w.b 120 | seqz.tab <- cbind(seqz.tab[, c("chromosome", "position", 121 | "zygosity.normal", "good.reads", "Af", "Bf")], 122 | rw = rw, w.r = w.r, bw = bw, w.b = w.b) 123 | } 124 | chr.order <- unique(seqz.tab$chromosome) 125 | seqz.tab <- split(seqz.tab, f = seqz.tab$chromosome) 126 | segments <- list() 127 | for (i in 1:length(seqz.tab)) { 128 | seqz.b.i <- seqz.tab[[i]][seqz.tab[[i]]$zygosity.normal == "het", ] 129 | seqz.b.i <- seqz.b.i[seqz.b.i$good.reads >= min.reads.baf, ] 130 | breaks.i <- breaks[breaks$chrom == names(seqz.tab)[i], ] 131 | nb <- nrow(breaks.i) 132 | breaks.vect <- do.call(cbind, split.data.frame(breaks.i[, 133 | c("start.pos", "end.pos")], f = 1:nb)) 134 | unique.breaks <- function(b, offset = 1) { 135 | while(any(diff(b) == 0)) { 136 | b[which(diff(b) == 0) + 1] <- b[diff(b) == 0] + offset 137 | } 138 | b 139 | } 140 | breaks.vect <- unique.breaks(b = as.numeric(breaks.vect), offset = 1) 141 | fact.r.i <- cut(seqz.tab[[i]]$position, breaks.vect) 142 | fact.b.i <- cut(seqz.b.i$position, breaks.vect) 143 | seg.i.s.r <- sapply(X = split(seqz.tab[[i]]$chromosome, 144 | f = fact.r.i), FUN = length) 145 | seg.i.s.b <- sapply(X = split(seqz.b.i$chromosome, 146 | f = fact.b.i), FUN = length) 147 | 148 | if (weighted.mean) { 149 | seg.i.rw <- sapply(X = split(seqz.tab[[i]]$rw, f = fact.r.i), 150 | FUN = function(a) sum(a, na.rm = TRUE)) 151 | seg.i.w.r <- sapply(X = split(seqz.tab[[i]]$w.r, f = fact.r.i), 152 | FUN = function(a) sum(a, na.rm = TRUE)) 153 | seg.i.r.sd <- sapply(X = split(seqz.tab[[i]]$rw / 154 | seqz.tab[[i]]$w.r, f = fact.r.i), 155 | FUN = function(a) sd(a, na.rm = TRUE)) 156 | seg.i.b.sd <- sapply(X = split(seqz.b.i$bw / 157 | seqz.b.i$w.b, f = fact.b.i), 158 | FUN = function(a) sd(a, na.rm = TRUE)) 159 | A.split <- split(seqz.b.i$Af, f = fact.b.i) 160 | B.split <- split(seqz.b.i$Bf, f = fact.b.i) 161 | d.split <- split(seqz.b.i$good.reads, f = fact.b.i) 162 | window.quantiles <- mapply(b_allele_freq, Af = A.split, 163 | Bf = B.split, good.reads = d.split, conf = 0.95) 164 | segments.i <- data.frame(chromosome = names(seqz.tab)[i], 165 | start.pos = as.numeric(breaks.vect[-length(breaks.vect)]), 166 | end.pos = as.numeric(breaks.vect[-1]), 167 | Bf = window.quantiles[2, ], N.BAF = seg.i.s.b, 168 | sd.BAF = seg.i.b.sd, depth.ratio = seg.i.rw / seg.i.w.r, 169 | N.ratio = seg.i.s.r, sd.ratio = seg.i.r.sd, 170 | stringsAsFactors = FALSE) 171 | } else { 172 | seg.i.r <- sapply(X = split(seqz.tab[[i]]$adjusted.ratio, 173 | f = fact.r.i), FUN = function(a) mean(a, na.rm = TRUE)) 174 | A.split <- split(seqz.b.i$Af, f = fact.b.i) 175 | B.split <- split(seqz.b.i$Bf, f = fact.b.i) 176 | d.split <- split(seqz.b.i$good.reads, f = fact.b.i) 177 | window.quantiles <- mapply(b_allele_freq, Af = A.split, 178 | Bf = B.split, good.reads = d.split, conf = 0.95) 179 | seg.i.r.sd <- sapply(X = split(seqz.tab[[i]]$adjusted.ratio, 180 | f = fact.r.i), FUN = function(a) sd(a, na.rm = TRUE)) 181 | seg.i.b.sd <- sapply(X = split(seqz.b.i$Bf, f = fact.b.i), 182 | FUN = function(a) sd(a, na.rm = TRUE)) 183 | segments.i <- data.frame(chromosome = names(seqz.tab)[i], 184 | start.pos = as.numeric(breaks.vect[-length(breaks.vect)]), 185 | end.pos = as.numeric(breaks.vect[-1]), 186 | Bf = window.quantiles[2, ], N.BAF = seg.i.s.b, 187 | sd.BAF = seg.i.b.sd, depth.ratio = seg.i.r, 188 | N.ratio = seg.i.s.r, sd.ratio = seg.i.r.sd, 189 | stringsAsFactors = FALSE) 190 | } 191 | segments[[i]] <- segments.i[seq(from = 1, 192 | to = nrow(segments.i), by = 2),] 193 | } 194 | segments <- do.call(rbind, segments[as.factor(chr.order)]) 195 | row.names(segments) <- 1:nrow(segments) 196 | len.seg <- (segments$end.pos - segments$start.pos) / 1e6 197 | segments[(segments$N.ratio / len.seg) >= 2, ] 198 | } 199 | -------------------------------------------------------------------------------- /R/solutions.R: -------------------------------------------------------------------------------- 1 | get.ci <- function(cp.table, level = 0.95) { 2 | znormsort <- sort(cp.table$lpp, decreasing = TRUE) 3 | znormcumLik <- cumsum(znormsort) 4 | n <- sapply(level, function(x) sum(znormcumLik < x) + 1) 5 | LikThresh <- znormsort[n] 6 | values.x <- data.frame(x = cp.table$ploidy, 7 | y = apply(cp.table$lpp, 1, max)) 8 | values.y <- data.frame(x = apply(cp.table$lpp, 2, max), 9 | y = cp.table$cellularity) 10 | up.x <- max(values.x$x[values.x$y >= LikThresh]) 11 | low.x <- min(values.x$x[values.x$y >= LikThresh]) 12 | max.x <- values.x$x[which.max(values.x$y)] 13 | up.y <- max(values.y$y[values.y$x >= LikThresh]) 14 | low.y <- min(values.y$y[values.y$x >= LikThresh]) 15 | max.y <- values.y$y[which.max(values.y$x)] 16 | values.x$y <- values.x$y / sum(values.x$y) 17 | values.y$x <- values.y$x / sum(values.y$x) 18 | results <- list() 19 | results$values.ploidy <- values.x 20 | results$confint.ploidy <- c(low.x, up.x) 21 | results$max.ploidy <- max.x 22 | results$values.cellularity <- values.y 23 | results$confint.cellularity <- c(low.y, up.y) 24 | results$max.cellularity <- max.y 25 | results 26 | } 27 | 28 | alternative.cp.solutions <- function(cp.table) { 29 | ci <- get.ci(cp.table) 30 | p.alt <- which(diff(sign(diff(ci$values.ploidy$y))) == -2) + 1 31 | get.alt <- function(idx.p, cp.table) { 32 | idx.c <- which.max(cp.table$lpp[idx.p, ]) 33 | c(cellularity = cp.table$cellularity[idx.c], 34 | ploidy = cp.table$ploidy[idx.p], 35 | SLPP = cp.table$lpp[idx.p, idx.c]) 36 | } 37 | res <- lapply(p.alt, FUN = function (x) get.alt(x, cp.table)) 38 | res <- as.data.frame(do.call(rbind, res)) 39 | if (nrow(res) > 0 ){ 40 | res[order(res$SLPP, decreasing = TRUE), ] 41 | } else { 42 | data.frame(cellularity = ci$max.cellularity, 43 | ploidy = ci$max.ploidy, 44 | SLPP = cp.table$lpp[which(cp.table$ploidy == ci$max.ploidy), 45 | which(cp.table$cellularity == ci$max.cellularity)]) 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /R/windows.R: -------------------------------------------------------------------------------- 1 | windowValues <- function(x, positions, chromosomes, window = 1e6, 2 | overlap = 0, weight = rep.int(x = 1, 3 | times = length(x)), start.coord = 1) { 4 | weight <- sqrt(weight) 5 | overlap <- as.integer(overlap) 6 | window.offset <- as.integer(window - 7 | round(window * (overlap / (overlap + 1)))) 8 | chr.ordered <- unique(chromosomes) 9 | data.splitByChr <- split(data.frame(pos = positions, x = x, 10 | weight = weight), 11 | f = factor(chromosomes, levels = chr.ordered)) 12 | lapply(data.splitByChr, function(x) { 13 | range.pos <- range(x$pos, na.rm = TRUE) 14 | if (!is.null(start.coord)) { 15 | range.pos[1] <- as.integer(start.coord) 16 | } 17 | beam.coords <- seq.int(range.pos[1], range.pos[2], by = window.offset) 18 | if (max(beam.coords) != range.pos[2]) { 19 | beam.coords <- c(beam.coords, range.pos[2]) 20 | } 21 | nWindows <- length(beam.coords) - overlap - 1 22 | pos.cut <- cut(x$pos, breaks = beam.coords) 23 | x.split <- split(x$x, f = pos.cut) 24 | weight.split <- split(x$weight, f = pos.cut) 25 | window.starts <- beam.coords[1:nWindows] 26 | window.ends <- beam.coords[(1:nWindows) + 1 + overlap] 27 | idx.list <- lapply(1:nWindows, function(ii) ii + (0:overlap)) 28 | x.window <- lapply(idx.list, function(idx) { 29 | unlist(x.split[idx], use.names = FALSE) 30 | }) 31 | weight.window <- lapply(idx.list, function(idx) { 32 | unlist(weight.split[idx], use.names = FALSE) 33 | }) 34 | window.means <- mapply(weighted.mean, x = x.window, w = weight.window) 35 | window.quantiles <- sapply(x.window, quantile, 36 | probs = c(0.25, 0.75), na.rm = TRUE, names = FALSE) 37 | window.counts <- sapply(x.window, length) 38 | data.frame(start = window.starts, end = window.ends, 39 | mean = window.means, q0 = window.quantiles[1, ], 40 | q1 = window.quantiles[2, ], N = window.counts) 41 | }) 42 | } 43 | 44 | windowBf <- function(Af, Bf, good.reads, positions, chromosomes, 45 | window = 1e6, overlap = 0, start.coord = 1, conf = 0.95) { 46 | overlap <- as.integer(overlap) 47 | window.offset <- as.integer(window - round(window * (overlap / 48 | (overlap + 1)))) 49 | chr.ordered <- unique(chromosomes) 50 | data.splitByChr <- split(data.frame(pos = positions, Bf, Af, good.reads), 51 | f = factor(chromosomes, levels = chr.ordered)) 52 | lapply(data.splitByChr, function(data.oneChr) { 53 | range.pos <- range(data.oneChr$pos, na.rm = TRUE) 54 | if (!is.null(start.coord)) { 55 | range.pos[1] <- as.integer(start.coord) 56 | } 57 | beam.coords <- seq.int(range.pos[1], range.pos[2], by = window.offset) 58 | if (max(beam.coords) != range.pos[2] ) { 59 | beam.coords <- c(beam.coords, range.pos[2]) 60 | } 61 | nWindows <- length(beam.coords) - overlap - 1 62 | pos.cut <- cut(data.oneChr$pos, breaks = beam.coords) 63 | A.split <- split(data.oneChr$Af, f = pos.cut) 64 | B.split <- split(data.oneChr$Bf, f = pos.cut) 65 | d.split <- split(data.oneChr$good.reads, f = pos.cut) 66 | window.starts <- beam.coords[1:nWindows] 67 | window.ends <- beam.coords[(1:nWindows) + 1 + overlap] 68 | idx.list <- lapply(1:nWindows, function(ii) ii + (0:overlap)) 69 | A.window <- lapply(idx.list, function(idx) { 70 | unlist(A.split[idx], use.names = FALSE) 71 | }) 72 | B.window <- lapply(idx.list, function(idx) { 73 | unlist(B.split[idx], use.names = FALSE) 74 | }) 75 | d.window <- lapply(idx.list, function(idx) { 76 | unlist(d.split[idx], use.names = FALSE) 77 | }) 78 | window.quantiles <- mapply(b_allele_freq, Af = A.window, Bf = B.window, 79 | good.reads = d.window, conf = conf) 80 | window.counts <- sapply(B.window, length) 81 | data.frame(start = window.starts, end = window.ends, 82 | mean = window.quantiles[2, ], q0 = window.quantiles[1, ], 83 | q1 = window.quantiles[3, ], N = window.counts) 84 | }) 85 | } 86 | -------------------------------------------------------------------------------- /build/vignette.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cran/sequenza/6b83b882f114d4927311388d16513e0d472d4077/build/vignette.rds -------------------------------------------------------------------------------- /data/CP.example.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cran/sequenza/6b83b882f114d4927311388d16513e0d472d4077/data/CP.example.RData -------------------------------------------------------------------------------- /data/example.seqz.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cran/sequenza/6b83b882f114d4927311388d16513e0d472d4077/data/example.seqz.rda -------------------------------------------------------------------------------- /inst/CITATION: -------------------------------------------------------------------------------- 1 | citHeader("To cite package 'sequenza' in publications use:") 2 | 3 | bibentry(bibtype = 'Article', 4 | title = "Sequenza: allele-specific copy number and mutation profiles from tumor sequencing data", 5 | author = personList(as.person("Francesco Favero"), 6 | as.person("Tejal Joshi"), 7 | as.person("Andrea M Marquard"), 8 | as.person("Nicolai J Birkbak"), 9 | as.person("Marcin Krzystanek"), 10 | as.person("Qiyuan Li"), 11 | as.person("Zoltan Szallasi"), 12 | as.person("Aron C Eklund")), 13 | journal = "Annals of Oncology", 14 | year = 2015, 15 | volume = 26, 16 | issue = 1, 17 | pages = "64-70", 18 | doi = "10.1093/annonc/mdu479", 19 | url = "http://annonc.oxfordjournals.org/content/26/1/64" 20 | ) 21 | -------------------------------------------------------------------------------- /inst/doc/sequenza.R: -------------------------------------------------------------------------------- 1 | ## ----load---------------------------------------------------------------- 2 | library(sequenza) 3 | 4 | ## ----data---------------------------------------------------------------- 5 | data.file <- system.file("extdata", "example.seqz.txt.gz", package = "sequenza") 6 | 7 | ## ----cp_data, echo=FALSE, message=FALSE---------------------------------- 8 | library(sequenza) 9 | data(CP.example) 10 | CP <- CP.example 11 | 12 | ## ----extract, message=FALSE, warning=FALSE, results="hide"--------------- 13 | test <- sequenza.extract(data.file, verbose = FALSE) 14 | 15 | ## ----fit, eval=FALSE----------------------------------------------------- 16 | # CP <- sequenza.fit(test) 17 | 18 | ## ----results------------------------------------------------------------- 19 | sequenza.results(sequenza.extract = test, 20 | cp.table = CP, sample.id = "Test", 21 | out.dir="TEST") 22 | 23 | ## ----res_dir, echo=FALSE------------------------------------------------- 24 | 25 | # create results list 26 | # 27 | res_list <- c("alternative_fit.pdf", "alternative_solutions.txt", 28 | "chromosome_depths.pdf", "chromosome_view.pdf", 29 | "CN_bars.pdf", "confints_CP.txt", 30 | "CP_contours.pdf", "gc_plots.pdf", 31 | "genome_view.pdf", "model_fit.pdf", 32 | "mutations.txt", "segments.txt", 33 | "sequenza_cp_table.RData", "sequenza_extract.RData", 34 | "sequenza_log.txt") 35 | res_list <- paste("Test", res_list, sep = "_") 36 | 37 | description_list <- c( 38 | "Alternative solution fir to the segments. One solution per slide", 39 | "List of all ploidy/cellularity alternative solution", 40 | "Visualization of sequencing coverage in the normal and in the tumor samples, before and after normalization", 41 | "Visualization per chromosome of depth.ratio, B-allele frequency and mutations, using the selected or estimated solution. One chromosome per slide", 42 | "Bar plot representing the percentage of genome in the detected copy number states", 43 | "Table of the confidence inerval of the best solution from the model", 44 | "Visualization of the likelihood density for each pair of cellularity/ploidy solution. The local maximum-likelihood points and confidence interval of the best estimate are also visualized", 45 | "Visualization of the GC correction in the normal and in the tumor sample", 46 | "Genome-whide visualization of the allele-specific and absolute copy number results, and raw profile of the depth ratio and allele frequency", 47 | "model_fit.pdf", 48 | "Table with mutation and estimated number of mutated alleles (Mt)", 49 | "Table listing the detected segments, with estimated copy number state at each sement", 50 | "RData object dump of the maxima a posteriori computation", 51 | "RData object dump of all the sample information", 52 | "Log with version and time information") 53 | knitr::kable(data.frame(Files = res_list, Description = description_list)) 54 | #dir("TEST", pattern = "Test") 55 | 56 | 57 | ## ----read_segs, echo=FALSE----------------------------------------------- 58 | seg.tab <- read.table("TEST/Test_segments.txt", 59 | header = TRUE, sep ="\t") 60 | 61 | alt_res <- read.table("TEST/Test_alternative_solutions.txt", 62 | header = TRUE, sep ="\t") 63 | seg.tab <- seg.tab[seg.tab$CNt <= 4, ] 64 | is.num <- sapply(seg.tab, is.numeric) 65 | seg.tab[is.num] <- lapply(seg.tab[is.num], round, 3) 66 | 67 | ## ----head_segs, echo=FALSE----------------------------------------------- 68 | knitr::kable(head(seg.tab)) 69 | 70 | 71 | ## ----g_view, echo=FALSE, fig.height=5, fig.width=10, fig.align='center'---- 72 | sequenza:::genome.view(seg.tab) 73 | 74 | ## ----g_view_tot, echo=FALSE, fig.height=5, fig.width=10, fig.align='center'---- 75 | sequenza:::genome.view(seg.tab, info.type = "CNt") 76 | 77 | ## ----g_view_raw, echo=FALSE, fig.height=5, fig.width=10, fig.align='center'---- 78 | sequenza:::plotRawGenome(test, cellularity = alt_res$cellularity[1], 79 | ploidy = alt_res$ploidy[1]) 80 | 81 | ## ----CPplot, echo=TRUE, fig.height=5, fig.width=5, fig.align='center'---- 82 | cp.plot(CP) 83 | cp.plot.contours(CP, add = TRUE, 84 | likThresh = c(0.999, 0.95), 85 | col = c("lightsalmon", "red"), pch = 20) 86 | 87 | ## ----c_view, echo=TRUE, fig.height=6, fig.width=8, fig.align='center'---- 88 | chromosome.view(mut.tab = test$mutations[[1]], baf.windows = test$BAF[[1]], 89 | ratio.windows = test$ratio[[1]], min.N.ratio = 1, 90 | segments = test$segments[[1]], 91 | main = test$chromosomes[1], 92 | cellularity = 0.89, ploidy = 1.9, 93 | avg.depth.ratio = 1) 94 | 95 | -------------------------------------------------------------------------------- /inst/doc/sequenza.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Sequenza User Guide" 3 | date: "`r Sys.Date()`" 4 | author: "[Francesco Favero](mailto:favero.francesco@gmail.com)" 5 | output: 6 | rmdformats::readthedown: 7 | self_contained: true 8 | vignette: > 9 | %\VignetteIndexEntry{Sequenza User Guide} 10 | %\VignetteEngine{knitr::rmarkdown} 11 | %\VignetteEncoding{UTF-8} 12 | --- 13 | 14 | # About 15 | 16 | > Sequenza: Copy Number Estimation from Tumor Genome Sequencing Data 17 | 18 | 19 | ![](https://bytebucket.org/sequenzatools/icons/raw/324bd43ac4d10546b64b04c38d8c513e8420346d/svg/sequenza_tools/sequenzaalpha_150.svg) 20 | 21 | [![CRAN_Status_Badge](http://www.r-pkg.org/badges/version/sequenza)](https://cran.r-project.org/package=sequenza) 22 | [![CRAN_Downloads_Badge](http://cranlogs.r-pkg.org/badges/sequenza)](https://cran.r-project.org/package=sequenza) 23 | [![CRAN_licence](https://img.shields.io/cran/l/sequenza.svg)](https://www.gnu.org/licenses/gpl-3.0.txt) 24 | 25 | 26 | 27 | Sequenza is a tool to analyze genomic sequencing data from paired normal-tumor samples, including cellularity and ploidy estimation; mutation and copy number (allele-specific and total copy number) detection, quantification and visualization. 28 | 29 | # Introduction 30 | 31 | Deep sequence of tumor DNA along with corresponding normal DNA can provide a 32 | valuable perspective on the mutations and aberrations that characterize the 33 | tumor. However, analysis of this data can be impeded by tumor cellularity and 34 | heterogeneity and by unwieldy data. Here we describe *Sequenza*, an R package 35 | that enables the efficient estimation of tumor cellularity and ploidy, and 36 | generation of copy number, loss-of-heterozygosity, and mutation frequency 37 | profiles. 38 | 39 | This document details a typical analysis of matched tumor-normal exome sequence 40 | data using *sequenza*. 41 | 42 | # Getting started 43 | 44 | ## Minimum requirements 45 | - Software: R, Python, SAMtools, tabix 46 | - Operating system: Linux, OS X, Windows 47 | - Memory: Minimum 4 GB of RAM. Recommended >8 GB. 48 | - Disk space: 1.5 GB for sample (depending on sequencing depth) 49 | - R version: 3.2.0 50 | - Python version: 2.7, 3.4, 3.5, 3.6 (or PyPy) 51 | 52 | ## Installation 53 | 54 | The R package can be installed by: 55 | 56 | ```r 57 | setRepositories(graphics = FALSE, ind = 1:6) 58 | install.packages("sequenza") 59 | ``` 60 | 61 | To install the Python companion package *sequenza-utils* to preprocess BAM 62 | files, refer to the [*sequenza-utils*](https://pypi.org/project/sequenza-utils) 63 | project page, or simply use the python package manager from the command prompt: 64 | 65 | ```bash 66 | pip install sequenza-utils 67 | ``` 68 | 69 | # Running sequenza 70 | 71 | ## Preprocessing of input files 72 | 73 | In order to obtain precise mutational and aberration patterns in a tumor sample, 74 | Sequenza requires a matched normal sample from the same patient. Typically, the 75 | following files are needed to get started with Sequenza: 76 | 77 | - A BAM file (or a derived pileup file) from the tumor specimen. 78 | - A BAM file (or a derived pileup file) from the normal specimen. 79 | - A FASTA reference genomic sequence file 80 | 81 | 82 | The normal and tumor BAM files are processed together to generate a *seqz* file, which 83 | is the required input for the analysis. 84 | It is possible to generate a *seqz* starting from other processed data, such as 85 | pileup, or VCF files. The available options are described in the 86 | [*sequenza-utils*](http://sequenza-utils.readthedocs.io/) manual pages. 87 | 88 | The *sequenza-utils* command provides various tools; here we highlight only the 89 | basic usage: 90 | 91 | - Process a FASTA file to produce a GC [Wiggle](https://genome.ucsc.edu/goldenpath/help/wiggle.html) 92 | track file: 93 | 94 | ```bash 95 | sequenza−utils gc_wiggle −w 50 --fasta hg19.fa -o hg19.gc50Base.wig.gz 96 | ``` 97 | 98 | - Process BAM and Wiggle files to produce a *seqz* file: 99 | 100 | ```bash 101 | sequenza−utils bam2seqz -n normal.bam -t tumor.bam --fasta hg19.fa \ 102 | -gc hg19.gc50Base.wig.gz -o out.seqz.gz 103 | ``` 104 | 105 | - Post-process by binning the original *seqz* file: 106 | 107 | ```bash 108 | sequenza−utils seqz_binning --seqz out.seqz.gz -w 50 -o out small.seqz.gz 109 | ``` 110 | 111 | ## Sequenza analysis (in R) 112 | 113 | ```{r load} 114 | library(sequenza) 115 | ``` 116 | 117 | In the package is provided a small *seqz* file 118 | 119 | ```{r data} 120 | data.file <- system.file("extdata", "example.seqz.txt.gz", package = "sequenza") 121 | ``` 122 | 123 | ```{r cp_data, echo=FALSE, message=FALSE} 124 | library(sequenza) 125 | data(CP.example) 126 | CP <- CP.example 127 | ``` 128 | 129 | 130 | 131 | The main interface consists of 3 functions: 132 | 133 | - sequenza.extract: process seqz data, normalization and segmentation 134 | ```{r extract, message=FALSE, warning=FALSE, results="hide"} 135 | test <- sequenza.extract(data.file, verbose = FALSE) 136 | ``` 137 | 138 | - sequenza.fit: run grid-search approach to estimate cellularity and ploidy 139 | ```{r fit, eval=FALSE} 140 | CP <- sequenza.fit(test) 141 | ``` 142 | 143 | - sequenza.results: write files and plots using suggested or selected solution 144 | ```{r results} 145 | sequenza.results(sequenza.extract = test, 146 | cp.table = CP, sample.id = "Test", 147 | out.dir="TEST") 148 | ``` 149 | 150 | # Plots and Results 151 | 152 | The function _sequenza.results_ outputs various files in the specified path. 153 | The resulting files are either output in pdf of in plain text. The files include 154 | quality control assessments (eg evaluate GC-correction), visualization of the 155 | data and files such as segmentation with copy number calling and mutation lists. 156 | 157 | ## Result files 158 | 159 | Each generated file is briefly explained in the following table 160 | 161 | ```{r res_dir, echo=FALSE} 162 | 163 | # create results list 164 | # 165 | res_list <- c("alternative_fit.pdf", "alternative_solutions.txt", 166 | "chromosome_depths.pdf", "chromosome_view.pdf", 167 | "CN_bars.pdf", "confints_CP.txt", 168 | "CP_contours.pdf", "gc_plots.pdf", 169 | "genome_view.pdf", "model_fit.pdf", 170 | "mutations.txt", "segments.txt", 171 | "sequenza_cp_table.RData", "sequenza_extract.RData", 172 | "sequenza_log.txt") 173 | res_list <- paste("Test", res_list, sep = "_") 174 | 175 | description_list <- c( 176 | "Alternative solution fir to the segments. One solution per slide", 177 | "List of all ploidy/cellularity alternative solution", 178 | "Visualization of sequencing coverage in the normal and in the tumor samples, before and after normalization", 179 | "Visualization per chromosome of depth.ratio, B-allele frequency and mutations, using the selected or estimated solution. One chromosome per slide", 180 | "Bar plot representing the percentage of genome in the detected copy number states", 181 | "Table of the confidence inerval of the best solution from the model", 182 | "Visualization of the likelihood density for each pair of cellularity/ploidy solution. The local maximum-likelihood points and confidence interval of the best estimate are also visualized", 183 | "Visualization of the GC correction in the normal and in the tumor sample", 184 | "Genome-whide visualization of the allele-specific and absolute copy number results, and raw profile of the depth ratio and allele frequency", 185 | "model_fit.pdf", 186 | "Table with mutation and estimated number of mutated alleles (Mt)", 187 | "Table listing the detected segments, with estimated copy number state at each sement", 188 | "RData object dump of the maxima a posteriori computation", 189 | "RData object dump of all the sample information", 190 | "Log with version and time information") 191 | knitr::kable(data.frame(Files = res_list, Description = description_list)) 192 | #dir("TEST", pattern = "Test") 193 | 194 | ``` 195 | 196 | ```{r read_segs, echo=FALSE} 197 | seg.tab <- read.table("TEST/Test_segments.txt", 198 | header = TRUE, sep ="\t") 199 | 200 | alt_res <- read.table("TEST/Test_alternative_solutions.txt", 201 | header = TRUE, sep ="\t") 202 | seg.tab <- seg.tab[seg.tab$CNt <= 4, ] 203 | is.num <- sapply(seg.tab, is.numeric) 204 | seg.tab[is.num] <- lapply(seg.tab[is.num], round, 3) 205 | ``` 206 | 207 | ## Segments results 208 | 209 | The segmentation file with the allele-specific copy number calling is one of 210 | the main result of the analysis. A sample of the file is shown in the table below: 211 | 212 | ```{r head_segs, echo=FALSE} 213 | knitr::kable(head(seg.tab)) 214 | 215 | ``` 216 | 217 | 218 | The columns represents: 219 | 220 | 1. **chromosome**: Chromosome 221 | 2. **start.pos**: Start position of the segment 222 | 3. **end.pos**: End position of the segment 223 | 4. **Bf**: B-allele frequency value 224 | 5. **N.BAF**: Number of observation to compute _Bf_ in the segment 225 | 6. **sd.BAF**: Standard deviation of _Bf_ 226 | 7. **depth.ratio**: Adjusted and normalized depth ratio tumor / normal 227 | 8. **N.ratio**: Number of observation to compute _depth.ratio_ in the segment 228 | 9. **sd.ratio**: Standard deviation of _depth.rati_ 229 | 10. **CNt**: Estimated total copy number value 230 | 11. **A**: Estimated number of A-alleles 231 | 12. **B**: Estimated number of B-alleles (minor allele) 232 | 13. **LPP**: Log-posterior probability of the segment 233 | 234 | ## Gene wide overview 235 | 236 | ### Allele-specific copy number 237 | 238 | ```{r g_view, echo=FALSE, fig.height=5, fig.width=10, fig.align='center'} 239 | sequenza:::genome.view(seg.tab) 240 | ``` 241 | 242 | ### Total copy number 243 | 244 | ```{r g_view_tot, echo=FALSE, fig.height=5, fig.width=10, fig.align='center'} 245 | sequenza:::genome.view(seg.tab, info.type = "CNt") 246 | ``` 247 | 248 | ### Raw profile 249 | 250 | ```{r g_view_raw, echo=FALSE, fig.height=5, fig.width=10, fig.align='center'} 251 | sequenza:::plotRawGenome(test, cellularity = alt_res$cellularity[1], 252 | ploidy = alt_res$ploidy[1]) 253 | ``` 254 | 255 | 256 | ## Grid search maximum likelihood 257 | 258 | 259 | ```{r CPplot, echo=TRUE, fig.height=5, fig.width=5, fig.align='center'} 260 | cp.plot(CP) 261 | cp.plot.contours(CP, add = TRUE, 262 | likThresh = c(0.999, 0.95), 263 | col = c("lightsalmon", "red"), pch = 20) 264 | ``` 265 | 266 | ## Chromosome view 267 | 268 | _Chromosome view_ is the visualization that displays chromosome by crhosome, nutations, 269 | B-allele frequency and depth-ratio. 270 | The visualization makes it easier to ispect the segmentation results, comparing to 271 | a binned profile of the raw data. 272 | It also visualize the copy number calling using the _cellularity_ and _ploidy_ solution, 273 | making useful to asses if the copy number calling is acurate. 274 | In addition it provides a visualization of the mutation frequency that can also help to 275 | corroborate the solution. 276 | 277 | ```{r c_view, echo=TRUE, fig.height=6, fig.width=8, fig.align='center'} 278 | chromosome.view(mut.tab = test$mutations[[1]], baf.windows = test$BAF[[1]], 279 | ratio.windows = test$ratio[[1]], min.N.ratio = 1, 280 | segments = test$segments[[1]], 281 | main = test$chromosomes[1], 282 | cellularity = 0.89, ploidy = 1.9, 283 | avg.depth.ratio = 1) 284 | ``` 285 | -------------------------------------------------------------------------------- /inst/extdata/example.seqz.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cran/sequenza/6b83b882f114d4927311388d16513e0d472d4077/inst/extdata/example.seqz.txt.gz -------------------------------------------------------------------------------- /inst/extdata/example.seqz.txt.gz.tbi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cran/sequenza/6b83b882f114d4927311388d16513e0d472d4077/inst/extdata/example.seqz.txt.gz.tbi -------------------------------------------------------------------------------- /man/CP.data.Rd: -------------------------------------------------------------------------------- 1 | \name{CP.example} 2 | \docType{data} 3 | \alias{CP.example} 4 | 5 | \title{Example of cellularity and ploidy results} 6 | 7 | \description{ 8 | Examples of results from the maximum a posteriori estimation from a set 9 | of cellularity and ploidy values, as returned by the functions 10 | \code{\link{baf.model.fit}} and \code{\link{mufreq.model.fit}}. 11 | } 12 | 13 | \usage{ 14 | data(CP.example) 15 | } 16 | 17 | \format{ 18 | A list containing three items: 19 | \describe{ 20 | \item{ploidy}{numeric vector of tested ploidy values.} 21 | \item{cellularity}{numeric vector of tested cellularity values.} 22 | \item{lpp}{numeric matrix of log-posterior probability for each 23 | \var{(ploidy, cellularity)} pair.} 24 | } 25 | } 26 | 27 | \examples{ 28 | data(CP.example) 29 | str(CP.example) 30 | 31 | ## Visualization of the object 32 | image(x = CP.example$ploidy, 33 | y = CP.example$cellularity, 34 | z = CP.example$lpp) 35 | 36 | ## A better plot 37 | cp.plot(CP.example) 38 | cp.plot.contours(CP.example, add = TRUE) 39 | } 40 | 41 | \keyword{datasets} 42 | -------------------------------------------------------------------------------- /man/baf.model.fit.Rd: -------------------------------------------------------------------------------- 1 | \name{baf.model.fit} 2 | \alias{baf.model.fit} 3 | \alias{mufreq.model.fit} 4 | \title{Model fitting using maximum a posteriori inference} 5 | 6 | \description{ 7 | Computes the log-posterior probability distribution for the specified range 8 | of cellularity and ploidy parameters 9 | } 10 | 11 | \usage{ 12 | mufreq.model.fit(cellularity = seq(0.3, 1, by = 0.01), 13 | ploidy = seq(1, 7, by = 0.1), mc.cores = getOption("mc.cores", 2L), 14 | ...) 15 | baf.model.fit(cellularity = seq(0.3, 1, by = 0.01), 16 | ploidy = seq(1, 7, by = 0.1), mc.cores = getOption("mc.cores", 2L), 17 | ...) 18 | } 19 | 20 | \arguments{ 21 | \item{cellularity}{vector of cellularity values to be tested.} 22 | \item{ploidy}{vector of ploidy values to be tested.} 23 | \item{mc.cores}{number of cores to use, defined as in 24 | \code{\link{pblapply}}.} 25 | \item{...}{any argument accepted by \code{\link{mufreq.bayes}} or 26 | \code{\link{baf.bayes}}.} 27 | } 28 | 29 | \value{ 30 | A list of three items: 31 | \item{ploidy}{tested values of the ploidy parameter} 32 | \item{cellularity}{tested values of the cellularity parameter} 33 | \item{lpp}{log-posterior probability of each pair of 34 | cellularity/ploidy parameters.} 35 | } 36 | 37 | \details{ 38 | \code{baf.model.fit} uses the function \code{\link{baf.bayes}} to infer 39 | the log-posterior probability of the model fit using the possible 40 | combinations of cellularity and ploidy values provided in the arguments. 41 | Similarly \code{mufreq.model.fit} fits the mutation/depth ratio model using 42 | the function \code{\link{mufreq.bayes}}. 43 | \code{baf.model.fit} is the defalt method used to infer cellularity and 44 | ploidy on segmented chromosomes. The \code{mufreq.model.fit} function 45 | estimates cellularity and ploidy using mutation frequency and depth ratio, 46 | however, the mutation data is more affected to background noise 47 | compared to the segmented B-allele frequency, hence it may give less 48 | accurate results. 49 | } 50 | 51 | \seealso{ 52 | \code{\link{cp.plot}} for visualization of the resulting object, 53 | and \code{\link{get.ci}} to extract confidence intervals. 54 | } 55 | 56 | \examples{ 57 | \dontrun{ 58 | 59 | data.file <- system.file("extdata", "example.seqz.txt.gz", 60 | package = "sequenza") 61 | # read all the chromosomes: 62 | seqz.data <- read.seqz(data.file) 63 | # Gather genome wide GC-stats from raw file: 64 | gc.stats <- gc.sample.stats(data.file) 65 | gc.normal.vect <- mean_gc(gc.stats$normal) 66 | gc.tumor.vect <- mean_gc(gc.stats$tumor) 67 | # Read only one chromosome: 68 | seqz.data <- read.seqz(data.file, chr_name = "1") 69 | 70 | # Correct the coverage of the loaded chromosome: 71 | seqz.data$adjusted.ratio <- round((seqz.data$depth.tumor / 72 | gc.tumor.vect[as.character(seqz.data$GC.percent)]) / 73 | (seqz.data$depth.normal / 74 | gc.normal.vect[as.character(seqz.data$GC.percent)]), 3) 75 | # Select the heterozygous positions 76 | seqz.hom <- seqz.data$zygosity.normal == 'hom' 77 | seqz.het <- seqz.data[!seqz.hom, ] 78 | # Detect breakpoints 79 | breaks <- find.breaks(seqz.het, gamma = 80, kmin = 10, 80 | baf.thres = c(0, 0.5)) 81 | # use heterozygous and homozygous position to measure segment values 82 | seg.s1 <- segment.breaks(seqz.data, breaks = breaks) 83 | 84 | # filter out small ambiguous segments, and conveniently weight 85 | # the segments by size: 86 | seg.filtered <- seg.s1[(seg.s1$end.pos - seg.s1$start.pos) > 3e6, ] 87 | weights.seg <- (seg.filtered$end.pos - seg.filtered$start.pos) / 1e6 88 | # Set the average depth ratio to 1: 89 | avg.depth.ratio <- 1 90 | # run the BAF model fit 91 | CP <- baf.model.fit(Bf = seg.filtered$Bf, 92 | depth.ratio = seg.filtered$depth.ratio, weight.ratio = weights.seg, 93 | weight.Bf = weights.seg, sd.ratio = seg.filtered$sd.ratio, 94 | sd.Bf = seg.filtered$sd.BAF, avg.depth.ratio = avg.depth.ratio, 95 | cellularity = seq(0.1, 1, 0.01), ploidy = seq(0.5, 3, 0.05)) 96 | 97 | confint <- get.ci(CP) 98 | ploidy <- confint$max.ploidy 99 | cellularity <- confint$max.cellularity 100 | 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /man/bayes.Rd: -------------------------------------------------------------------------------- 1 | \name{baf.bayes} 2 | \alias{baf.bayes} 3 | \alias{mufreq.bayes} 4 | \title{Model allele-specific copy numbers with specified cellularity and ploidy parameters} 5 | 6 | \description{ 7 | Given a pair of cellularity and ploidy parameters, the function returns the most likely allele-specific copy numbers with the corresponding log-posterior probability of the fit, for given values of B-allele frequency and depth ratio. 8 | } 9 | 10 | \usage{ 11 | baf.bayes(Bf, depth.ratio, cellularity, ploidy, avg.depth.ratio, 12 | sd.Bf = 0.1, sd.ratio = 0.5, weight.Bf = 1, weight.ratio = 1, 13 | CNt.min = 0, CNt.max = 7, CNn = 2, 14 | priors.table = data.frame(CN = CNt.min:CNt.max, value = 1), 15 | ratio.priority = FALSE) 16 | mufreq.bayes(mufreq, depth.ratio, cellularity, ploidy, avg.depth.ratio, 17 | weight.mufreq = 100, weight.ratio = 100, CNt.min = 1, CNt.max = 7, CNn = 2, 18 | priors.table = data.frame(CN = CNt.min:CNt.max, value = 1)) 19 | } 20 | 21 | \arguments{ 22 | \item{Bf}{vector of B-allele frequencies (values can range from 0 to 0.5).} 23 | \item{mufreq}{vector of mutation frequencies (values can range from 0 to 1).} 24 | \item{depth.ratio}{vector of depth ratios.} 25 | \item{sd.ratio}{standard deviation observed in the depth ratio measures in a segment} 26 | \item{sd.Bf}{standard deviation observed in the B-allele frequency measures in a segment} 27 | \item{weight.Bf}{vector of weights for B-allele frequency values.} 28 | \item{weight.mufreq}{vector of weights for the mutation frequency values.} 29 | \item{weight.ratio}{vector of weights for the depth ratio values.} 30 | \item{cellularity}{fraction of tumor cells in the sample.} 31 | \item{ploidy}{2 * ratio between total DNA content in a tumor cell and a normal cell.} 32 | \item{avg.depth.ratio}{average normalized depth ratio.} 33 | \item{CNt.min}{minimum copy number to consider in the model.} 34 | \item{CNt.max}{maximum copy number to consider in the model.} 35 | \item{CNn}{copy number of the normal genome.} 36 | \item{priors.table}{data frame with columns \code{CN} and \code{value}, containing the copy numbers and the corresponding weights. To every copy number is assigned the value 1 as default, so any values different from 1 will change the corresponding weight.} 37 | \item{ratio.priority}{logical, if TRUE only the depth ratio will be used to determine the copy number state, while the Bf value will be used to determine the number of B-alleles.} 38 | } 39 | 40 | \value{ 41 | \item{CNt}{copy number of the tumor cell at the tested point.} 42 | \item{A}{number of A-alleles at the tested point.} 43 | \item{B}{number of B-alleles at the tested point.} 44 | \item{CNn}{copy number of the normal cell at the tested point (equal to CNn given as argument).} 45 | \item{Mt}{number of mutated alleles at the tested point.} 46 | \item{LPP}{log-posterior probability of model fitting at the given point/segment.} 47 | } 48 | \details{ 49 | \code{baf.bayes} and \code{mufreq.bayes} use a naive Bayesian approach to calculate the posterior probability of fitness of the data point with the model point resulting from the given values of cellularity and DNA-content. 50 | } 51 | 52 | \seealso{ 53 | \code{baf.model.fit}, \code{mufreq.model.fit}. 54 | } 55 | 56 | \examples{ 57 | \dontrun{ 58 | data.file <- system.file("extdata", "example.seqz.txt.gz", package = "sequenza") 59 | # read all the chromosomes: 60 | seqz.data <- read.seqz(data.file) 61 | # Gather genome wide GC-stats from raw file: 62 | gc.stats <- gc.sample.stats(data.file) 63 | gc.vect <- setNames(gc.stats$raw.mean, gc.stats$gc.values) 64 | # Read only one chromosome: 65 | seqz.data <- read.seqz(data.file, chr.name = 1) 66 | 67 | # Correct the coverage of the loaded chromosome: 68 | seqz.data$adjusted.ratio <- seqz.data$depth.ratio / 69 | gc.vect[as.character(seqz.data$GC.percent)] 70 | # Select the heterozygous positions 71 | seqz.hom <- seqz.data$zygosity.normal == 'hom' 72 | seqz.het <- seqz.data[!seqz.hom, ] 73 | # Detect breakpoints 74 | breaks <- find.breaks(seqz.het, gamma = 80, kmin = 10, baf.thres = c(0, 0.5)) 75 | # use heterozygous and homozygous position to measure segment values 76 | seg.s1 <- segment.breaks(seqz.data, breaks = breaks) 77 | 78 | # filter out small ambiguous segments, and conveniently weight the segments by size: 79 | seg.filtered <- seg.s1[(seg.s1$end.pos - seg.s1$start.pos) > 10e6, ] 80 | weights.seg <- 150 + round((seg.filtered$end.pos - 81 | seg.filtered$start.pos) / 1e6, 0) 82 | # get the genome wide mean of the normalized depth ratio: 83 | avg.depth.ratio <- mean(gc.stats$adj[,2]) 84 | # run the BAF model fit 85 | 86 | CP <- baf.model.fit(Bf = seg.filtered$Bf, depth.ratio = seg.filtered$depth.ratio, 87 | weight.ratio = weights.seg, 88 | weight.Bf = weights.seg, 89 | avg.depth.ratio = avg.depth.ratio, 90 | cellularity = seq(0.1,1,0.01), 91 | ploidy = seq(0.5,3,0.05)) 92 | 93 | confint <- get.ci(CP) 94 | ploidy <- confint$max.ploidy 95 | cellularity <- confint$max.cellularity 96 | 97 | #detect copy number alteration on the segments: 98 | 99 | cn.alleles <- baf.bayes(Bf = seg.s1$Bf, depth.ratio = seg.s1$depth.ratio, 100 | cellularity = cellularity, ploidy = ploidy, 101 | avg.depth.ratio = 1) 102 | 103 | head(cbind(seg.s1, cn.alleles)) 104 | 105 | # create mutation table: 106 | mut.tab <- mutation.table(seqz.data, mufreq.treshold = 0.15, 107 | min.reads = 40, max.mut.types = 1, 108 | min.type.freq = 0.9, segments = seg.s1) 109 | 110 | mut.tab.clean <- na.exclude(mut.tab) 111 | 112 | # Detect mutated alleles: 113 | mut.alleles <- mufreq.bayes(mufreq = mut.tab.clean$F, 114 | depth.ratio = mut.tab.clean$adjusted.ratio, 115 | cellularity = cellularity, ploidy = ploidy, 116 | avg.depth.ratio = avg.depth.ratio) 117 | head(cbind(mut.tab.clean[,c("chromosome","position","F", 118 | "adjusted.ratio", "mutation")], 119 | mut.alleles)) 120 | 121 | 122 | } 123 | } -------------------------------------------------------------------------------- /man/breaks.Rd: -------------------------------------------------------------------------------- 1 | \name{find.breaks} 2 | \alias{find.breaks} 3 | \alias{segment.breaks} 4 | \title{Segmentation of sequencing data using an allele-specific copy number algorithm} 5 | 6 | \description{ 7 | This function uses \code{\link{aspcf}} or \code{\link{pcf}} from the package \pkg{copynumber} to segment depth ratio and B-allele frequency obtained from sequencing data. 8 | } 9 | 10 | \usage{ 11 | find.breaks(seqz.baf, gamma = 80, kmin = 10, baf.thres = c(0, 0.5), 12 | verbose = FALSE, seg.algo = "aspcf", ...) 13 | segment.breaks(seqz.tab, breaks, min.reads.baf = 1, weighted.mean = TRUE) 14 | } 15 | 16 | \arguments{ 17 | \item{seqz.baf}{an seqz file containing only the heterozygous positions.} 18 | \item{seqz.tab}{a complete seqz file.} 19 | \item{gamma, kmin, baf.thres, verbose}{arguments passed to the segmentation algorithm.} 20 | \item{breaks}{breaks as output by \code{find.breaks}.} 21 | \item{min.reads.baf}{threshold on the depth of the positions included to calculate the average BAF for segment.} 22 | \item{weighted.mean}{boolean to select if the segments have to calculated using the read depth as a weights to calculate depth ratio and B-allele frequency means.} 23 | \item{seg.algo}{Selects the algorithm used for the segmentation. Available options are \code{\link{aspcf}} of \code{\link{pcf}}.} 24 | \item{...}{additional arguments passed to \code{\link{aspcf}}.} 25 | } 26 | 27 | \details{ 28 | \pkg{copynumber} is a package to perform efficient segmentation of SNP-array data. The function \code{find.breaks} uses the algorithms from the \pkg{copynumber} package to find break points, where the default parameters have been optimized for sequencing data, but a careful choice of an optimal \code{gamma} value is advised. 29 | } 30 | 31 | \examples{ 32 | 33 | \dontrun{ 34 | 35 | data.file <- system.file("extdata", "example.seqz.txt.gz", package = "sequenza") 36 | # read all the chromosomes: 37 | seqz.data <- read.seqz(data.file) 38 | # Gather genome wide GC-stats from raw file: 39 | gc.stats <- gc.sample.stats(data.file) 40 | gc.vect <- setNames(gc.stats$raw.mean, gc.stats$gc.values) 41 | # Read only one chromosome: 42 | seqz.data <- read.seqz(data.file, chr.name = 12) 43 | 44 | # Correct the coverage of the loaded chromosome: 45 | seqz.data$adjusted.ratio <- seqz.data$depth.ratio / 46 | gc.vect[as.character(seqz.data$GC.percent)] 47 | # Select the heterozygous positions 48 | seqz.hom <- seqz.data$zygosity.normal == 'hom' 49 | seqz.het <- seqz.data[!seqz.hom, ] 50 | # Detect breakpoints 51 | breaks <- find.breaks(seqz.het, gamma = 80, kmin = 10, baf.thres = c(0, 0.5)) 52 | # use heterozygous and homozygous position to measure segment values 53 | segment.breaks(seqz.data, breaks = breaks) 54 | } 55 | } -------------------------------------------------------------------------------- /man/chromosome.view.Rd: -------------------------------------------------------------------------------- 1 | \name{chromosome.view} 2 | \alias{chromosome.view} 3 | \alias{genome.view} 4 | 5 | \title{A graphical representation of multiple chromosomal 6 | features} 7 | 8 | \description{ 9 | A graphical representation of depth ratio, allele frequency and mutation 10 | frequency in multiple panels allineated by the coordinate of 11 | the same chromosome. 12 | } 13 | 14 | \usage{ 15 | chromosome.view(baf.windows, ratio.windows, mut.tab = NULL, 16 | segments = NULL, min.N.baf = 1, min.N.ratio = 10000, main = "", 17 | vlines = FALSE, legend.inset = c(-20 * strwidth("a", units = "figure"), 18 | 0), CNn = 2, cellularity = NULL, ploidy = NULL, avg.depth.ratio = NULL, 19 | model.lwd = 1, model.lty = "24", model.col = 1, x.chr.space = 10) 20 | genome.view(seg.cn, info.type = "AB", ...) 21 | } 22 | 23 | \arguments{ 24 | \item{baf.windows}{matrix containing the windowed B-allele frequency values 25 | for one chromosome.} 26 | \item{ratio.windows}{matrix containing the windowed depth ratio values for 27 | one chromosome.} 28 | \item{mut.tab}{mutation table of one chromosome. If specified, the 29 | mutations will be drawn in a top panel. \code{mut.tab} must be output 30 | from the \code{\link{mutation.table}} function.} 31 | \item{segments}{segmentation for one chromosome. If specified, the 32 | segmented B-allele frequency and depth ratio values will be shown 33 | as red lines.} 34 | \item{min.N.baf}{minimum number of observations required in a BAF window 35 | for plotting.} 36 | \item{min.N.ratio}{minimum number of observations required in a depth ratio 37 | window for plotting.} 38 | \item{CNn}{copy number of the germline genome. } 39 | \item{vlines}{logical, if TRUE the plot will include dotted vertical lines 40 | corresponding to segment breaks.} 41 | \item{cellularity}{fraction of tumor cells in the sample.} 42 | \item{ploidy}{value of the estimated \code{ploidy} parameter.} 43 | \item{avg.depth.ratio}{the average value of the normalized depth ratio.} 44 | \item{main}{main title of the plot.} 45 | \item{legend.inset}{the inset argument to pass to the \code{\link{legend}} 46 | function. Defines the distance between the mutation legend and the 47 | plot border.} 48 | \item{model.lwd}{width of the theoretical lines, if the segments matrix 49 | contains the columns A, B and CNt.} 50 | \item{model.lty}{line type of the theoretical lines, if the segments matrix 51 | contains the columns A, B and CNt.} 52 | \item{model.col}{color of the theoretical lines, if the segments matrix 53 | contains the columns A, B and CNt.} 54 | \item{x.chr.space}{step in megabase on the positions to visualize on 55 | the x-axis.} 56 | \item{seg.cn}{genome wide segments, with the columns A, B and CNt.} 57 | \item{info.type}{information to plot in \code{genome.view}. Available 58 | options are "CNt" for total copy numbers and "AB" (default) for 59 | the alleles specific copy number.} 60 | \item{...}{optional arguments passed to \code{\link{plot}}.} 61 | } 62 | 63 | \details{ 64 | \code{chromosome.view} is a plotting function based on the default 65 | \code{\link{plot}} function and \code{\link{par}} to display multiple 66 | panels. The plotting function \code{\link{plotWindows}} is used to plot 67 | the binned data of \code{depth-ratio} and \code{b-allele frequency}. 68 | The function displays the observations reulting from the sequencing 69 | post-procssing as well the results of the model. 70 | } 71 | 72 | \seealso{ 73 | \code{\link{windowValues}}, \code{\link{find.breaks}}. 74 | } 75 | 76 | \examples{ 77 | \dontrun{ 78 | 79 | data.file <- system.file("extdata", "example.seqz.txt.gz", 80 | package = "sequenza") 81 | # read all the chromosomes: 82 | seqz.data <- read.seqz(data.file) 83 | # Gather genome wide GC-stats from raw file: 84 | gc.stats <- gc.sample.stats(data.file) 85 | gc.vect <- setNames(gc.stats$raw.mean, gc.stats$gc.values) 86 | # Read only one chromosome: 87 | seqz.data <- read.seqz(data.file, chr.name = 1) 88 | 89 | # Correct the coverage of the loaded chromosome: 90 | seqz.data$adjusted.ratio <- seqz.data$depth.ratio / 91 | gc.vect[as.character(seqz.data$GC.percent)] 92 | # Select the heterozygous positions 93 | seqz.hom <- seqz.data$zygosity.normal == 'hom' 94 | seqz.het <- seqz.data[!seqz.hom, ] 95 | # Detect breakpoints 96 | breaks <- find.breaks(seqz.het, gamma = 80, kmin = 10, baf.thres = c(0, 0.5)) 97 | # use heterozygous and homozygous position to measure segment values 98 | seg.s1 <- segment.breaks(seqz.data, breaks = breaks) 99 | 100 | # Binning the values of depth ratio and B allele frequency 101 | seqz.r.win <- windowValues(x = seqz.data$adjusted.ratio, 102 | positions = seqz.data$position, chromosomes = seqz.data$chromosome, 103 | window = 1e6, overlap = 1, weight = seqz.data$depth.normal) 104 | 105 | seqz.b.win <- windowValues(x = seqz.het$Bf, 106 | positions = seqz.het$position, chromosomes = seqz.het$chromosome, 107 | window = 1e6, overlap = 1, weight = round(x = seqz.het$good.reads, 108 | digits = 0)) 109 | # create mutation table: 110 | mut.tab <- mutation.table(seqz.data, mufreq.treshold = 0.15, 111 | min.reads = 40, max.mut.types = 1, min.type.freq = 0.9, 112 | segments = seg.s1) 113 | # chromosome view without parametes: 114 | chromosome.view(mut.tab = mut.tab[mut.tab$chromosome == "1",], 115 | baf.windows = seqz.b.win[[1]], ratio.windows = seqz.r.win[[1]], 116 | min.N.ratio = 1, segments = seg.s1[seg.s1$chromosome == "1",], 117 | main = "Chromosome 1") 118 | 119 | # filter out small ambiguous segments, and weight the segments by size: 120 | seg.filtered <- seg.s1[(seg.s1$end.pos - seg.s1$start.pos) > 10e6, ] 121 | weights.seg <- 150 + round((seg.filtered$end.pos - 122 | seg.filtered$start.pos) / 1e6, 0) 123 | # get the genome wide mean of the normalized depth ratio: 124 | avg.depth.ratio <- mean(gc.stats$adj[,2]) 125 | # run the BAF model fit 126 | 127 | CP <- baf.model.fit(Bf = seg.filtered$Bf, depth.ratio = seg.filtered$depth.ratio, 128 | weight.ratio = weights.seg, weight.Bf = weights.seg, 129 | avg.depth.ratio = avg.depth.ratio, cellularity = seq(0.1,1,0.01), 130 | ploidy = seq(0.5,3,0.05)) 131 | 132 | confint <- get.ci(CP) 133 | ploidy <- confint$max.ploidy 134 | cellularity <- confint$max.cellularity 135 | #detect copy number alteration on the segments: 136 | cn.alleles <- baf.bayes(Bf = seg.s1$Bf, depth.ratio = seg.s1$depth.ratio, 137 | cellularity = cellularity, ploidy = ploidy, avg.depth.ratio = 1) 138 | 139 | seg.s1 <- cbind(seg.s1, cn.alleles) 140 | 141 | # Chromosome view with estimated paramenters: 142 | chromosome.view(mut.tab = mut.tab[mut.tab$chromosome == "1",], 143 | baf.windows = seqz.b.win[[1]], ratio.windows = seqz.r.win[[1]], 144 | min.N.ratio = 1, segments = seg.s1[seg.s1$chromosome == "1",], 145 | main = "Chromosome 1", cellularity = cellularity, ploidy = ploidy, 146 | avg.depth.ratio = 1, BAF.style = "lines") 147 | } 148 | } 149 | -------------------------------------------------------------------------------- /man/cp.plot.Rd: -------------------------------------------------------------------------------- 1 | \name{cp.plot} 2 | \alias{cp.plot} 3 | \alias{cp.plot.contours} 4 | \alias{get.ci} 5 | 6 | \title{Plot log-posterior probability for the output of the 7 | \code{\link{sequenza.fit}} function} 8 | 9 | \description{ 10 | This function uses the \code{\link{colorgram}} function from the package 11 | \pkg{squash} to plot log-posterior probability for the tested combinations of 12 | cellularity and ploidy 13 | } 14 | 15 | \usage{ 16 | cp.plot(cp.table, xlab = "Ploidy", ylab = "Cellularity", 17 | zlab = "Scaled rank LPP", 18 | colFn = colorRampPalette(c('white', 'lightblue')), ...) 19 | cp.plot.contours(cp.table, likThresh = c(0.95), alternative = TRUE, 20 | col = palette(), legend.pos = "bottomright", pch = 18, 21 | alt.pch = 3, ...) 22 | get.ci(cp.table, level = 0.95) 23 | } 24 | 25 | \arguments{ 26 | \item{cp.table}{list, as output from \code{\link{baf.model.fit}} or \code{\link{mufreq.model.fit}}.} 27 | \item{xlab}{xlab parameter as in the function \code{\link{colorgram}}.} 28 | \item{ylab}{ylab parameter as in the function \code{\link{colorgram}}.} 29 | \item{zlab}{zlab parameter as in the function \code{\link{colorgram}}.} 30 | \item{colFn}{colFn parameter as in the function \code{\link{colorgram}}.} 31 | \item{likThresh}{vector of quantiles to define tresholds for the confindent regions.} 32 | \item{alternative}{boolean parameter, if \code{TRUE} the alternative solutions are computed and plotted.} 33 | \item{col}{vector of colors.} 34 | \item{legend.pos}{position for placing the legend.} 35 | \item{pch}{character used to indicate the point estimate.} 36 | \item{alt.pch}{if \code{alternative} is set to \code{TRUE} defines the character to indicate alternative solutions.} 37 | \item{...}{additional arguments accepted by the function \code{\link{colorgram}} for \code{cp.plot}, or \code{\link{contour}} for \code{cp.plot.contours}.} 38 | \item{level}{decimal value of the confidence interval} 39 | } 40 | 41 | \value{ 42 | The \code{get.ci} function returns a list with 6 items: 43 | \item{values.ploidy}{matrix of ploidy values with respective posterior probability.} 44 | \item{confint.ploidy}{boundaries of the confidence interval of the estimated ploidy.} 45 | \item{max.ploidy}{point estimate of the ploidy value that has the maximum posterior probability.} 46 | \item{values.cellularity}{matrix of cellularity values with respective posterior probability.} 47 | \item{confint.cellularity}{boundaries of the confidence interval of the estimated cellularity.} 48 | \item{max.cellularity}{point estimate of the cellularity value that has the maximum posterior probability.} 49 | } 50 | 51 | \examples{ 52 | 53 | data(CP.example) 54 | cp.plot(CP.example) 55 | cp.plot.contours(CP.example, add = TRUE) 56 | 57 | # Plot more contours 58 | cp.plot(CP.example) 59 | cp.plot.contours(CP.example, likThresh = c(0.95, 0.9999), add = TRUE) 60 | 61 | # Return the 95% confidence interval 62 | CP.example.ci <- get.ci(CP.example) 63 | str(CP.example.ci) 64 | 65 | } 66 | -------------------------------------------------------------------------------- /man/example.seqz.Rd: -------------------------------------------------------------------------------- 1 | \name{example.seqz} 2 | \docType{data} 3 | \alias{example.seqz} 4 | 5 | \title{Example \dQuote{seqz} data} 6 | 7 | \description{ 8 | The \dQuote{seqz} file is produced by \command{sequenza-utils} and 9 | typically has the file extension \file{.seqz}. The data here is 10 | representative of a seqz file derived from an exome-sequenced tumor sample, 11 | such as could be obtained from TCGA. 12 | } 13 | 14 | \usage{ 15 | data(example.seqz) 16 | } 17 | 18 | \format{ 19 | A data frame with 53937 rows and 14 columns: 20 | \tabular{rll}{ 21 | [,1] \tab chromosome \tab Chromosome name \cr 22 | [,2] \tab position \tab Base position \cr 23 | [,3] \tab base.ref \tab Base in the reference genome\cr 24 | [,4] \tab depth.normal \tab Read depth in the normal sample\cr 25 | [,5] \tab depth.tumor \tab Read depth in the tumor sample\cr 26 | [,6] \tab depth.ratio \tab Ratio of \code{depth.tumor} and \code{depth.normal}\cr 27 | [,7] \tab Af \tab A-allele frequency in the tumor sample\cr 28 | [,8] \tab Bf \tab B-allele frequency in the tumor sample, 29 | in heterozygous positions only\cr 30 | [,9] \tab zygosity.normal\tab Zygosity of the normal sample: 31 | "hom" for homozygous or "het" for heterozygous\cr 32 | [,10] \tab GC.percent \tab \% GC content\cr 33 | [,11] \tab good.reads \tab Number of reads from the tumor sample 34 | which pass the quality threshold \cr 35 | [,12] \tab AB.normal \tab Base(s) found in the normal sample, 36 | sorted by allele frequency if more than one\cr 37 | [,13] \tab AB.tumor \tab Base(s) found in the tumor sample \emph{but not} in 38 | the normal specimen, with their observed 39 | frequencies, separated by colons \cr 40 | [,14] \tab tumor.strand \tab Identical to \code{AB.tumor} but indicating, for 41 | each variant base, the fraction of reads 42 | oriented in the forward direction \cr 43 | } 44 | } 45 | 46 | \source{ 47 | This is derived from a TCGA specimen, but has been scrambled to anonymize the source. 48 | The reference genome is hg19. The GC content was calculated in 50-base windows. 49 | } 50 | 51 | \details{ 52 | \code{example.seqz} can be loaded in the standard R way via 53 | \code{data(example.seqz)}, or it can be read from a text file using 54 | \code{\link{read.seqz}}. The former is useful for examples and testing, 55 | whereas the latter is representative of the standard workflow. 56 | } 57 | 58 | \keyword{datasets} 59 | -------------------------------------------------------------------------------- /man/gc.Rd: -------------------------------------------------------------------------------- 1 | \name{gc.sample.stats} 2 | \alias{gc.sample.stats} 3 | \alias{gc.summary.plot} 4 | \alias{mean_gc} 5 | \alias{median_gc} 6 | 7 | 8 | \title{Collect display and correct GC-content related coverage bias} 9 | 10 | \description{ 11 | Collect information and perform statistics of depth of coverage in 12 | relation with GC-content. 13 | } 14 | 15 | \usage{ 16 | gc.sample.stats(file, col_types = "c--dd----d----", buffer = 33554432, 17 | parallel = 2L, verbose = TRUE) 18 | gc.summary.plot(gc_list, mean.col = 1, median.col = 2, 19 | scale.subset = 1.5, ...) 20 | mean_gc(gc_list) 21 | median_gc(gc_list) 22 | } 23 | 24 | \arguments{ 25 | \item{file}{name of a file in the seqz format.} 26 | \item{col_types}{a string describing the classes of each columns of the 27 | input file (see \code{\link{read_tsv}}). The default value corresponds 28 | to the columns of a seqz file used for carculating GC statistics.} 29 | \item{buffer}{maximal size of each chunk in bytes(see 30 | \code{\link{chunk.apply}}).} 31 | \item{parallel}{integer, number of threads used to process a seqz file 32 | (see \code{\link{chunk.apply}}).} 33 | \item{verbose}{logical. If TRUE (the default) the function retuns information 34 | in the console.} 35 | \item{gc_list}{a normal or tumor list resulting from the 36 | \code{\link{gc.sample.stats}} function.} 37 | \item{mean.col}{color for the mean in the summary plot.} 38 | \item{median.col}{color for the median in the summary plot.} 39 | \item{scale.subset}{scale the depth values to sho in the plot. A value of 1 40 | will show the average depth at the center of the plot.} 41 | \item{...}{additional parametrers from \code{\link{colorgram}}.} 42 | } 43 | 44 | \details{ 45 | \code{gc.sample.stats} extracts depths and GC-content inforation for the 46 | tumor and the control samples from an seqz file 47 | it returns a list with 3 elements: \code{file.metrics}, \code{normal} and 48 | \code{tumor}. 49 | 50 | \code{file.metrics} is a \code{data.frame} serving as index of the seqz 51 | file; the \code{normal} and \code{tumor} objects contains each 3 ojects: 52 | \code{gc}, \code{depth} and \code{n}. 53 | 54 | \code{gc} and \code{depth} are vectors containing the recorded values of, 55 | respectively, GC and coverage depth. the \code{n} object is a matrix 56 | \code{gc}x\code{depth}, recording the number of time a certain 57 | \code{gc}/\code{depth} pairs is observed in the data. 58 | 59 | } 60 | 61 | \value{ 62 | A list with the following elements: 63 | \item{file.metrics}{index of the seqz file.} 64 | \item{tumor}{GC and coverage depth observations in the tumor sample.} 65 | \item{normal}{GC and coverage depth observations in the control sample.} 66 | } 67 | 68 | 69 | \examples{ 70 | 71 | \dontrun{ 72 | 73 | data.file <- system.file("extdata", "example.seqz.txt.gz", package = "sequenza") 74 | # read all the chromosomes: 75 | gc_info <- gc.sample.stats(data.file) 76 | 77 | # mean values of depth coverage vs GC content 78 | 79 | mean_gc(gc_info$normal) 80 | 81 | # plot the information for the tumor and normal samples 82 | par(mfrow=c(1, 2)) 83 | gc.summary.plot(gc_info$normal, main = "Normal GC stats") 84 | gc.summary.plot(gc_info$tumor, main = "Tumor GC stats") 85 | } 86 | } -------------------------------------------------------------------------------- /man/model_points.Rd: -------------------------------------------------------------------------------- 1 | \name{model.points} 2 | 3 | \alias{model.points} 4 | \alias{baf.model.points} 5 | \alias{mufreq.model.points} 6 | 7 | \title{Generate B-allele frequency, mutation frequency and depth ratios 8 | at given model points, cellularity and ploidy values} 9 | 10 | \description{ 11 | The \code{baf.model.points} and \code{mufreq.model.points} functions 12 | combine \code{theoretical_baf}, \code{theoretical_mufreq} and 13 | \code{theoretical_depth_ratio} to model the theoretical respective values 14 | at known values of cellularity and ploidy. 15 | } 16 | 17 | \usage{ 18 | baf.model.points(cellularity, ploidy, baf_types, avg.depth.ratio) 19 | mufreq.model.points(cellularity, ploidy, mufreq_types, avg.depth.ratio) 20 | } 21 | 22 | \arguments{ 23 | \item{cellularity}{fraction of tumor cells in the sample.} 24 | \item{ploidy}{2 * ratio between total DNA content in a tumor cell 25 | and a normal cell.} 26 | \item{baf_types}{matrix with the sets of copy numbers and number of 27 | mutated alleles over which to model mutation frequency and depth ratio. 28 | The matrix can be generated with \code{\link{baf.types.matrix}}.} 29 | \item{mufreq_types}{matrix with the sets of copy numbers and number of 30 | mutated alleles over which to model mutation frequency and depth ratio. 31 | The matrix can be generated with \code{\link{mufreq.types.matrix}}.} 32 | \item{avg.depth.ratio}{average normalized depth ratio.} 33 | } 34 | \value{ 35 | For \code{baf.model.points} a data.frame with two columns: 36 | \item{BAF}{modelled values of B-allele frequency.} 37 | \item{depth_ratio}{modelled values of depth ratio.} 38 | For \code{mufreq.model.points} a data.frame with two columns: 39 | \item{mufreqs}{modelled values of mutation frequency.} 40 | \item{depth_ratio}{modelled values of depth ratio.} 41 | } 42 | 43 | \details{ 44 | The \code{baf.model.points} and \code{mufreq.model.points} functions 45 | generate the theoretical values of B-allele frequency, mutation frequency 46 | and depth ratio for the given type tags. To learn more about type tags 47 | see \code{\link{types.matrix}}. 48 | } 49 | 50 | \seealso{ 51 | \code{\link{types.matrix}}, \code{\link{theoretical.depth.ratio}}, 52 | \code{\link{theoretical.baf}} \code{\link{theoretical.mufreq}}. 53 | } 54 | 55 | \examples{ 56 | # Simulate a cellularity of 0.5, ploidy of 2 and types from min CNt 0 57 | # and max = 4 on an originally diploid genome: 58 | types <- baf.types.matrix(CNt.min = 0, CNt.max = 4, CNn = 2) 59 | cbind(types, baf.model.points(cellularity = 0.5, ploidy = 2, 60 | baf_types = types, avg.depth.ratio = 1)) 61 | # Simulate a cellularity of 0.5, ploidy of 2 and types from min CNt 0 62 | # and max = 4 on an originally monoallelic genome: 63 | types <- mufreq.types.matrix(CNt.min = 0, CNt.max = 4, CNn = 1) 64 | cbind(types, mufreq.model.points(cellularity = 0.5, ploidy = 2, 65 | mufreq_types = types, avg.depth.ratio = 1)) 66 | } 67 | -------------------------------------------------------------------------------- /man/mutations.Rd: -------------------------------------------------------------------------------- 1 | \name{mutation.table} 2 | \alias{mutation.table} 3 | \title{Identify mutations} 4 | 5 | \description{ 6 | This function extracts positions from an seqz file that differ from the normal genome, applying various filters. 7 | } 8 | 9 | \usage{ 10 | mutation.table(seqz.tab, mufreq.treshold = 0.15, min.reads = 40, min.reads.normal = 10, 11 | max.mut.types = 3, min.type.freq = 0.9, min.fw.freq = 0, segments = NULL) 12 | } 13 | 14 | \arguments{ 15 | \item{seqz.tab}{an seqz table, as output from \code{\link{read.seqz}}.} 16 | \item{mufreq.treshold}{mutation frequency threshold.} 17 | \item{min.reads}{minimum number of reads above the quality threshold to accept the mutation call.} 18 | \item{min.reads.normal}{minimum number of reads used to determine the genotype in the normal sample.} 19 | \item{max.mut.types}{maximum number of different base substitutions per position. Integer from 1 to 3 (since there are only 4 different bases). Default is 3, to accept \dQuote{noisy} mutation calls.} 20 | \item{min.type.freq}{minimum frequency of aberrant types.} 21 | \item{min.fw.freq}{minimum frequency of variant reads detected in the forward strand. Setting it to 0, all the variant calls with strand frequency in the interval outside 0 and 1, margin not comprised, would be discarded.} 22 | \item{segments}{if specified, the values of depth ratio would be taken from the segments rather than from the raw data.} 23 | } 24 | 25 | \details{ 26 | Calling mutations in impure tumor samples is a difficult task, because the degree of contamination by normal cells affects the measured mutation frequency. In highly impure samples, where the normal cells comprise the major component of the sample, mutations can be so diluted that it can be difficult to distinguish them from sequencing errors. 27 | 28 | The function \code{mutation.table} tries to separate true mutations from sequencing errors, based on the given threshold. In samples with low contamination, it should even be possible to catch sub-clonal mutations using this function. 29 | 30 | This function identifes only those mutations occuring in positions that are homozygous in the normal genome. 31 | } 32 | 33 | \value{ 34 | A data frame, which in addition to some of the columns of the seqz table, contains the following two columns: 35 | \item{F}{the mutation frequency} 36 | \item{mutation}{a character representation of the mutation. For example, a mutation from \samp{A} in the normal to \samp{G} in the tumor is annotated as \samp{A>G}.} 37 | 38 | } 39 | \examples{ 40 | 41 | \dontrun{ 42 | 43 | data.file <- system.file("extdata", "example.seqz.txt.gz", package = "sequenza") 44 | seqz.data <- read.seqz(data.file) 45 | 46 | ## Normalize coverage by GC-content 47 | gc.stats <- gc.norm(x = seqz.data$depth.ratio, 48 | gc = seqz.data$GC.percent) 49 | gc.vect <- setNames(gc.stats$raw.mean, gc.stats$gc.values) 50 | seqz.data$adjusted.ratio <- seqz.data$depth.ratio / 51 | gc.vect[as.character(seqz.data$GC.percent)] 52 | 53 | ## Extract mutations 54 | mut.tab <- mutation.table(seqz.data, mufreq.treshold = 0.15, 55 | min.reads = 40, max.mut.types = 1, 56 | min.type.freq = 0.9) 57 | mut.tab <- na.exclude(mut.tab) 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /man/plotWindows.Rd: -------------------------------------------------------------------------------- 1 | \name{plotWindows} 2 | \alias{plotWindows} 3 | \title{Plot a binned values of a chromosome} 4 | 5 | \description{ 6 | The \code{plotWindows} function visualizes a \code{data.frame} produced by 7 | the \code{windowValues} or \code{windowBf} functions. 8 | } 9 | 10 | \usage{ 11 | plotWindows(seqz.window, m.lty = 1, m.lwd = 3, m.col = "black", 12 | q.bg = "lightblue", log2.plot = FALSE, n.min = 1, xlim, ylim, 13 | add = FALSE, ...) 14 | } 15 | 16 | \arguments{ 17 | \item{seqz.window}{data frame of base-pair windows and corresponding 18 | quartiles to be plotted. A list of such data frames can be output 19 | from \code{\link{windowValues}} or \code{\link{windowBf}}.} 20 | \item{m.lty}{line type used for plotting mean values.} 21 | \item{m.lwd}{line width used for plotting mean values.} 22 | \item{m.col}{line color used for plotting mean values.} 23 | \item{q.bg}{background color for the area between the 0.25 and 0.75 24 | quartiles.} 25 | \item{log2.plot}{logical, if TRUE values are log2 scaled.} 26 | \item{n.min}{minimum number of data points required for a binned window 27 | to be plotted.} 28 | \item{xlim}{limits of the x axis.} 29 | \item{ylim}{limits of the y axis.} 30 | \item{add}{logical, if TRUE the plot will be added to an existing opened 31 | device.} 32 | \item{...}{any other arguments accepted by \code{\link{plot}}.} 33 | } 34 | 35 | \seealso{ 36 | \code{\link{chromosome.view}}, 37 | } 38 | 39 | \examples{ 40 | data.file <- system.file("extdata", "example.seqz.txt.gz", 41 | package = "sequenza") 42 | seqz.data <- read.seqz(data.file) 43 | # 1Mb windows, each window is overlapping with 1 other adjacent 44 | # window: depth ratio 45 | seqz.ratio <- windowValues(x = seqz.data$depth.ratio, 46 | positions = seqz.data$position, chromosomes = seqz.data$chromosome, 47 | window = 1e6, weight = seqz.data$depth.normal, start.coord = 1, 48 | overlap = 1) 49 | 50 | plotWindows(seqz.ratio[[1]], log2.plot = FALSE, ylab = "Depth ratio", 51 | xlab = "Position (bases)", main = names(seqz.ratio)[1], las = 1, 52 | n.min = 1, ylim = c(0, 2.5)) 53 | 54 | plotWindows(seqz.ratio[[17]], log2.plot = FALSE, ylab = "Depth ratio", 55 | xlab = "Position (bases)", main = names(seqz.ratio)[1], las = 1, 56 | n.min = 1, ylim = c(0, 2.5)) 57 | } 58 | -------------------------------------------------------------------------------- /man/read_seqz.Rd: -------------------------------------------------------------------------------- 1 | \name{read.seqz} 2 | \alias{read.seqz} 3 | \alias{read.acgt} 4 | \title{Read a seqz or acgt format file} 5 | 6 | \description{ 7 | Efficiently reads a seqz file into R. 8 | } 9 | 10 | \usage{ 11 | read.seqz(file, n_lines = NULL, col_types = "ciciidddcddccc", chr_name = NULL, 12 | buffer = 33554432, parallel = 1, 13 | col_names = c("chromosome", "position", "base.ref", "depth.normal", 14 | "depth.tumor", "depth.ratio", "Af", "Bf", "zygosity.normal", 15 | "GC.percent", "good.reads", "AB.normal", "AB.tumor", 16 | "tumor.strand"),...) 17 | } 18 | 19 | \arguments{ 20 | \item{file}{file name} 21 | \item{col_types}{a string describing the classes of each columns of the 22 | input file (see \code{\link{read_tsv}}). The default value corresponds 23 | to the columns of a seqz file.} 24 | \item{chr_name}{if specified, only the selected chromosome will be 25 | extracted instead of the entire file. For \code{tabix}-indexed files 26 | this argument can also be used to extract coordinated-selected genomic 27 | regions. E.g. \code{chr_name="5:1-1000000"} will select the first 28 | megabase of chromosome 5.} 29 | \item{n_lines}{vector of length 2 specifying the first and last line to 30 | read from the file. If specified, only the selected portion of the 31 | file will be used.} 32 | \item{buffer}{maximal size of each chunk in bytes(see 33 | \code{\link{chunk.apply}}).} 34 | \item{parallel}{integer, number of threads used to process a seqz file 35 | (see \code{\link{chunk.apply}}).} 36 | \item{col_names}{names of the columns of the seqz file. The default 37 | corresponds to the column names of a seqz file.} 38 | \item{...}{any arguments accepted by \code{read_tsv}.} 39 | } 40 | 41 | \details{ 42 | \code{read.seqz} is a function that allows to efficiently access a 43 | \code{seqz} file by chromosome or by line numbers. The function can also 44 | access coordinate specific regions with \code{tabix}-indexed \code{seqz} 45 | files. 46 | The specific content of a \code{seqz} file is explained in the \code{value} 47 | section. 48 | } 49 | 50 | \format{ 51 | A seqz file is a tab-separated text file with 14 columns and a header row. 52 | The first 3 columns are derived from the original \code{pileup} 53 | file and contain: 54 | \describe{ 55 | \item{chromosome}{the chromosome name} 56 | \item{position}{the base position} 57 | \item{base.ref}{the base in the reference genome. 58 | Note that this is NOT necessarily the same base as in 59 | the normal specimen.} 60 | The remaining 10 columns contain the following information: 61 | \item{depth.normal}{read depth observed in the normal sample} 62 | \item{depth.tumor}{read depth observed in the tumor sample} 63 | \item{depth.ratio}{ratio of \code{depth.tumor} and \code{depth.normal}} 64 | \item{Af}{A-allele frequency observed in the tumor sample} 65 | \item{Bf}{B-allele frequency observed in the tumor sample in 66 | heterozygous positions} 67 | \item{zygosity.normal}{zygosity of the reference sample. "hom" 68 | corresponds to AA or BB, whereas "het" corresponds to AB or BA} 69 | \item{GC.percent}{GC-content (percent), calculated from the reference 70 | genome in fixed nucleotide windows } 71 | \item{good.reads}{number of reads that passed the quality threshold 72 | (threshold specified in the pre-processing software), in the 73 | tumor specimen} 74 | \item{AB.normal}{base(s) found in the germline sample; for heterozygous 75 | positions AB are sorted using the values of Af and Bf respectively} 76 | \item{AB.tumor}{base(s) found in the tumor sample not present in the 77 | normal specimen. The field include all the variants found in the 78 | tumor alignment, separated by a colon. Each variant contains the 79 | base and the observed frequency} 80 | \item{tumor.strand}{frequency of the variant nucleotides detected on 81 | the forward orientation. The field have a consistent structure with 82 | \code{AB.tumor}, indicating the fraction, relative to the total 83 | number of reads presenting the specific variant, orientated 84 | in the forward direction} 85 | } 86 | } 87 | 88 | \seealso{ 89 | \code{read_delim}. 90 | } 91 | 92 | \examples{ 93 | \dontrun{ 94 | 95 | data_file <- system.file("extdata", "example.seqz.txt.gz", package = "sequenza") 96 | 97 | ## read chromosome 1 from an seqz file. 98 | seqz_data <- read.seqz(data_file, chr_name = 1) 99 | 100 | ## Fast access to chromosome X using the file metrics 101 | gc.stats <- gc.sample.stats(data_file) 102 | chrX <- gc.stats$file.metrics[gc.stats$file.metrics$chr == "X", ] 103 | seqz.data <- read.seqz(data_file, n_lines = c(chrX$start, chrX$end)) 104 | 105 | ## Compare the running time of the two different methods. 106 | system.time(seqz.data <- read.seqz(data_file, n_lines = c(chrX$start, chrX$end))) 107 | system.time(seqz.data <- read.seqz(data_file, chr_name = "X")) 108 | 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /man/theoretical.Rd: -------------------------------------------------------------------------------- 1 | \name{theoretical.baf} 2 | \alias{theoretical.baf} 3 | \alias{theoretical.mufreq} 4 | \alias{theoretical.depth.ratio} 5 | 6 | \title{Calculates cellularity and ploidy dependent model points} 7 | 8 | \description{ 9 | Calculates the theoretically expected values of BAF, mutation frequency 10 | or depth ratio for given values of cellularity, ploidy and copy number. 11 | } 12 | 13 | \usage{ 14 | theoretical.depth.ratio(CNt, cellularity, ploidy, CNn = 2, 15 | normal.ploidy = 2, avg.depth.ratio = 1) 16 | theoretical.baf(CNt, B, cellularity, CNn = 2) 17 | theoretical.mufreq(CNt, Mt, cellularity, CNn = 2) 18 | 19 | } 20 | 21 | \arguments{ 22 | \item{CNn}{copy number in the normal sample.} 23 | \item{CNt}{copy number in the tumor sample.} 24 | \item{B}{number of B-alleles in the tumor sample.} 25 | \item{Mt}{number of alleles carrying a mutation in the tumor sample.} 26 | \item{cellularity}{fraction of tumor cells in the sample.} 27 | \item{ploidy}{2 * ratio between total DNA content in a tumor cell 28 | and a normal cell.} 29 | \item{normal.ploidy}{ploidy value in the normal sample. Default 30 | is 2 for a diploid cell.} 31 | \item{avg.depth.ratio}{average normalized depth ratio.} 32 | } 33 | 34 | \details{ 35 | The observed B-allele frequency, depth ratio and mutation frequency are 36 | affected by the cellularity of the tumor sample, which is the inverse of 37 | the degree of contamination by normal cells. 38 | Three functions are included, which for know values of cellularity and 39 | ploidy they produce the expected values of B-allele frequency, mutation 40 | frequency or depth ratio. 41 | 42 | \code{theoretical.baf} returns a dataframe with the possible copy numbers 43 | of A and B alleles, along with their corresponding B-allele frequency and 44 | the total copy number state (always the sum of A+B). 45 | 46 | \code{theoretical.depth.ratio} returns the theoretical depth ratio at a 47 | single specific position, given values of cellularity, ploidy, the ratio 48 | between the tumor copy number and the normal copy number at that position, 49 | and the average depth ratio of the sample. 50 | 51 | \code{theoretical.mufreq} returns the theoretical mutation frequency at a 52 | single specific position, given values of cellularity, copy number in the 53 | normal and tumor samples at that position, and the number of mutated 54 | alleles. 55 | } 56 | 57 | \seealso{ 58 | \code{\link{model.points}} 59 | } 60 | -------------------------------------------------------------------------------- /man/type_matrix.Rd: -------------------------------------------------------------------------------- 1 | \name{types.matrix} 2 | \alias{types.matrix} 3 | \alias{baf.types.matrix} 4 | \alias{mufreq.types.matrix} 5 | \title{Creates a matrix of type tags} 6 | 7 | \description{ 8 | Type tags are a utensil to distinguish genomic positions by their copy 9 | number state, number A and B alleles and the number of mutated alleles. 10 | This function creates a matrix of all possible type tags, given the copy 11 | number of the normal sample and the range of possible copy numbers in 12 | the tumor sample. 13 | } 14 | 15 | \usage{ 16 | baf.types.matrix(CNt.min, CNt.max, CNn = 2) 17 | mufreq.types.matrix(CNt.min, CNt.max, CNn = 2) 18 | } 19 | 20 | \arguments{ 21 | \item{CNt.min}{minimum copy number in the tumor.} 22 | \item{CNt.max}{maximum copy number in the tumor.} 23 | \item{CNn}{copy number of the normal sample.} 24 | } 25 | 26 | \value{ 27 | \code{baf.types.matrix} returns a data.frame with the 3 columns: 28 | \item{CNn}{number of alleles in the normal sample.} 29 | \item{CNt}{numbers of alleles in the tumor sample.} 30 | \item{B}{number of B alleles in the tumor sample.} 31 | 32 | \code{mufreq.types.matrix} returns a data.frame with the 3 columns: 33 | \item{CNn}{number of alleles in the normal sample.} 34 | \item{CNt}{numbers of alleles in the tumor sample.} 35 | \item{Mt}{number of mutated alleles in the tumor sample.} 36 | } 37 | 38 | \details{ 39 | A type consists of 3 integers signifying the copy number in the normal 40 | and tumor samples and the number of B alleles (\code{baf.types.matrix}) or 41 | mutated alleles (\code{mufreq.types.matrix}). 42 | The two functions return all the possible types combination within the 43 | range of tumor copy numbers in the arguments (\code{CNt.min:CNt.max}). 44 | } 45 | 46 | \seealso{ 47 | \code{theoretical_mufreq}, \code{theoretical_depth_ratio}, 48 | \code{theoretical_baf}, \code{model_points}. 49 | } 50 | 51 | \examples{ 52 | ## Generate matrix types from 0 to 4 copy number, being the 53 | ## non-tumor chromosome diploid. 54 | baf.types.matrix(CNt.min = 0, CNt.max = 4, CNn = 2 ) 55 | 56 | ## Generate matrix types from 0 to 4 copy number, being the 57 | ## non-tumor chromosome monoploid. 58 | mufreq.types.matrix(CNt.min = 0, CNt.max = 4, CNn = 1 ) 59 | } 60 | -------------------------------------------------------------------------------- /man/windowValues.Rd: -------------------------------------------------------------------------------- 1 | \name{windowValues} 2 | \alias{windowValues} 3 | \alias{windowBf} 4 | \title{Bins sequencing data for plotting} 5 | 6 | \description{ 7 | Given a variable with corresponding genomic positions, the function bins the 8 | values in windows of a specified size and calculates weighted mean and 25th 9 | and 75th percentile for each window. The resulting object are visualized by 10 | the function \code{plotWindows}. 11 | } 12 | 13 | \usage{ 14 | windowValues(x, positions, chromosomes, window = 1e6, overlap = 0, 15 | weight = rep.int( x = 1, times = length(x)), start.coord = 1) 16 | windowBf(Af, Bf, good.reads, positions, chromosomes, window = 1e6, 17 | overlap = 0, start.coord = 1, conf = 0.95) 18 | } 19 | \arguments{ 20 | \item{x}{variable to be windowed.} 21 | \item{positions}{base-pair positions.} 22 | \item{chromosomes}{names or numbers of the chromosomes.} 23 | \item{window}{size of windows used for binning data. Smaller windows will 24 | take more time to compute.} 25 | \item{overlap}{integer defining the number of overlapping windows. Default 26 | is 0, no overlap.} 27 | \item{weight}{weights to be assigned to each value of \code{x}, usually 28 | related to the read depth.} 29 | \item{start.coord}{coordinate at which to start computing the windows. If 30 | NULL, will start at the first position available.} 31 | \item{Af}{A-allele frequency for the \code{Bf} calculation.} 32 | \item{Bf}{B-allele frequency for the \code{Bf} calculation.} 33 | \item{good.reads}{number of reads passing filter for the \code{Bf} 34 | calculation.} 35 | \item{conf}{confidence intervals of the binned \code{Bf} value.} 36 | } 37 | 38 | \details{ 39 | DNA sequencing produces an amount of data too large to be handled by 40 | standard graphical devices. In addition, for samples analyzed with older 41 | machines and with low or middle coverage (20x to 50x), measures such as 42 | read depth are subject to big variations due to technical noise. 43 | Using \code{windowValues} prior to plotting reduces the noise and the 44 | amount of data to be plotted. 45 | 46 | The binning of the B-allele frequency requires a separate function, 47 | \code{windowBf}, as the B-allele frequency calculation uses multiple 48 | values: \code{Af}, \code{Bf} and \code{good.reads}. 49 | 50 | The output of \code{windowValues} and \code{windowBf} can be used as input 51 | for \code{\link{plotWindows}}. 52 | } 53 | 54 | \value{ 55 | a list of data.frame, one per chromosome. Each data.frame contains 56 | base-pair windows covering the chromosome. Each row of the data.frame 57 | correspond to a window and its weighted mean, 25th and 75th percentiles of 58 | the input values, and the number of data points within each window. 59 | } 60 | 61 | \seealso{ 62 | \code{plotWindows} 63 | } 64 | 65 | \examples{ 66 | 67 | \dontrun{ 68 | data.file <- system.file("extdata", "example.seqz.txt.gz", 69 | package = "sequenza") 70 | seqz.data <- read.seqz(data.file) 71 | # 1Mb windows, each window is overlapping with 1 other 72 | # adjacent window: depth ratio 73 | seqz.ratio <- windowValues(x = seqz.data$depth.ratio, 74 | positions = seqz.data$position, chromosomes = seqz.data$chromosome, 75 | window = 1e6, weight = seqz.data$depth.normal, start.coord = 1, 76 | overlap = 1) 77 | 78 | seqz.hom <- seqz.data$zygosity.normal == 'hom' 79 | seqz.het <- seqz.data[!seqz.hom, ] 80 | # 1Mb windows, each window is overlapping with 1 other adjacent window: 81 | # B-allele frequency 82 | seqz.bafs <- windowValues(x = seqz.het$Bf, positions = seqz.het$position, 83 | chromosomes = seqz.het$chromosome, window = 1e6, 84 | weight = seqz.het$depth.tumor, start.coord = 1, overlap = 1) 85 | # Repeat the same operation using windowBf 86 | seqz.bafs <- windowBf(Af = seqz.het$Bf, Bf = seqz.het$Bf, 87 | good.reads = seqz.het$good.reads, positions = seqz.het$position, 88 | chromosomes = seqz.het$chromosome, window = 1e6, 89 | start.coord = 1, overlap = 1, conf = 0.95) 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /man/workflow.Rd: -------------------------------------------------------------------------------- 1 | \name{sequenza} 2 | \alias{sequenza.extract} 3 | \alias{sequenza.fit} 4 | \alias{sequenza.results} 5 | \title{Sequenza convenience functions for standard analysis} 6 | 7 | \description{ 8 | These three functions are intended to be the main user interface of the package, to run several of the functions of \code{sequenza} in a standardized pipeline. 9 | } 10 | 11 | \usage{ 12 | sequenza.extract(file, window = 1e6, overlap = 1, 13 | gamma = 80, kmin = 10, gamma.pcf = 140, kmin.pcf = 40, 14 | mufreq.treshold = 0.10, min.reads = 40, min.reads.normal = 10, 15 | min.reads.baf = 1, max.mut.types = 1, min.type.freq = 0.9, 16 | min.fw.freq = 0, verbose = TRUE, chromosome.list = NULL, 17 | breaks = NULL, breaks.method = "het", assembly = "hg19", 18 | weighted.mean = TRUE, normalization.method = "mean", 19 | ignore.normal = FALSE, parallel = 1, gc.stats = NULL, 20 | segments.samples = FALSE) 21 | 22 | sequenza.fit(sequenza.extract, female = TRUE, N.ratio.filter = 10, 23 | N.BAF.filter = 1, segment.filter = 3e6, 24 | mufreq.treshold = 0.10, XY = c(X = "X", Y = "Y"), 25 | cellularity = seq(0.1,1,0.01), ploidy = seq(1, 7, 0.1), 26 | ratio.priority = FALSE, method = "baf", 27 | priors.table = data.frame(CN = 2, value = 2), 28 | chromosome.list = 1:24, mc.cores = getOption("mc.cores", 2L)) 29 | 30 | sequenza.results(sequenza.extract, cp.table = NULL, sample.id, out.dir = getwd(), 31 | cellularity = NULL, ploidy = NULL, female = TRUE, CNt.max = 20, 32 | ratio.priority = FALSE, XY = c(X = "X", Y = "Y"), 33 | chromosome.list = 1:24) 34 | } 35 | 36 | \arguments{ 37 | \item{file}{the name of the seqz file to read.} 38 | \item{window}{size of windows used when plotting mean and quartile ranges of depth ratios and B-allele frequencies. Smaller windows will take more time to compute.} 39 | \item{overlap}{integer specifying the number of overlapping windows.} 40 | \item{gamma, kmin}{arguments passed to \code{\link{aspcf}} from the \pkg{copynumber} package.} 41 | \item{gamma.pcf, kmin.pcf}{arguments passed to \code{\link{pcf}} from the \pkg{copynumber} package. The arguments are effective only when \code{breaks.method} is set to "full".} 42 | \item{mufreq.treshold}{mutation frequency threshold.} 43 | \item{min.reads}{minimum number of reads above the quality threshold to accept the mutation call.} 44 | \item{min.reads.normal}{minimum number of reads used to determine the genotype in the normal sample.} 45 | \item{min.reads.baf}{threshold on the depth of the positions included to calculate the average BAF for segment.} 46 | \item{max.mut.types}{maximum number of different base substitutions per position. Integer from 1 to 3 (since there are only 4 bases). Default is 3, to accept "noisy" mutation calls.} 47 | \item{min.type.freq}{minimum frequency of aberrant types.} 48 | \item{min.fw.freq}{minimum frequency of variant reads detected in the forward strand. Setting it to 0, all the variant calls with strand frequency in the interval outside 0 and 1, margin not comprised, would be discarded.} 49 | \item{verbose}{logical, indicating whether to print information about the chromosome being processed.} 50 | \item{chromosome.list}{vector containing the index or the names of the chromosome to include in the model fitting.} 51 | \item{breaks}{Optional data.frame in the format chrom, start.pos, end.pos, defining a pre-existing segmentation. When the argument is set the built-in segmentation will be skipped in favor of the suggested breaks.} 52 | \item{breaks.method}{Argument indicating the resolution of the segmentation. Possible values are \code{fast}, \code{het} and \code{full}, where \code{fast} allows the lower resolution and \code{full} the higher. Custom values of \code{gamma} and \code{kmin} need to be adjusted to have optimal results.} 53 | \item{assembly}{assembly version of the genome, see \code{\link{aspcf}} or \code{\link{pcf}}.} 54 | \item{weighted.mean}{boolean to select if the segments should be calculated using the read depth as weights to calculate depth ratio and B-allele frequency means.} 55 | \item{normalization.method}{string defining the operation to perform during the GC-normalization process. Possible values are \code{mean} (default) and \code{median}. A \code{median} normalization is preferable with noisy data.} 56 | \item{ignore.normal}{boolean, when set to TRUE the process will ignore the normal coverage and perform the analysis by using the normalized tumor coverage.} 57 | \item{parallel}{integer, number of threads used to process a seqz file 58 | (see \code{\link{chunk.apply}}).} 59 | \item{gc.stats}{object returned from the function \code{\link{gc.sample.stats}}. If \code{NULL} the object will be computed from the input file.} 60 | \item{segments.samples}{EXPERIMENTAL. Segment both tumor and normal samples separately, and add it to the QC plots.} 61 | \item{sequenza.extract}{a list of objects as output from the \code{sequenza.extract} function.} 62 | \item{method}{method to use to fit the data; possible values are \code{baf} to use \code{\link{baf.model.fit}} or \code{mufreq} to use the \code{\link{mufreq.model.fit}} function to fit the data.} 63 | \item{cp.table}{a list of objects as output from the \code{sequenza.fit} function.} 64 | \item{female}{logical, indicating whether the sample is male or female, to properly handle the X and Y chromosomes. Implementation only works for the human normal karyotype.} 65 | \item{CNt.max}{maximum copy number to consider in the model.} 66 | \item{N.ratio.filter}{threshold of minimum number of observation of depth ratio in a segment.} 67 | \item{N.BAF.filter}{threshold of minimum number of observation of B-allele frequency in a segment.} 68 | \item{segment.filter}{threshold segment length (in base pairs) to filter out short segments, that can cause noise when fitting the cellularity and ploidy parameters. The threshold will not affect the allele-specific segmentation.} 69 | \item{XY}{character vector of length 2 specifying the labels used for the X and Y chromosomes.} 70 | \item{cellularity}{vector of candidate cellularity parameters.} 71 | \item{ploidy}{vector candidate ploidy parameters.} 72 | \item{priors.table}{data frame with the columns \code{CN} and \code{value}, containing the copy numbers and the corresponding weights. To every copy number is assigned the value 1 as default, so every values different then 1 will change the corresponding weight.} 73 | \item{ratio.priority}{logical, if TRUE only the depth ratio will be used to determine the copy number state, while the Bf value will be used to determine the number of B-alleles.} 74 | \item{sample.id}{identifier of the sample, to be used as a prefix for saved objects.} 75 | \item{out.dir}{output directory where the files and objects will be saved.} 76 | \item{mc.cores}{legacy argument to set the number of cores, but it refers to the \code{cl} of \code{\link{pblapply}}. It uses \code{\link{mclapply}} when set to an integer.} 77 | 78 | } 79 | 80 | \details{ 81 | The first function, \code{sequenza.extract}, utilizes a range of functions from the sequenza package to read the raw data, normalize the depth.ratio for GC-content bias, perform allele-specific segmentation, filter for noisy mutations and bin the raw data for plotting. The computed objects are returned as a single list object. 82 | 83 | The segmentation by default is performed using only the heterozygous position and the \code{\link{aspcf}} function from \pkg{copynumber} package. The \code{full} option in the \code{breaks.method} argument allow to combine results of the segmentation of all the data available, using the \code{\link{pcf}} function, and the default \code{\link{aspcf}} using only the heterozygous positions. 84 | 85 | The second function, \code{sequenza.fit}, accepts the output from \code{sequenza.extract} and calls \code{\link{baf.model.fit}} to calculate the log-posterior probability for all pairs of the candidate ploidy and cellularity parameters. 86 | 87 | The third function, \code{sequenza.results}, saves a number of objects in a specified directory (default is the working directory). The objects are: 88 | \itemize{ 89 | \item The list of segments with resulting copy numbers and major and minor alleles. 90 | \item The candidate mutation list with variant allele frequency, and copy number and number of mutated allele, in relation of the clonal population (for sub-clonal population it needs to be processed with further methods). 91 | \item A plot of all the chromosomes in one image, representing the major and minor alleles and the absolute copy number changes (genome_view). 92 | \item Multiple plots with one chromosome per image, representing copy-number, B-allele frequency and mutation in parallel (chromosome_view). 93 | \item Results of the model fitting (CP_contours and confints_CP) 94 | \item A summary of the copy number state of the sample (CN_bars). 95 | } 96 | } 97 | 98 | \seealso{ 99 | \code{\link{genome.view}}, \code{\link{baf.bayes}}, \code{\link{cp.plot}}, \code{\link{get.ci}}. 100 | } 101 | 102 | \examples{ 103 | \dontrun{ 104 | 105 | data.file <- system.file("extdata", "example.seqz.txt.gz", 106 | package = "sequenza") 107 | test <- sequenza.extract(data.file) 108 | test.CP <- sequenza.fit(test) 109 | sequenza.results(test, test.CP, out.dir = "example", 110 | sample.id = "example") 111 | 112 | } 113 | } -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(sequenza) 3 | 4 | test_check("sequenza") 5 | -------------------------------------------------------------------------------- /tests/testthat/test.model.R: -------------------------------------------------------------------------------- 1 | context("Test sequenza models") 2 | 3 | 4 | test_that("Testing theoretical depth ratio", { 5 | r1 <- theoretical.depth.ratio(CNt = 3, cellularity = 0.5, ploidy = 2) 6 | r2 <- theoretical.depth.ratio(CNt = 3, cellularity = 0.8, ploidy = 2) 7 | r3 <- theoretical.depth.ratio(CNt = 4, cellularity = 0.8, ploidy = 4) 8 | expect_equal(r1, 1.25) 9 | expect_equal(r2, 1.4) 10 | expect_equal(r3, 1) 11 | }) 12 | 13 | test_that("Testing theoretical BAF", { 14 | r1 <- theoretical.baf(CNt = 4, B = 2, cellularity = 0.5) 15 | r2 <- theoretical.baf(CNt = 3, B = 1, cellularity = 0.8) 16 | r3 <- theoretical.baf(CNt = 4, B = 0, cellularity = 0.8) 17 | expect_equal(r1, 0.5) 18 | expect_equal(round(r2, 3), 0.357) 19 | expect_equal(round(r3, 3), 0.056) 20 | }) 21 | 22 | test_that("Testing theoretical mufreq", { 23 | r1 <- theoretical.mufreq(CNt = 4, Mt = 2, cellularity = 0.5) 24 | r2 <- theoretical.mufreq(CNt = 3, Mt = 1, cellularity = 0.8) 25 | r3 <- theoretical.mufreq(CNt = 4, Mt = 0, cellularity = 0.8) 26 | expect_equal(round(r1, 3), 0.333) 27 | expect_equal(round(r2, 3), 0.286) 28 | expect_equal(r3, 0) 29 | }) 30 | 31 | test_that("Testing baf types matrix", { 32 | x <- 3 33 | CNts <- do.call(c, sapply(1:x, FUN = function(x) rep(x, x))) 34 | Bs <- do.call(c, sapply(1:x, FUN = function(x) { 35 | y <- x:1 36 | z <- rep(x, x) 37 | z - y 38 | } 39 | )) 40 | mat_b <- data.frame(CNn = rep(2, length(Bs) + 1), 41 | CNt = c(0, CNts), B = c(0, Bs)) 42 | mat_b <- mat_b[mat_b[, "B"] <= mat_b[, "CNt"] / 2, ] 43 | r1 <- baf.types.matrix(CNt.min = 0, CNt.max = 3, CNn = 2) 44 | expect_equal(r1, mat_b) 45 | }) 46 | 47 | test_that("Testing mufreq types matrix", { 48 | x <- 3 49 | CNts <- do.call(c, sapply(1:x, FUN = function(x) rep(x, x + 1))) 50 | Mts <- do.call(c, sapply(1:x, FUN = function(x) 0:x)) 51 | mat_m <- data.frame(CNn = rep(2, length(Mts) + 1), 52 | CNt = c(0, CNts), Mt = c(0, Mts)) 53 | r1 <- mufreq.types.matrix(CNt.min = 0, CNt.max = 3, CNn = 2) 54 | expect_equal(r1, mat_m) 55 | }) 56 | -------------------------------------------------------------------------------- /tests/testthat/test.read.R: -------------------------------------------------------------------------------- 1 | context("Test read seqz data") 2 | seqz_file <- system.file("extdata", "example.seqz.txt.gz", package = "sequenza") 3 | col_names <- c("chromosome", "position", "base.ref", "depth.normal", 4 | "depth.tumor", "depth.ratio", "Af", "Bf", "zygosity.normal", 5 | "GC.percent", "good.reads", "AB.normal", "AB.tumor", 6 | "tumor.strand") 7 | 8 | test_that("Testing read seqz data", { 9 | full_file <- read.seqz(seqz_file) 10 | expect_equal(colnames(full_file), col_names) 11 | }) 12 | 13 | test_that("Testing tbi presence", { 14 | tbi <- file.exists(paste(seqz_file, "tbi", sep = ".")) 15 | expect_equal(tbi, TRUE) 16 | }) 17 | 18 | test_that("Reading single chromosome", { 19 | chr_17 <- read.seqz(seqz_file, chr_name = "17") 20 | expect_equal(dim(chr_17), c(2988, 14)) 21 | }) 22 | 23 | test_that("Reading selected region", { 24 | chr_17_p <- read.seqz(seqz_file, chr_name = "17:7500000-7600000") 25 | expect_equal(dim(chr_17_p), c(13, 14)) 26 | }) 27 | -------------------------------------------------------------------------------- /vignettes/sequenza.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Sequenza User Guide" 3 | date: "`r Sys.Date()`" 4 | author: "[Francesco Favero](mailto:favero.francesco@gmail.com)" 5 | output: 6 | rmdformats::readthedown: 7 | self_contained: true 8 | vignette: > 9 | %\VignetteIndexEntry{Sequenza User Guide} 10 | %\VignetteEngine{knitr::rmarkdown} 11 | %\VignetteEncoding{UTF-8} 12 | --- 13 | 14 | # About 15 | 16 | > Sequenza: Copy Number Estimation from Tumor Genome Sequencing Data 17 | 18 | 19 | ![](https://bytebucket.org/sequenzatools/icons/raw/324bd43ac4d10546b64b04c38d8c513e8420346d/svg/sequenza_tools/sequenzaalpha_150.svg) 20 | 21 | [![CRAN_Status_Badge](http://www.r-pkg.org/badges/version/sequenza)](https://cran.r-project.org/package=sequenza) 22 | [![CRAN_Downloads_Badge](http://cranlogs.r-pkg.org/badges/sequenza)](https://cran.r-project.org/package=sequenza) 23 | [![CRAN_licence](https://img.shields.io/cran/l/sequenza.svg)](https://www.gnu.org/licenses/gpl-3.0.txt) 24 | 25 | 26 | 27 | Sequenza is a tool to analyze genomic sequencing data from paired normal-tumor samples, including cellularity and ploidy estimation; mutation and copy number (allele-specific and total copy number) detection, quantification and visualization. 28 | 29 | # Introduction 30 | 31 | Deep sequence of tumor DNA along with corresponding normal DNA can provide a 32 | valuable perspective on the mutations and aberrations that characterize the 33 | tumor. However, analysis of this data can be impeded by tumor cellularity and 34 | heterogeneity and by unwieldy data. Here we describe *Sequenza*, an R package 35 | that enables the efficient estimation of tumor cellularity and ploidy, and 36 | generation of copy number, loss-of-heterozygosity, and mutation frequency 37 | profiles. 38 | 39 | This document details a typical analysis of matched tumor-normal exome sequence 40 | data using *sequenza*. 41 | 42 | # Getting started 43 | 44 | ## Minimum requirements 45 | - Software: R, Python, SAMtools, tabix 46 | - Operating system: Linux, OS X, Windows 47 | - Memory: Minimum 4 GB of RAM. Recommended >8 GB. 48 | - Disk space: 1.5 GB for sample (depending on sequencing depth) 49 | - R version: 3.2.0 50 | - Python version: 2.7, 3.4, 3.5, 3.6 (or PyPy) 51 | 52 | ## Installation 53 | 54 | The R package can be installed by: 55 | 56 | ```r 57 | setRepositories(graphics = FALSE, ind = 1:6) 58 | install.packages("sequenza") 59 | ``` 60 | 61 | To install the Python companion package *sequenza-utils* to preprocess BAM 62 | files, refer to the [*sequenza-utils*](https://pypi.org/project/sequenza-utils) 63 | project page, or simply use the python package manager from the command prompt: 64 | 65 | ```bash 66 | pip install sequenza-utils 67 | ``` 68 | 69 | # Running sequenza 70 | 71 | ## Preprocessing of input files 72 | 73 | In order to obtain precise mutational and aberration patterns in a tumor sample, 74 | Sequenza requires a matched normal sample from the same patient. Typically, the 75 | following files are needed to get started with Sequenza: 76 | 77 | - A BAM file (or a derived pileup file) from the tumor specimen. 78 | - A BAM file (or a derived pileup file) from the normal specimen. 79 | - A FASTA reference genomic sequence file 80 | 81 | 82 | The normal and tumor BAM files are processed together to generate a *seqz* file, which 83 | is the required input for the analysis. 84 | It is possible to generate a *seqz* starting from other processed data, such as 85 | pileup, or VCF files. The available options are described in the 86 | [*sequenza-utils*](http://sequenza-utils.readthedocs.io/) manual pages. 87 | 88 | The *sequenza-utils* command provides various tools; here we highlight only the 89 | basic usage: 90 | 91 | - Process a FASTA file to produce a GC [Wiggle](https://genome.ucsc.edu/goldenpath/help/wiggle.html) 92 | track file: 93 | 94 | ```bash 95 | sequenza−utils gc_wiggle −w 50 --fasta hg19.fa -o hg19.gc50Base.wig.gz 96 | ``` 97 | 98 | - Process BAM and Wiggle files to produce a *seqz* file: 99 | 100 | ```bash 101 | sequenza−utils bam2seqz -n normal.bam -t tumor.bam --fasta hg19.fa \ 102 | -gc hg19.gc50Base.wig.gz -o out.seqz.gz 103 | ``` 104 | 105 | - Post-process by binning the original *seqz* file: 106 | 107 | ```bash 108 | sequenza−utils seqz_binning --seqz out.seqz.gz -w 50 -o out small.seqz.gz 109 | ``` 110 | 111 | ## Sequenza analysis (in R) 112 | 113 | ```{r load} 114 | library(sequenza) 115 | ``` 116 | 117 | In the package is provided a small *seqz* file 118 | 119 | ```{r data} 120 | data.file <- system.file("extdata", "example.seqz.txt.gz", package = "sequenza") 121 | ``` 122 | 123 | ```{r cp_data, echo=FALSE, message=FALSE} 124 | library(sequenza) 125 | data(CP.example) 126 | CP <- CP.example 127 | ``` 128 | 129 | 130 | 131 | The main interface consists of 3 functions: 132 | 133 | - sequenza.extract: process seqz data, normalization and segmentation 134 | ```{r extract, message=FALSE, warning=FALSE, results="hide"} 135 | test <- sequenza.extract(data.file, verbose = FALSE) 136 | ``` 137 | 138 | - sequenza.fit: run grid-search approach to estimate cellularity and ploidy 139 | ```{r fit, eval=FALSE} 140 | CP <- sequenza.fit(test) 141 | ``` 142 | 143 | - sequenza.results: write files and plots using suggested or selected solution 144 | ```{r results} 145 | sequenza.results(sequenza.extract = test, 146 | cp.table = CP, sample.id = "Test", 147 | out.dir="TEST") 148 | ``` 149 | 150 | # Plots and Results 151 | 152 | The function _sequenza.results_ outputs various files in the specified path. 153 | The resulting files are either output in pdf of in plain text. The files include 154 | quality control assessments (eg evaluate GC-correction), visualization of the 155 | data and files such as segmentation with copy number calling and mutation lists. 156 | 157 | ## Result files 158 | 159 | Each generated file is briefly explained in the following table 160 | 161 | ```{r res_dir, echo=FALSE} 162 | 163 | # create results list 164 | # 165 | res_list <- c("alternative_fit.pdf", "alternative_solutions.txt", 166 | "chromosome_depths.pdf", "chromosome_view.pdf", 167 | "CN_bars.pdf", "confints_CP.txt", 168 | "CP_contours.pdf", "gc_plots.pdf", 169 | "genome_view.pdf", "model_fit.pdf", 170 | "mutations.txt", "segments.txt", 171 | "sequenza_cp_table.RData", "sequenza_extract.RData", 172 | "sequenza_log.txt") 173 | res_list <- paste("Test", res_list, sep = "_") 174 | 175 | description_list <- c( 176 | "Alternative solution fir to the segments. One solution per slide", 177 | "List of all ploidy/cellularity alternative solution", 178 | "Visualization of sequencing coverage in the normal and in the tumor samples, before and after normalization", 179 | "Visualization per chromosome of depth.ratio, B-allele frequency and mutations, using the selected or estimated solution. One chromosome per slide", 180 | "Bar plot representing the percentage of genome in the detected copy number states", 181 | "Table of the confidence inerval of the best solution from the model", 182 | "Visualization of the likelihood density for each pair of cellularity/ploidy solution. The local maximum-likelihood points and confidence interval of the best estimate are also visualized", 183 | "Visualization of the GC correction in the normal and in the tumor sample", 184 | "Genome-whide visualization of the allele-specific and absolute copy number results, and raw profile of the depth ratio and allele frequency", 185 | "model_fit.pdf", 186 | "Table with mutation and estimated number of mutated alleles (Mt)", 187 | "Table listing the detected segments, with estimated copy number state at each sement", 188 | "RData object dump of the maxima a posteriori computation", 189 | "RData object dump of all the sample information", 190 | "Log with version and time information") 191 | knitr::kable(data.frame(Files = res_list, Description = description_list)) 192 | #dir("TEST", pattern = "Test") 193 | 194 | ``` 195 | 196 | ```{r read_segs, echo=FALSE} 197 | seg.tab <- read.table("TEST/Test_segments.txt", 198 | header = TRUE, sep ="\t") 199 | 200 | alt_res <- read.table("TEST/Test_alternative_solutions.txt", 201 | header = TRUE, sep ="\t") 202 | seg.tab <- seg.tab[seg.tab$CNt <= 4, ] 203 | is.num <- sapply(seg.tab, is.numeric) 204 | seg.tab[is.num] <- lapply(seg.tab[is.num], round, 3) 205 | ``` 206 | 207 | ## Segments results 208 | 209 | The segmentation file with the allele-specific copy number calling is one of 210 | the main result of the analysis. A sample of the file is shown in the table below: 211 | 212 | ```{r head_segs, echo=FALSE} 213 | knitr::kable(head(seg.tab)) 214 | 215 | ``` 216 | 217 | 218 | The columns represents: 219 | 220 | 1. **chromosome**: Chromosome 221 | 2. **start.pos**: Start position of the segment 222 | 3. **end.pos**: End position of the segment 223 | 4. **Bf**: B-allele frequency value 224 | 5. **N.BAF**: Number of observation to compute _Bf_ in the segment 225 | 6. **sd.BAF**: Standard deviation of _Bf_ 226 | 7. **depth.ratio**: Adjusted and normalized depth ratio tumor / normal 227 | 8. **N.ratio**: Number of observation to compute _depth.ratio_ in the segment 228 | 9. **sd.ratio**: Standard deviation of _depth.rati_ 229 | 10. **CNt**: Estimated total copy number value 230 | 11. **A**: Estimated number of A-alleles 231 | 12. **B**: Estimated number of B-alleles (minor allele) 232 | 13. **LPP**: Log-posterior probability of the segment 233 | 234 | ## Gene wide overview 235 | 236 | ### Allele-specific copy number 237 | 238 | ```{r g_view, echo=FALSE, fig.height=5, fig.width=10, fig.align='center'} 239 | sequenza:::genome.view(seg.tab) 240 | ``` 241 | 242 | ### Total copy number 243 | 244 | ```{r g_view_tot, echo=FALSE, fig.height=5, fig.width=10, fig.align='center'} 245 | sequenza:::genome.view(seg.tab, info.type = "CNt") 246 | ``` 247 | 248 | ### Raw profile 249 | 250 | ```{r g_view_raw, echo=FALSE, fig.height=5, fig.width=10, fig.align='center'} 251 | sequenza:::plotRawGenome(test, cellularity = alt_res$cellularity[1], 252 | ploidy = alt_res$ploidy[1]) 253 | ``` 254 | 255 | 256 | ## Grid search maximum likelihood 257 | 258 | 259 | ```{r CPplot, echo=TRUE, fig.height=5, fig.width=5, fig.align='center'} 260 | cp.plot(CP) 261 | cp.plot.contours(CP, add = TRUE, 262 | likThresh = c(0.999, 0.95), 263 | col = c("lightsalmon", "red"), pch = 20) 264 | ``` 265 | 266 | ## Chromosome view 267 | 268 | _Chromosome view_ is the visualization that displays chromosome by crhosome, nutations, 269 | B-allele frequency and depth-ratio. 270 | The visualization makes it easier to ispect the segmentation results, comparing to 271 | a binned profile of the raw data. 272 | It also visualize the copy number calling using the _cellularity_ and _ploidy_ solution, 273 | making useful to asses if the copy number calling is acurate. 274 | In addition it provides a visualization of the mutation frequency that can also help to 275 | corroborate the solution. 276 | 277 | ```{r c_view, echo=TRUE, fig.height=6, fig.width=8, fig.align='center'} 278 | chromosome.view(mut.tab = test$mutations[[1]], baf.windows = test$BAF[[1]], 279 | ratio.windows = test$ratio[[1]], min.N.ratio = 1, 280 | segments = test$segments[[1]], 281 | main = test$chromosomes[1], 282 | cellularity = 0.89, ploidy = 1.9, 283 | avg.depth.ratio = 1) 284 | ``` 285 | --------------------------------------------------------------------------------