├── .Rbuildignore ├── .gitignore ├── DESCRIPTION ├── LICENSE ├── LICENSE.md ├── NAMESPACE ├── R ├── getCounts.R ├── plot_mostdepth.R ├── plot_mostdepth_tn.R ├── prepAscat.R ├── prepAscat_t.R └── segment_logR.R ├── README.md ├── ezASCAT.Rproj ├── inst └── extdata │ ├── GRCh37_SNP6.tsv.gz │ └── GRCh38_SNP6.tsv.gz ├── man ├── get_counts.Rd ├── plot_mosdepth.Rd ├── plot_mosdepth_tn.Rd ├── prep_ascat.Rd ├── prep_ascat_t.Rd └── segment_logR.Rd ├── scripts └── compile_snp6.R └── src ├── Makevars └── ntcounts.c /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | src/*.o 6 | src/*.so 7 | src/*.dll 8 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: ezASCAT 2 | Title: Conveniently perform ASCAT copy-number analysis in R 3 | Version: 0.9.00 4 | Authors@R: 5 | person(given = "Anand", 6 | family = "Mayakonda", 7 | role = c("aut", "cre"), 8 | email = "anand_mt@hotmail.com", 9 | comment = c(ORCID = "0000-0003-1162-687X")) 10 | Description: This package attempts make it easier to perform ASCAT analysis completely in R from tumor-normal pairs. 11 | License: MIT + file LICENSE 12 | Encoding: UTF-8 13 | LazyData: true 14 | Roxygen: list(markdown = TRUE) 15 | RoxygenNote: 7.1.1 16 | LinkingTo: 17 | Rhtslib, 18 | zlibbioc 19 | SystemRequirements: GNU make 20 | depends: 21 | data.table, DNAcopy, ASCAT 22 | Imports: 23 | data.table, DNAcopy, ASCAT 24 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2021 2 | COPYRIGHT HOLDER: ezASCAT authors 3 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | Copyright (c) 2021 ezASCAT authors 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(get_counts) 4 | export(plot_mosdepth) 5 | export(plot_mosdepth_tn) 6 | export(prep_ascat) 7 | export(prep_ascat_t) 8 | export(segment_logR) 9 | import(DNAcopy) 10 | import(data.table) 11 | useDynLib(ezASCAT, .registration = TRUE) 12 | -------------------------------------------------------------------------------- /R/getCounts.R: -------------------------------------------------------------------------------- 1 | #' Extract nucleotide counts from targeted loci (SNPs) 2 | #' @param t_bam Tumor BAM file. Required 3 | #' @param n_bam Normal BAM file. Recommended 4 | #' @param build Default hg19. Mutually exclusive with `loci`. Currently supported `hg19` and `hg38` and includes ~900K SNPs from Affymetrix Genome-Wide Human SNP 6.0 Array. SNP file has no `chr` prefix. 5 | #' @param prefix Prefix to add or remove from contig names in loci file. For example, in case BAM files have no `chr` prefix. 6 | #' @param add If prefix is used, default is to add prefix to contig names in loci file. If false prefix will be removed from contig names. 7 | #' @param mapq Map quality. Default 10 8 | #' @param sam_flag SAM FLAG to filter reads. Default 1024 9 | #' @param loci A tab separated file with chr and position. If not available use `build` argument. 10 | #' @param zerobased are coordinates zero-based. Default FALSE. Use only if `loci` is used. 11 | #' @param op Output file basename. Default parses from BAM file 12 | #' @param fa Indexed fasta file. If provided, extracts and adds reference base to the output tsv. 13 | #' @param nthreads Number of threads to use. Default 4. Each chromosome will be launched on a separate thread. Works only on Unix and macOS. 14 | #' @param verbose Default TRUE 15 | #' @export 16 | #' @useDynLib ezASCAT, .registration = TRUE 17 | #' @import data.table 18 | 19 | get_counts = function(t_bam = NULL, n_bam = NULL, build = "hg19", prefix = NULL, add = TRUE, 20 | mapq = 10, sam_flag = 1024, loci = NULL, fa = NULL, op = NULL, 21 | zerobased = FALSE, nthreads = 4, verbose = TRUE){ 22 | 23 | if(is.null(t_bam)) stop("Missing tumor BAM file!") 24 | bam = c(t_bam) 25 | 26 | if(!is.null(n_bam)){ 27 | bam = c(bam, n_bam) 28 | } 29 | 30 | 31 | if(is.null(loci)){ 32 | if(build == "hg19"){ 33 | loci = system.file("extdata", "GRCh37_SNP6.tsv.gz", package = "ezASCAT") 34 | }else{ 35 | loci = system.file("extdata", "GRCh38_SNP6.tsv.gz", package = "ezASCAT") 36 | } 37 | } 38 | 39 | loci = data.table::fread(input = loci) 40 | colnames(loci)[1:2] = c("Chr", "start") 41 | 42 | if(!is.null(prefix)){ 43 | if(add){ 44 | loci$Chr = paste(prefix, loci$Chr, sep = '') 45 | }else{ 46 | loci$Chr = gsub(pattern = prefix, replacement = '', x = loci$Chr, fixed = TRUE) 47 | } 48 | } 49 | 50 | data.table::setDF(x = loci) 51 | 52 | if(zerobased){ 53 | loci$start = as.numeric(loci$start) + 1 54 | } 55 | 56 | op_files = lapply(bam, function(x){ 57 | bam_ext = substr(x = basename(x), start = nchar(basename(path = x))-3, nchar(basename(x))) 58 | 59 | if(bam_ext != ".bam"){ 60 | stop("Input file is not a BAM file: ", x) 61 | } 62 | 63 | if(!file.exists(x)){ 64 | stop("BAM file does not exist: ", x) 65 | } 66 | gsub(pattern = "\\.bam$", replacement = "", x = basename(x), ignore.case = TRUE) 67 | }) 68 | 69 | if(is.null(op)){ 70 | op = as.character(unlist(op_files)) 71 | op_files = lapply(op, function(x) { 72 | paste0(x, "_nucleotide_counts") 73 | }) 74 | op_files = as.character(unlist(op_files)) 75 | }else{ 76 | if(length(op) != length(bam)){ 77 | stop("No. of output file names must be equal to no. of BAM files.") 78 | } 79 | op_files = paste0(op, "_nucleotide_counts") 80 | } 81 | 82 | if(all(file.exists(op_files))){ 83 | warning("Counts are already generated!") 84 | res = lapply(seq_along(op_files), function(x){ 85 | data.table::fread(file = paste0(op_files[x], ".tsv"), sep = "\t", header = TRUE) 86 | }) 87 | names(res) = op 88 | return(res) 89 | } 90 | 91 | if(is.null(fa)){ 92 | fa = "NULL" 93 | } 94 | 95 | loci = split(loci, loci$Chr) 96 | 97 | loci_files = lapply(1:length(loci), function(idx){ 98 | chrname = names(loci)[idx] 99 | lfile = tempfile(pattern = paste0("chr",chrname, "_"), fileext = paste0("_loci.tsv")) 100 | data.table::fwrite(x = loci[[idx]][,c(1:2)], file = lfile, col.names = FALSE, sep = "\t", row.names = FALSE) 101 | lfile 102 | }) 103 | 104 | if(verbose){ 105 | cat("Fetching readcounts from BAM files..\n") 106 | } 107 | 108 | res = list() 109 | bam_idxstats = list() #Store samtools idxstats 110 | 111 | for(b in bam){ 112 | 113 | if(verbose){ 114 | cat("Processing", basename(b), ":\n") 115 | } 116 | 117 | bam_counts = parallel::mclapply(loci_files, function(lfile){ 118 | chr = unlist(data.table::tstrsplit(basename(path = lfile), split = "_", keep = 1)) 119 | 120 | if(verbose){ 121 | system(paste("echo ' current chromosome:",chr,"'")) 122 | } 123 | 124 | opcount = tempfile(pattern = paste0(chr, "_", basename(b)), fileext = ".tsv") 125 | 126 | withCallingHandlers(suppressWarnings(invisible(.Call("ntc", b, lfile, mapq, sam_flag, fa, opcount, PACKAGE = "ezASCAT")))) 127 | 128 | paste0(opcount, ".tsv") 129 | }, mc.cores = nthreads) 130 | 131 | #print(unlist(bam_counts, use.names = FALSE)) 132 | 133 | idxstat = apply(data.table::fread(file = bam_counts[[1]], nrow = 1, sep = "\t"), 1, paste, collapse = " ") 134 | bam_idxstats[[length(bam_idxstats)+1]] = idxstat 135 | res[[length(res)+1]] = data.table::rbindlist(lapply(bam_counts, data.table::fread), use.names = TRUE, fill = TRUE) 136 | lapply(bam_counts, unlink) 137 | } 138 | 139 | names(res) = op 140 | 141 | lapply(seq_along(res), function(idx){ 142 | cat(paste0(bam_idxstats[[idx]], "\n"), file = paste0(op_files[[idx]], ".tsv")) 143 | data.table::fwrite(x = res[[idx]], file = paste0(op_files[[idx]], ".tsv"), append = TRUE, sep = "\t", na = "NA", quote = FALSE, col.names = TRUE) 144 | }) 145 | 146 | res 147 | } 148 | -------------------------------------------------------------------------------- /R/plot_mostdepth.R: -------------------------------------------------------------------------------- 1 | #'Plot results from mosdepth output 2 | #' @param bed mosdepth output 3 | #' @param col Colors. Default c("#95a5a6", "#7f8c8d") 4 | #' @export 5 | 6 | plot_mosdepth = function(bed = NULL, col = c("#95a5a6", "#7f8c8d")){ 7 | 8 | tum_cov = data.table::fread(input = bed) 9 | colnames(tum_cov) = c("chr", "start", "end", "doc") 10 | 11 | contigs = c(1:22, "X", "Y", paste0("chr", 1:22), "chrX", "chrY") 12 | tum_cov = tum_cov[chr %in% contigs] 13 | 14 | tnxy = tum_cov[chr %in% c('X', 'Y', 'chrX', 'chrY')] 15 | tn = tum_cov[!chr %in% c('X', 'Y', 'chrX', 'chrY')] 16 | tn[, chr := gsub(pattern = "chr", replacement = "", x = chr)] 17 | tn[, chr := as.numeric(as.character(chr))] 18 | tn = tn[order(chr, start)] 19 | med_cov = median(tn[,doc], na.rm = TRUE) 20 | 21 | 22 | all_depth = rbind(tn, tnxy) 23 | colnames(all_depth)[1:3] = c("Chromosome", "Start_Position", "End_Position") 24 | 25 | chr.lens.dt = all_depth[,max(End_Position, na.rm = TRUE), .(Chromosome)] 26 | chr.lens = chr.lens.dt$V1 27 | names(chr.lens) = chr.lens.dt$Chromosome 28 | 29 | #chr.lens = unlist(lapply(split(all_depth, all_depth$Chromosome), function(x) max(x$End_Position, na.rm = TRUE))) 30 | 31 | #all_depth = .transformSegments(segmentedData = all_depth, chr.lens = chr.lens) 32 | 33 | #log2 median centered 34 | all_depth[, doc_norm := log2(doc) - log2(med_cov)] 35 | 36 | cols = rep(x = col, length(chr.lens)) 37 | 38 | all_depth_spl = split(all_depth, all_depth$Chromosome) 39 | all_depth_spl = all_depth_spl[names(chr.lens)] 40 | 41 | seg.spl.transformed = all_depth_spl[[1]] 42 | if (nrow(seg.spl.transformed) > 0) { 43 | seg.spl.transformed$Start_Position_updated = seg.spl.transformed$Start_Position 44 | seg.spl.transformed$End_Position_updated = seg.spl.transformed$End_Position 45 | } 46 | chr.lens.sumsum = cumsum(as.numeric(chr.lens)) 47 | for (i in 2:length(all_depth_spl)) { 48 | x.seg = all_depth_spl[[i]] 49 | if (nrow(x.seg) > 0) { 50 | x.seg$Start_Position_updated = x.seg$Start_Position + 51 | chr.lens.sumsum[i - 1] 52 | x.seg$End_Position_updated = x.seg$End_Position + 53 | chr.lens.sumsum[i - 1] 54 | } 55 | seg.spl.transformed = rbind(seg.spl.transformed, x.seg, fill = TRUE) 56 | } 57 | 58 | all_depth_spl = split(seg.spl.transformed, seg.spl.transformed$Chromosome) 59 | all_depth_spl = all_depth_spl[names(chr.lens)] 60 | 61 | rm(seg.spl.transformed) 62 | 63 | cat("Plotting..") 64 | 65 | #png(filename = paste0(basename(bed), ".png"), width = 1024, height = 600, bg = "white") 66 | 67 | par(mfrow = c(2, 1), mar = c(3, 3, 2, 1)) 68 | 69 | plot(NA, xlim = c(0, sum(chr.lens)), ylim = c(med_cov-50, med_cov+50), frame.plot = FALSE, axes = FALSE, xlab = NA, ylab = NA) 70 | temp = lapply(seq_along(all_depth_spl), function(idx){ 71 | x = all_depth_spl[[idx]] 72 | points(x$Start_Position_updated, x$doc, pch = 19, cex = 0.5, col = cols[idx]) 73 | rect(xleft = x[, Start_Position_updated][1], ybottom = med_cov, 74 | xright = x[,End_Position_updated][nrow(x)], ytop = med_cov) 75 | }) 76 | abline(v = cumsum(as.numeric(chr.lens)), lty = 2) 77 | axis(side = 1, at = cumsum(as.numeric(chr.lens)), labels = names(chr.lens)) 78 | axis(side = 2, at = pretty(c(med_cov-50, med_cov+50)), las = 2) 79 | title(main = "DOC") 80 | 81 | 82 | plot(NA, xlim = c(0, sum(chr.lens)), ylim = c(-3, 3), frame.plot = FALSE, axes = FALSE, xlab = NA, ylab = NA) 83 | temp = lapply(seq_along(all_depth_spl), function(idx){ 84 | x = all_depth_spl[[idx]] 85 | points(x$Start_Position_updated, x$doc_norm, pch = 19, cex = 0.5, col = cols[idx]) 86 | rect(xleft = x[, Start_Position_updated][1], ybottom = log2(med_cov), 87 | xright = x[,End_Position_updated][nrow(x)], ytop = log2(med_cov)) 88 | }) 89 | abline(v = cumsum(as.numeric(chr.lens)), lty = 2) 90 | axis(side = 1, at = cumsum(as.numeric(chr.lens)), labels = names(chr.lens)) 91 | axis(side = 2, at = seq(-3, 3, 1), las = 2) 92 | title(main = "DOC Median centered") 93 | 94 | #dev.off() 95 | 96 | } -------------------------------------------------------------------------------- /R/plot_mostdepth_tn.R: -------------------------------------------------------------------------------- 1 | #'Plot results from mosdepth output for Tumor/Normal pairs 2 | #' @param t_bed mosdepth output from tumor 3 | #' @param n_bed mosdepth output from normal 4 | #' @param segment Whether to perform CBS segmentation. Default TRUE 5 | #' @param sample_name sample name. Default parses from `t_bed` 6 | #' @param col Colors. Default c("#95a5a6", "#7f8c8d") 7 | #' @export 8 | 9 | plot_mosdepth_tn = function(t_bed = NULL, n_bed = NULL, segment = TRUE, sample_name = NULL, col = c("#95a5a6", "#7f8c8d")){ 10 | 11 | contigs = c(1:22, "X", "Y", paste0("chr", 1:22), "chrX", "chrY") 12 | 13 | if(is.null(sample_name)){ 14 | sample_name = gsub(x = basename(t_bed), pattern = "\\.regions\\.bed\\.gz$", replacement = "") 15 | } 16 | 17 | if(is.null(plot_file)){ 18 | plot_file = sample_name 19 | } 20 | 21 | dat = lapply(X = c(t_bed, n_bed), function(x){ 22 | x = data.table::fread(input = x) 23 | colnames(x) = c("chr", "start", "end", "doc") 24 | x[, chr := gsub(pattern = "chr", replacement = "", x = chr)] 25 | colnames(x)[1:3] = c("Chromosome", "Start_Position", "End_Position") 26 | x = x[Chromosome %in% contigs] 27 | x 28 | }) 29 | 30 | 31 | names(dat) = c("tumor", "normal") 32 | dat = merge(dat$tumor, dat$normal, by = c("Chromosome", "Start_Position", 'End_Position'), suffixes = c("_t", "_n")) 33 | 34 | dat_xy = dat[Chromosome %in% c('X', 'Y', 'chrX', 'chrY')] 35 | datn = dat[!Chromosome %in% c('X', 'Y', 'chrX', 'chrY')] 36 | datn[, Chromosome := as.numeric(as.character(Chromosome))] 37 | datn = datn[order(Chromosome, Start_Position)] 38 | dat = rbind(datn, dat_xy) 39 | 40 | #Get chr lengths 41 | chr.lens.dt = dat[,max(End_Position, na.rm = TRUE), .(Chromosome)] 42 | chr.lens = chr.lens.dt$V1 43 | names(chr.lens) = chr.lens.dt$Chromosome 44 | 45 | map_ratio = sum(dat$doc_t, na.rm = TRUE)/sum(dat$doc_n, na.rm = TRUE) 46 | message("Coverage ratio T/N: ", round(map_ratio, digits = 3)) 47 | dat$doc_n = dat$doc_n * map_ratio 48 | 49 | dat[, logR := log2(doc_t+1) - log2(doc_n+1)] 50 | 51 | cols = rep(x = col, length(chr.lens)) 52 | 53 | all_depth_spl = split(dat, dat$Chromosome)[names(chr.lens)] 54 | 55 | seg.spl.transformed = all_depth_spl[[1]] 56 | if (nrow(seg.spl.transformed) > 0) { 57 | seg.spl.transformed$Start_Position_updated = seg.spl.transformed$Start_Position 58 | seg.spl.transformed$End_Position_updated = seg.spl.transformed$End_Position 59 | } 60 | chr.lens.sumsum = cumsum(as.numeric(chr.lens)) 61 | for (i in 2:length(all_depth_spl)) { 62 | x.seg = all_depth_spl[[i]] 63 | if (nrow(x.seg) > 0) { 64 | x.seg$Start_Position_updated = x.seg$Start_Position + 65 | chr.lens.sumsum[i - 1] 66 | x.seg$End_Position_updated = x.seg$End_Position + 67 | chr.lens.sumsum[i - 1] 68 | } 69 | seg.spl.transformed = rbind(seg.spl.transformed, x.seg, 70 | fill = TRUE) 71 | } 72 | 73 | all_depth_spl = split(seg.spl.transformed, seg.spl.transformed$Chromosome)[names(chr.lens)] 74 | 75 | cn_segs = NULL 76 | if(segment){ 77 | message("Running CBS segmentation:") 78 | #samp.name = gsub(pattern = '.denoisedCR.tsv', replacement = '', x = copynumber_file) 79 | cn = DNAcopy::CNA(genomdat = data.table::rbindlist(l = all_depth_spl)[,logR], chrom = data.table::rbindlist(l = all_depth_spl)[,Chromosome], maploc = data.table::rbindlist(l = all_depth_spl)[,Start_Position], 80 | data.type = "logratio", sampleid = sample_name, presorted = TRUE) 81 | 82 | cn = DNAcopy::smooth.CNA(cn) 83 | cn = DNAcopy::segment(cn, alpha = 0.01, nperm = 10000, p.method = 'hybrid', min.width = 5, kmax = 25, nmin = 210, 84 | eta = 0.05, trim = 0.025, undo.SD = 3, undo.prune = 0.05, undo.splits = 'sdundo', verbose = 2) 85 | cn_segs = DNAcopy::segments.p(x = cn) 86 | colnames(cn_segs)[1:4] = c("Sample_Name", "Chromosome", "Start_Position", "End_Position") 87 | data.table::setDT(cn_segs) 88 | cn_segs = split(cn_segs, cn_segs$Chromosome)[names(all_depth_spl)] 89 | 90 | 91 | seg.spl.transformed = cn_segs[[1]] 92 | if (nrow(seg.spl.transformed) > 0) { 93 | seg.spl.transformed$Start_Position_updated = seg.spl.transformed$Start_Position 94 | seg.spl.transformed$End_Position_updated = seg.spl.transformed$End_Position 95 | } 96 | chr.lens.sumsum = cumsum(as.numeric(chr.lens)) 97 | for (i in 2:length(cn_segs)) { 98 | x.seg = cn_segs[[i]] 99 | if (nrow(x.seg) > 0) { 100 | x.seg$Start_Position_updated = x.seg$Start_Position + 101 | chr.lens.sumsum[i - 1] 102 | x.seg$End_Position_updated = x.seg$End_Position + 103 | chr.lens.sumsum[i - 1] 104 | } 105 | seg.spl.transformed = rbind(seg.spl.transformed, x.seg, 106 | fill = TRUE) 107 | } 108 | seg.spl.transformed = split(seg.spl.transformed, seg.spl.transformed$Chromosome)[names(chr.lens)] 109 | } 110 | 111 | message("Plotting") 112 | #png(filename = paste0(sample_name, ".png"), width = 1024, height = 600, bg = "white") 113 | par(mar = c(4, 4, 3, 1)) 114 | plot(NA, xlim = c(0, sum(chr.lens)), ylim = c(-3, 3), frame.plot = FALSE, axes = FALSE, xlab = NA, ylab = NA) 115 | temp = lapply(seq_along(all_depth_spl), function(idx){ 116 | message(" Chromosome: ", names(all_depth_spl)[idx]) 117 | x = all_depth_spl[[idx]] 118 | points(x$Start_Position_updated, x$logR, pch = 19, cex = 0.5, col = cols[idx]) 119 | if(segment){ 120 | xs = seg.spl.transformed[[idx]] 121 | rect(xleft = xs$Start_Position_updated, ybottom = xs$seg.mean, 122 | xright = xs$End_Position_updated, ytop = xs$seg.mean, col = "maroon", lwd = 1, border = "maroon") 123 | } 124 | }) 125 | abline(v = cumsum(as.numeric(chr.lens)), lty = 2, col = "gray70") 126 | axis(side = 1, at = cumsum(as.numeric(chr.lens)), labels = names(chr.lens)) 127 | axis(side = 2, at = seq(-3, 3, 1), las = 2) 128 | mtext(text = "logR", side = 2, line = 2) 129 | title(main = sample_name, adj = 0) 130 | #dev.off() 131 | 132 | data.table::rbindlist(l = cn_segs, use.names = TRUE, fill = TRUE) 133 | } 134 | -------------------------------------------------------------------------------- /R/prepAscat.R: -------------------------------------------------------------------------------- 1 | #'Prepare input files for ASCAT 2 | #' @param t_counts read counts from tumor generated by `get_counts` 3 | #' @param n_counts read counts from normal generated by `get_counts` 4 | #' @param sample_name Sample name. Used as a basename for output files. Default NA, parses from t_counts file. 5 | #' @param min_depth Min read depth required to consider a marker. Default 30 6 | #' @param normalize If TRUE, normalizes for library size 7 | #' @return An \code{\link{ascat.loadData}} object; ascat data structure 8 | #' @export 9 | prep_ascat = function(t_counts = NULL, n_counts = NULL, sample_name = NA, min_depth = 30, normalize = FALSE){ 10 | 11 | if(any(is.null(t_counts) | is.null(n_counts))) stop("Missing tumor or normal read counts!") 12 | 13 | if(is.na(sample_name)){ 14 | sample_name = gsub(pattern = "\\.tsv$", replacement = "", x = basename(path = t_counts)) 15 | } 16 | 17 | counts = c(t_counts, n_counts) 18 | 19 | #library sizes 20 | tot_map_reads = lapply(counts, function(x){ 21 | x = data.table::fread(input = x,nrows = 1) 22 | as.numeric(x$V2) 23 | }) 24 | names(tot_map_reads) = c("tumor", "normal") 25 | map_ratio = tot_map_reads$tumor/tot_map_reads$normal 26 | message("Library sizes:") 27 | message("Tumor: ", tot_map_reads$tumor) 28 | message("Normal: ", tot_map_reads$normal) 29 | message("Library size difference: ", round(map_ratio, digits = 3)) 30 | message("------") 31 | 32 | counts = lapply(counts, function(x){ 33 | message("Counts file: ", x) 34 | x = data.table::fread(input = x) 35 | x[,loci := gsub(pattern = "^chr", replacement = "", x = loci)] 36 | message("Markers: ", nrow(x)) 37 | if(nrow(x[duplicated(loci)]) > 0){ 38 | message("Removed ", nrow(x[duplicated(loci)]), " duplicated loci") 39 | x = x[!duplicated(loci)] 40 | } 41 | x[, tot_depth := apply(x[,.(A, T, G, C)], 1, sum)] 42 | x = x[tot_depth > min_depth] 43 | message("Markers > ", min_depth, ": ", nrow(x)) 44 | message("------") 45 | x[, baf := apply(x[,.(A, T, G, C)], 1, function(r) {r = sort(r); r[4]/sum(r[4], r[3])})] 46 | #x[, baf := ifelse(test = baf < 0.5, yes = baf, no = 1 - baf)] 47 | #x$baf = ifelse(x$baf <0.5,x$baf,1-x$baf) 48 | x$baf = ifelse(runif(length(x$baf))<0.5,x$baf,1-x$baf) 49 | x = data.frame(data.table::tstrsplit(x = x$loci, split = ":"), x$baf, x$tot_depth, row.names = x$loci) 50 | colnames(x) = c("chr", "pos", "BAF", "depth") 51 | x$pos = as.numeric(as.character(x$pos)) 52 | x 53 | }) 54 | names(counts) = c("tumor", "normal") 55 | com_loci = intersect(rownames(counts[[1]]), rownames(counts[[2]])) 56 | counts = lapply(counts, function(x) x[com_loci,, drop = FALSE]) 57 | message("Final number SNPs: ", nrow(counts[[1]])) 58 | 59 | #normalize for sequencing depth (might not be the best way) and estimate logR 60 | # t_depth = sum(counts[[1]][,"depth"]) 61 | # n_depth = sum(counts[[2]][,"depth"]) 62 | # counts[[2]][,"depth"] <- counts[[2]][,"depth"] * (n_depth/t_depth) 63 | if(normalize){ 64 | counts[[2]][,"depth"] <- counts[[2]][,"depth"] * map_ratio 65 | } 66 | 67 | 68 | counts[[2]][,"logR"] <- 0 69 | counts[[1]][,"logR"] <- round(log2(counts[[1]][,"depth"]/counts[[2]][,"depth"]), digits = 3) 70 | #counts[[1]][,"logR"] <- counts[[1]][,"logR"] - median(counts[[1]][,"logR"], na.rm = TRUE) 71 | 72 | counts = lapply(counts, function(x){ 73 | x[,"BAF"] = ifelse(is.na(x[,"BAF"]) | is.nan(x[,"BAF"]), yes = NA, x[,"BAF"]) 74 | x[,"logR"] = ifelse(is.na(x[,"logR"]) | is.nan(x[,"logR"]), yes = NA, x[,"logR"]) 75 | x[,"logR"] = ifelse(is.infinite(x[,"logR"]), yes = NA, x[,"logR"]) 76 | x 77 | }) 78 | 79 | data.table::fwrite(x = counts[[1]][,c("chr", "pos", "BAF")], file = paste0(sample_name, ".tumour.BAF.txt"), sep = "\t", row.names = TRUE) 80 | data.table::fwrite(x = counts[[1]][,c("chr", "pos", "logR")], file = paste0(sample_name, ".tumour.logR.txt"), sep = "\t", row.names = TRUE) 81 | data.table::fwrite(x = counts[[2]][,c("chr", "pos", "BAF")], file = paste0(sample_name, ".normal.BAF.txt"), sep = "\t", row.names = TRUE) 82 | data.table::fwrite(x = counts[[2]][,c("chr", "pos", "logR")], file = paste0(sample_name, ".normal.logR.txt"), sep = "\t", row.names = TRUE) 83 | 84 | message("Generated following files:") 85 | message(paste0(sample_name, ".tumour.BAF.txt")) 86 | message(paste0(sample_name, ".tumour.logR.txt")) 87 | message(paste0(sample_name, ".normal.BAF.txt")) 88 | message(paste0(sample_name, ".normal.logR.txt")) 89 | message("------") 90 | 91 | ascat_obj = NULL 92 | if("ASCAT" %in% rownames(installed.packages())){ 93 | message("Running ASCAT::ascat.loadData()") 94 | ascat_obj = ASCAT::ascat.loadData(Tumor_LogR_file = paste0(sample_name, ".tumour.logR.txt"), 95 | Tumor_BAF_file = paste0(sample_name, ".tumour.BAF.txt"), 96 | Germline_LogR_file = paste0(sample_name, ".normal.logR.txt"), 97 | Germline_BAF_file = paste0(sample_name, ".normal.BAF.txt"), 98 | chrs = c(1:22, "X", "Y"), sexchromosomes = c("X", "Y")) 99 | message("Running ASCAT::ascat.plotRawData()") 100 | ASCAT::ascat.plotRawData(ASCATobj = ascat_obj, img.prefix = sample_name) 101 | message("Returned ASCAT object!") 102 | } 103 | 104 | ascat_obj 105 | } 106 | -------------------------------------------------------------------------------- /R/prepAscat_t.R: -------------------------------------------------------------------------------- 1 | #'Prepare input files for ASCAT tumor only samples 2 | #' @param t_counts read counts from tumor generated by `get_counts` 3 | #' @param sample_name Sample name. Used as a basename for output files. Default NA, parses from t_counts file. 4 | #' @param min_depth Min read depth required to consider a marker. Default 30 5 | #' @param PLATFORM Default AffySNP6. Only change if you have used custom loci. See here for available options https://www.crick.ac.uk/research/labs/peter-van-loo/software 6 | #' @return An \code{\link{ascat.loadData}} object; ascat data structure 7 | #' @export 8 | 9 | prep_ascat_t = function(t_counts = NULL, sample_name = NA, min_depth = 30, PLATFORM = "AffySNP6"){ 10 | 11 | if(any(is.null(t_counts))) stop("Missing tumor or normal read counts!") 12 | 13 | if(is.na(sample_name)){ 14 | sample_name = gsub(pattern = "\\.tsv$", replacement = "", x = basename(path = t_counts)) 15 | } 16 | 17 | counts = c(t_counts) 18 | 19 | #library sizes 20 | tot_map_reads = lapply(counts, function(x){ 21 | x = data.table::fread(input = x,nrows = 1) 22 | as.numeric(x$V2) 23 | }) 24 | names(tot_map_reads) = c("tumor") 25 | message("Library sizes:") 26 | message("Tumor: ", tot_map_reads$tumor) 27 | 28 | counts = lapply(counts, function(x){ 29 | message("Counts file: ", basename(x)) 30 | x = data.table::fread(file = x) 31 | x[,loci := gsub(pattern = "^chr", replacement = "", x = loci)] 32 | message("Markers: ", nrow(x)) 33 | if(nrow(x[duplicated(loci)]) > 0){ 34 | message("Removed ", nrow(x[duplicated(loci)]), " duplicated loci") 35 | x = x[!duplicated(loci)] 36 | } 37 | x[, tot_depth := apply(x[,.(A, T, G, C)], 1, sum)] 38 | x = x[tot_depth > min_depth] 39 | message("Markers > ", min_depth, ": ", nrow(x)) 40 | x[, baf := apply(x[,.(A, T, G, C)], 1, function(r) {r = sort(r); r[4]/sum(r[4], r[3])})] 41 | #x[, baf := ifelse(test = baf < 0.5, yes = baf, no = 1 - baf)] 42 | #x$baf = ifelse(x$baf <0.5,x$baf,1-x$baf) 43 | x$baf = ifelse(runif(length(x$baf))<0.5,x$baf,1-x$baf) 44 | x = data.frame(data.table::tstrsplit(x = x$loci, split = ":"), x$baf, x$tot_depth, row.names = x$loci) 45 | colnames(x) = c("chr", "pos", "BAF", "depth") 46 | x$pos = as.numeric(as.character(x$pos)) 47 | x 48 | }) 49 | names(counts) = c("tumor") 50 | counts = counts$tumor 51 | 52 | med_cov = median(counts[counts$chr %in% c(1:22), "depth"], na.rm = TRUE) 53 | message("Median depth of coverage (autosomes): ", med_cov) 54 | message("------") 55 | 56 | counts$logR = round(log2(counts$depth) - log2(med_cov), digits = 3) 57 | 58 | data.table::fwrite(x = counts[,c("chr", "pos", "BAF")], file = paste0(sample_name, ".tumour.BAF.txt"), sep = "\t", row.names = TRUE) 59 | data.table::fwrite(x = counts[,c("chr", "pos", "logR")], file = paste0(sample_name, ".tumour.logR.txt"), sep = "\t", row.names = TRUE) 60 | 61 | message("Generated following files:") 62 | message(paste0(sample_name, ".tumour.BAF.txt")) 63 | message(paste0(sample_name, ".tumour.logR.txt")) 64 | message("------") 65 | 66 | ascat_obj = NULL 67 | if("ASCAT" %in% rownames(installed.packages())){ 68 | message("Running ASCAT::ascat.loadData:") 69 | ascat_obj = ASCAT::ascat.loadData(Tumor_LogR_file = paste0(sample_name, ".tumour.logR.txt"), 70 | Tumor_BAF_file = paste0(sample_name, ".tumour.BAF.txt"), 71 | chrs = c(1:22, "X", "Y"), sexchromosomes = c("X", "Y")) 72 | message("Running ASCAT::ascat.plotRawData()") 73 | ASCAT::ascat.plotRawData(ASCATobj = ascat_obj, img.prefix = sample_name) 74 | #message("Running ASCAT::ascat.predictGermlineGenotypes() for tumor only") 75 | #ascat.gg = ASCAT::ascat.predictGermlineGenotypes(ASCATobj = ascat.bc, platform = PLATFORM) 76 | 77 | message("Returned ASCAT object!") 78 | } 79 | 80 | ascat_obj 81 | } 82 | 83 | -------------------------------------------------------------------------------- /R/segment_logR.R: -------------------------------------------------------------------------------- 1 | #' Segment log ratio values with DNACopy 2 | #' @description The function takes tumor logR file generated by \code{\link{prep_ascat}} and performs segmentation with \code{\link{DNAcopy}} 3 | #' @param tumor_logR sample.tumour.logR.txt file generated by \code{\link{prep_ascat}} 4 | #' @param sample_name Default "tumor" 5 | #' @return Invisibly returns \code{\link{DNAcopy}} object 6 | #' @export 7 | #' @import DNAcopy 8 | segment_logR = function(tumor_logR = NULL, sample_name = "tumor"){ 9 | 10 | tn = data.table::fread(input = tumor_logR) 11 | colnames(tn)[1:4] = c('SNP', 'contig', 'pos', 'logR') 12 | #tn$contig = gsub(pattern = 'chr', replacement = '', x = tn$contig, fixed = TRUE) 13 | tnxy = tn[contig %in% c('X', 'Y')] 14 | tn = tn[!contig %in% c('X', 'Y')] 15 | #tn = tn[!contig == 'Y'] 16 | tn = tn[order(as.numeric(tn$contig)),] 17 | tn = rbind(tn, tnxy) 18 | 19 | #samp.name = gsub(pattern = '.denoisedCR.tsv', replacement = '', x = copynumber_file) 20 | cn = DNAcopy::CNA(genomdat = tn[,logR], chrom = tn[,contig], maploc = tn[,pos], 21 | data.type = "logratio", sampleid= sample_name, presorted = TRUE) 22 | 23 | cn = DNAcopy::smooth.CNA(cn) 24 | cn = DNAcopy::segment(cn, alpha = 0.01, nperm = 10000, p.method = 'hybrid', min.width = 5, kmax = 25, nmin = 210, 25 | eta = 0.05, trim = 0.025, undo.SD = 3, undo.prune = 0.05, undo.splits = 'sdundo', verbose = 2) 26 | 27 | .dnaCopy_plotter(dc = cn) 28 | 29 | segs = cn$output 30 | colnames(segs) = c("Sample",'Chromosome','Start','End','Num_Probes','Segment_Mean') 31 | write.table(segs, paste(sample_name, '_cbs.seg', sep=''), quote = FALSE, row.names = FALSE, sep='\t') 32 | message("Segments are written to: ", paste(sample_name, '_cbs.seg', sep='')) 33 | 34 | invisible(cn) 35 | #save(cn, file = paste(sample_name, '_cbs.RData', sep='')) 36 | } 37 | 38 | 39 | .dnaCopy_plotter = function(dc){ 40 | dc.dat = dc$output 41 | 42 | tn = data.table::data.table(name = unique(dc$output[,1]) ,contig = dc$data$chrom, start = dc$data$maploc, stop = dc$data$maploc, ratio = dc$data[,3]) 43 | #colnames(tn)[5] = unique(dc$output[,ID]) 44 | tn$contig = gsub(pattern = 'chr', replacement = '', x = tn$contig, fixed = T) 45 | tn.xy = tn[contig %in% c("X", "Y")] 46 | tn = tn[!contig %in% c("X", "Y")] 47 | tn = tn[order(as.numeric(as.character(contig)))] 48 | # tn = tn[-grep(pattern = 'X',x = tn$contig),] 49 | # tn = tn[-grep(pattern = 'Y', x = tn$contig),] 50 | tn = rbind(tn, tn.xy) 51 | 52 | seg = dc$output 53 | data.table::setDT(x = seg) 54 | colnames(seg) = c('Sample', 'Chromosome', 'Start', 'End', 'Num_Probes', 'Segment_Mean') 55 | seg$Chromosome = gsub(pattern = 'chr', replacement = '', x = seg$Chromosome, fixed = T) 56 | seg.xy = seg[Chromosome %in% c("X", "Y")] 57 | seg = seg[!Chromosome %in% c("X", "Y")] 58 | # seg = seg[-grep(pattern = 'X', x = seg$Chromosome),] 59 | # seg = seg[-grep(pattern = 'Y', x = seg$Chromosome),] 60 | seg = seg[order(as.numeric(Chromosome))] 61 | seg = rbind(seg, seg.xy) 62 | 63 | tn$contig = factor(x = tn$contig, levels = c(as.character(1:22), "X", "Y")) 64 | seg$Chromosome = factor(x = seg$Chromosome, levels = c(as.character(1:22), "X", "Y")) 65 | 66 | tn.spl = split(tn, as.factor(tn$contig)) 67 | seg.spl = split(seg, as.factor(seg$Chromosome)) 68 | 69 | chr.lens = c(249250621, 243199373, 198022430, 191154276, 180915260, 171115067, 159138663, 70 | 146364022, 141213431, 135534747, 135006516, 133851895, 115169878, 107349540, 71 | 102531392, 90354753, 81195210, 78077248, 59128983, 63025520, 48129895, 51304566, 72 | 155270560, 59373566) 73 | 74 | tn.spl.tranformed = tn.spl[[1]] 75 | seg.spl.transformed = seg.spl[[1]] 76 | 77 | chr.lens.sumsum = cumsum(chr.lens) 78 | 79 | for(i in 2:length(tn.spl)){ 80 | x = tn.spl[[i]] 81 | x$start = x$start + chr.lens.sumsum[i-1] 82 | x$stop = x$stop + chr.lens.sumsum[i-1] 83 | tn.spl.tranformed = rbind(tn.spl.tranformed, x) 84 | 85 | x.seg = seg.spl[[i]] 86 | x.seg$Start = x.seg$Start + chr.lens.sumsum[i-1] 87 | x.seg$End = x.seg$End + chr.lens.sumsum[i-1] 88 | seg.spl.transformed = rbind(seg.spl.transformed, x.seg) 89 | } 90 | 91 | #tn.spl.tranformed$contig = factor(x = tn.spl.tranformed$contig, levels = c(1:22, "X", "Y")) 92 | #y.max = round(max(tn.spl.tranformed[,5])) 93 | samp.name = colnames(tn.spl.tranformed)[5] 94 | 95 | data.table::setDF(x = tn.spl.tranformed) 96 | data.table::setDF(x = seg.spl.transformed) 97 | 98 | #pdf(file = paste0(samp.name, ".pdf"), width = 8, height = 4, bg = "white", paper = "special") 99 | #ylims = c(floor(min(tn.spl.tranformed[,5], na.rm = TRUE)), ceiling(max(tn.spl.tranformed[,5], na.rm = TRUE))) 100 | ylims = c(-2, 2) 101 | par(mar = c(3, 3, 2, 2)) 102 | plot(tn.spl.tranformed[,3], tn.spl.tranformed[,5], xlim = c(0, max(chr.lens.sumsum)), pch = 16, 103 | cex = 0.1, frame.plot = FALSE, axes = FALSE, xlab = NA, ylab = NA, ylim = ylims, col = "gray70") 104 | abline(v = chr.lens.sumsum, lty = 2, col = "gray70") 105 | abline(h = 0, lwd = 1, lty = 2, col = "black") 106 | segments(x0 = seg.spl.transformed$Start, y0 = seg.spl.transformed$Segment_Mean, 107 | x1 = seg.spl.transformed$End, y1 = seg.spl.transformed$Segment_Mean, col = "maroon") 108 | axis(side = 1, at = c(0, chr.lens.sumsum), labels = c(0, 1:22, "X", "Y"), font = 2, cex.axis = 0.7, las = 2) 109 | axis(side = 2, at = c(ylims[1], -1, 0, 1, ylims[2]), font = 2, las = 2) 110 | #dev.off() 111 | } 112 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # ezASCAT 3 | 4 | 5 | [![Lifecycle: experimental](https://img.shields.io/badge/lifecycle-experimental-orange.svg)](https://lifecycle.r-lib.org/articles/stages.html#experimental) 6 | 7 | 8 | The goal of `ezASCAT` is to make life simpler while using [ASCAT](https://github.com/VanLoo-lab/ascat) with tumor-normal pairs from WGS. 9 | Although there exists [ascatNgs](https://github.com/cancerit/ascatNgs), it requires installation of perl and C modules. `ezASCAT` bypasses these requirements entirely within R with the C code baked in. 10 | 11 | ## Installation 12 | 13 | ``` r 14 | remotes::install_github(repo = "CompEpigen/ezASCAT") 15 | ``` 16 | 17 | ## Usage 18 | 19 | ### Step-1: Get nucleotide counts at the marker loci with `get_counts` 20 | 21 | Below command will generate two tsv files `tumor_nucleotide_counts.tsv` and `normal_nucleotide_counts.tsv` that can be used for downstream analysis. Note that the function will process ~900K SNPs from [Affymetrix Genome-Wide Human SNP 6.0 Array](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GPL6801). The process can be sped up by increasing `nthreads` which will launch each chromosome on a separate thread. 22 | Currently `hg19` and `hg38` are supported. 23 | 24 | ```r 25 | library("ezASCAT") 26 | #Matched normal BAM files are strongly recommended 27 | counts = ezASCAT::get_counts(t_bam = "tumor.bam", n_bam = "normal.bam", build = "hg19") 28 | ``` 29 | 30 | ### Step-2: Prepare input files for ASCAT with `prep_ascat()` 31 | 32 | #### Tumor-Normal pair 33 | 34 | Below command will filter SNPs with low coverage (default <30), estimate BAF, logR, and generates the input files for ASCAT. 35 | In addition, it will run `ASCAT::ascat.loadData()` and `ASCAT::ascat.plotRawData()` for you and returns the ASCAT object that can be further processed with ASCAT functions. 36 | 37 | ```r 38 | ascat.bc = prep_ascat(t_counts = "tumor_nucleotide_counts.tsv", n_counts = "normal_nucleotide_counts.tsv", sample_name = "tumor") 39 | 40 | # Markers: 901235 41 | # Removed 3072 duplicated loci 42 | # Markers > 30: 25246 43 | # ------ 44 | # Counts file: normal_nucleotide_counts.tsv 45 | # Markers: 901235 46 | # Removed 3072 duplicated loci 47 | # Markers > 30: 31387 48 | # ------ 49 | # Final number SNPs: 23765 50 | # Generated following files: 51 | # tumor.tumour.BAF.txt 52 | # tumor.tumour.logR.txt 53 | # tumor.normal.BAF.txt 54 | # tumor.normal.logR.txt 55 | # ------ 56 | # Running ASCAT::ascat.loadData: 57 | # [1] Reading Tumor LogR data... 58 | # [1] Reading Tumor BAF data... 59 | # [1] Reading Germline LogR data... 60 | # [1] Reading Germline BAF data... 61 | # [1] Registering SNP locations... 62 | # [1] Splitting genome in distinct chunks... 63 | # Running ASCAT::ascat.plotRawData: 64 | # [1] Plotting tumor data 65 | # [1] Plotting germline data 66 | # Returned ASCAT object 67 | ``` 68 | 69 | The returned `ASCAT` object can be passed to downstream ASCAT functions: 70 | 71 | ```r 72 | ascat.bc = ASCAT::ascat.aspcf(ascat.bc) 73 | ASCAT::ascat.plotSegmentedData(ascat.bc) 74 | ascat.output = ASCAT::ascat.runAscat(ascat.bc) 75 | ``` 76 | 77 | #### Tumor only 78 | 79 | ```r 80 | > ascat.bc = ezASCAT::prep_ascat_t(t_counts = "tumor_nucleotide_counts.tsv", sample_name = "tumoronly") 81 | 82 | # Library sizes: 83 | # Tumor: 1239964831 84 | # Counts file: tumor_nucleotide_counts.tsv 85 | # Markers: 930104 86 | # Removed 15 duplicated loci 87 | # Markers > 30: 829579 88 | # ------ 89 | # Median depth of coverage: 59 90 | # Generated following files: 91 | # tumoronly.tumour.BAF.txt 92 | # tumoronly.tumour.logR.txt 93 | # ------ 94 | # Running ASCAT::ascat.loadData: 95 | # [1] Reading Tumor LogR data... 96 | # [1] Reading Tumor BAF data... 97 | # [1] Registering SNP locations... 98 | # [1] Splitting genome in distinct chunks... 99 | # Running ASCAT::ascat.plotRawData() 100 | # [1] Plotting tumor data 101 | # Returned ASCAT object! 102 | ``` 103 | 104 | The returned `ASCAT` object can be processed with _ASCAT without matched normal data protocol_: 105 | 106 | ```r 107 | ascat.gg = ASCAT::ascat.predictGermlineGenotypes(ascat.bc) 108 | ascat.bc = ASCAT::ascat.aspcf(ascat.bc,ascat.gg=ascat.gg) 109 | ASCAT::ascat.plotSegmentedData(ascat.bc) 110 | ascat.output = ASCAT::ascat.runAscat(ascat.bc) 111 | ``` 112 | 113 | ### CBS segmentation 114 | 115 | Alternatively, tumor logR files generated by `prep_ascat()` can be processed with `segment_logR()` function which performs circular binary segmentation using [DNAcopy](https://bioconductor.org/packages/release/bioc/html/DNAcopy.html) and plots the results 116 | 117 | ```r 118 | > ezASCAT::segment_logR(tumor_logR = "tumor.tumour.logR.txt", sample_name = "tumor") 119 | 120 | # Analyzing: tumor 121 | # current chromosome: 1 122 | # current chromosome: 2 123 | # current chromosome: 3 124 | # current chromosome: 4 125 | # current chromosome: 5 126 | # current chromosome: 6 127 | # current chromosome: 7 128 | # current chromosome: 8 129 | # current chromosome: 9 130 | # current chromosome: 10 131 | # current chromosome: 11 132 | # current chromosome: 12 133 | # current chromosome: 13 134 | # current chromosome: 14 135 | # current chromosome: 15 136 | # current chromosome: 16 137 | # current chromosome: 17 138 | # current chromosome: 18 139 | # current chromosome: 19 140 | # current chromosome: 20 141 | # current chromosome: 21 142 | # current chromosome: 22 143 | # current chromosome: MT 144 | # current chromosome: X 145 | # current chromosome: Y 146 | # Segments are written to: tumor_cbs.seg 147 | ``` 148 | -------------------------------------------------------------------------------- /ezASCAT.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | BuildType: Package 16 | PackageUseDevtools: Yes 17 | PackageInstallArgs: --no-multiarch --with-keep.source 18 | -------------------------------------------------------------------------------- /inst/extdata/GRCh37_SNP6.tsv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CompEpigen/ezASCAT/4c0dfbd23da86c6363028aa7908ccff142820724/inst/extdata/GRCh37_SNP6.tsv.gz -------------------------------------------------------------------------------- /inst/extdata/GRCh38_SNP6.tsv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CompEpigen/ezASCAT/4c0dfbd23da86c6363028aa7908ccff142820724/inst/extdata/GRCh38_SNP6.tsv.gz -------------------------------------------------------------------------------- /man/get_counts.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/getCounts.R 3 | \name{get_counts} 4 | \alias{get_counts} 5 | \title{Extract nucleotide counts from targeted loci (SNPs)} 6 | \usage{ 7 | get_counts( 8 | t_bam = NULL, 9 | n_bam = NULL, 10 | build = "hg19", 11 | prefix = NULL, 12 | add = TRUE, 13 | mapq = 10, 14 | sam_flag = 1024, 15 | loci = NULL, 16 | fa = NULL, 17 | op = NULL, 18 | zerobased = FALSE, 19 | nthreads = 4, 20 | verbose = TRUE 21 | ) 22 | } 23 | \arguments{ 24 | \item{t_bam}{Tumor BAM file. Required} 25 | 26 | \item{n_bam}{Normal BAM file. Recommended} 27 | 28 | \item{build}{Default hg19. Mutually exclusive with \code{loci}. Currently supported \code{hg19} and \code{hg38} and includes ~900K SNPs from Affymetrix Genome-Wide Human SNP 6.0 Array. SNP file has no \code{chr} prefix.} 29 | 30 | \item{prefix}{Prefix to add or remove from contig names in loci file. For example, in case BAM files have no \code{chr} prefix.} 31 | 32 | \item{add}{If prefix is used, default is to add prefix to contig names in loci file. If false prefix will be removed from contig names.} 33 | 34 | \item{mapq}{Map quality. Default 10} 35 | 36 | \item{sam_flag}{SAM FLAG to filter reads. Default 1024} 37 | 38 | \item{loci}{A tab separated file with chr and position. If not available use \code{build} argument.} 39 | 40 | \item{fa}{Indexed fasta file. If provided, extracts and adds reference base to the output tsv.} 41 | 42 | \item{op}{Output file basename. Default parses from BAM file} 43 | 44 | \item{zerobased}{are coordinates zero-based. Default FALSE. Use only if \code{loci} is used.} 45 | 46 | \item{nthreads}{Number of threads to use. Default 4. Each chromosome will be launched on a separate thread. Works only on Unix and macOS.} 47 | 48 | \item{verbose}{Default TRUE} 49 | } 50 | \description{ 51 | Extract nucleotide counts from targeted loci (SNPs) 52 | } 53 | -------------------------------------------------------------------------------- /man/plot_mosdepth.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plot_mostdepth.R 3 | \name{plot_mosdepth} 4 | \alias{plot_mosdepth} 5 | \title{Plot results from mosdepth output} 6 | \usage{ 7 | plot_mosdepth(bed = NULL, col = c("#95a5a6", "#7f8c8d")) 8 | } 9 | \arguments{ 10 | \item{bed}{mosdepth output} 11 | 12 | \item{col}{Colors. Default c("#95a5a6", "#7f8c8d")} 13 | } 14 | \description{ 15 | Plot results from mosdepth output 16 | } 17 | -------------------------------------------------------------------------------- /man/plot_mosdepth_tn.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plot_mostdepth_tn.R 3 | \name{plot_mosdepth_tn} 4 | \alias{plot_mosdepth_tn} 5 | \title{Plot results from mosdepth output for Tumor/Normal pairs} 6 | \usage{ 7 | plot_mosdepth_tn( 8 | t_bed = NULL, 9 | n_bed = NULL, 10 | segment = TRUE, 11 | sample_name = NULL, 12 | col = c("#95a5a6", "#7f8c8d") 13 | ) 14 | } 15 | \arguments{ 16 | \item{t_bed}{mosdepth output from tumor} 17 | 18 | \item{n_bed}{mosdepth output from normal} 19 | 20 | \item{segment}{Whether to perform CBS segmentation. Default TRUE} 21 | 22 | \item{sample_name}{sample name. Default parses from \code{t_bed}} 23 | 24 | \item{col}{Colors. Default c("#95a5a6", "#7f8c8d")} 25 | } 26 | \description{ 27 | Plot results from mosdepth output for Tumor/Normal pairs 28 | } 29 | -------------------------------------------------------------------------------- /man/prep_ascat.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/prepAscat.R 3 | \name{prep_ascat} 4 | \alias{prep_ascat} 5 | \title{Prepare input files for ASCAT} 6 | \usage{ 7 | prep_ascat( 8 | t_counts = NULL, 9 | n_counts = NULL, 10 | sample_name = NA, 11 | min_depth = 30, 12 | normalize = FALSE 13 | ) 14 | } 15 | \arguments{ 16 | \item{t_counts}{read counts from tumor generated by \code{get_counts}} 17 | 18 | \item{n_counts}{read counts from normal generated by \code{get_counts}} 19 | 20 | \item{sample_name}{Sample name. Used as a basename for output files. Default NA, parses from t_counts file.} 21 | 22 | \item{min_depth}{Min read depth required to consider a marker. Default 30} 23 | 24 | \item{normalize}{If TRUE, normalizes for library size} 25 | } 26 | \value{ 27 | An \code{\link{ascat.loadData}} object; ascat data structure 28 | } 29 | \description{ 30 | Prepare input files for ASCAT 31 | } 32 | -------------------------------------------------------------------------------- /man/prep_ascat_t.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/prepAscat_t.R 3 | \name{prep_ascat_t} 4 | \alias{prep_ascat_t} 5 | \title{Prepare input files for ASCAT tumor only samples} 6 | \usage{ 7 | prep_ascat_t( 8 | t_counts = NULL, 9 | sample_name = NA, 10 | min_depth = 30, 11 | PLATFORM = "AffySNP6" 12 | ) 13 | } 14 | \arguments{ 15 | \item{t_counts}{read counts from tumor generated by \code{get_counts}} 16 | 17 | \item{sample_name}{Sample name. Used as a basename for output files. Default NA, parses from t_counts file.} 18 | 19 | \item{min_depth}{Min read depth required to consider a marker. Default 30} 20 | 21 | \item{PLATFORM}{Default AffySNP6. Only change if you have used custom loci. See here for available options https://www.crick.ac.uk/research/labs/peter-van-loo/software} 22 | } 23 | \value{ 24 | An \code{\link{ascat.loadData}} object; ascat data structure 25 | } 26 | \description{ 27 | Prepare input files for ASCAT tumor only samples 28 | } 29 | -------------------------------------------------------------------------------- /man/segment_logR.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/segment_logR.R 3 | \name{segment_logR} 4 | \alias{segment_logR} 5 | \title{Segment log ratio values with DNACopy} 6 | \usage{ 7 | segment_logR(tumor_logR = NULL, sample_name = "tumor") 8 | } 9 | \arguments{ 10 | \item{tumor_logR}{sample.tumour.logR.txt file generated by \code{\link{prep_ascat}}} 11 | 12 | \item{sample_name}{Default "tumor"} 13 | } 14 | \value{ 15 | Invisibly returns \code{\link{DNAcopy}} object 16 | } 17 | \description{ 18 | The function takes tumor logR file generated by \code{\link{prep_ascat}} and performs segmentation with \code{\link{DNAcopy}} 19 | } 20 | -------------------------------------------------------------------------------- /scripts/compile_snp6.R: -------------------------------------------------------------------------------- 1 | #Code to create SNP6 loci in extdata directory 2 | 3 | library(data.table) 4 | 5 | #hg38 6 | download.file(url = "https://api.gdc.cancer.gov/data/77fbfff6-2acc-47ca-a5f6-c488beb46879", destfile = "snp6.na35.liftoverhg38.txt.zip") 7 | unzip(zipfile = "snp6.na35.liftoverhg38.txt.zip") 8 | snp6_hg38 = data.table::fread(input = "snp6.na35.liftoverhg38.txt") 9 | snp6_hg38 = snp6_hg38[!type %in% "CN"][order(as.character(chr), as.numeric(pos))][,.(chr, pos, probeid)] 10 | nrow(snp6_hg38) #932,148 loci 11 | 12 | #hg19: Get `GPL6801-4019.txt` from GEO annotations https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GPL6801 13 | ## On build 32 (ucsc=hg19; ncbi=GRCh37) 14 | snp6_hg19 = data.table::fread(input = "GPL6801-4019.txt", skip = 10) 15 | snp6_hg19 = snp6_hg19[grepl(pattern = "^SNP", x = snp6_hg19$ID)][!Chromosome %in% "---"][order(Chromosome, as.numeric(`Physical Position`))] 16 | nrow(snp6_hg19) #930,104 loci 17 | 18 | length(intersect(x = snp6_hg19$ID, snp6_hg38$probeid)) #be 929,132 probes common 19 | 20 | data.table::fwrite(x = snp6_hg19[,.(Chromosome, `Physical Position`)], file = "inst/extdata/GRCh37_SNP6.tsv.gz", sep = "\t", col.names = FALSE) 21 | data.table::fwrite(x = snp6_hg38[,.(chr, pos)], file = "inst/extdata/GRCh38_SNP6.tsv.gz", sep = "\t", col.names = FALSE) 22 | 23 | system(command = "rm snp6.na35.liftoverhg38.txt") 24 | system(command = "rm snp6.na35.liftoverhg38.txt.zip") 25 | system(command = "rm GPL6801-4019.txt") 26 | -------------------------------------------------------------------------------- /src/Makevars: -------------------------------------------------------------------------------- 1 | RHTSLIB_LIBS=$(shell "${R_HOME}/bin${R_ARCH_BIN}/Rscript" -e \ 2 | 'Rhtslib::pkgconfig("PKG_LIBS")') 3 | RHTSLIB_CPPFLAGS=$(shell "${R_HOME}/bin${R_ARCH_BIN}/Rscript" -e \ 4 | 'Rhtslib::pkgconfig("PKG_CPPFLAGS")') 5 | PKG_LIBS=$(RHTSLIB_LIBS) 6 | PKG_CPPFLAGS=$(RHTSLIB_CPPFLAGS) 7 | #PKG_CPPFLAGS=-I "/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/usr/include/" 8 | #https://github.com/Rdatatable/data.table/issues/4907#issuecomment-806131096 Makevars for macOS big sur 9 | -------------------------------------------------------------------------------- /src/ntcounts.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include 12 | 13 | int countlines(char *filename){ 14 | // count the number of lines in the file called filename 15 | FILE *fp = fopen(filename,"r"); 16 | int ch=0; 17 | int lines=0; 18 | 19 | if (fp == NULL){ 20 | return 0; 21 | } 22 | 23 | while(!feof(fp)){ 24 | ch = fgetc(fp); 25 | if(ch == '\n'){ 26 | lines++; 27 | } 28 | } 29 | fclose(fp); 30 | return lines; 31 | } 32 | 33 | #define PBSTR "||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||" 34 | #define PBWIDTH 60 35 | 36 | // Progress bar modified from: https://stackoverflow.com/a/36315819 37 | void printProgress(double percentage, int done, int tot) { 38 | int val = (int) (percentage * 100); 39 | int lpad = (int) (percentage * PBWIDTH); 40 | int rpad = PBWIDTH - lpad; 41 | fprintf(stderr, "\r%3d%% [%.*s%*s] %d/%d", val, lpad, PBSTR, rpad, "", done, tot); 42 | fflush(stdout); 43 | } 44 | 45 | void ntcounts(const char *bam, const char *bedfile, uint32_t q, uint32_t F, const char *fafile, const char *op){ 46 | 47 | int vars_gt = 1; //No. of BED entries 48 | 49 | hts_verbose = 0; //suppresses htslib warnings 50 | 51 | char tsv_file[1000]; 52 | strcpy(tsv_file, op); 53 | strcat(tsv_file, ".tsv"); 54 | 55 | //Open bed file 56 | int nloci = countlines(bedfile); 57 | 58 | FILE *bed_fp; 59 | bed_fp = fopen(bedfile, "r"); 60 | char buff[1000]; 61 | 62 | //Open TSV report file 63 | FILE *tsv_fp; 64 | tsv_fp = fopen(tsv_file, "w" ); 65 | 66 | //fasta file 67 | char *seq; 68 | faidx_t *fa = fai_load(fafile); 69 | 70 | //BAM file 71 | samFile *fp_in = hts_open(bam,"r"); //open bam file 72 | hts_idx_t *fp_idx = sam_index_load(fp_in, bam); 73 | bam_hdr_t *bamHdr = sam_hdr_read(fp_in); //read header 74 | bam1_t *aln = bam_init1(); //initialize an alignment 75 | 76 | uint64_t n_mapped = 0; 77 | uint64_t n_unmapped = 0; 78 | uint64_t tot_mapped = 0; 79 | int res = 0; 80 | 81 | int32_t n_contigs =bamHdr->n_targets; 82 | for(int i = 0; i < n_contigs; i++){ 83 | res = hts_idx_get_stat(fp_idx, i, &n_mapped, &n_unmapped); 84 | if(res == 0){ 85 | tot_mapped = tot_mapped + n_mapped; 86 | } 87 | } 88 | 89 | fprintf(tsv_fp, "#idxstats_mapped_reads\t%llu\n", tot_mapped); 90 | fprintf(tsv_fp, "loci\tfa_ref\tA\tT\tG\tC\tIns\tDel\n"); 91 | 92 | //For every loci in the BED file 93 | while(fgets(buff,1000,bed_fp) != NULL){ 94 | 95 | //Remove trailing new line chars 96 | int len = strlen(buff); 97 | if(buff[len-1] == '\n' ){ 98 | buff[len-1] = 0; 99 | } 100 | 101 | char *chrom = strtok(buff,"\t"); 102 | char *start = strtok(NULL,"\t"); 103 | 104 | char loci[250] = ""; 105 | strcat(loci, chrom); strcat(loci, ":"); strcat(loci, start); strcat(loci, "-"); strcat(loci, start); 106 | 107 | //Fetch base at target loci from fasta file 108 | if(fa != NULL){ 109 | int templen = 100; 110 | seq = fai_fetch(fa, loci, &templen); 111 | fprintf(tsv_fp, "%s:%s\t%s", chrom, start, seq); 112 | free(seq); 113 | }else{ 114 | fprintf(tsv_fp, "%s:%s\tNA", chrom, start); 115 | } 116 | 117 | int32_t target_pos = atoi(start) -1; //input position are 1 based 118 | 119 | //load reads in target loci 120 | hts_itr_t *samitr = sam_itr_querys(fp_idx, bamHdr, loci); 121 | 122 | //Keep track of total reads and nt counts per loci 123 | int32_t tot_reads = 0; 124 | float nt[6] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; 125 | 126 | //printProgress(vars_gt/(double)nloci, vars_gt, nloci); 127 | vars_gt = vars_gt + 1; 128 | 129 | 130 | //if(vars_gt % 1000 == 0){ 131 | // Rprintf("%d | ", vars_gt); //verbose processed loci 132 | //} 133 | 134 | //For every read in the BAM file of target region 135 | while(sam_itr_next(fp_in, samitr, aln) > 0){ 136 | 137 | int32_t pos = aln->core.pos ; //left most position of alignment in zero based coordinate (0-based) 138 | uint32_t len = aln->core.l_qseq; //length of the read. 139 | uint32_t* cig = bam_get_cigar(aln); 140 | uint8_t *qs = bam_get_seq(aln); //quality string 141 | //char *chr = bamHdr->target_name[aln->core.tid] ; //contig name (chromosome) 142 | //uint16_t samflag = aln->core.flag; // flag 143 | //uint32_t q2 = aln->core.qual ; //mapping quality 144 | 145 | 146 | //MAPQ and FLAG filter 147 | if(aln->core.qual <= q){ 148 | continue; 149 | } 150 | 151 | if(aln->core.flag >= F){ 152 | continue; 153 | } 154 | 155 | tot_reads = tot_reads +1; 156 | //char ins_seq[100]; char del_seq[100]; 157 | 158 | //get nucleotide id and converts them into IUPAC id. 159 | char *qseq = (char *)malloc(len); 160 | int i = 0; 161 | for(i=0; i< len ; i++){ 162 | qseq[i] = seq_nt16_str[bam_seqi(qs,i)]; 163 | } 164 | 165 | //target position on the read 166 | int32_t pos_onread = 0; 167 | 168 | //For every CIGAR string 169 | int k = 0; 170 | for(k=0;k< aln->core.n_cigar ;++k){ 171 | int cop =cig[k] & BAM_CIGAR_MASK; // CIGAR string 172 | int cl = cig[k] >> BAM_CIGAR_SHIFT; // CIGAR length 173 | 174 | if(BAM_CIGAR_STR[cop] == 'M'){ 175 | pos_onread = pos_onread + cl; 176 | pos = pos + cl; 177 | }else if(BAM_CIGAR_STR[cop] == 'S'){ 178 | pos_onread = pos_onread + cl; 179 | }else if(BAM_CIGAR_STR[cop] == 'I'){ 180 | pos_onread = pos_onread + cl; 181 | }else if(BAM_CIGAR_STR[cop] == 'D'){ 182 | pos = pos + cl; 183 | } 184 | 185 | if(pos > target_pos){ 186 | if(BAM_CIGAR_STR[cop] == 'M'){ 187 | pos_onread = pos_onread - (pos - target_pos); 188 | if(qseq[pos_onread] == 'A'){ 189 | nt[0] = nt[0] + 1; 190 | }else if(qseq[pos_onread] == 'T'){ 191 | nt[1] = nt[1] + 1; 192 | }else if(qseq[pos_onread] == 'G'){ 193 | nt[2] = nt[2] + 1; 194 | }else if(qseq[pos_onread] == 'C'){ 195 | nt[3] = nt[3] + 1; 196 | } 197 | break; 198 | } 199 | }else if(pos == target_pos){ 200 | if(BAM_CIGAR_STR[cop] == 'I'){ 201 | nt[4] = nt[4] + 1; 202 | // insertion sequence 203 | // for(int i = 0; i < cl; i++){ 204 | // //strcat(ins_seq, &qseq[pos_onread+i]); 205 | // } 206 | break; 207 | }else if(BAM_CIGAR_STR[cop] == 'D'){ 208 | nt[5] = nt[5] + 1; 209 | // deletion sequence 210 | // for(int i = 0; i < cl; i++){ 211 | // strcat(del_seq, qseq[pos_onread+i]); 212 | // } 213 | break; 214 | } 215 | } 216 | } 217 | 218 | free(qseq); 219 | } 220 | 221 | hts_itr_destroy(samitr); 222 | fprintf(tsv_fp, "\t%.f\t%.f\t%.f\t%.f\t%.f\t%.f\n", nt[0], nt[1], nt[2], nt[3], nt[4], nt[5]); 223 | } 224 | 225 | //fprintf(tsv_fp, "\t%.f\t%.f\t%.f\t%.f\t%.f\t%.f\n", nt[0], nt[1], nt[2], nt[3], nt[4], nt[5]); 226 | 227 | bam_destroy1(aln); 228 | bam_hdr_destroy(bamHdr); 229 | fai_destroy(fa); 230 | sam_close(fp_in); 231 | fclose(bed_fp); 232 | fclose(tsv_fp); 233 | //Rprintf("\n Done!"); 234 | } 235 | 236 | SEXP ntc(SEXP filename, SEXP bedname, SEXP qual, SEXP flag, SEXP fa, SEXP op_file){ 237 | //SEXP tbl = PROTECT(Rf_allocVector(STRSXP, 1)); 238 | //char *parse_bam (const char *bam, const char *bedfile, int d, int t, float v, const char *fafile, const char *op, uint32_t q, uint32_t F){ 239 | ntcounts(Rf_translateChar(Rf_asChar(filename)), Rf_translateChar(Rf_asChar(bedname)),Rf_asInteger(qual), Rf_asInteger(flag), 240 | Rf_translateChar(Rf_asChar(fa)), Rf_translateChar(Rf_asChar(op_file))); 241 | //nt[0] = 1; 242 | return 0; 243 | } 244 | --------------------------------------------------------------------------------