├── R └── RCNV_seq-helper.R ├── README.md ├── image ├── cn.mops-segplot.png ├── cna-norm.png ├── cnv-countOverlaps.png ├── cnv-seq-plot.png ├── count2.png ├── normalised-count2.png ├── plotBias.png ├── plotCorrection.png ├── plotDefaultSegments.png ├── plotSegment_case1.png └── plotSegment_case2.png ├── inst └── script │ ├── CNAnorm.R │ ├── CNV-seq.R │ ├── HMMcopy.R │ ├── RCNV_seq.R │ ├── TitanCNA-helper.R │ ├── TitanCNA.R │ ├── cn.mops.R │ ├── countOverlaps.R │ ├── seqCNA.R │ └── testCounts.R ├── result files ├── countOverlapresult.tsv ├── tumor.hits-vs-normal.hits.window-10000.minw-4.cnv └── tumor.hits-vs-normal.hits.window-10000.minw-4.count └── vignettes └── seqCNA.Rnw /R/RCNV_seq-helper.R: -------------------------------------------------------------------------------- 1 | ## samtools view -F 4 tumorA.chr4.bam |\ 2 | ## perl -lane 'print "$F[2]\t$F[3]"' >tumor.hits 3 | ## samtools view -F 4 normalA.chr4.bam |\ 4 | ## perl -lane 'print "$F[2]\t$F[3]"' >normal.hits 5 | ## perl cnv-seq/cnv-seq.pl --test tumor.hits --ref normal.hits \ 6 | ## --genome human --Rexe "~/bin/R-devel/bin/R" 7 | 8 | genomeSize <- function(files) 9 | sum(as.numeric(seqlengths(BamFile(files[[1]])))) 10 | 11 | windowSize <- 12 | function(bam_files, pvalue=0.001, log2=0.6, bigger=1.5, 13 | genome_size, param) 14 | { 15 | if (missing(genome_size)) 16 | genome_size=genomeSize(bam_files) 17 | if (missing(param)) 18 | param <- ScanBamParam(flag=scanBamFlag(isUnmappedQuery=FALSE)) 19 | 20 | total <- sapply(bam_files, function(...) { 21 | countBam(...)$records 22 | }, param=param) 23 | 24 | bt <- qnorm(1 - pvalue / 2) 25 | st <- qnorm(pvalue / 2) 26 | log2 <- abs(log2) 27 | brp <- 2^log2 28 | srp <- 1 / (2^log2) 29 | 30 | bw <- (total[["test"]] * brp^2 + total[["ref"]]) * genome_size * bt^2 / 31 | ((1-brp)^2 * total[["test"]] * total[["ref"]]) 32 | sw <- (total[["test"]] * srp^2 + total[["ref"]]) * genome_size * st^2 / 33 | ((1-srp)^2 * total[["test"]] * total[["ref"]]) 34 | 35 | window_size = floor(max(bw, sw) * bigger) 36 | } 37 | 38 | tileGenomeOverlap <- function(file, tilewidth) { 39 | ## overlapping tiles 40 | lengths <- seqlengths(BamFile(file[[1]])) 41 | tile0 <- tileGenome(lengths, tilewidth=tilewidth, 42 | cut.last.tile.in.chrom=TRUE) 43 | tile1 <- tile0[width(tile0) >= tilewidth] 44 | tile1 <- shift(tile1[-cumsum(runLength(seqnames(tile1)))], tilewidth / 2) 45 | sort(c(tile0, tile1)) 46 | } 47 | 48 | binCounter <- function(features, reads, ignore.strand, ...) { 49 | countOverlaps(features, resize(granges(reads), 1), 50 | ignore.strand=ignore.strand) 51 | } 52 | 53 | as.countsfile <- function(hits, file=tempfile()) { 54 | df <- with(rowData(hits), { 55 | cbind(data.frame(chromosome=as.character(seqnames), 56 | start=start, end=end), 57 | assay(hits)) 58 | }) 59 | write.table(df, file, quote=FALSE, row.names=FALSE, sep="\t") 60 | file 61 | } 62 | 63 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Copy Number Analysis 2 | ===================== 3 | 4 | Explore, compare, and evaluate Bioconductor packages related to genomic copy number analysis 5 | 6 | Genomic amplifications and deletions are found in most (all?) tumor genomes. A common practice today is to do low coverage DNA sequencing (0.5x, for instance) of a tumor genome, and a matched normal genome (from the same subject). Judicious comparison of the the two sequence genomes illuminates structural changes in the tumor. 7 | 8 | Copy number changes in tumors vary from broad (an entire chromosome arm) to focal (i.e., a 10kb amplification, loss of heterozygosity or gain). Detection methods should be sensitive enough to detect these very different phenomena in noisy low-coverage data. 9 | 10 | Our purpose here is to provide 11 | 12 | * A tumor/normal single chromosome pair of bam files (with accompanying index files) 13 | * A reference analysis, using the popular SeqSeg matlab program from the Broad Institute 14 | * A tutorial on the exploratory data analysis of these files using "native" Bioconductor capabilities 15 | * Demonstrate (and evaluate) the capabilities of many of the Bioconductor copy number analysis packages 16 | 17 | List of Tools used 18 | =================== 19 | Bioconductor Packages 20 | * countOverlaps 21 | * cn.mops 22 | * CNAnorm 23 | * seqCNA 24 | * HMMcopy 25 | * TitanCNA 26 |
27 | 28 | Non Biocondcutor packages 29 | * CNV-seq 30 | * Seg-seq 31 | 32 | 33 | Literature Resources 34 | ========================= 35 | * Alkan, C., et al. (2011). "Genome structural variation discovery and genotyping." Nat Rev Genet 12(5): 363-376. 36 | * Duan J, Zhang J-G, Deng H-W, Wang Y-P (2013) Comparative Studies of Copy Number Variation Detection Methods for Next-Generation Sequencing Technologies. PLoS ONE 8(3): e59128. doi:10.1371/journal.pone.0059128 37 | 38 | Sample Data 39 | =========== 40 | * http://s3.amazonaws.com/copy-number-analysis/tumorA.chr4.bam 41 | * http://s3.amazonaws.com/copy-number-analysis/tumorA.chr4.bam.bai 42 | * http://s3.amazonaws.com/copy-number-analysis/normalA.chr4.bam 43 | * http://s3.amazonaws.com/copy-number-analysis/normalA.chr4.bam.bai 44 | 45 | Use, e.g., 46 |
 
47 | download.file(url="http://s3.amazonaws.com/copy-number-analysis/tumorA.chr4.bam.bai",
48 |               destfile="tumorA.chr4.bam.bai")
49 | 
50 | 51 | 52 | Exploratory Data Analysis 53 | ========================== 54 | We have done some primary Exploratory Data Analysis on the Normal and Tumor Sample Datasets. 55 | 56 | -------------------------------------------------------------------------------- /image/cn.mops-segplot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bioconductor/copy-number-analysis/865ee38fcd621ef720851f11d30b965d60b8890a/image/cn.mops-segplot.png -------------------------------------------------------------------------------- /image/cna-norm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bioconductor/copy-number-analysis/865ee38fcd621ef720851f11d30b965d60b8890a/image/cna-norm.png -------------------------------------------------------------------------------- /image/cnv-countOverlaps.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bioconductor/copy-number-analysis/865ee38fcd621ef720851f11d30b965d60b8890a/image/cnv-countOverlaps.png -------------------------------------------------------------------------------- /image/cnv-seq-plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bioconductor/copy-number-analysis/865ee38fcd621ef720851f11d30b965d60b8890a/image/cnv-seq-plot.png -------------------------------------------------------------------------------- /image/count2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bioconductor/copy-number-analysis/865ee38fcd621ef720851f11d30b965d60b8890a/image/count2.png -------------------------------------------------------------------------------- /image/normalised-count2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bioconductor/copy-number-analysis/865ee38fcd621ef720851f11d30b965d60b8890a/image/normalised-count2.png -------------------------------------------------------------------------------- /image/plotBias.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bioconductor/copy-number-analysis/865ee38fcd621ef720851f11d30b965d60b8890a/image/plotBias.png -------------------------------------------------------------------------------- /image/plotCorrection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bioconductor/copy-number-analysis/865ee38fcd621ef720851f11d30b965d60b8890a/image/plotCorrection.png -------------------------------------------------------------------------------- /image/plotDefaultSegments.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bioconductor/copy-number-analysis/865ee38fcd621ef720851f11d30b965d60b8890a/image/plotDefaultSegments.png -------------------------------------------------------------------------------- /image/plotSegment_case1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bioconductor/copy-number-analysis/865ee38fcd621ef720851f11d30b965d60b8890a/image/plotSegment_case1.png -------------------------------------------------------------------------------- /image/plotSegment_case2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bioconductor/copy-number-analysis/865ee38fcd621ef720851f11d30b965d60b8890a/image/plotSegment_case2.png -------------------------------------------------------------------------------- /inst/script/CNAnorm.R: -------------------------------------------------------------------------------- 1 | library(CNAnorm) 2 | library(RUnit) 3 | 4 | #outside R session - convert perl file 5 | ##perl bam2windows.pl "tumorA.chr4.bam" "normalA.chr4.bam" > perloutput.txt 6 | 7 | test_counts <- function(chr4data) 8 | { 9 | test_indices <- c(8000001, 8010001,10000001 , 10010001, 1000001) 10 | test_res <- c(67,62,67,74,47) #from samtools 11 | res <- sapply( test_indices, function(x) chr4data[which(chr4data[,2]==x),3]) 12 | checkEquals(test_res,res) 13 | } 14 | 15 | data<- read.table("perloutput.txt",sep="\t",header=TRUE) 16 | 17 | #subset to chromosome 4 18 | chr4data <- data[which(data[,1]=="chr4"),] 19 | 20 | #check if the raw counts are similiar to counts from samtools 21 | test_counts() #FALSE! 22 | 23 | #create an object of class CNAnorm 24 | cn <- dataFrame2object(chr4data) 25 | 26 | #smooth the signal to decrease noise without losing resolution. 27 | cn <- addSmooth(cn,lambda=7) 28 | 29 | #estimate peaks and ploidy 30 | cn <- peakPloidy(cn) 31 | 32 | #produce a plot 33 | png("cna-norm.png") 34 | plotPeaks(cn) 35 | dev.off() -------------------------------------------------------------------------------- /inst/script/CNV-seq.R: -------------------------------------------------------------------------------- 1 | #Here we show an implementation of CNV-seq 2 | 3 | ##step 1 - includes generating best-hit location files for each mapped 4 | ##sequence read. The authors provide a perl script for BLAT psl file 5 | ## and SOLiD maching pipeline. For BAM files, they suggest to extract 6 | ##locations using the following command 7 | 8 | #~/copynumber$ samtools view -F 4 tumorA.chr4.bam |perl -lane 9 | #'print "F[2]\t$F[3]"' >tumor.hits 10 | 11 | #~/copynumber$ samtools view -F 4 normalA.chr4.bam |perl 12 | #-lane 'print "F[2]\t$F[3]"' >normal.hits 13 | 14 | 15 | ##cnv-seq.pl is used to calculate sliding window size, to count number of 16 | ##mapped hits in each window, and to call cnv R package to calculate log2 17 | ## ratios and annotate CNV 18 | 19 | # perl cnv-seq.pl --test tumor.hits --ref normal.hits --genome human 20 | 21 | 22 | ##two output files are produced. They can be found under "result files": 23 | ##tumor.hits-vs-normal.hits.window-10000.minw-4.cnv 24 | ##tumor.hits-vs-normal.hits.window-10000.minw-4.count 25 | 26 | #One can visualize the cnv inside R using the following code snippet 27 | ## the plot can be found under "image" folder 28 | 29 | library(cnv) 30 | data <- read.delim("tumor.hits-vs-normal.hits.window-10000.minw-4.cnv") 31 | cnv.summary(data) 32 | png("cnv-seq-plot.png") 33 | plot.cnv(data) 34 | dev.off() -------------------------------------------------------------------------------- /inst/script/HMMcopy.R: -------------------------------------------------------------------------------- 1 | ## This file shows an implementation of copy number using HMMcopy. 2 | ## This is run on our chosen dataset 3 | ## tumor file : tumorA.chr4.bam 4 | ## normal file : normalA.chr4.bam 5 | 6 | ## outside R - generate readCounts file 7 | ## bin/readCounter tumorA.chr4.bam > chr4_tum_reads.wig 8 | ## bin/readCounter normalA.chr4.bam > chr4_norm_reads.wig 9 | 10 | ## currently the wig files for gc content and mappability have NCBI style 11 | ## of seqnames/ chromsomes. so convert tumor and normal wig files to same. 12 | ## sed s/=chr/=/ chr4_tumor_reads.wig > chr4_tumor_reads_ncbi.wig 13 | ## sed s/=chr/=/ chr4_tumor_reads.wig > chr4_tumor_reads_ncbi.wig 14 | 15 | tum_readfile <-"chr4_tumor_reads_ncbi.wig" 16 | nor_readfile <-"chr4_normal_reads_ncbi.wig" 17 | 18 | ## Note - these files are distributed along with TitanCNA 19 | ## the files distributed along with HMMcopy had inconsistent seqname style 20 | mapfile <-"GRCh37-lite.map.ws_1000.wig" 21 | gcfile <-"GRCh37-lite.gc.ws_1000.wig" 22 | 23 | ## create a RangedData object. 24 | tum_uncorrected_reads <- wigsToRangedData(tum_readfile, gcfile, mapfile) 25 | norm_uncorrected_reads <- wigsToRangedData(nor_readfile, gcfile, mapfile) 26 | 27 | ## subset to have reads only from chr4 28 | tum_uc_reads<- tum_uncorrected_reads["4"] 29 | norm_uc_reads<- norm_uncorrected_reads["4"] 30 | 31 | ##correct read counts 32 | tum_corrected_copy <- correctReadcount(tum_uc_reads) 33 | norm_corrected_copy <- correctReadcount(norm_uc_reads) 34 | 35 | ## Normalizing Tumour by Normal 36 | tum_corrected_copy$copy <- tum_corrected_copy$copy - norm_corrected_copy$copy 37 | 38 | ## Export to SEG format for CNAseq segmentation 39 | rangedDataToSeg(tum_corrected_copy, file = "paul_tum_corrected_copy.seg") 40 | 41 | ## Segmenting 42 | ## use default segmentation 43 | seg_copy_def <- HMMsegment(tum_corrected_copy) 44 | 45 | ## get parametrs 46 | realparam <- HMMsegment(tum_corrected_copy, getparam = TRUE) # retrieve converged parameters via EM 47 | 48 | ## Adjust parameters - case1 49 | param1 <- realparam 50 | param1$mu <- log(c(1, 1.4, 2, 2.7, 3, 4.5) / 2, 2) 51 | param1$m <- param1$mu 52 | segmented_copy_case1 <- HMMsegment(tum_corrected_copy, param1) # perform segmentation via Viterbi 53 | 54 | ## adjust parameters - case2 ## to decrease no of segments/ 55 | param2 <- realparam 56 | param2$strength <- 1e30 57 | param2$e <- 0.99999999999999 58 | segmented_copy_case2 <- HMMsegment(tum_corrected_copy, param2) 59 | 60 | ## adjust parameters - case2 ## to increase no of segments/ 61 | param3 <- realparam 62 | param3$strength <- 0.1 63 | param3$e <- 0.1 64 | segmented_copy_case3 <- HMMsegment(tum_corrected_copy, param3) 65 | 66 | ## visualization 67 | 68 | plotBias(tum_corrected_copy) 69 | plotCorrection(tum_corrected_copy) 70 | plotSegments(tum_corrected_copy, seg_copy_def) 71 | plotSegments(tum_corrected_copy, segmented_copy_case1) 72 | plotSegments(tum_corrected_copy, segmented_copy_case2) 73 | plotSegments(tum_corrected_copy, segmented_copy_case3) 74 | 75 | 76 | 77 | -------------------------------------------------------------------------------- /inst/script/RCNV_seq.R: -------------------------------------------------------------------------------- 1 | ## This script and helper functions implements an R-only version of 2 | ## the work flow from http://tiger.dbs.nus.edu.sg/cnv-seq/. It uses 3 | ## GenomicRanges utilities to perform read counts across bins, and the 4 | ## 'cnv' package available at the URL above for additional 5 | ## analysis. No intermediate files are generates. 6 | 7 | ## user arguments; provide full paths 8 | root <- "~/benchmark/copynumber" 9 | files <- file.path(root, c("tumorA.chr4.bam", "normalA.chr4.bam")) 10 | names(files) <- c("test", "ref") 11 | 12 | log2 <- .6 13 | annotate <- TRUE 14 | 15 | ## script 16 | source("../../R/RCNV_seq-helper.R") 17 | suppressPackageStartupMessages({ 18 | library(GenomicAlignments) 19 | library(cnv) 20 | }) 21 | 22 | window_size <- windowSize(files, log2=log2) 23 | tiles <- tileGenomeOverlap(files, window_size) 24 | hits <- summarizeOverlaps(tiles, files, binCounter) 25 | 26 | chr4 <- subset(hits, seqnames %in% "chr4") 27 | cnv <- cnv.cal(as.countsfile(chr4), log2=log2, annotate=annotate) 28 | plot.cnv(cnv) 29 | -------------------------------------------------------------------------------- /inst/script/TitanCNA-helper.R: -------------------------------------------------------------------------------- 1 | ################################################################### 2 | ##Utility R script to extract tumour allele read counts for input 3 | ## to TitanCNA. 4 | ## Author: Gavin Ha (gavin.ha@gmail.com) 5 | ## Date: May 12, 2014 6 | ##################################################################### 7 | ## The script makes use of the R Bioconductor package 8 | ## Rsamtools (>= 1.17.11). 9 | 10 | ## The inputs to this script are: 11 | ## 1. File path to tumour BAM file (tumbamFile) 12 | ## 2. File path to tumour BAM index file (tumIndexFile) 13 | ## 3. File path to heterozygous germline SNP positions (vcfFile) 14 | ## This file can be generated using a variety of solutions. 15 | ## One such solution is posted on 16 | ## http://compbio.bccrc.ca/software/titan/titan-running-titancna/ 17 | ## (see Step 5: Input files). 18 | ## The output to this script: 19 | ## A data.frame (countMat). 20 | ## Alternatively, this can be written to a tab-delimited text file with 21 | ## path "outFile". The format of this text file matches that required 22 | ## by TitanCNA ("loadAlleleCountsFromFile()"). 23 | ##################################################################### 24 | ##################################################################### 25 | 26 | ## THE FOLLOWING SCRIPT ONLY WORKS WITH Rsamtools (>= 1.17.11) 27 | library(Rsamtools) 28 | 29 | ##################################################################### 30 | ################ USERS - PLEASE MODIFY THESE PATHS ################# 31 | ##################################################################### 32 | tumbamFile <- "tumorA.chr4_sorted.bam" 33 | tumIndexFile <- paste(tumbamFile,".bai",sep = "") 34 | vcfFile <- "titanCNA_sorted_HetSNPs.vcf" ##"titanCNAHetSNPs.vcf" 35 | outFile <- "paul_tumAlleleCounts.tsv" 36 | 37 | ##################################################################### 38 | #################### LOAD HET POSITIONS VCF FILE #################### 39 | ##################################################################### 40 | ## read in vcf file of het positions 41 | vcf <- BcfFile(vcfFile) 42 | vcfPosns <- scanBcf(vcf) 43 | 44 | ##################################################################### 45 | ####################### SETUP PILEUP PARAMETERS ##################### 46 | ##################################################################### 47 | ## setup PileupParam using sequence read filters 48 | pp <- PileupParam(min_base_quality = 10, min_mapq = 20, 49 | min_nucleotide_depth = 10, max_depth = 20, 50 | distinguish_strands = FALSE, 51 | distinguish_nucleotides = TRUE) 52 | ## setup the positions of interest to generate the pileup for 53 | which <- GRanges(as.character(vcfPosns$CHROM), 54 | IRanges(vcfPosns$POS, width = 1)) 55 | ## setup addition BAM filters, such as excluding duplicate reads 56 | sbp <- ScanBamParam(flag = scanBamFlag(isDuplicate = FALSE), which = which) 57 | 58 | ##################################################################### 59 | ########################## GENERATE PILEUP ########################## 60 | ##################################################################### 61 | ## generate pileup using function (Rsamtools >= 1.17.11) 62 | ## this step can take a while 63 | tumbamObj <- BamFile(tumbamFile, index = tumIndexFile) 64 | counts <- pileup(tumbamObj, scanBamParam = sbp, pileupParam = pp) 65 | 66 | ## set of command to manipulate the "counts" data.frame output 67 | ## by pileup() such that multiple nucleotides are in a single 68 | ## row rather than in multiple rows. 69 | countsMerge <- xtabs(count ~ which_label + nucleotide, counts) 70 | label <- do.call(rbind, strsplit(rownames(countsMerge), ":")) 71 | posn <- do.call(rbind, strsplit(label[, 2],"-")) 72 | 73 | countsMerge <- cbind( position = posn[, 1], countsMerge) 74 | mode(countsMerge) <- "numeric" 75 | countsMerge <- data.frame(chr=label[,1],countsMerge, check.names=FALSE) 76 | 77 | 78 | ##################################################################### 79 | ############### GET REFERENCE AND NON-REF READ COUNTS ############### 80 | ##################################################################### 81 | ## this block of code is used to match up the reference and 82 | ## non-reference nucleotide when assigning read counts 83 | ## final output data.frame is "countMat" 84 | ## setup output data.frame 85 | countMat <- data.frame(chr = vcfPosns$CHROM, 86 | position = as.numeric(vcfPosns$POS), 87 | ref = vcfPosns$REF, refCount = 0, 88 | Nref = vcfPosns$ALT, NrefCount = 0, 89 | stringsAsFactors = FALSE) 90 | 91 | ## match rows with vcf positions of interest 92 | countMat <- merge(countMat, countsMerge, by = c("chr","position"), 93 | sort = FALSE) 94 | 95 | ## assign the flattened table of nucleotide counts to ref, Nref 96 | ## note that non-reference (Nref) allele is sum of other bases 97 | ## that is not matching the ref. 98 | NT <- c("A", "T", "C", "G") 99 | for (n in 1:length(NT)){ 100 | indRef <- countMat$ref == NT[n] 101 | countMat[indRef, "refCount"] <- countMat[indRef, NT[n]] 102 | countMat[indRef, "NrefCount"] <- rowSums(countMat[indRef, NT[-n]]) 103 | } 104 | 105 | countMat <- countMat[,1:6] 106 | 107 | ##################################################################### 108 | ####################### OUTPUT TO TEXT FILE ######################### 109 | ##################################################################### 110 | ## output text file will have the same format required by TitanCNA 111 | write.table(countMat, file = outFile, row.names = FALSE, 112 | col.names = TRUE, quote = FALSE, sep = "\t") 113 | 114 | -------------------------------------------------------------------------------- /inst/script/TitanCNA.R: -------------------------------------------------------------------------------- 1 | ## change seqnameStyle from UCSC to NCBI - done in unix. 2 | ## sed s/chr//g paul_tumAlleleCounts.tsv > paul_tumAlleleCounts_ncbi.tsv 3 | ## sed s/chr//g chr4_tumor_reads.wig > chr4_tumor_reads_ncbi.wig 4 | ## sed s/chr//g chr4_normal_reads.wig > chr4_normal_reads_ncbi.wig 5 | 6 | library(TitanCNA) 7 | 8 | ## load the files 9 | id <- "test" 10 | infile <-"paul_tumAlleleCounts_ncbi.tsv" 11 | tumWig <- "chr4_tumor_reads_ncbi.wig" 12 | normWig <- "chr4_normal_reads_ncbi.wig" 13 | gcWig <- "GRCh37-lite.gc.ws_1000.wig" 14 | mapWig <- "GRCh37-lite.map.ws_1000.wig" 15 | 16 | ## load the tumor allele read counts 17 | data <- loadAlleleCountsFromFile(infile) 18 | 19 | ## correct gc and mappability bias 20 | cnData <- correctReadDepth(tumWig,normWig,gcWig,mapWig) 21 | -------------------------------------------------------------------------------- /inst/script/cn.mops.R: -------------------------------------------------------------------------------- 1 | library(cn.mops) 2 | library(RUnit) 3 | 4 | tumor_gr <- getReadCountsFromBAM("tumorA.chr4.bam", 5 | refSeqName="chr4",WL=10000,mode="unpaired") 6 | normal_gr <- getReadCountsFromBAM("normalA.chr4.bam", 7 | refSeqName="chr4",WL=10000,mode="unpaired") 8 | 9 | # We need a special normalization because the tumor has made large CNVs 10 | X <- tumor_gr 11 | values(X) <- cbind(values(tumor_gr),values(normal_gr)) 12 | X <- normalizeGenome(X,normType="mode") 13 | 14 | # Parameter settings for tumor: 15 | # - norm=0, because we already have normalized 16 | # - integer copy numbers higher than 8 allowed 17 | # - DNAcopy as segmentation algorithm. 18 | ref_analysis_norm0 <- referencecn.mops(X[,1], X[,2], 19 | norm=0, 20 | I = c(0.025, 0.5, 1, 1.5, 2, 2.5, 3, 3.5, 4, 8,16,32,64), 21 | classes = c("CN0", "CN1", "CN2", "CN3", "CN4", "CN5", 22 | "CN6", "CN7","CN8","CN16","CN32","CN64","CN128"), 23 | segAlgorithm="DNAcopy") 24 | 25 | 26 | resCNMOPS <- calcIntegerCopyNumbers(ref_analysis_norm0) 27 | resCNMOPS <-cn.mops:::.replaceNames(resCNMOPS, "tumorA.chr4.bam","tumor") 28 | 29 | png("cn.mops-segplot.png") 30 | segplot(resCNMOPS) 31 | dev.off() 32 | ##Here the x-axis represents the genomic position and the y-axis represents the 33 | ##log ratio of read counts and copy number call of each segment(red) 34 | 35 | cnvr(resCNMOPS) #look at CNV regions 36 | 37 | cnvs(resCNMOPS) #look at individual CNV regions 38 | 39 | 40 | #------------------------------------------------------------------------------- 41 | test_cnv_cn.mops <- 42 | function() 43 | { 44 | BamFiles <- list.files(system.file("extdata", package="cn.mops") , 45 | pattern=".bam$", full.names=TRUE) 46 | bamDataRanges <- getReadCountsFromBAM(BamFiles, 47 | sampleNames=paste("Sample",1:2), mode="unpaired") 48 | checkEquals(856, bamDataRanges$Sample.1[1]) 49 | 50 | data(cn.mops) 51 | got <- cn.mops(XRanges[,1:3]) 52 | checkEquals(6,length(cnvs(got))) 53 | checkEquals(1775001 ,start(ranges(cnvr(got))[1])) 54 | checkEquals(1850000, end(ranges(cnvr(got))[1])) 55 | checkEquals(75000, width(ranges(cnvr(got))[1])) 56 | } 57 | -------------------------------------------------------------------------------- /inst/script/countOverlaps.R: -------------------------------------------------------------------------------- 1 | library(GenomicAlignments) 2 | library(RUnit) 3 | 4 | countBinOverlaps <- 5 | function(features, reads, ...) 6 | { 7 | reads <- resize(granges(reads), width=1) 8 | countOverlaps(features, reads) 9 | } 10 | 11 | summarizeBins <- 12 | function(file, seqnames, tilewidth=10000) 13 | { 14 | stopifnot(is(file, "character") && length(file) > 0) 15 | stopifnot(is(seqnames, "character") && length(seqnames) > 0) 16 | 17 | seqlengths <- seqlengths(BamFile(file[[1]])) 18 | tiles <- tileGenome(seqlengths[names(seqlengths) %in% seqnames], 19 | tilewidth=tilewidth, 20 | cut.last.tile.in.chrom=TRUE) 21 | 22 | summarizeOverlaps(tiles, fls, countBinOverlaps) 23 | } 24 | 25 | fls <- dir("~/benchmark/copynumber/", pattern="bam$", full=TRUE) 26 | counts <- summarizeBins(fls, "chr4") 27 | 28 | 29 | -------------------------------------------------------------------------------- /inst/script/seqCNA.R: -------------------------------------------------------------------------------- 1 | library(seqCNA) 2 | 3 | tumor <- file.path(datafiles, "tumorA.chr4.bam") 4 | normal <- file.path(datafiles, "normalA.chr4.bam") 5 | ##runSeqsumm is run only once. 6 | #runSeqsumm(summ.win=10, 7 | # file=tumor, 8 | # output.file="tumorA_chr4_seqsumm_out.txt", 9 | # samtools.path="samtools") 10 | 11 | # runSeqsumm(summ.win=10, 12 | # file=normal, 13 | # output.file="normalA_chr4_seqsumm_out.txt", 14 | # samtools.path="samtools") 15 | 16 | ##note: by default it has written the two files in the same folder 17 | ## as the bam files!! 18 | 19 | tumordata<- read.table(file.path(datafiles,"tumorA_chr4_seqsumm_out.txt"), 20 | sep="\t", header=TRUE) 21 | normaldata<- read.table(file.path(datafiles,"normalA_chr4_seqsumm_out.txt"), 22 | sep="\t", header=TRUE) 23 | tumordata<- tumordata[which(tumordata$chrom=="chr4"),] 24 | normaldata<- normaldata[which(normaldata$chrom=="chr4"),] 25 | head(tumordata) 26 | 27 | rco = readSeqsumm(build="hg19", 28 | tumour.data=tumordata, 29 | normal.data=normaldata) 30 | 31 | ##apply the trimming and mapping quality filters 32 | rco =applyFilters(rco, trim.filter=1, mapq.filter=2) 33 | rco= runSeqnorm(rco) 34 | rco=runGLAD(rco) 35 | 36 | plotCNProfile(rco) 37 | 38 | rco = applyThresholds(rco, seq(-0.8,4,by=0.8), 1) 39 | 40 | plotCNProfile(rco) 41 | 42 | summary(rco) 43 | -------------------------------------------------------------------------------- /inst/script/testCounts.R: -------------------------------------------------------------------------------- 1 | ## Our motivation for this gist comes from the fact that we would like 2 | ## to check the counts calculated by each of the methods. 3 | 4 | ## Here we provide a simple unitTest which takes as input a GRanges 5 | ## object which contains the reads from the "tumorA.chr4.bam" file. 6 | 7 | ## we choose 5 regions and get their counts from samtools using a simple 8 | ## command : 9 | ## samtools view tumorA.chr4.bam chr4:8000001-8010000 | wc -l 67 10 | 11 | ## we can then use this function to check if the counts coming out of 12 | ## a given method are equal to the counts given by samtools 13 | 14 | ## we assume that the metadata column containing the counts in the GRanges 15 | ## object is called as "cnt" 16 | 17 | testCounts <- function(grTumor) 18 | { 19 | test_indices <- c(8000001, 8010001,10000001 , 10010001, 1000001) 20 | test_res <- c(67,62,67,74,47) #from samtools 21 | counts <- sapply(test_indices , 22 | function (x) grTumor[which(start(ranges(grTumor))==x)]$cnt ) 23 | checkEquals(test_res,indices,tolerance=2) 24 | } 25 | 26 | -------------------------------------------------------------------------------- /vignettes/seqCNA.Rnw: -------------------------------------------------------------------------------- 1 | \documentclass{article} 2 | <>= 3 | BiocStyle::latex() 4 | @ 5 | 6 | \title{Copy Number approach - seqCNA} 7 | \date{Modified :26 March, 2014. Compiled: \today} 8 | 9 | \begin{document} 10 | \maketitle 11 | 12 | \section{method} 13 | Here we will review the Bioconductor package seqCNA 14 | 15 | David Mosen-Ansorena, N. T., Silvia Veganzones,VirginiaDelaOrden, 16 | Maria Luisa Maestro and a. AnaMAransay (2014). "seqCNA: an R package 17 | for DNA copy number analysis in cancer using high-throughput sequencing." 18 | BMCGenomics. 19 | 20 | 21 | \section{code} 22 | 23 | <>= 24 | library(seqCNA) 25 | datafiles <- file.path("/shared","silo_researcher","Morgan_M","BioC","sonali", 26 | "analysis","inst","extdata") 27 | @ 28 | Loading test data and running seqCNA on test data. 29 | This tumor sample and normal sample contain data only from "chr4". 30 | <>= 31 | tumor <- file.path(datafiles, "tumorA.chr4.bam") 32 | normal <- file.path(datafiles, "normalA.chr4.bam") 33 | ##runSeqsumm is run only once. 34 | #runSeqsumm(summ.win=10, 35 | # file=tumor, 36 | # output.file="tumorA_chr4_seqsumm_out.txt", 37 | # samtools.path="samtools") 38 | 39 | # runSeqsumm(summ.win=10, 40 | # file=normal, 41 | # output.file="normalA_chr4_seqsumm_out.txt", 42 | # samtools.path="samtools") 43 | 44 | ##note: by default it has written the two files in the same folder 45 | ## as the bam files!! 46 | 47 | tumordata<- read.table(file.path(datafiles,"tumorA_chr4_seqsumm_out.txt"), 48 | sep="\t", header=TRUE) 49 | normaldata<- read.table(file.path(datafiles,"normalA_chr4_seqsumm_out.txt"), 50 | sep="\t", header=TRUE) 51 | tumordata<- tumordata[which(tumordata$chrom=="chr4"),] 52 | normaldata<- normaldata[which(normaldata$chrom=="chr4"),] 53 | head(tumordata) 54 | @ 55 | 56 | When we run \Rfunction{readSeqsumm} it generates the following lines in the output 57 | file for other chromosomes.Thus we have subsetted to look only at chr4. 58 | <>= 59 | #chr5 0 0 0 0 60 | #chr6 0 0 0 0 61 | @ 62 | 63 | <>= 64 | tumor <- file.path(datafiles, "tumor_srx036691.bam") 65 | normal <- file.path(datafiles, "normal_srx113635.bam") 66 | 67 | ##runSeqsumm is run only once. 68 | #runSeqsumm(summ.win=10, 69 | # file=tumor, 70 | # output.file="tumor_seqsumm_out.txt", 71 | # samtools.path="samtools") 72 | 73 | # runSeqsumm(summ.win=10, 74 | # file=normal, 75 | # output.file="normal_seqsumm_out.txt", 76 | # samtools.path="samtools") 77 | 78 | ##note: by default it has written the two files in the same folder 79 | ## as the bam files!! 80 | 81 | tumordata<- read.table(file.path(datafiles,"tumor_seqsumm_out.txt"), 82 | sep="\t", header=TRUE) 83 | normaldata<- read.table(file.path(datafiles,"normal_seqsumm_out.txt"), 84 | sep="\t", header=TRUE) 85 | @ 86 | Running seqCNA according to manual. 87 | <>= 88 | ##read in the summarized data 89 | rco = readSeqsumm(build="hg19", 90 | tumour.data=tumordata, 91 | normal.data=normaldata) 92 | 93 | ##apply the trimming and mapping quality filters 94 | rco =applyFilters(rco, trim.filter=1, mapq.filter=2) 95 | rco= runSeqnorm(rco) 96 | rco=runGLAD(rco) 97 | 98 | plotCNProfile(rco) 99 | 100 | rco = applyThresholds(rco, seq(-0.8,4,by=0.8), 1) 101 | 102 | plotCNProfile(rco) 103 | 104 | summary(rco) 105 | 106 | #writeCNProfile(rco, dir) 107 | @ 108 | 109 | \end{document} 110 | --------------------------------------------------------------------------------