├── R
    └── RCNV_seq-helper.R
├── README.md
├── image
    ├── cn.mops-segplot.png
    ├── cna-norm.png
    ├── cnv-countOverlaps.png
    ├── cnv-seq-plot.png
    ├── count2.png
    ├── normalised-count2.png
    ├── plotBias.png
    ├── plotCorrection.png
    ├── plotDefaultSegments.png
    ├── plotSegment_case1.png
    └── plotSegment_case2.png
├── inst
    └── script
    │   ├── CNAnorm.R
    │   ├── CNV-seq.R
    │   ├── HMMcopy.R
    │   ├── RCNV_seq.R
    │   ├── TitanCNA-helper.R
    │   ├── TitanCNA.R
    │   ├── cn.mops.R
    │   ├── countOverlaps.R
    │   ├── seqCNA.R
    │   └── testCounts.R
├── result files
    ├── countOverlapresult.tsv
    ├── tumor.hits-vs-normal.hits.window-10000.minw-4.cnv
    └── tumor.hits-vs-normal.hits.window-10000.minw-4.count
└── vignettes
    └── seqCNA.Rnw


/R/RCNV_seq-helper.R:
--------------------------------------------------------------------------------
 1 | ## samtools view -F 4 tumorA.chr4.bam |\
 2 | ##     perl -lane 'print "$F[2]\t$F[3]"' >tumor.hits
 3 | ## samtools view -F 4 normalA.chr4.bam |\
 4 | ##     perl -lane 'print "$F[2]\t$F[3]"' >normal.hits
 5 | ## perl cnv-seq/cnv-seq.pl --test tumor.hits --ref normal.hits \
 6 | ##     --genome human --Rexe "~/bin/R-devel/bin/R"
 7 | 
 8 | genomeSize <- function(files)
 9 |     sum(as.numeric(seqlengths(BamFile(files[[1]]))))
10 | 
11 | windowSize <-
12 |     function(bam_files, pvalue=0.001, log2=0.6, bigger=1.5,
13 |              genome_size, param)
14 | {
15 |     if (missing(genome_size))
16 |         genome_size=genomeSize(bam_files)
17 |     if (missing(param))
18 |         param <- ScanBamParam(flag=scanBamFlag(isUnmappedQuery=FALSE))
19 | 
20 |     total <- sapply(bam_files, function(...) {
21 |         countBam(...)$records
22 |     }, param=param)
23 | 
24 |     bt <- qnorm(1 - pvalue / 2)
25 |     st <- qnorm(pvalue / 2)
26 |     log2 <- abs(log2)
27 |     brp <- 2^log2
28 |     srp <- 1 / (2^log2)
29 |     
30 |     bw <- (total[["test"]] * brp^2 + total[["ref"]]) * genome_size * bt^2 /
31 |         ((1-brp)^2 * total[["test"]] * total[["ref"]])
32 |     sw <- (total[["test"]] * srp^2 + total[["ref"]]) * genome_size * st^2 /
33 |         ((1-srp)^2 * total[["test"]] * total[["ref"]])
34 | 
35 |     window_size = floor(max(bw, sw) * bigger)
36 | }
37 | 
38 | tileGenomeOverlap <- function(file, tilewidth) {
39 |     ## overlapping tiles
40 |     lengths <- seqlengths(BamFile(file[[1]]))
41 |     tile0 <- tileGenome(lengths, tilewidth=tilewidth,
42 |                         cut.last.tile.in.chrom=TRUE)
43 |     tile1 <- tile0[width(tile0) >= tilewidth]
44 |     tile1 <- shift(tile1[-cumsum(runLength(seqnames(tile1)))], tilewidth / 2)
45 |     sort(c(tile0, tile1))
46 | }
47 | 
48 | binCounter <- function(features, reads, ignore.strand, ...) {
49 |     countOverlaps(features, resize(granges(reads), 1),
50 |                   ignore.strand=ignore.strand) 
51 | }
52 | 
53 | as.countsfile <- function(hits, file=tempfile()) {
54 |     df <- with(rowData(hits), {
55 |         cbind(data.frame(chromosome=as.character(seqnames),
56 |                          start=start, end=end),
57 |               assay(hits))
58 |     })
59 |     write.table(df, file, quote=FALSE, row.names=FALSE, sep="\t")
60 |     file
61 | }
62 | 
63 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Copy Number Analysis 
 2 | =====================
 3 | 
 4 | Explore, compare, and evaluate Bioconductor packages related to genomic copy number analysis
 5 | 
 6 | Genomic amplifications and deletions are found in most (all?) tumor genomes.  A common practice today is to do low coverage DNA sequencing (0.5x, for instance) of a tumor genome, and a matched normal genome (from the same subject).  Judicious comparison of the the two sequence genomes illuminates structural changes in the tumor.
 7 | 
 8 | Copy number changes in tumors vary from broad (an entire chromosome arm) to focal (i.e., a 10kb amplification, loss of heterozygosity or gain).   Detection methods should be sensitive enough to detect these very different phenomena in noisy low-coverage data.
 9 | 
10 | Our purpose here is to provide
11 | 
12 | * A tumor/normal single chromosome pair of bam files (with accompanying index files)
13 | * A reference analysis, using the popular SeqSeg matlab program from the Broad Institute
14 | * A tutorial on the exploratory data analysis of these files using "native" Bioconductor capabilities
15 | * Demonstrate (and evaluate) the capabilities of many of the Bioconductor copy number analysis packages
16 | 
17 | List of Tools used
18 | ===================
19 | Bioconductor Packages
20 | * <a href="https://github.com/Bioconductor/copy-number-analysis/wiki/CountOverlaps-method-from-IRanges-Package">countOverlaps</a>
21 | * <a href="https://github.com/Bioconductor/copy-number-analysis/wiki/cn.mops">cn.mops</a>
22 | * <a href="https://github.com/Bioconductor/copy-number-analysis/wiki/CNAnorm">CNAnorm</a>
23 | * <a href="https://github.com/Bioconductor/copy-number-analysis/wiki/seqCNA">seqCNA</a>
24 | * <a href="https://github.com/Bioconductor/copy-number-analysis/wiki/HMMcopy">HMMcopy</a>  
25 | * <a href ="https://github.com/Bioconductor/copy-number-analysis/wiki/TitanCNA">TitanCNA</a>
26 | <br>
27 | 
28 | Non Biocondcutor packages
29 | * <a href="https://github.com/Bioconductor/copy-number-analysis/wiki/CNV-seq">CNV-seq</a>
30 | * <a href="https://github.com/Bioconductor/copy-number-analysis/wiki/SegSeq">Seg-seq</a>
31 | 
32 | 
33 | Literature Resources
34 | =========================
35 | * Alkan, C., et al. (2011). <a href="http://www.ncbi.nlm.nih.gov/pubmed/21358748">"Genome structural variation discovery and genotyping."</a> Nat Rev Genet 12(5): 363-376. 
36 | * Duan J, Zhang J-G, Deng H-W, Wang Y-P (2013) <a href="http://www.plosone.org/article/info%3Adoi%2F10.1371%2Fjournal.pone.0059128">Comparative Studies of Copy Number Variation Detection Methods for Next-Generation Sequencing Technologies.</a> PLoS ONE 8(3): e59128. doi:10.1371/journal.pone.0059128
37 | 
38 | Sample Data
39 | ===========
40 | * http://s3.amazonaws.com/copy-number-analysis/tumorA.chr4.bam
41 | * http://s3.amazonaws.com/copy-number-analysis/tumorA.chr4.bam.bai
42 | * http://s3.amazonaws.com/copy-number-analysis/normalA.chr4.bam
43 | * http://s3.amazonaws.com/copy-number-analysis/normalA.chr4.bam.bai
44 | 
45 | Use, e.g.,
46 | <pre><code> 
47 | download.file(url="http://s3.amazonaws.com/copy-number-analysis/tumorA.chr4.bam.bai",
48 |               destfile="tumorA.chr4.bam.bai")
49 | </code></pre>
50 | 
51 | 
52 | Exploratory Data Analysis
53 | ==========================
54 | We have done some primary <a href="https://github.com/Bioconductor/copy-number-analysis/wiki/Exploratory-Data-Analysis">Exploratory Data Analysis</a> on the Normal and Tumor Sample Datasets.
55 | 
56 | 


--------------------------------------------------------------------------------
/image/cn.mops-segplot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/copy-number-analysis/865ee38fcd621ef720851f11d30b965d60b8890a/image/cn.mops-segplot.png


--------------------------------------------------------------------------------
/image/cna-norm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/copy-number-analysis/865ee38fcd621ef720851f11d30b965d60b8890a/image/cna-norm.png


--------------------------------------------------------------------------------
/image/cnv-countOverlaps.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/copy-number-analysis/865ee38fcd621ef720851f11d30b965d60b8890a/image/cnv-countOverlaps.png


--------------------------------------------------------------------------------
/image/cnv-seq-plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/copy-number-analysis/865ee38fcd621ef720851f11d30b965d60b8890a/image/cnv-seq-plot.png


--------------------------------------------------------------------------------
/image/count2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/copy-number-analysis/865ee38fcd621ef720851f11d30b965d60b8890a/image/count2.png


--------------------------------------------------------------------------------
/image/normalised-count2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/copy-number-analysis/865ee38fcd621ef720851f11d30b965d60b8890a/image/normalised-count2.png


--------------------------------------------------------------------------------
/image/plotBias.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/copy-number-analysis/865ee38fcd621ef720851f11d30b965d60b8890a/image/plotBias.png


--------------------------------------------------------------------------------
/image/plotCorrection.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/copy-number-analysis/865ee38fcd621ef720851f11d30b965d60b8890a/image/plotCorrection.png


--------------------------------------------------------------------------------
/image/plotDefaultSegments.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/copy-number-analysis/865ee38fcd621ef720851f11d30b965d60b8890a/image/plotDefaultSegments.png


--------------------------------------------------------------------------------
/image/plotSegment_case1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/copy-number-analysis/865ee38fcd621ef720851f11d30b965d60b8890a/image/plotSegment_case1.png


--------------------------------------------------------------------------------
/image/plotSegment_case2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/copy-number-analysis/865ee38fcd621ef720851f11d30b965d60b8890a/image/plotSegment_case2.png


--------------------------------------------------------------------------------
/inst/script/CNAnorm.R:
--------------------------------------------------------------------------------
 1 | library(CNAnorm)
 2 | library(RUnit)
 3 | 
 4 | #outside R session - convert perl file
 5 | ##perl bam2windows.pl "tumorA.chr4.bam" "normalA.chr4.bam" > perloutput.txt
 6 | 
 7 | test_counts <- function(chr4data)
 8 | {
 9 |     test_indices <- c(8000001, 8010001,10000001 , 10010001, 1000001)
10 |     test_res <- c(67,62,67,74,47) #from samtools
11 |     res <- sapply( test_indices, function(x) chr4data[which(chr4data[,2]==x),3])
12 |     checkEquals(test_res,res)
13 | }
14 | 
15 | data<- read.table("perloutput.txt",sep="\t",header=TRUE)
16 | 
17 | #subset to chromosome 4
18 | chr4data <- data[which(data[,1]=="chr4"),]
19 | 
20 | #check if the raw counts are similiar to counts from samtools
21 | test_counts() #FALSE!
22 | 
23 | #create an object of class CNAnorm
24 | cn <- dataFrame2object(chr4data)
25 | 
26 | #smooth the signal to decrease noise without losing resolution.
27 | cn <- addSmooth(cn,lambda=7)
28 | 
29 | #estimate peaks and ploidy
30 | cn <- peakPloidy(cn) 
31 | 
32 | #produce a plot
33 | png("cna-norm.png")
34 | plotPeaks(cn)
35 | dev.off()


--------------------------------------------------------------------------------
/inst/script/CNV-seq.R:
--------------------------------------------------------------------------------
 1 | #Here we show an implementation of CNV-seq
 2 | 
 3 | ##step 1 - includes generating best-hit location files for each mapped 
 4 | ##sequence read. The authors provide a perl script for BLAT psl file 
 5 | ## and SOLiD maching pipeline. For BAM files, they suggest to extract 
 6 | ##locations using the following command
 7 | 
 8 | #~/copynumber$ samtools view -F 4 tumorA.chr4.bam |perl -lane 
 9 | #'print "F[2]\t$F[3]"' >tumor.hits
10 | 
11 | #~/copynumber$ samtools view -F 4 normalA.chr4.bam |perl 
12 | #-lane 'print "F[2]\t$F[3]"' >normal.hits
13 | 
14 | 
15 | ##cnv-seq.pl is used to calculate sliding window size, to count number of 
16 | ##mapped hits in each window, and to call cnv R package to calculate log2 
17 | ## ratios and annotate CNV
18 | 
19 | # perl cnv-seq.pl --test tumor.hits --ref normal.hits --genome human
20 | 
21 | 
22 | ##two output files are produced. They can be found under "result files":
23 | ##tumor.hits-vs-normal.hits.window-10000.minw-4.cnv
24 | ##tumor.hits-vs-normal.hits.window-10000.minw-4.count
25 | 
26 | #One can visualize the cnv inside R using the following code snippet
27 | ## the plot can be found under "image" folder
28 | 
29 | library(cnv)
30 | data <- read.delim("tumor.hits-vs-normal.hits.window-10000.minw-4.cnv")
31 | cnv.summary(data)
32 | png("cnv-seq-plot.png")
33 | plot.cnv(data)
34 | dev.off()


--------------------------------------------------------------------------------
/inst/script/HMMcopy.R:
--------------------------------------------------------------------------------
 1 | ## This file shows an implementation of copy number using HMMcopy. 
 2 | ## This is run on our chosen dataset
 3 | ## tumor file : tumorA.chr4.bam
 4 | ## normal file : normalA.chr4.bam
 5 | 
 6 | ## outside R - generate readCounts file 
 7 | ## bin/readCounter tumorA.chr4.bam > chr4_tum_reads.wig
 8 | ## bin/readCounter normalA.chr4.bam > chr4_norm_reads.wig
 9 | 
10 | ## currently the wig files for gc content and mappability have NCBI style
11 | ## of seqnames/ chromsomes. so convert tumor and normal wig files to same. 
12 | ## sed s/=chr/=/ chr4_tumor_reads.wig > chr4_tumor_reads_ncbi.wig
13 | ## sed s/=chr/=/ chr4_tumor_reads.wig > chr4_tumor_reads_ncbi.wig
14 | 
15 | tum_readfile <-"chr4_tumor_reads_ncbi.wig"
16 | nor_readfile <-"chr4_normal_reads_ncbi.wig"
17 | 
18 | ## Note - these files are distributed along with TitanCNA
19 | ## the files distributed along with HMMcopy had inconsistent seqname style
20 | mapfile <-"GRCh37-lite.map.ws_1000.wig"
21 | gcfile <-"GRCh37-lite.gc.ws_1000.wig"
22 | 
23 | ## create a RangedData object.
24 | tum_uncorrected_reads <- wigsToRangedData(tum_readfile, gcfile, mapfile)
25 | norm_uncorrected_reads <- wigsToRangedData(nor_readfile, gcfile, mapfile)
26 | 
27 | ## subset to have reads only from chr4
28 | tum_uc_reads<- tum_uncorrected_reads["4"]
29 | norm_uc_reads<- norm_uncorrected_reads["4"]
30 | 
31 | ##correct read counts
32 | tum_corrected_copy <- correctReadcount(tum_uc_reads)
33 | norm_corrected_copy <- correctReadcount(norm_uc_reads)
34 | 
35 | ## Normalizing Tumour by Normal
36 | tum_corrected_copy$copy <- tum_corrected_copy$copy - norm_corrected_copy$copy
37 | 
38 | ## Export to SEG format for CNAseq segmentation
39 | rangedDataToSeg(tum_corrected_copy, file = "paul_tum_corrected_copy.seg")
40 | 
41 | ## Segmenting
42 | ## use default segmentation 
43 | seg_copy_def <- HMMsegment(tum_corrected_copy)
44 | 
45 | ## get parametrs
46 | realparam <- HMMsegment(tum_corrected_copy, getparam = TRUE) # retrieve converged parameters via EM
47 | 
48 | ## Adjust parameters - case1
49 | param1 <- realparam
50 | param1$mu <- log(c(1, 1.4, 2, 2.7, 3, 4.5) / 2, 2)
51 | param1$m <- param1$mu
52 | segmented_copy_case1 <- HMMsegment(tum_corrected_copy, param1) # perform segmentation via Viterbi
53 | 
54 | ## adjust parameters - case2 ## to decrease no of segments/ 
55 | param2 <- realparam
56 | param2$strength <- 1e30
57 | param2$e <- 0.99999999999999
58 | segmented_copy_case2 <- HMMsegment(tum_corrected_copy, param2)
59 | 
60 | ## adjust parameters - case2 ## to increase no of segments/ 
61 | param3 <- realparam
62 | param3$strength <- 0.1
63 | param3$e <- 0.1
64 | segmented_copy_case3 <- HMMsegment(tum_corrected_copy, param3)
65 | 
66 | ## visualization
67 | 
68 | plotBias(tum_corrected_copy)
69 | plotCorrection(tum_corrected_copy)
70 | plotSegments(tum_corrected_copy, seg_copy_def)
71 | plotSegments(tum_corrected_copy, segmented_copy_case1)
72 | plotSegments(tum_corrected_copy, segmented_copy_case2)
73 | plotSegments(tum_corrected_copy, segmented_copy_case3)
74 | 
75 | 
76 | 
77 | 


--------------------------------------------------------------------------------
/inst/script/RCNV_seq.R:
--------------------------------------------------------------------------------
 1 | ## This script and helper functions implements an R-only version of
 2 | ## the work flow from http://tiger.dbs.nus.edu.sg/cnv-seq/. It uses
 3 | ## GenomicRanges utilities to perform read counts across bins, and the
 4 | ## 'cnv' package available at the URL above for additional
 5 | ## analysis. No intermediate files are generates.
 6 | 
 7 | ## user arguments; provide full paths
 8 | root <- "~/benchmark/copynumber"
 9 | files <- file.path(root, c("tumorA.chr4.bam", "normalA.chr4.bam"))
10 | names(files) <- c("test", "ref")
11 | 
12 | log2 <- .6
13 | annotate <- TRUE
14 | 
15 | ## script
16 | source("../../R/RCNV_seq-helper.R")
17 | suppressPackageStartupMessages({
18 |     library(GenomicAlignments)
19 |     library(cnv)
20 | })
21 | 
22 | window_size <- windowSize(files, log2=log2)
23 | tiles <- tileGenomeOverlap(files, window_size)
24 | hits <- summarizeOverlaps(tiles, files, binCounter)
25 | 
26 | chr4 <- subset(hits, seqnames %in% "chr4")
27 | cnv <- cnv.cal(as.countsfile(chr4), log2=log2, annotate=annotate)
28 | plot.cnv(cnv)
29 | 


--------------------------------------------------------------------------------
/inst/script/TitanCNA-helper.R:
--------------------------------------------------------------------------------
  1 | ###################################################################
  2 | ##Utility R script to extract tumour allele read counts for input 
  3 | ##   to TitanCNA. 
  4 | ## Author: Gavin Ha (gavin.ha@gmail.com)
  5 | ## Date: May 12, 2014
  6 | #####################################################################
  7 | ## The script makes use of the R Bioconductor package 
  8 | ##       Rsamtools (>= 1.17.11).  
  9 | 
 10 | ## The inputs to this script are:
 11 | ## 1. File path to tumour BAM file (tumbamFile)
 12 | ## 2. File path to tumour BAM index file (tumIndexFile)
 13 | ## 3. File path to heterozygous germline SNP positions (vcfFile)
 14 | ##     This file can be generated using a variety of solutions. 
 15 | ##     One such solution is posted on 
 16 | ##     http://compbio.bccrc.ca/software/titan/titan-running-titancna/
 17 | ##     (see Step 5: Input files).
 18 | ## The output to this script:
 19 | ##   A data.frame (countMat).
 20 | ##   Alternatively, this can be written to a tab-delimited text file with
 21 | ##   path "outFile". The format of this text file matches that required
 22 | ##   by TitanCNA ("loadAlleleCountsFromFile()"). 
 23 | #####################################################################
 24 | #####################################################################
 25 | 
 26 | ## THE FOLLOWING SCRIPT ONLY WORKS WITH Rsamtools (>= 1.17.11)
 27 | library(Rsamtools)
 28 | 
 29 | #####################################################################
 30 | ################ USERS - PLEASE MODIFY THESE PATHS #################
 31 | #####################################################################
 32 | tumbamFile <- "tumorA.chr4_sorted.bam"
 33 | tumIndexFile <- paste(tumbamFile,".bai",sep = "")
 34 | vcfFile <-  "titanCNA_sorted_HetSNPs.vcf" ##"titanCNAHetSNPs.vcf" 
 35 | outFile <- "paul_tumAlleleCounts.tsv"
 36 | 
 37 | #####################################################################
 38 | #################### LOAD HET POSITIONS VCF FILE ####################
 39 | #####################################################################
 40 | ## read in vcf file of het positions
 41 | vcf <- BcfFile(vcfFile)
 42 | vcfPosns <- scanBcf(vcf)
 43 | 
 44 | #####################################################################
 45 | ####################### SETUP PILEUP PARAMETERS #####################
 46 | #####################################################################
 47 | ## setup PileupParam using sequence read filters
 48 | pp <- PileupParam(min_base_quality = 10, min_mapq = 20, 
 49 | 		min_nucleotide_depth = 10, max_depth = 20, 
 50 | 		distinguish_strands = FALSE, 
 51 | 		distinguish_nucleotides = TRUE)
 52 | ## setup the positions of interest to generate the pileup for
 53 | which <- GRanges(as.character(vcfPosns$CHROM), 
 54 | 		IRanges(vcfPosns$POS, width = 1))
 55 | ## setup addition BAM filters, such as excluding duplicate reads
 56 | sbp <- ScanBamParam(flag = scanBamFlag(isDuplicate = FALSE), which = which)
 57 | 
 58 | #####################################################################
 59 | ########################## GENERATE PILEUP ##########################
 60 | #####################################################################
 61 | ## generate pileup using function (Rsamtools >= 1.17.11)
 62 | ## this step can take a while
 63 | tumbamObj <- BamFile(tumbamFile, index = tumIndexFile)
 64 | counts <- pileup(tumbamObj, scanBamParam = sbp,  pileupParam = pp)
 65 | 
 66 | ## set of command to manipulate the "counts" data.frame output
 67 | ##     by pileup() such that multiple nucleotides are in a single
 68 | ##     row rather than in multiple rows.
 69 | countsMerge <- xtabs(count ~ which_label + nucleotide, counts)
 70 | label <- do.call(rbind, strsplit(rownames(countsMerge), ":"))
 71 | posn <- do.call(rbind, strsplit(label[, 2],"-"))
 72 | 
 73 | countsMerge <- cbind( position = posn[, 1], countsMerge)
 74 | mode(countsMerge) <- "numeric"
 75 | countsMerge <- data.frame(chr=label[,1],countsMerge, check.names=FALSE)
 76 | 
 77 | 
 78 | #####################################################################
 79 | ############### GET REFERENCE AND NON-REF READ COUNTS ###############
 80 | #####################################################################
 81 | ## this block of code is used to match up the reference and 
 82 | ##   non-reference nucleotide when assigning read counts
 83 | ##   final output data.frame is "countMat"
 84 | ## setup output data.frame
 85 | countMat <- data.frame(chr = vcfPosns$CHROM, 
 86 | 			position = as.numeric(vcfPosns$POS), 
 87 | 			ref = vcfPosns$REF, refCount = 0, 
 88 | 			Nref = vcfPosns$ALT, NrefCount = 0, 
 89 | 			stringsAsFactors = FALSE)
 90 | 
 91 | ## match rows with vcf positions of interest
 92 | countMat <- merge(countMat, countsMerge, by = c("chr","position"), 
 93 | 		sort = FALSE)
 94 | 
 95 | ## assign the flattened table of nucleotide counts to ref, Nref
 96 | ## note that non-reference (Nref) allele is sum of other bases
 97 | ##    that is not matching the ref.
 98 | NT <- c("A", "T", "C", "G")
 99 | for (n in 1:length(NT)){	
100 | 	indRef <- countMat$ref == NT[n]
101 | 	countMat[indRef, "refCount"] <- countMat[indRef, NT[n]]
102 | 	countMat[indRef, "NrefCount"] <- rowSums(countMat[indRef, NT[-n]])
103 | }
104 | 		
105 | countMat <- countMat[,1:6]
106 | 
107 | #####################################################################
108 | ####################### OUTPUT TO TEXT FILE #########################
109 | #####################################################################
110 | ## output text file will have the same format required by TitanCNA
111 | write.table(countMat, file = outFile, row.names = FALSE, 
112 | 	col.names = TRUE, quote = FALSE, sep = "\t")
113 | 			
114 | 


--------------------------------------------------------------------------------
/inst/script/TitanCNA.R:
--------------------------------------------------------------------------------
 1 | ## change seqnameStyle from UCSC to NCBI - done in unix.
 2 | ## sed s/chr//g paul_tumAlleleCounts.tsv > paul_tumAlleleCounts_ncbi.tsv
 3 | ## sed s/chr//g chr4_tumor_reads.wig > chr4_tumor_reads_ncbi.wig
 4 | ## sed s/chr//g chr4_normal_reads.wig > chr4_normal_reads_ncbi.wig
 5 | 
 6 | library(TitanCNA)
 7 | 
 8 | ## load the files
 9 | id <- "test"
10 | infile <-"paul_tumAlleleCounts_ncbi.tsv" 
11 | tumWig <- "chr4_tumor_reads_ncbi.wig" 
12 | normWig <- "chr4_normal_reads_ncbi.wig"
13 | gcWig <- "GRCh37-lite.gc.ws_1000.wig"
14 | mapWig <- "GRCh37-lite.map.ws_1000.wig"
15 | 
16 | ## load the tumor allele read counts
17 | data <- loadAlleleCountsFromFile(infile)
18 | 
19 | ## correct gc and mappability bias
20 | cnData <- correctReadDepth(tumWig,normWig,gcWig,mapWig)
21 | 


--------------------------------------------------------------------------------
/inst/script/cn.mops.R:
--------------------------------------------------------------------------------
 1 | library(cn.mops)
 2 | library(RUnit)
 3 | 
 4 | tumor_gr <- getReadCountsFromBAM("tumorA.chr4.bam",
 5 |      refSeqName="chr4",WL=10000,mode="unpaired")
 6 | normal_gr <- getReadCountsFromBAM("normalA.chr4.bam",
 7 |      refSeqName="chr4",WL=10000,mode="unpaired")
 8 | 
 9 | # We need a special normalization because the tumor has made large CNVs
10 | X <- tumor_gr
11 | values(X) <- cbind(values(tumor_gr),values(normal_gr)) 
12 | X <- normalizeGenome(X,normType="mode")
13 |  
14 | # Parameter settings for tumor: 
15 | # - norm=0, because we already have normalized
16 | # - integer copy numbers higher than 8 allowed
17 | # - DNAcopy as segmentation algorithm.
18 | ref_analysis_norm0 <- referencecn.mops(X[,1], X[,2],
19 |      norm=0, 
20 |      I = c(0.025, 0.5, 1, 1.5, 2, 2.5, 3, 3.5, 4, 8,16,32,64), 
21 |      classes = c("CN0", "CN1", "CN2", "CN3", "CN4", "CN5",
22 |                     "CN6", "CN7","CN8","CN16","CN32","CN64","CN128"),
23 |      segAlgorithm="DNAcopy")
24 | 
25 | 
26 | resCNMOPS <- calcIntegerCopyNumbers(ref_analysis_norm0)
27 | resCNMOPS <-cn.mops:::.replaceNames(resCNMOPS, "tumorA.chr4.bam","tumor")
28 | 
29 | png("cn.mops-segplot.png")
30 | segplot(resCNMOPS)
31 | dev.off()
32 | ##Here the x-axis represents the genomic position and the y-axis represents the 
33 | ##log ratio of read counts and copy number call of each segment(red)
34 | 
35 | cnvr(resCNMOPS) #look at CNV regions
36 | 
37 | cnvs(resCNMOPS) #look at individual CNV regions
38 | 
39 | 
40 | #-------------------------------------------------------------------------------
41 | test_cnv_cn.mops <-
42 |   function()
43 | {	
44 |     BamFiles <- list.files(system.file("extdata", package="cn.mops") , 
45 |                      pattern=".bam$", full.names=TRUE)
46 |     bamDataRanges <- getReadCountsFromBAM(BamFiles, 
47 |                          sampleNames=paste("Sample",1:2), mode="unpaired")
48 |     checkEquals(856, bamDataRanges$Sample.1[1])
49 |     
50 |     data(cn.mops)
51 |     got <- cn.mops(XRanges[,1:3])
52 |     checkEquals(6,length(cnvs(got)))
53 |     checkEquals(1775001 ,start(ranges(cnvr(got))[1]))
54 |     checkEquals(1850000, end(ranges(cnvr(got))[1]))
55 |     checkEquals(75000, width(ranges(cnvr(got))[1]))
56 | }
57 | 


--------------------------------------------------------------------------------
/inst/script/countOverlaps.R:
--------------------------------------------------------------------------------
 1 | library(GenomicAlignments)
 2 | library(RUnit)
 3 | 
 4 | countBinOverlaps <-
 5 |   function(features, reads, ...)
 6 |   {
 7 |     reads <- resize(granges(reads), width=1)
 8 |     countOverlaps(features, reads)
 9 |   }
10 | 
11 | summarizeBins <-
12 |   function(file, seqnames, tilewidth=10000)
13 |   {
14 |     stopifnot(is(file, "character") && length(file) > 0)
15 |     stopifnot(is(seqnames, "character") && length(seqnames) > 0)
16 |     
17 |     seqlengths <- seqlengths(BamFile(file[[1]]))
18 |     tiles <- tileGenome(seqlengths[names(seqlengths) %in% seqnames],
19 |                         tilewidth=tilewidth,
20 |                         cut.last.tile.in.chrom=TRUE)
21 |     
22 |     summarizeOverlaps(tiles, fls, countBinOverlaps)
23 |   }
24 | 
25 | fls <- dir("~/benchmark/copynumber/", pattern="bam$", full=TRUE)
26 | counts <- summarizeBins(fls, "chr4")
27 | 
28 | 
29 | 


--------------------------------------------------------------------------------
/inst/script/seqCNA.R:
--------------------------------------------------------------------------------
 1 | library(seqCNA)
 2 | 
 3 | tumor <- file.path(datafiles, "tumorA.chr4.bam")
 4 | normal <- file.path(datafiles, "normalA.chr4.bam")
 5 | ##runSeqsumm is run only once.
 6 | #runSeqsumm(summ.win=10,
 7 | #           file=tumor,
 8 | #           output.file="tumorA_chr4_seqsumm_out.txt",
 9 | #           samtools.path="samtools")
10 | 
11 | # runSeqsumm(summ.win=10,
12 | #            file=normal,
13 | #            output.file="normalA_chr4_seqsumm_out.txt",
14 | #            samtools.path="samtools")
15 | 
16 | ##note: by default it has written the two files in the same folder
17 | ## as the bam files!!
18 | 
19 | tumordata<- read.table(file.path(datafiles,"tumorA_chr4_seqsumm_out.txt"),
20 |                        sep="\t", header=TRUE)
21 | normaldata<- read.table(file.path(datafiles,"normalA_chr4_seqsumm_out.txt"),
22 |                        sep="\t", header=TRUE)
23 | tumordata<- tumordata[which(tumordata$chrom=="chr4"),]
24 | normaldata<- normaldata[which(normaldata$chrom=="chr4"),]
25 | head(tumordata)
26 | 
27 | rco = readSeqsumm(build="hg19",
28 |                     tumour.data=tumordata,
29 |                     normal.data=normaldata)
30 | 
31 | ##apply the trimming and mapping quality filters
32 | rco =applyFilters(rco, trim.filter=1, mapq.filter=2)
33 | rco= runSeqnorm(rco)
34 | rco=runGLAD(rco)
35 | 
36 | plotCNProfile(rco)
37 | 
38 | rco = applyThresholds(rco, seq(-0.8,4,by=0.8), 1)
39 | 
40 | plotCNProfile(rco)
41 | 
42 | summary(rco)
43 | 


--------------------------------------------------------------------------------
/inst/script/testCounts.R:
--------------------------------------------------------------------------------
 1 | ## Our motivation for this gist comes from the fact that we would like
 2 | ## to check the counts calculated by each of the methods.
 3 | 
 4 | ## Here we provide a simple unitTest which takes as input a GRanges
 5 | ## object which contains the reads from the "tumorA.chr4.bam" file.
 6 | 
 7 | ## we choose 5 regions and get their counts from samtools using a simple
 8 | ## command :
 9 | ## samtools view  tumorA.chr4.bam chr4:8000001-8010000   | wc -l      67
10 | 
11 | ## we can then use this function to check if  the counts coming out of
12 | ## a given method are equal to the counts given by samtools
13 | 
14 | ## we assume that the metadata column containing the counts in the GRanges
15 | ## object is called as "cnt"
16 | 
17 | testCounts <- function(grTumor)
18 | {
19 |     test_indices <- c(8000001, 8010001,10000001 , 10010001, 1000001)
20 |     test_res <- c(67,62,67,74,47) #from samtools
21 |     counts <- sapply(test_indices , 
22 |         function (x) grTumor[which(start(ranges(grTumor))==x)]$cnt  )
23 |     checkEquals(test_res,indices,tolerance=2)
24 | }
25 | 
26 | 


--------------------------------------------------------------------------------
/vignettes/seqCNA.Rnw:
--------------------------------------------------------------------------------
  1 | \documentclass{article}
  2 | <<style, eval=TRUE, echo=FALSE, results="asis">>=
  3 | BiocStyle::latex()
  4 | @
  5 | 
  6 | \title{Copy Number approach - seqCNA}
  7 | \date{Modified :26 March, 2014. Compiled: \today}
  8 | 
  9 | \begin{document}
 10 | \maketitle
 11 | 
 12 | \section{method}
 13 | Here we will review the Bioconductor package seqCNA
 14 | 
 15 | David Mosen-Ansorena, N. T., Silvia Veganzones,VirginiaDelaOrden, 
 16 | Maria Luisa Maestro and a. AnaMAransay (2014). "seqCNA: an R package 
 17 | for DNA copy number analysis in cancer using high-throughput sequencing."
 18 |  BMCGenomics.  
 19 | 
 20 | 
 21 | \section{code}
 22 | 
 23 | <<package-load,message=FALSE>>=
 24 | library(seqCNA)
 25 | datafiles <- file.path("/shared","silo_researcher","Morgan_M","BioC","sonali",
 26 |                       "analysis","inst","extdata")
 27 | @
 28 | Loading test data and running seqCNA on test data.
 29 | This tumor sample and normal sample contain data only from "chr4".
 30 | <<Paul-data-load>>=
 31 | tumor <- file.path(datafiles, "tumorA.chr4.bam")
 32 | normal <- file.path(datafiles, "normalA.chr4.bam")
 33 | ##runSeqsumm is run only once.
 34 | #runSeqsumm(summ.win=10, 
 35 | #           file=tumor, 
 36 | #           output.file="tumorA_chr4_seqsumm_out.txt",
 37 | #           samtools.path="samtools")
 38 | 
 39 | # runSeqsumm(summ.win=10, 
 40 | #            file=normal, 
 41 | #            output.file="normalA_chr4_seqsumm_out.txt",
 42 | #            samtools.path="samtools")
 43 | 
 44 | ##note: by default it has written the two files in the same folder
 45 | ## as the bam files!!
 46 | 
 47 | tumordata<- read.table(file.path(datafiles,"tumorA_chr4_seqsumm_out.txt"), 
 48 |                        sep="\t", header=TRUE)
 49 | normaldata<- read.table(file.path(datafiles,"normalA_chr4_seqsumm_out.txt"), 
 50 |                        sep="\t", header=TRUE)
 51 | tumordata<- tumordata[which(tumordata$chrom=="chr4"),]
 52 | normaldata<- normaldata[which(normaldata$chrom=="chr4"),]
 53 | head(tumordata)
 54 | @
 55 | 
 56 | When we run \Rfunction{readSeqsumm} it generates the following lines in the output
 57 | file for other chromosomes.Thus we have subsetted to look only at chr4.
 58 | <<noentry,eval=FALSE>>=
 59 | #chr5    0       0       0       0
 60 | #chr6    0       0       0       0
 61 | @
 62 | 
 63 | <<Realdata, eval=FALSE,echo=FALSE>>=
 64 | tumor <- file.path(datafiles, "tumor_srx036691.bam")
 65 | normal <- file.path(datafiles, "normal_srx113635.bam")
 66 | 
 67 | ##runSeqsumm is run only once. 
 68 | #runSeqsumm(summ.win=10, 
 69 | #           file=tumor, 
 70 | #           output.file="tumor_seqsumm_out.txt",
 71 | #           samtools.path="samtools")
 72 |  
 73 | # runSeqsumm(summ.win=10, 
 74 | #            file=normal, 
 75 | #            output.file="normal_seqsumm_out.txt",
 76 | # samtools.path="samtools")
 77 |  
 78 | ##note: by default it has written the two files in the same folder 
 79 | ## as the bam files!!
 80 |  
 81 | tumordata<- read.table(file.path(datafiles,"tumor_seqsumm_out.txt"), 
 82 |                        sep="\t", header=TRUE)
 83 | normaldata<- read.table(file.path(datafiles,"normal_seqsumm_out.txt"), 
 84 |                        sep="\t", header=TRUE)
 85 | @
 86 | Running seqCNA according to manual.
 87 | <<seqCNA>>= 
 88 | ##read in the summarized data
 89 | rco = readSeqsumm(build="hg19",
 90 |                     tumour.data=tumordata,
 91 |                     normal.data=normaldata)
 92 |  
 93 | ##apply the trimming and mapping quality filters                  
 94 | rco =applyFilters(rco, trim.filter=1, mapq.filter=2) 
 95 | rco= runSeqnorm(rco)
 96 | rco=runGLAD(rco)
 97 | 
 98 | plotCNProfile(rco)
 99 |  
100 | rco = applyThresholds(rco, seq(-0.8,4,by=0.8), 1)
101 | 
102 | plotCNProfile(rco)
103 | 
104 | summary(rco)
105 | 
106 | #writeCNProfile(rco, dir)
107 | @
108 | 
109 | \end{document}
110 | 


--------------------------------------------------------------------------------