├── R
└── RCNV_seq-helper.R
├── README.md
├── image
├── cn.mops-segplot.png
├── cna-norm.png
├── cnv-countOverlaps.png
├── cnv-seq-plot.png
├── count2.png
├── normalised-count2.png
├── plotBias.png
├── plotCorrection.png
├── plotDefaultSegments.png
├── plotSegment_case1.png
└── plotSegment_case2.png
├── inst
└── script
│ ├── CNAnorm.R
│ ├── CNV-seq.R
│ ├── HMMcopy.R
│ ├── RCNV_seq.R
│ ├── TitanCNA-helper.R
│ ├── TitanCNA.R
│ ├── cn.mops.R
│ ├── countOverlaps.R
│ ├── seqCNA.R
│ └── testCounts.R
├── result files
├── countOverlapresult.tsv
├── tumor.hits-vs-normal.hits.window-10000.minw-4.cnv
└── tumor.hits-vs-normal.hits.window-10000.minw-4.count
└── vignettes
└── seqCNA.Rnw
/R/RCNV_seq-helper.R:
--------------------------------------------------------------------------------
1 | ## samtools view -F 4 tumorA.chr4.bam |\
2 | ## perl -lane 'print "$F[2]\t$F[3]"' >tumor.hits
3 | ## samtools view -F 4 normalA.chr4.bam |\
4 | ## perl -lane 'print "$F[2]\t$F[3]"' >normal.hits
5 | ## perl cnv-seq/cnv-seq.pl --test tumor.hits --ref normal.hits \
6 | ## --genome human --Rexe "~/bin/R-devel/bin/R"
7 |
8 | genomeSize <- function(files)
9 | sum(as.numeric(seqlengths(BamFile(files[[1]]))))
10 |
11 | windowSize <-
12 | function(bam_files, pvalue=0.001, log2=0.6, bigger=1.5,
13 | genome_size, param)
14 | {
15 | if (missing(genome_size))
16 | genome_size=genomeSize(bam_files)
17 | if (missing(param))
18 | param <- ScanBamParam(flag=scanBamFlag(isUnmappedQuery=FALSE))
19 |
20 | total <- sapply(bam_files, function(...) {
21 | countBam(...)$records
22 | }, param=param)
23 |
24 | bt <- qnorm(1 - pvalue / 2)
25 | st <- qnorm(pvalue / 2)
26 | log2 <- abs(log2)
27 | brp <- 2^log2
28 | srp <- 1 / (2^log2)
29 |
30 | bw <- (total[["test"]] * brp^2 + total[["ref"]]) * genome_size * bt^2 /
31 | ((1-brp)^2 * total[["test"]] * total[["ref"]])
32 | sw <- (total[["test"]] * srp^2 + total[["ref"]]) * genome_size * st^2 /
33 | ((1-srp)^2 * total[["test"]] * total[["ref"]])
34 |
35 | window_size = floor(max(bw, sw) * bigger)
36 | }
37 |
38 | tileGenomeOverlap <- function(file, tilewidth) {
39 | ## overlapping tiles
40 | lengths <- seqlengths(BamFile(file[[1]]))
41 | tile0 <- tileGenome(lengths, tilewidth=tilewidth,
42 | cut.last.tile.in.chrom=TRUE)
43 | tile1 <- tile0[width(tile0) >= tilewidth]
44 | tile1 <- shift(tile1[-cumsum(runLength(seqnames(tile1)))], tilewidth / 2)
45 | sort(c(tile0, tile1))
46 | }
47 |
48 | binCounter <- function(features, reads, ignore.strand, ...) {
49 | countOverlaps(features, resize(granges(reads), 1),
50 | ignore.strand=ignore.strand)
51 | }
52 |
53 | as.countsfile <- function(hits, file=tempfile()) {
54 | df <- with(rowData(hits), {
55 | cbind(data.frame(chromosome=as.character(seqnames),
56 | start=start, end=end),
57 | assay(hits))
58 | })
59 | write.table(df, file, quote=FALSE, row.names=FALSE, sep="\t")
60 | file
61 | }
62 |
63 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Copy Number Analysis
2 | =====================
3 |
4 | Explore, compare, and evaluate Bioconductor packages related to genomic copy number analysis
5 |
6 | Genomic amplifications and deletions are found in most (all?) tumor genomes. A common practice today is to do low coverage DNA sequencing (0.5x, for instance) of a tumor genome, and a matched normal genome (from the same subject). Judicious comparison of the the two sequence genomes illuminates structural changes in the tumor.
7 |
8 | Copy number changes in tumors vary from broad (an entire chromosome arm) to focal (i.e., a 10kb amplification, loss of heterozygosity or gain). Detection methods should be sensitive enough to detect these very different phenomena in noisy low-coverage data.
9 |
10 | Our purpose here is to provide
11 |
12 | * A tumor/normal single chromosome pair of bam files (with accompanying index files)
13 | * A reference analysis, using the popular SeqSeg matlab program from the Broad Institute
14 | * A tutorial on the exploratory data analysis of these files using "native" Bioconductor capabilities
15 | * Demonstrate (and evaluate) the capabilities of many of the Bioconductor copy number analysis packages
16 |
17 | List of Tools used
18 | ===================
19 | Bioconductor Packages
20 | * countOverlaps
21 | * cn.mops
22 | * CNAnorm
23 | * seqCNA
24 | * HMMcopy
25 | * TitanCNA
26 |
27 |
28 | Non Biocondcutor packages
29 | * CNV-seq
30 | * Seg-seq
31 |
32 |
33 | Literature Resources
34 | =========================
35 | * Alkan, C., et al. (2011). "Genome structural variation discovery and genotyping." Nat Rev Genet 12(5): 363-376.
36 | * Duan J, Zhang J-G, Deng H-W, Wang Y-P (2013) Comparative Studies of Copy Number Variation Detection Methods for Next-Generation Sequencing Technologies. PLoS ONE 8(3): e59128. doi:10.1371/journal.pone.0059128
37 |
38 | Sample Data
39 | ===========
40 | * http://s3.amazonaws.com/copy-number-analysis/tumorA.chr4.bam
41 | * http://s3.amazonaws.com/copy-number-analysis/tumorA.chr4.bam.bai
42 | * http://s3.amazonaws.com/copy-number-analysis/normalA.chr4.bam
43 | * http://s3.amazonaws.com/copy-number-analysis/normalA.chr4.bam.bai
44 |
45 | Use, e.g.,
46 |
47 | download.file(url="http://s3.amazonaws.com/copy-number-analysis/tumorA.chr4.bam.bai",
48 | destfile="tumorA.chr4.bam.bai")
49 |
50 |
51 |
52 | Exploratory Data Analysis
53 | ==========================
54 | We have done some primary Exploratory Data Analysis on the Normal and Tumor Sample Datasets.
55 |
56 |
--------------------------------------------------------------------------------
/image/cn.mops-segplot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/copy-number-analysis/865ee38fcd621ef720851f11d30b965d60b8890a/image/cn.mops-segplot.png
--------------------------------------------------------------------------------
/image/cna-norm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/copy-number-analysis/865ee38fcd621ef720851f11d30b965d60b8890a/image/cna-norm.png
--------------------------------------------------------------------------------
/image/cnv-countOverlaps.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/copy-number-analysis/865ee38fcd621ef720851f11d30b965d60b8890a/image/cnv-countOverlaps.png
--------------------------------------------------------------------------------
/image/cnv-seq-plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/copy-number-analysis/865ee38fcd621ef720851f11d30b965d60b8890a/image/cnv-seq-plot.png
--------------------------------------------------------------------------------
/image/count2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/copy-number-analysis/865ee38fcd621ef720851f11d30b965d60b8890a/image/count2.png
--------------------------------------------------------------------------------
/image/normalised-count2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/copy-number-analysis/865ee38fcd621ef720851f11d30b965d60b8890a/image/normalised-count2.png
--------------------------------------------------------------------------------
/image/plotBias.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/copy-number-analysis/865ee38fcd621ef720851f11d30b965d60b8890a/image/plotBias.png
--------------------------------------------------------------------------------
/image/plotCorrection.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/copy-number-analysis/865ee38fcd621ef720851f11d30b965d60b8890a/image/plotCorrection.png
--------------------------------------------------------------------------------
/image/plotDefaultSegments.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/copy-number-analysis/865ee38fcd621ef720851f11d30b965d60b8890a/image/plotDefaultSegments.png
--------------------------------------------------------------------------------
/image/plotSegment_case1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/copy-number-analysis/865ee38fcd621ef720851f11d30b965d60b8890a/image/plotSegment_case1.png
--------------------------------------------------------------------------------
/image/plotSegment_case2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/copy-number-analysis/865ee38fcd621ef720851f11d30b965d60b8890a/image/plotSegment_case2.png
--------------------------------------------------------------------------------
/inst/script/CNAnorm.R:
--------------------------------------------------------------------------------
1 | library(CNAnorm)
2 | library(RUnit)
3 |
4 | #outside R session - convert perl file
5 | ##perl bam2windows.pl "tumorA.chr4.bam" "normalA.chr4.bam" > perloutput.txt
6 |
7 | test_counts <- function(chr4data)
8 | {
9 | test_indices <- c(8000001, 8010001,10000001 , 10010001, 1000001)
10 | test_res <- c(67,62,67,74,47) #from samtools
11 | res <- sapply( test_indices, function(x) chr4data[which(chr4data[,2]==x),3])
12 | checkEquals(test_res,res)
13 | }
14 |
15 | data<- read.table("perloutput.txt",sep="\t",header=TRUE)
16 |
17 | #subset to chromosome 4
18 | chr4data <- data[which(data[,1]=="chr4"),]
19 |
20 | #check if the raw counts are similiar to counts from samtools
21 | test_counts() #FALSE!
22 |
23 | #create an object of class CNAnorm
24 | cn <- dataFrame2object(chr4data)
25 |
26 | #smooth the signal to decrease noise without losing resolution.
27 | cn <- addSmooth(cn,lambda=7)
28 |
29 | #estimate peaks and ploidy
30 | cn <- peakPloidy(cn)
31 |
32 | #produce a plot
33 | png("cna-norm.png")
34 | plotPeaks(cn)
35 | dev.off()
--------------------------------------------------------------------------------
/inst/script/CNV-seq.R:
--------------------------------------------------------------------------------
1 | #Here we show an implementation of CNV-seq
2 |
3 | ##step 1 - includes generating best-hit location files for each mapped
4 | ##sequence read. The authors provide a perl script for BLAT psl file
5 | ## and SOLiD maching pipeline. For BAM files, they suggest to extract
6 | ##locations using the following command
7 |
8 | #~/copynumber$ samtools view -F 4 tumorA.chr4.bam |perl -lane
9 | #'print "F[2]\t$F[3]"' >tumor.hits
10 |
11 | #~/copynumber$ samtools view -F 4 normalA.chr4.bam |perl
12 | #-lane 'print "F[2]\t$F[3]"' >normal.hits
13 |
14 |
15 | ##cnv-seq.pl is used to calculate sliding window size, to count number of
16 | ##mapped hits in each window, and to call cnv R package to calculate log2
17 | ## ratios and annotate CNV
18 |
19 | # perl cnv-seq.pl --test tumor.hits --ref normal.hits --genome human
20 |
21 |
22 | ##two output files are produced. They can be found under "result files":
23 | ##tumor.hits-vs-normal.hits.window-10000.minw-4.cnv
24 | ##tumor.hits-vs-normal.hits.window-10000.minw-4.count
25 |
26 | #One can visualize the cnv inside R using the following code snippet
27 | ## the plot can be found under "image" folder
28 |
29 | library(cnv)
30 | data <- read.delim("tumor.hits-vs-normal.hits.window-10000.minw-4.cnv")
31 | cnv.summary(data)
32 | png("cnv-seq-plot.png")
33 | plot.cnv(data)
34 | dev.off()
--------------------------------------------------------------------------------
/inst/script/HMMcopy.R:
--------------------------------------------------------------------------------
1 | ## This file shows an implementation of copy number using HMMcopy.
2 | ## This is run on our chosen dataset
3 | ## tumor file : tumorA.chr4.bam
4 | ## normal file : normalA.chr4.bam
5 |
6 | ## outside R - generate readCounts file
7 | ## bin/readCounter tumorA.chr4.bam > chr4_tum_reads.wig
8 | ## bin/readCounter normalA.chr4.bam > chr4_norm_reads.wig
9 |
10 | ## currently the wig files for gc content and mappability have NCBI style
11 | ## of seqnames/ chromsomes. so convert tumor and normal wig files to same.
12 | ## sed s/=chr/=/ chr4_tumor_reads.wig > chr4_tumor_reads_ncbi.wig
13 | ## sed s/=chr/=/ chr4_tumor_reads.wig > chr4_tumor_reads_ncbi.wig
14 |
15 | tum_readfile <-"chr4_tumor_reads_ncbi.wig"
16 | nor_readfile <-"chr4_normal_reads_ncbi.wig"
17 |
18 | ## Note - these files are distributed along with TitanCNA
19 | ## the files distributed along with HMMcopy had inconsistent seqname style
20 | mapfile <-"GRCh37-lite.map.ws_1000.wig"
21 | gcfile <-"GRCh37-lite.gc.ws_1000.wig"
22 |
23 | ## create a RangedData object.
24 | tum_uncorrected_reads <- wigsToRangedData(tum_readfile, gcfile, mapfile)
25 | norm_uncorrected_reads <- wigsToRangedData(nor_readfile, gcfile, mapfile)
26 |
27 | ## subset to have reads only from chr4
28 | tum_uc_reads<- tum_uncorrected_reads["4"]
29 | norm_uc_reads<- norm_uncorrected_reads["4"]
30 |
31 | ##correct read counts
32 | tum_corrected_copy <- correctReadcount(tum_uc_reads)
33 | norm_corrected_copy <- correctReadcount(norm_uc_reads)
34 |
35 | ## Normalizing Tumour by Normal
36 | tum_corrected_copy$copy <- tum_corrected_copy$copy - norm_corrected_copy$copy
37 |
38 | ## Export to SEG format for CNAseq segmentation
39 | rangedDataToSeg(tum_corrected_copy, file = "paul_tum_corrected_copy.seg")
40 |
41 | ## Segmenting
42 | ## use default segmentation
43 | seg_copy_def <- HMMsegment(tum_corrected_copy)
44 |
45 | ## get parametrs
46 | realparam <- HMMsegment(tum_corrected_copy, getparam = TRUE) # retrieve converged parameters via EM
47 |
48 | ## Adjust parameters - case1
49 | param1 <- realparam
50 | param1$mu <- log(c(1, 1.4, 2, 2.7, 3, 4.5) / 2, 2)
51 | param1$m <- param1$mu
52 | segmented_copy_case1 <- HMMsegment(tum_corrected_copy, param1) # perform segmentation via Viterbi
53 |
54 | ## adjust parameters - case2 ## to decrease no of segments/
55 | param2 <- realparam
56 | param2$strength <- 1e30
57 | param2$e <- 0.99999999999999
58 | segmented_copy_case2 <- HMMsegment(tum_corrected_copy, param2)
59 |
60 | ## adjust parameters - case2 ## to increase no of segments/
61 | param3 <- realparam
62 | param3$strength <- 0.1
63 | param3$e <- 0.1
64 | segmented_copy_case3 <- HMMsegment(tum_corrected_copy, param3)
65 |
66 | ## visualization
67 |
68 | plotBias(tum_corrected_copy)
69 | plotCorrection(tum_corrected_copy)
70 | plotSegments(tum_corrected_copy, seg_copy_def)
71 | plotSegments(tum_corrected_copy, segmented_copy_case1)
72 | plotSegments(tum_corrected_copy, segmented_copy_case2)
73 | plotSegments(tum_corrected_copy, segmented_copy_case3)
74 |
75 |
76 |
77 |
--------------------------------------------------------------------------------
/inst/script/RCNV_seq.R:
--------------------------------------------------------------------------------
1 | ## This script and helper functions implements an R-only version of
2 | ## the work flow from http://tiger.dbs.nus.edu.sg/cnv-seq/. It uses
3 | ## GenomicRanges utilities to perform read counts across bins, and the
4 | ## 'cnv' package available at the URL above for additional
5 | ## analysis. No intermediate files are generates.
6 |
7 | ## user arguments; provide full paths
8 | root <- "~/benchmark/copynumber"
9 | files <- file.path(root, c("tumorA.chr4.bam", "normalA.chr4.bam"))
10 | names(files) <- c("test", "ref")
11 |
12 | log2 <- .6
13 | annotate <- TRUE
14 |
15 | ## script
16 | source("../../R/RCNV_seq-helper.R")
17 | suppressPackageStartupMessages({
18 | library(GenomicAlignments)
19 | library(cnv)
20 | })
21 |
22 | window_size <- windowSize(files, log2=log2)
23 | tiles <- tileGenomeOverlap(files, window_size)
24 | hits <- summarizeOverlaps(tiles, files, binCounter)
25 |
26 | chr4 <- subset(hits, seqnames %in% "chr4")
27 | cnv <- cnv.cal(as.countsfile(chr4), log2=log2, annotate=annotate)
28 | plot.cnv(cnv)
29 |
--------------------------------------------------------------------------------
/inst/script/TitanCNA-helper.R:
--------------------------------------------------------------------------------
1 | ###################################################################
2 | ##Utility R script to extract tumour allele read counts for input
3 | ## to TitanCNA.
4 | ## Author: Gavin Ha (gavin.ha@gmail.com)
5 | ## Date: May 12, 2014
6 | #####################################################################
7 | ## The script makes use of the R Bioconductor package
8 | ## Rsamtools (>= 1.17.11).
9 |
10 | ## The inputs to this script are:
11 | ## 1. File path to tumour BAM file (tumbamFile)
12 | ## 2. File path to tumour BAM index file (tumIndexFile)
13 | ## 3. File path to heterozygous germline SNP positions (vcfFile)
14 | ## This file can be generated using a variety of solutions.
15 | ## One such solution is posted on
16 | ## http://compbio.bccrc.ca/software/titan/titan-running-titancna/
17 | ## (see Step 5: Input files).
18 | ## The output to this script:
19 | ## A data.frame (countMat).
20 | ## Alternatively, this can be written to a tab-delimited text file with
21 | ## path "outFile". The format of this text file matches that required
22 | ## by TitanCNA ("loadAlleleCountsFromFile()").
23 | #####################################################################
24 | #####################################################################
25 |
26 | ## THE FOLLOWING SCRIPT ONLY WORKS WITH Rsamtools (>= 1.17.11)
27 | library(Rsamtools)
28 |
29 | #####################################################################
30 | ################ USERS - PLEASE MODIFY THESE PATHS #################
31 | #####################################################################
32 | tumbamFile <- "tumorA.chr4_sorted.bam"
33 | tumIndexFile <- paste(tumbamFile,".bai",sep = "")
34 | vcfFile <- "titanCNA_sorted_HetSNPs.vcf" ##"titanCNAHetSNPs.vcf"
35 | outFile <- "paul_tumAlleleCounts.tsv"
36 |
37 | #####################################################################
38 | #################### LOAD HET POSITIONS VCF FILE ####################
39 | #####################################################################
40 | ## read in vcf file of het positions
41 | vcf <- BcfFile(vcfFile)
42 | vcfPosns <- scanBcf(vcf)
43 |
44 | #####################################################################
45 | ####################### SETUP PILEUP PARAMETERS #####################
46 | #####################################################################
47 | ## setup PileupParam using sequence read filters
48 | pp <- PileupParam(min_base_quality = 10, min_mapq = 20,
49 | min_nucleotide_depth = 10, max_depth = 20,
50 | distinguish_strands = FALSE,
51 | distinguish_nucleotides = TRUE)
52 | ## setup the positions of interest to generate the pileup for
53 | which <- GRanges(as.character(vcfPosns$CHROM),
54 | IRanges(vcfPosns$POS, width = 1))
55 | ## setup addition BAM filters, such as excluding duplicate reads
56 | sbp <- ScanBamParam(flag = scanBamFlag(isDuplicate = FALSE), which = which)
57 |
58 | #####################################################################
59 | ########################## GENERATE PILEUP ##########################
60 | #####################################################################
61 | ## generate pileup using function (Rsamtools >= 1.17.11)
62 | ## this step can take a while
63 | tumbamObj <- BamFile(tumbamFile, index = tumIndexFile)
64 | counts <- pileup(tumbamObj, scanBamParam = sbp, pileupParam = pp)
65 |
66 | ## set of command to manipulate the "counts" data.frame output
67 | ## by pileup() such that multiple nucleotides are in a single
68 | ## row rather than in multiple rows.
69 | countsMerge <- xtabs(count ~ which_label + nucleotide, counts)
70 | label <- do.call(rbind, strsplit(rownames(countsMerge), ":"))
71 | posn <- do.call(rbind, strsplit(label[, 2],"-"))
72 |
73 | countsMerge <- cbind( position = posn[, 1], countsMerge)
74 | mode(countsMerge) <- "numeric"
75 | countsMerge <- data.frame(chr=label[,1],countsMerge, check.names=FALSE)
76 |
77 |
78 | #####################################################################
79 | ############### GET REFERENCE AND NON-REF READ COUNTS ###############
80 | #####################################################################
81 | ## this block of code is used to match up the reference and
82 | ## non-reference nucleotide when assigning read counts
83 | ## final output data.frame is "countMat"
84 | ## setup output data.frame
85 | countMat <- data.frame(chr = vcfPosns$CHROM,
86 | position = as.numeric(vcfPosns$POS),
87 | ref = vcfPosns$REF, refCount = 0,
88 | Nref = vcfPosns$ALT, NrefCount = 0,
89 | stringsAsFactors = FALSE)
90 |
91 | ## match rows with vcf positions of interest
92 | countMat <- merge(countMat, countsMerge, by = c("chr","position"),
93 | sort = FALSE)
94 |
95 | ## assign the flattened table of nucleotide counts to ref, Nref
96 | ## note that non-reference (Nref) allele is sum of other bases
97 | ## that is not matching the ref.
98 | NT <- c("A", "T", "C", "G")
99 | for (n in 1:length(NT)){
100 | indRef <- countMat$ref == NT[n]
101 | countMat[indRef, "refCount"] <- countMat[indRef, NT[n]]
102 | countMat[indRef, "NrefCount"] <- rowSums(countMat[indRef, NT[-n]])
103 | }
104 |
105 | countMat <- countMat[,1:6]
106 |
107 | #####################################################################
108 | ####################### OUTPUT TO TEXT FILE #########################
109 | #####################################################################
110 | ## output text file will have the same format required by TitanCNA
111 | write.table(countMat, file = outFile, row.names = FALSE,
112 | col.names = TRUE, quote = FALSE, sep = "\t")
113 |
114 |
--------------------------------------------------------------------------------
/inst/script/TitanCNA.R:
--------------------------------------------------------------------------------
1 | ## change seqnameStyle from UCSC to NCBI - done in unix.
2 | ## sed s/chr//g paul_tumAlleleCounts.tsv > paul_tumAlleleCounts_ncbi.tsv
3 | ## sed s/chr//g chr4_tumor_reads.wig > chr4_tumor_reads_ncbi.wig
4 | ## sed s/chr//g chr4_normal_reads.wig > chr4_normal_reads_ncbi.wig
5 |
6 | library(TitanCNA)
7 |
8 | ## load the files
9 | id <- "test"
10 | infile <-"paul_tumAlleleCounts_ncbi.tsv"
11 | tumWig <- "chr4_tumor_reads_ncbi.wig"
12 | normWig <- "chr4_normal_reads_ncbi.wig"
13 | gcWig <- "GRCh37-lite.gc.ws_1000.wig"
14 | mapWig <- "GRCh37-lite.map.ws_1000.wig"
15 |
16 | ## load the tumor allele read counts
17 | data <- loadAlleleCountsFromFile(infile)
18 |
19 | ## correct gc and mappability bias
20 | cnData <- correctReadDepth(tumWig,normWig,gcWig,mapWig)
21 |
--------------------------------------------------------------------------------
/inst/script/cn.mops.R:
--------------------------------------------------------------------------------
1 | library(cn.mops)
2 | library(RUnit)
3 |
4 | tumor_gr <- getReadCountsFromBAM("tumorA.chr4.bam",
5 | refSeqName="chr4",WL=10000,mode="unpaired")
6 | normal_gr <- getReadCountsFromBAM("normalA.chr4.bam",
7 | refSeqName="chr4",WL=10000,mode="unpaired")
8 |
9 | # We need a special normalization because the tumor has made large CNVs
10 | X <- tumor_gr
11 | values(X) <- cbind(values(tumor_gr),values(normal_gr))
12 | X <- normalizeGenome(X,normType="mode")
13 |
14 | # Parameter settings for tumor:
15 | # - norm=0, because we already have normalized
16 | # - integer copy numbers higher than 8 allowed
17 | # - DNAcopy as segmentation algorithm.
18 | ref_analysis_norm0 <- referencecn.mops(X[,1], X[,2],
19 | norm=0,
20 | I = c(0.025, 0.5, 1, 1.5, 2, 2.5, 3, 3.5, 4, 8,16,32,64),
21 | classes = c("CN0", "CN1", "CN2", "CN3", "CN4", "CN5",
22 | "CN6", "CN7","CN8","CN16","CN32","CN64","CN128"),
23 | segAlgorithm="DNAcopy")
24 |
25 |
26 | resCNMOPS <- calcIntegerCopyNumbers(ref_analysis_norm0)
27 | resCNMOPS <-cn.mops:::.replaceNames(resCNMOPS, "tumorA.chr4.bam","tumor")
28 |
29 | png("cn.mops-segplot.png")
30 | segplot(resCNMOPS)
31 | dev.off()
32 | ##Here the x-axis represents the genomic position and the y-axis represents the
33 | ##log ratio of read counts and copy number call of each segment(red)
34 |
35 | cnvr(resCNMOPS) #look at CNV regions
36 |
37 | cnvs(resCNMOPS) #look at individual CNV regions
38 |
39 |
40 | #-------------------------------------------------------------------------------
41 | test_cnv_cn.mops <-
42 | function()
43 | {
44 | BamFiles <- list.files(system.file("extdata", package="cn.mops") ,
45 | pattern=".bam$", full.names=TRUE)
46 | bamDataRanges <- getReadCountsFromBAM(BamFiles,
47 | sampleNames=paste("Sample",1:2), mode="unpaired")
48 | checkEquals(856, bamDataRanges$Sample.1[1])
49 |
50 | data(cn.mops)
51 | got <- cn.mops(XRanges[,1:3])
52 | checkEquals(6,length(cnvs(got)))
53 | checkEquals(1775001 ,start(ranges(cnvr(got))[1]))
54 | checkEquals(1850000, end(ranges(cnvr(got))[1]))
55 | checkEquals(75000, width(ranges(cnvr(got))[1]))
56 | }
57 |
--------------------------------------------------------------------------------
/inst/script/countOverlaps.R:
--------------------------------------------------------------------------------
1 | library(GenomicAlignments)
2 | library(RUnit)
3 |
4 | countBinOverlaps <-
5 | function(features, reads, ...)
6 | {
7 | reads <- resize(granges(reads), width=1)
8 | countOverlaps(features, reads)
9 | }
10 |
11 | summarizeBins <-
12 | function(file, seqnames, tilewidth=10000)
13 | {
14 | stopifnot(is(file, "character") && length(file) > 0)
15 | stopifnot(is(seqnames, "character") && length(seqnames) > 0)
16 |
17 | seqlengths <- seqlengths(BamFile(file[[1]]))
18 | tiles <- tileGenome(seqlengths[names(seqlengths) %in% seqnames],
19 | tilewidth=tilewidth,
20 | cut.last.tile.in.chrom=TRUE)
21 |
22 | summarizeOverlaps(tiles, fls, countBinOverlaps)
23 | }
24 |
25 | fls <- dir("~/benchmark/copynumber/", pattern="bam$", full=TRUE)
26 | counts <- summarizeBins(fls, "chr4")
27 |
28 |
29 |
--------------------------------------------------------------------------------
/inst/script/seqCNA.R:
--------------------------------------------------------------------------------
1 | library(seqCNA)
2 |
3 | tumor <- file.path(datafiles, "tumorA.chr4.bam")
4 | normal <- file.path(datafiles, "normalA.chr4.bam")
5 | ##runSeqsumm is run only once.
6 | #runSeqsumm(summ.win=10,
7 | # file=tumor,
8 | # output.file="tumorA_chr4_seqsumm_out.txt",
9 | # samtools.path="samtools")
10 |
11 | # runSeqsumm(summ.win=10,
12 | # file=normal,
13 | # output.file="normalA_chr4_seqsumm_out.txt",
14 | # samtools.path="samtools")
15 |
16 | ##note: by default it has written the two files in the same folder
17 | ## as the bam files!!
18 |
19 | tumordata<- read.table(file.path(datafiles,"tumorA_chr4_seqsumm_out.txt"),
20 | sep="\t", header=TRUE)
21 | normaldata<- read.table(file.path(datafiles,"normalA_chr4_seqsumm_out.txt"),
22 | sep="\t", header=TRUE)
23 | tumordata<- tumordata[which(tumordata$chrom=="chr4"),]
24 | normaldata<- normaldata[which(normaldata$chrom=="chr4"),]
25 | head(tumordata)
26 |
27 | rco = readSeqsumm(build="hg19",
28 | tumour.data=tumordata,
29 | normal.data=normaldata)
30 |
31 | ##apply the trimming and mapping quality filters
32 | rco =applyFilters(rco, trim.filter=1, mapq.filter=2)
33 | rco= runSeqnorm(rco)
34 | rco=runGLAD(rco)
35 |
36 | plotCNProfile(rco)
37 |
38 | rco = applyThresholds(rco, seq(-0.8,4,by=0.8), 1)
39 |
40 | plotCNProfile(rco)
41 |
42 | summary(rco)
43 |
--------------------------------------------------------------------------------
/inst/script/testCounts.R:
--------------------------------------------------------------------------------
1 | ## Our motivation for this gist comes from the fact that we would like
2 | ## to check the counts calculated by each of the methods.
3 |
4 | ## Here we provide a simple unitTest which takes as input a GRanges
5 | ## object which contains the reads from the "tumorA.chr4.bam" file.
6 |
7 | ## we choose 5 regions and get their counts from samtools using a simple
8 | ## command :
9 | ## samtools view tumorA.chr4.bam chr4:8000001-8010000 | wc -l 67
10 |
11 | ## we can then use this function to check if the counts coming out of
12 | ## a given method are equal to the counts given by samtools
13 |
14 | ## we assume that the metadata column containing the counts in the GRanges
15 | ## object is called as "cnt"
16 |
17 | testCounts <- function(grTumor)
18 | {
19 | test_indices <- c(8000001, 8010001,10000001 , 10010001, 1000001)
20 | test_res <- c(67,62,67,74,47) #from samtools
21 | counts <- sapply(test_indices ,
22 | function (x) grTumor[which(start(ranges(grTumor))==x)]$cnt )
23 | checkEquals(test_res,indices,tolerance=2)
24 | }
25 |
26 |
--------------------------------------------------------------------------------
/vignettes/seqCNA.Rnw:
--------------------------------------------------------------------------------
1 | \documentclass{article}
2 | <