├── .github ├── .gitignore └── ISSUE_TEMPLATE │ └── issue-report.md ├── inst ├── extdata │ ├── ex2_reference.fa.fai │ ├── ex3_reference.fa.fai │ ├── example_normal.list │ ├── ex1.bam │ ├── ex1.bam.bai │ ├── ex1_intervals.txt │ ├── example.vcf.gz │ ├── issue62.vcf.gz │ ├── gatk4_pon_db.tgz │ ├── issue109.vcf.gz │ ├── issue184.vcf.gz │ ├── issue184_2.vcf.gz │ ├── issue184_2_mb.rds │ ├── buggy_cnvkit.seg.gz │ ├── ex2_intervals.bed │ ├── ex3_intervals.bed │ ├── example_vcf.vcf.gz │ ├── issue62.vcf.gz.tbi │ ├── normalpanel.vcf.gz │ ├── ex1_gcgene.txt │ ├── ex2_mappability.bigWig │ ├── example_cosmic.vcf.gz │ ├── example_mutect2.vcf.gz │ ├── example_normal.txt.gz │ ├── example_normal2.txt.gz │ ├── example_normal5.hdf5 │ ├── example_single.vcf.gz │ ├── example_tumor.txt.gz │ ├── example_tumor2.txt.gz │ ├── example_vcf.vcf.gz.tbi │ ├── normalpanel.vcf.gz.tbi │ ├── example_callable.bed.gz │ ├── example_logratio.txt.gz │ ├── gatk4_m2_test_pon_db.tgz │ ├── ex2_intervals.txt │ ├── example_cosmic.vcf.gz.tbi │ ├── example_mutect2.vcf.gz.tbi │ ├── example_single.vcf.gz.tbi │ ├── example_gatk4_denoised_cr.tsv.gz │ ├── example_gatk4_modelfinal.seg.gz │ ├── example_intervals_tiny_ot.txt.gz │ ├── ex3_mappability.bed │ ├── ex2_mappability.bed │ ├── ex1_intervals_headered.txt │ ├── ex2_reptiming.bed │ ├── example_normal3.cnn │ ├── issue192_tumor.seg │ ├── example_normal4.cnr │ ├── test_coverage_overlapping_intervals.txt │ ├── dist │ │ ├── calculateSbdry.R │ │ └── downloadCentromeres.R │ ├── example_allelic_counts_empty.tsv │ ├── issue192_tumor.cnr │ ├── example_allelic_counts.tsv │ ├── ex2_reference.fa │ ├── ex3_reference.fa │ └── example_seg.txt └── CITATION ├── data ├── centromeres.rda ├── purecn.DNAcopy.bdry.rda └── purecn.example.output.rda ├── tests ├── testthat.R └── testthat │ ├── test_plotAbs.R │ ├── test_getSexFromVcf.R │ ├── test_findFocal.R │ ├── test_setPriorVcf.R │ ├── test_callCIN.R │ ├── test_adjustLogRatio.R │ ├── test_annotateTargets.R │ ├── test_callAlterationsFromSegmentation.R │ ├── test_callAlterations.R │ ├── test_bootstrapResults.R │ ├── test_poolCoverage.R │ ├── test_calculateLogRatio.R │ ├── test_readSegmentationFile.R │ ├── test_calculatePowerDetectSomatic.R │ ├── test_readLogRatioFile.R │ ├── test_callMutationBurden.R │ ├── test_predictSomatic.R │ ├── test_readAllelicCountsFile.R │ ├── test_callLOH.R │ ├── test_getSexFromCoverage.R │ ├── test_callAmplificationsInLowPurity.R │ ├── test_correctCoverageBias.R │ ├── test_calculateBamCoverageByInterval.R │ ├── test_readCoverageFile.R │ ├── test_createNormalDatabase.R │ ├── test_segmentation.R │ └── test_createCurationFile.R ├── .Rbuildignore ├── man ├── purecn.DNAcopy.bdry.Rd ├── purecn.example.output.Rd ├── PureCN-deprecated.Rd ├── PureCN-defunct.Rd ├── readAllelicCountsFile.Rd ├── callLOH.Rd ├── centromeres.Rd ├── readIntervalFile.Rd ├── readLogRatioFile.Rd ├── annotateTargets.Rd ├── createCurationFile.Rd ├── poolCoverage.Rd ├── calculateLogRatio.Rd ├── readSegmentationFile.Rd ├── readCoverageFile.Rd ├── bootstrapResults.Rd ├── callCIN.Rd ├── findHighQualitySNPs.Rd ├── adjustLogRatio.Rd ├── callAlterations.Rd ├── predictSomatic.Rd ├── calculateTangentNormal.Rd ├── filterVcfMuTect2.Rd ├── getSexFromCoverage.Rd ├── findFocal.Rd ├── readCurationFile.Rd ├── filterVcfMuTect.Rd ├── setMappingBiasVcf.Rd ├── correctCoverageBias.Rd ├── calculateBamCoverageByInterval.Rd ├── callAmplificationsInLowPurity.Rd ├── calculateMappingBiasVcf.Rd ├── callAlterationsFromSegmentation.Rd ├── calculateMappingBiasGatk4.Rd ├── createNormalDatabase.Rd ├── getSexFromVcf.Rd ├── setPriorVcf.Rd ├── calculatePowerDetectSomatic.Rd ├── segmentationHclust.Rd ├── callMutationBurden.Rd ├── processMultipleSamples.Rd ├── segmentationGATK4.Rd ├── plotAbs.Rd ├── filterIntervals.Rd └── preprocessIntervals.Rd ├── R ├── adjustLogRatio.R ├── callCIN.R ├── poolCoverage.R ├── filterVcfMuTect2.R ├── readIntervalFile.R ├── findFocal.R ├── createCurationFile.R ├── processMultipleSamples.R ├── bootstrapResults.R ├── readLogRatioFile.R ├── segmentationHclust.R ├── calculateLogRatio.R ├── annotateTargets.R ├── setPriorVcf.R ├── filterVcfMuTect.R ├── readCurationFile.R └── readAllelicCountsFile.R ├── DESCRIPTION └── Dockerfile /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /inst/extdata/ex2_reference.fa.fai: -------------------------------------------------------------------------------- 1 | seq1 800 6 80 81 2 | seq2 800 822 80 81 3 | -------------------------------------------------------------------------------- /inst/extdata/ex3_reference.fa.fai: -------------------------------------------------------------------------------- 1 | chr1 800 6 80 81 2 | chr2 800 822 80 81 3 | -------------------------------------------------------------------------------- /inst/extdata/example_normal.list: -------------------------------------------------------------------------------- 1 | example_normal.txt 2 | example_normal2.txt 3 | -------------------------------------------------------------------------------- /data/centromeres.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lima1/PureCN/HEAD/data/centromeres.rda -------------------------------------------------------------------------------- /inst/extdata/ex1.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/ex1.bam -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(PureCN) 3 | 4 | test_check("PureCN") 5 | -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | .github 2 | .travis.yml 3 | LICENSE 4 | codecov.R 5 | ^\.github$ 6 | Dockerfile 7 | -------------------------------------------------------------------------------- /inst/extdata/ex1.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/ex1.bam.bai -------------------------------------------------------------------------------- /inst/extdata/ex1_intervals.txt: -------------------------------------------------------------------------------- 1 | Targets 2 | seq1:1000-2000 3 | seq2:100-1000 4 | seq2:1001-2000 5 | -------------------------------------------------------------------------------- /inst/extdata/example.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/example.vcf.gz -------------------------------------------------------------------------------- /inst/extdata/issue62.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/issue62.vcf.gz -------------------------------------------------------------------------------- /data/purecn.DNAcopy.bdry.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lima1/PureCN/HEAD/data/purecn.DNAcopy.bdry.rda -------------------------------------------------------------------------------- /data/purecn.example.output.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lima1/PureCN/HEAD/data/purecn.example.output.rda -------------------------------------------------------------------------------- /inst/extdata/gatk4_pon_db.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/gatk4_pon_db.tgz -------------------------------------------------------------------------------- /inst/extdata/issue109.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/issue109.vcf.gz -------------------------------------------------------------------------------- /inst/extdata/issue184.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/issue184.vcf.gz -------------------------------------------------------------------------------- /inst/extdata/issue184_2.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/issue184_2.vcf.gz -------------------------------------------------------------------------------- /inst/extdata/issue184_2_mb.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/issue184_2_mb.rds -------------------------------------------------------------------------------- /inst/extdata/buggy_cnvkit.seg.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/buggy_cnvkit.seg.gz -------------------------------------------------------------------------------- /inst/extdata/ex2_intervals.bed: -------------------------------------------------------------------------------- 1 | seq1 100 250 2 | seq1 300 650 3 | seq2 0 150 4 | seq2 400 550 5 | seq2 700 750 6 | -------------------------------------------------------------------------------- /inst/extdata/ex3_intervals.bed: -------------------------------------------------------------------------------- 1 | chr1 100 250 2 | chr1 300 650 3 | chr2 0 150 4 | chr2 400 550 5 | chr2 700 750 6 | -------------------------------------------------------------------------------- /inst/extdata/example_vcf.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/example_vcf.vcf.gz -------------------------------------------------------------------------------- /inst/extdata/issue62.vcf.gz.tbi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/issue62.vcf.gz.tbi -------------------------------------------------------------------------------- /inst/extdata/normalpanel.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/normalpanel.vcf.gz -------------------------------------------------------------------------------- /inst/extdata/ex1_gcgene.txt: -------------------------------------------------------------------------------- 1 | Targets gc_bias 2 | seq1:1000-2000 0.45 3 | seq2:100-1000 0.55 4 | seq2:1001-2000 0.46 5 | -------------------------------------------------------------------------------- /inst/extdata/ex2_mappability.bigWig: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/ex2_mappability.bigWig -------------------------------------------------------------------------------- /inst/extdata/example_cosmic.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/example_cosmic.vcf.gz -------------------------------------------------------------------------------- /inst/extdata/example_mutect2.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/example_mutect2.vcf.gz -------------------------------------------------------------------------------- /inst/extdata/example_normal.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/example_normal.txt.gz -------------------------------------------------------------------------------- /inst/extdata/example_normal2.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/example_normal2.txt.gz -------------------------------------------------------------------------------- /inst/extdata/example_normal5.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/example_normal5.hdf5 -------------------------------------------------------------------------------- /inst/extdata/example_single.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/example_single.vcf.gz -------------------------------------------------------------------------------- /inst/extdata/example_tumor.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/example_tumor.txt.gz -------------------------------------------------------------------------------- /inst/extdata/example_tumor2.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/example_tumor2.txt.gz -------------------------------------------------------------------------------- /inst/extdata/example_vcf.vcf.gz.tbi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/example_vcf.vcf.gz.tbi -------------------------------------------------------------------------------- /inst/extdata/normalpanel.vcf.gz.tbi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/normalpanel.vcf.gz.tbi -------------------------------------------------------------------------------- /inst/extdata/example_callable.bed.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/example_callable.bed.gz -------------------------------------------------------------------------------- /inst/extdata/example_logratio.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/example_logratio.txt.gz -------------------------------------------------------------------------------- /inst/extdata/gatk4_m2_test_pon_db.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/gatk4_m2_test_pon_db.tgz -------------------------------------------------------------------------------- /inst/extdata/ex2_intervals.txt: -------------------------------------------------------------------------------- 1 | Target 2 | seq1:101-250 3 | seq1:301-650 4 | seq2:1-150 5 | seq2:401-550 6 | seq2:701-750 7 | -------------------------------------------------------------------------------- /inst/extdata/example_cosmic.vcf.gz.tbi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/example_cosmic.vcf.gz.tbi -------------------------------------------------------------------------------- /inst/extdata/example_mutect2.vcf.gz.tbi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/example_mutect2.vcf.gz.tbi -------------------------------------------------------------------------------- /inst/extdata/example_single.vcf.gz.tbi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/example_single.vcf.gz.tbi -------------------------------------------------------------------------------- /inst/extdata/example_gatk4_denoised_cr.tsv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/example_gatk4_denoised_cr.tsv.gz -------------------------------------------------------------------------------- /inst/extdata/example_gatk4_modelfinal.seg.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/example_gatk4_modelfinal.seg.gz -------------------------------------------------------------------------------- /inst/extdata/example_intervals_tiny_ot.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/example_intervals_tiny_ot.txt.gz -------------------------------------------------------------------------------- /inst/extdata/ex3_mappability.bed: -------------------------------------------------------------------------------- 1 | chr1 0 250 . 1 . 2 | chr1 250 650 . 1 . 3 | chr2 0 150 . 0.7 . 4 | chr2 150 550 . 1 . 5 | chr2 550 750 . 1 . 6 | -------------------------------------------------------------------------------- /inst/extdata/ex2_mappability.bed: -------------------------------------------------------------------------------- 1 | seq1 0 250 . 1 . 2 | seq1 250 650 . 1 . 3 | seq2 0 150 . 0.699999988079071 . 4 | seq2 150 550 . 1 . 5 | seq2 550 750 . 1 . 6 | -------------------------------------------------------------------------------- /inst/extdata/ex1_intervals_headered.txt: -------------------------------------------------------------------------------- 1 | @HD VN:1.6 2 | @SQ SN:seq1 LN:1575 3 | @SQ SN:seq2 LN:1584 4 | Targets 5 | seq1:1000-1500 6 | seq2:100-1000 7 | seq2:1001-1500 8 | -------------------------------------------------------------------------------- /inst/extdata/ex2_reptiming.bed: -------------------------------------------------------------------------------- 1 | seq1 0 100 . 10 2 | seq1 100 200 . 15 3 | seq1 200 300 . 20 4 | seq1 300 400 . 10 5 | seq1 400 500 . 12 6 | seq2 0 150 . 50 7 | seq2 150 300 . 80 8 | seq2 300 750 . 10 9 | -------------------------------------------------------------------------------- /inst/extdata/example_normal3.cnn: -------------------------------------------------------------------------------- 1 | chromosome start end gene depth log2 2 | chr1 762097 762270 LINC00115 174.89 7.45031 3 | chr1 861281 861490 SAMD11 28.9043 4.85321 4 | chr1 865591 865791 SAMD11 51.26 5.67976 5 | chr1 866325 866498 SAMD11 14 3.80735 6 | -------------------------------------------------------------------------------- /inst/extdata/issue192_tumor.seg: -------------------------------------------------------------------------------- 1 | ID chrom loc.start loc.end num.mark seg.mean 2 | tumor 1 105930 57961455 5665 0.0372507 3 | tumor 1 61680889 80343975 668 0.163111 4 | tumor 1 81495027 136720674 2523 -0.648407 5 | tumor 1 136791606 151618297 777 0.119727 6 | tumor 1 151809054 152269420 71 0.736822 7 | tumor 1 152277886 170640583 1200 0.111717 8 | -------------------------------------------------------------------------------- /inst/extdata/example_normal4.cnr: -------------------------------------------------------------------------------- 1 | chromosome start end gene log2 depth weight 2 | chr1 10500 68590 Background 0.55584 0.70587 0.466868 3 | chr1 70509 176917 Background 0.235896 1.02411 0.482562 4 | chr1 227917 267219 Background 0.163203 0.387996 0.408305 5 | chr1 318219 367158 Background 0.375418 1.42616 0.424955 6 | chr1 367658 367893 . 0.68569 17.617 0.310347 7 | -------------------------------------------------------------------------------- /inst/extdata/test_coverage_overlapping_intervals.txt: -------------------------------------------------------------------------------- 1 | Target total_coverage average_coverage 2 | chr1:1216042-1216047 316.551528468946 80.8786439075042 3 | chr1:1216045-1216050 316.551528468946 80.8786439075042 4 | chr1:1216606-1216678 5839.39523091608 129.022717424915 5 | chr1:1216791-1216991 26857.8564530621 220.417338871495 6 | chr1:1216991-1217991 26857.8564530621 220.417338871495 7 | -------------------------------------------------------------------------------- /tests/testthat/test_plotAbs.R: -------------------------------------------------------------------------------- 1 | context("plotAbs") 2 | 3 | test_that("Exceptions happen with wrong input", { 4 | data(purecn.example.output) 5 | expect_error( plotAbs(purecn.example.output, id = "hello", "BAF"), 6 | "No solution with id hello") 7 | expect_error( plotAbs(purecn.example.output, id = 100, "BAF"), 8 | "No solution with id 100") 9 | }) 10 | -------------------------------------------------------------------------------- /inst/extdata/dist/calculateSbdry.R: -------------------------------------------------------------------------------- 1 | library(PureCN) 2 | 3 | alpha <- formals(segmentationCBS)$alpha 4 | eta <- formals(segment)$eta 5 | nperm <- formals(segment)$nperm 6 | max.ones <- floor(nperm * alpha) + 1 7 | set.seed(123) 8 | 9 | purecn.DNAcopy.bdry <- getbdry(eta, nperm, max.ones) 10 | save(purecn.DNAcopy.bdry, file="~/git/PureCN/data/purecn.DNAcopy.bdry.rda", compress="xz") 11 | -------------------------------------------------------------------------------- /man/purecn.DNAcopy.bdry.Rd: -------------------------------------------------------------------------------- 1 | \name{purecn.DNAcopy.bdry} 2 | \docType{data} 3 | \alias{purecn.DNAcopy.bdry} 4 | \title{DNAcopy boundary data} 5 | \description{ 6 | This provides the output of the \code{DNAcopy::getbdry} call using \code{\link{segmentationCBS}} 7 | default parameters. 8 | } 9 | \usage{data(purecn.DNAcopy.bdry)} 10 | \value{Output of the \code{DNAcopy::getbdry} call.} 11 | \keyword{datasets} 12 | -------------------------------------------------------------------------------- /man/purecn.example.output.Rd: -------------------------------------------------------------------------------- 1 | \name{purecn.example.output} 2 | \docType{data} 3 | \alias{purecn.example.output} 4 | \title{Example output} 5 | \description{ 6 | This provides the output of the \code{\link{runAbsoluteCN}} call used in the 7 | vignette and examples. 8 | } 9 | \usage{data(purecn.example.output)} 10 | \value{Output of the \code{\link{runAbsoluteCN}} call used in the vignette.} 11 | \keyword{datasets} 12 | -------------------------------------------------------------------------------- /tests/testthat/test_getSexFromVcf.R: -------------------------------------------------------------------------------- 1 | context("getSexFromVcf") 2 | 3 | test_that("Example data is called correctly", { 4 | vcf.file <- system.file("extdata", "example.vcf.gz", package = "PureCN") 5 | vcf <- readVcf(vcf.file, "hg19") 6 | sex <- getSexFromVcf(vcf) 7 | expect_true(is.na(sex)) 8 | vcfs <- vcf[info(vcf)$SOMATIC] 9 | getSexFromVcf(vcfs, "LIB-02240e4") 10 | expect_true(is.na(sex)) 11 | }) 12 | -------------------------------------------------------------------------------- /tests/testthat/test_findFocal.R: -------------------------------------------------------------------------------- 1 | context("findFocal") 2 | 3 | test_that("Example data is called correctly", { 4 | data(purecn.example.output) 5 | ret <- findFocal(purecn.example.output$results[[1]]$seg) 6 | expect_equal(class(ret), "logical") 7 | expect_true(nrow(purecn.example.output$results[[1]]$seg) == 8 | length(ret)) 9 | expect_true(min(purecn.example.output$results[[1]]$seg[ret, 10 | "C"]) >= 5) 11 | }) 12 | -------------------------------------------------------------------------------- /man/PureCN-deprecated.Rd: -------------------------------------------------------------------------------- 1 | \name{PureCN-deprecated} 2 | \alias{PureCN-deprecated} 3 | \title{Deprecated functions in package \sQuote{PureCN}} 4 | 5 | \description{ 6 | These functions are provided for compatibility with older versions 7 | of \sQuote{PureCN} only, and will be defunct at the next release. 8 | } 9 | 10 | \details{ 11 | The following functions are deprecated and will be made defunct; use 12 | the replacement indicated below: 13 | % \itemize{ 14 | % 15 | % } 16 | } 17 | -------------------------------------------------------------------------------- /tests/testthat/test_setPriorVcf.R: -------------------------------------------------------------------------------- 1 | context("setPriorVcf") 2 | 3 | test_that("Example data matches expected values", { 4 | vcf.file <- system.file("extdata", "example_vcf.vcf.gz", package = "PureCN") 5 | vcf <- readVcf(vcf.file, "hg19") 6 | vcf <- setPriorVcf(vcf) 7 | vcf.priorsomatic <- info(vcf)$PR 8 | expected <- c(2322, 9) 9 | names(expected) <- c(1e-04, 0.999) 10 | expect_equal(sort(table(vcf.priorsomatic))[2], expected[1]) 11 | expect_equal(sort(table(vcf.priorsomatic))[1], expected[2]) 12 | }) 13 | 14 | -------------------------------------------------------------------------------- /tests/testthat/test_callCIN.R: -------------------------------------------------------------------------------- 1 | context("callCIN") 2 | 3 | test_that("Example is called correctly", { 4 | data(purecn.example.output) 5 | loh <- callLOH(purecn.example.output) 6 | loh$size <- loh$end - loh$start + 1 7 | idx <- loh$C == 2 8 | ret <- callCIN(purecn.example.output, reference.state = "normal", 9 | allele.specific = FALSE) 10 | expect_equal(sum(loh$size[!idx])/sum(loh$size), ret, tol = 0.001) 11 | loh <- loh[!is.na(loh$M),] 12 | ret <- callCIN(purecn.example.output) 13 | expect_equal(0.481, ret, tol = 0.02) 14 | ret <- callCIN(purecn.example.output, reference.state = "normal") 15 | idx <- loh$C == 2 & loh$M == 1 16 | expect_equal(sum(loh$size[!idx])/sum(loh$size), ret, tol = 0.001) 17 | }) 18 | -------------------------------------------------------------------------------- /tests/testthat/test_adjustLogRatio.R: -------------------------------------------------------------------------------- 1 | context("adjustLogRatio") 2 | 3 | test_that("Function returns expected values for example coverage", { 4 | data(purecn.example.output) 5 | log.ratio <- purecn.example.output$results[[1]]$seg$seg.mean 6 | purity <- purecn.example.output$results[[1]]$purity 7 | ploidy <- purecn.example.output$results[[1]]$ploidy 8 | log.ratio.adjusted <- adjustLogRatio(log.ratio, purity, ploidy) 9 | total.ploidy <- 1.73 10 | p <- 1 11 | log.ratio.offset <- 0 12 | opt.C <- (2^(log.ratio.adjusted + log.ratio.offset) * total.ploidy)/p - ((2 * (1 - p))/p) 13 | expect_lt(abs(min(log.ratio.adjusted, na.rm=TRUE) + 8), 0.001) 14 | expect_lt(median(abs(opt.C - purecn.example.output$results[[1]]$seg$C)), 0.1) 15 | }) 16 | 17 | -------------------------------------------------------------------------------- /tests/testthat/test_annotateTargets.R: -------------------------------------------------------------------------------- 1 | context("annotateTargets") 2 | library(TxDb.Hsapiens.UCSC.hg19.knownGene) 3 | library(org.Hs.eg.db) 4 | test_coverage <- readCoverageFile(system.file("extdata", "example_normal.txt.gz", 5 | package = "PureCN")) 6 | 7 | test_that("KIF1B is correctly annotated with UCSC chromosome names", { 8 | x <- head(test_coverage, 100) 9 | x <- annotateTargets(x, TxDb.Hsapiens.UCSC.hg19.knownGene, 10 | org.Hs.eg.db) 11 | expect_equal(x$Gene[67], "KIF1B") 12 | }) 13 | 14 | test_that("KIF1B is correctly annotated with NCBI chromosome names", { 15 | x <- head(test_coverage, 100) 16 | seqlevelsStyle(x) <- "Ensembl" 17 | x <- annotateTargets(x, TxDb.Hsapiens.UCSC.hg19.knownGene, 18 | org.Hs.eg.db) 19 | expect_equal(x$Gene[67], "KIF1B") 20 | }) 21 | -------------------------------------------------------------------------------- /tests/testthat/test_callAlterationsFromSegmentation.R: -------------------------------------------------------------------------------- 1 | context("callAlterationsFromSegmentation") 2 | 3 | test_that("Example is called correctly", { 4 | data(purecn.example.output) 5 | seg <- purecn.example.output$results[[1]]$seg 6 | interval.file <- system.file("extdata", "example_intervals.txt", 7 | package = "PureCN") 8 | calls <- callAlterationsFromSegmentation(sampleid = seg$ID, 9 | chr = seg$chrom, start = seg$loc.start, end = seg$loc.end, 10 | num.mark = seg$num.mark, seg.mean = seg$seg.mean, C = seg$C, 11 | interval.file = interval.file) 12 | calls2 <- callAlterations(purecn.example.output) 13 | expect_equal(sort(rownames(calls$Sample1[calls$Sample1$type == 14 | "AMPLIFICATION", ])), sort(rownames(calls2[calls2$type == 15 | "AMPLIFICATION", ]))) 16 | }) 17 | -------------------------------------------------------------------------------- /inst/CITATION: -------------------------------------------------------------------------------- 1 | bibentry(bibtype = "Article", 2 | author = c(person(given = "Markus", family = "Riester"), 3 | person(given = "Angad", family = "Singh"), 4 | person(given = "A. Rose", family = "Brannon"), 5 | person(given = "Kun", family = "Yu"), 6 | person(given = "Catarina D.", family = "Campbell"), 7 | person(given = "Derek Y.", family = "Chiang"), 8 | person(given = "Michael", family = "Morrissey")), 9 | title = "PureCN: Copy number calling and SNV classification using 10 | targeted short read sequencing", 11 | year="2016", 12 | volume="11", 13 | pages="13", 14 | doi="10.1186/s13029-016-0060-z", 15 | journal = "Source Code for Biology and Medicine" 16 | ) 17 | -------------------------------------------------------------------------------- /inst/extdata/example_allelic_counts_empty.tsv: -------------------------------------------------------------------------------- 1 | @HD VN:1.6 2 | @SQ SN:chr1 LN:249250621 3 | @SQ SN:chr2 LN:243199373 4 | @SQ SN:chr3 LN:198022430 5 | @SQ SN:chr4 LN:191154276 6 | @SQ SN:chr5 LN:180915260 7 | @SQ SN:chr6 LN:171115067 8 | @SQ SN:chr7 LN:159138663 9 | @SQ SN:chr8 LN:146364022 10 | @SQ SN:chr9 LN:141213431 11 | @SQ SN:chr10 LN:135534747 12 | @SQ SN:chr11 LN:135006516 13 | @SQ SN:chr12 LN:133851895 14 | @SQ SN:chr13 LN:115169878 15 | @SQ SN:chr14 LN:107349540 16 | @SQ SN:chr15 LN:102531392 17 | @SQ SN:chr16 LN:90354753 18 | @SQ SN:chr17 LN:81195210 19 | @SQ SN:chr18 LN:78077248 20 | @SQ SN:chr19 LN:59128983 21 | @SQ SN:chr20 LN:63025520 22 | @SQ SN:chr21 LN:48129895 23 | @SQ SN:chr22 LN:51304566 24 | @SQ SN:chrX LN:155270560 25 | @SQ SN:chrY LN:59373566 26 | @SQ SN:chrM LN:16571 27 | @RG ID:PureCN SM:LIB-02240e4 28 | CONTIG POSITION REF_COUNT ALT_COUNT REF_NUCLEOTIDE ALT_NUCLEOTIDE 29 | -------------------------------------------------------------------------------- /tests/testthat/test_callAlterations.R: -------------------------------------------------------------------------------- 1 | context("callAlterations") 2 | 3 | test_that("Example is called correctly", { 4 | data(purecn.example.output) 5 | calls <- callAlterations(purecn.example.output) 6 | expect_true(sum(calls$C < 6 & calls$C > 0.5) == 0) 7 | calls <- callAlterations(purecn.example.output, failed = TRUE) 8 | expect_true(sum(calls$gene.mean < 0.9 & calls$gene.mean > 9 | -0.9) == 0) 10 | esr2 <- callAlterations(purecn.example.output, all.genes = TRUE)["ESR2", ] 11 | expect_equal(as.character(esr2$chr), "chr14") 12 | expect_true(esr2$start > 64694600) 13 | expect_true(esr2$end < 64761128) 14 | }) 15 | 16 | test_that("issue_292 is fixed", { 17 | data(purecn.example.output) 18 | calls <- callAlterations(purecn.example.output, id = 2, all.genes = TRUE) 19 | expect_true(abs(mean(calls$C) - purecn.example.output$results[[2]]$ploidy) < 0.5) 20 | }) 21 | -------------------------------------------------------------------------------- /tests/testthat/test_bootstrapResults.R: -------------------------------------------------------------------------------- 1 | context("bootstrapResults") 2 | 3 | test_that("Bootstrapping removed solutions", { 4 | data(purecn.example.output) 5 | set.seed(123) 6 | ret <- bootstrapResults(purecn.example.output, n = 100, top = 2) 7 | expect_equal(ret$results[[1]]$purity, purecn.example.output$results[[1]]$purity) 8 | expect_equal(ret$results[[1]]$ploidy, purecn.example.output$results[[1]]$ploidy) 9 | expect_true(length(ret$results) < length(purecn.example.output$results)) 10 | expect_true(ret$results[[1]]$bootstrap.value >= 0.5) 11 | expect_true(ret$results[[2]]$bootstrap.value < 0.5) 12 | expect_true(length(ret$results) >= 2) 13 | ret <- bootstrapResults(purecn.example.output, n = 100, top = 3) 14 | expect_true(length(ret$results) >= 3) 15 | ret <- bootstrapResults(purecn.example.output, n = 100) 16 | expect_equal(length(purecn.example.output$results), length(ret$results)) 17 | }) 18 | 19 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/issue-report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Issue report 3 | about: For issues related to your PureCN output, please use this template if possible. 4 | Otherwise start with a blank issue. 5 | title: '' 6 | labels: '' 7 | assignees: '' 8 | 9 | --- 10 | 11 | **Describe the issue** 12 | A clear and concise description of what the issue is. 13 | 14 | **To Reproduce** 15 | Copy and paste your complete command line arguments from PureCN.R. If possible and potentially relevant, also copy the output of NormalDB.R and Coverage.R. 16 | 17 | **Expected behavior** 18 | A clear and concise description of what you expected to happen. 19 | 20 | **Log file** 21 | Please copy and paste the log file (Sampleid.log) of a representative example 22 | 23 | **B-allele frequency plot** 24 | Please take a screenshot of the B-allele frequency plot of the maximum likelihood solution 25 | (Sampleid.pdf). 26 | 27 | **Session Info** 28 | Please start R, type sessionInfo() and paste the output. 29 | -------------------------------------------------------------------------------- /man/PureCN-defunct.Rd: -------------------------------------------------------------------------------- 1 | \name{PureCN-defunct} 2 | \alias{PureCN-defunct} 3 | \title{Defunct functions in package \sQuote{PureCN}} 4 | 5 | \description{ 6 | These functions are defunct and no longer available. 7 | } 8 | 9 | \details{ 10 | The following functions are defunct; use 11 | the replacement indicated below: 12 | \itemize{ 13 | \item{autoCurateResults: no replacement} 14 | \item{calculateGCContentByInterval: \code{\link{preprocessIntervals}}} 15 | \item{calculateIntervalWeights: \code{\link{createNormalDatabase}}} 16 | \item{createExonWeightFile: \code{\link{createNormalDatabase}}} 17 | \item{createSNPBlacklist: \code{\link{setMappingBiasVcf}}} 18 | \item{createTargetWeights: \code{\link{createNormalDatabase}}} 19 | \item{filterTargets: \code{\link{filterIntervals}}} 20 | \item{findBestNormal: \code{\link{calculateTangentNormal}}} 21 | \item{getDiploid: no replacement} 22 | \item{plotBestNormal: no replacement} 23 | \item{readCoverageGatk: \code{\link{readCoverageFile}}} 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /man/readAllelicCountsFile.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/readAllelicCountsFile.R 3 | \name{readAllelicCountsFile} 4 | \alias{readAllelicCountsFile} 5 | \title{Read allelic counts file} 6 | \usage{ 7 | readAllelicCountsFile(file, format, zero = NULL) 8 | } 9 | \arguments{ 10 | \item{file}{Input file containing counts of ref and alt alleles} 11 | 12 | \item{format}{File format. If missing, derived from the file 13 | extension. Currently only GATK4 CollectAllelicCounts (tsv) 14 | format supported.} 15 | 16 | \item{zero}{Start position is 0-based. Default is \code{FALSE} 17 | for GATK, \code{TRUE} for BED file based intervals.} 18 | } 19 | \value{ 20 | A \code{CollapsedVCF} with the parsed allelic counts. 21 | } 22 | \description{ 23 | Read file containing counts of ref and alt alleles of common 24 | Toolkit 4. 25 | } 26 | \examples{ 27 | 28 | ac.file <- system.file("extdata", "example_allelic_counts.tsv", 29 | package="PureCN") 30 | vcf_ac <- readAllelicCountsFile(ac.file) 31 | 32 | } 33 | \author{ 34 | Markus Riester 35 | } 36 | -------------------------------------------------------------------------------- /man/callLOH.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/callLOH.R 3 | \name{callLOH} 4 | \alias{callLOH} 5 | \title{Get regions of LOH} 6 | \usage{ 7 | callLOH(res, id = 1, arm.cutoff = 0.9, keep.no.snp.segments = TRUE) 8 | } 9 | \arguments{ 10 | \item{res}{Return object of the \code{\link{runAbsoluteCN}} function.} 11 | 12 | \item{id}{Candidate solution to extract LOH from. \code{id=1} will use the 13 | maximum likelihood solution.} 14 | 15 | \item{arm.cutoff}{Min fraction LOH on a chromosome arm to call whole arm 16 | events.} 17 | 18 | \item{keep.no.snp.segments}{Segments without heterozygous SNPs 19 | have no LOH information. This defines whether these segments should 20 | be reported anyways.} 21 | } 22 | \value{ 23 | Returns \code{data.frame} with LOH regions. 24 | } 25 | \description{ 26 | This function provides detailed LOH information by region. 27 | } 28 | \examples{ 29 | 30 | data(purecn.example.output) 31 | head(callLOH(purecn.example.output)) 32 | 33 | } 34 | \seealso{ 35 | \code{\link{runAbsoluteCN}} 36 | } 37 | \author{ 38 | Markus Riester 39 | } 40 | -------------------------------------------------------------------------------- /tests/testthat/test_poolCoverage.R: -------------------------------------------------------------------------------- 1 | context("poolCoverage") 2 | 3 | normal.coverage.file <- system.file("extdata", "example_normal.txt.gz", 4 | package = "PureCN") 5 | normal2.coverage.file <- system.file("extdata", "example_normal2.txt.gz", 6 | package = "PureCN") 7 | normal.coverage.files <- c(normal.coverage.file, normal2.coverage.file) 8 | 9 | test_that("Example coverage is averaged", { 10 | coverage <- lapply(normal.coverage.files, readCoverageFile) 11 | pool <- poolCoverage(coverage) 12 | expect_equal(coverage[[1]]$average.coverage + coverage[[2]]$average.coverage, 13 | pool$average.coverage) 14 | expect_equal(coverage[[1]]$coverage + coverage[[2]]$coverage, 15 | pool$coverage) 16 | pool2 <- poolCoverage(coverage, w = c(0.5, 0.5)) 17 | expect_equal((coverage[[1]]$coverage + coverage[[2]]$coverage) / 2, 18 | pool2$coverage) 19 | }) 20 | 21 | test_that("Exceptions happend with wrong input", { 22 | coverage <- lapply(normal.coverage.files, readCoverageFile) 23 | expect_error(poolCoverage(coverage, w = seq(3)), "different lengths") 24 | }) 25 | -------------------------------------------------------------------------------- /inst/extdata/dist/downloadCentromeres.R: -------------------------------------------------------------------------------- 1 | library(rtracklayer) 2 | library(data.table) 3 | library(PureCN) 4 | 5 | data(chr.hash) 6 | mySession <- browserSession("UCSC") 7 | genomes <- c("hg18", "hg19", "hg38") 8 | centromeres <- list() 9 | 10 | for (genome in genomes) { 11 | genome(mySession) <- genome 12 | if (genome == "hg38") { 13 | tbl.gaps <- getTable( ucscTableQuery(mySession,track="Centromeres", 14 | table="centromeres")) 15 | } else { 16 | tbl.gaps <- getTable( ucscTableQuery(mySession, track="Gap", 17 | table="gap")) 18 | tbl.gaps <- tbl.gaps[tbl.gaps$type=="centromere",] 19 | } 20 | tbl.gaps.dt <- data.table(tbl.gaps) 21 | tbl.centromeres <- as.data.frame(tbl.gaps.dt[, 22 | list(chromStart=min(chromStart),chromEnd=max(chromEnd)),by=chrom]) 23 | centromeres[[genome]] <- tbl.centromeres 24 | } 25 | 26 | centromeres <- lapply(centromeres, function(x) { 27 | x$chromNumerical <- chr.hash$number[match(x$chrom, chr.hash$chr)] 28 | x[order(x$chromNumerical),1:3] 29 | }) 30 | 31 | save(centromeres, file="data/centromeres.rda", compress="xz") 32 | -------------------------------------------------------------------------------- /tests/testthat/test_calculateLogRatio.R: -------------------------------------------------------------------------------- 1 | context("calculateLogRatio") 2 | 3 | test_that("Misaligned on- and off-target regions are aligned", { 4 | x <- readCoverageFile( 5 | system.file("extdata", "example_intervals_tiny_ot.txt.gz", 6 | package = "PureCN")) 7 | set.seed(123) 8 | l1 <- rnorm(length(x), mean = 0.25, sd=0.3) 9 | l2 <- rnorm(length(x), mean = -0.25, sd=0.3) 10 | x$log.ratio <- l1 11 | x$log.ratio[x$on.target] <- l2[x$on.target] 12 | expect_lt(t.test( x$log.ratio[x$on.target], x$log.ratio[!x$on.target])$p.value, 0.001) 13 | 14 | xc <- x 15 | xc$log.ratio <- PureCN:::.calibrate_off_target_log_ratio(x) 16 | expect_gt(t.test( xc$log.ratio[x$on.target], xc$log.ratio[!x$on.target])$p.value, 0.001) 17 | 18 | x$log.ratio <- l2 19 | x$log.ratio[x$on.target] <- l1[x$on.target] 20 | expect_lt(t.test( x$log.ratio[x$on.target], x$log.ratio[!x$on.target])$p.value, 0.001) 21 | 22 | xc <- x 23 | xc$log.ratio <- PureCN:::.calibrate_off_target_log_ratio(x) 24 | expect_gt(t.test( xc$log.ratio[x$on.target], xc$log.ratio[!x$on.target])$p.value, 0.001) 25 | 26 | }) 27 | 28 | -------------------------------------------------------------------------------- /man/centromeres.Rd: -------------------------------------------------------------------------------- 1 | \name{centromeres} 2 | \alias{centromeres} 3 | \docType{data} 4 | \title{ 5 | A list of data.frames containing centromere positions. 6 | } 7 | \description{ 8 | A list of data.frames containing centromere positions for hg18, hg19 and hg38. 9 | Downloaded from the UCSC genome browser. 10 | } 11 | \usage{data(centromeres)} 12 | \value{ 13 | A list with three data frames, "hg18", "hg19", and "hg38". Each containes 14 | three columns 15 | \describe{ 16 | \item{\code{chrom}}{a factor with levels \code{chr1} \code{chr10} \code{chr11} \code{chr12} \code{chr13} \code{chr14} \code{chr15} \code{chr16} \code{chr17} \code{chr18} \code{chr19} \code{chr2} \code{chr20} \code{chr21} \code{chr22} \code{chr3} \code{chr4} \code{chr5} \code{chr6} \code{chr7} \code{chr8} \code{chr9} \code{chrX} \code{chrY}} 17 | \item{\code{chromStart}}{a numeric vector} 18 | \item{\code{chromEnd}}{a numeric vector} 19 | } 20 | } 21 | \references{ 22 | The script downloadCentromeres.R in the extdata directory was used to generate 23 | the data.frames. 24 | } 25 | \examples{ 26 | data(centromeres) 27 | } 28 | \keyword{datasets} 29 | -------------------------------------------------------------------------------- /inst/extdata/issue192_tumor.cnr: -------------------------------------------------------------------------------- 1 | chromosome start end gene depth log2 weight 2 | 6 105929 106231 OR4F1P 73.2318 -0.0897448 0.739157 3 | 6 106533 106835 OR4F1P 88.4636 -0.137046 0.866497 4 | 6 203388 203765 AL035696.3,AL035696.1 104.095 -0.154834 0.880158 5 | 6 304510 304686 DUSP22 261.83 0.421809 0.735374 6 | 6 335056 335176 DUSP22 50.45 -0.122885 0.93186 7 | 6 335179 335299 DUSP22 82.7833 -0.0522508 0.789679 8 | 6 345832 346041 DUSP22 219.225 0.65042 0.724212 9 | 6 348419 348700 DUSP22 44.8505 -1.18952 0.479454 10 | 6 348735 348903 DUSP22 166.81 0.258922 0.933433 11 | 6 350872 351180 DUSP22 63.7468 -1.33098 0.157633 12 | 6 367261 367484 - 172.04 0.386116 0.805822 13 | 6 391692 391881 IRF4 54.0159 0.217291 0.162526 14 | 6 394719 394871 IRF4 113.289 0.195861 0.854984 15 | 6 394874 394994 IRF4 150.883 0.171075 0.948817 16 | 6 395754 395940 IRF4 74.6882 -0.281235 0.107269 17 | 6 398733 398924 IRF4 96.1047 -0.268284 0.811244 18 | 6 401490 401848 IRF4 253.478 0.580141 0.342134 19 | 6 404920 405216 IRF4 141.395 0.115385 0.954922 20 | 6 406554 406753 IRF4 67.0905 -0.166957 0.563356 21 | 6 406755 406875 IRF4 36 -0.465117 0.905217 22 | 6 407459 407580 IRF4 90.6446 0.0794558 0.902095 23 | -------------------------------------------------------------------------------- /man/readIntervalFile.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/readIntervalFile.R 3 | \name{readIntervalFile} 4 | \alias{readIntervalFile} 5 | \title{Read interval file} 6 | \usage{ 7 | readIntervalFile(interval.file, strict = TRUE, verbose = TRUE) 8 | } 9 | \arguments{ 10 | \item{interval.file}{A mapping file that assigns GC content and gene symbols 11 | to each exon in the coverage files. Used for generating gene-level calls. 12 | First column in format CHR:START-END. Second column GC content (0 to 1). 13 | Third column gene symbol. This file is generated with the 14 | \code{\link{preprocessIntervals}} function.} 15 | 16 | \item{strict}{Error out with missing columns} 17 | 18 | \item{verbose}{Verbose output} 19 | } 20 | \value{ 21 | A \code{GRanges} object with the parsed intervals. 22 | } 23 | \description{ 24 | Read file containing coordinates of on- and off-target intervals 25 | generated by \code{\link{preprocessIntervals}}. 26 | } 27 | \examples{ 28 | 29 | interval.file <- system.file("extdata", "example_intervals.txt", 30 | package = "PureCN") 31 | x <- readIntervalFile(interval.file) 32 | 33 | } 34 | \author{ 35 | Markus Riester 36 | } 37 | -------------------------------------------------------------------------------- /man/readLogRatioFile.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/readLogRatioFile.R 3 | \name{readLogRatioFile} 4 | \alias{readLogRatioFile} 5 | \title{Read file containing interval-level log2 tumor/normal ratios} 6 | \usage{ 7 | readLogRatioFile(file, format, zero = NULL) 8 | } 9 | \arguments{ 10 | \item{file}{Log2 coverage file.} 11 | 12 | \item{format}{File format. If missing, derived from the file 13 | extension. Currently GATK4 DenoiseReadCounts format supported. 14 | A simple GATK3-style format, two columns with coordinates 15 | as string in format chr:start-stop in first and log2-ratio 16 | in second is also supported.} 17 | 18 | \item{zero}{Start position is 0-based. Default is \code{FALSE} 19 | for GATK, \code{TRUE} for BED file based intervals.} 20 | } 21 | \value{ 22 | A \code{GRange} with the log2 ratio. 23 | } 24 | \description{ 25 | Read log2 ratio file produced by external tools like The Genome Analysis 26 | Toolkit version 4. 27 | } 28 | \examples{ 29 | 30 | logratio.file <- system.file("extdata", "example_gatk4_denoised_cr.tsv.gz", 31 | package = "PureCN") 32 | logratio <- readLogRatioFile(logratio.file) 33 | 34 | } 35 | \author{ 36 | Markus Riester 37 | } 38 | -------------------------------------------------------------------------------- /man/annotateTargets.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/annotateTargets.R 3 | \name{annotateTargets} 4 | \alias{annotateTargets} 5 | \title{Annotate targets with gene symbols} 6 | \usage{ 7 | annotateTargets(x, txdb, org) 8 | } 9 | \arguments{ 10 | \item{x}{A \code{GRanges} object with interals to annotate} 11 | 12 | \item{txdb}{A \code{TxDb} database, e.g. 13 | \code{TxDb.Hsapiens.UCSC.hg19.knownGene}} 14 | 15 | \item{org}{A \code{OrgDb} object, e.g. \code{org.Hs.eg.db}.} 16 | } 17 | \value{ 18 | A \code{GRanges} object. 19 | } 20 | \description{ 21 | This function can be used to add a \sQuote{Gene} meta column containing 22 | gene symbols to a \code{GRanges} object. 23 | It applies heuristics to find the protein coding genes that were 24 | likely meant to target in the assay design in case transcripts 25 | overlap. 26 | } 27 | \examples{ 28 | library(TxDb.Hsapiens.UCSC.hg19.knownGene) 29 | library(org.Hs.eg.db) 30 | 31 | normal.coverage.file <- system.file("extdata", "example_normal.txt.gz", 32 | package = "PureCN") 33 | x <- head(readCoverageFile(normal.coverage.file), 100) 34 | x <- annotateTargets(x,TxDb.Hsapiens.UCSC.hg19.knownGene, org.Hs.eg.db) 35 | 36 | } 37 | \author{ 38 | Markus Riester 39 | } 40 | -------------------------------------------------------------------------------- /man/createCurationFile.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/createCurationFile.R 3 | \name{createCurationFile} 4 | \alias{createCurationFile} 5 | \title{Create file to curate PureCN results} 6 | \usage{ 7 | createCurationFile( 8 | file.rds, 9 | overwrite.uncurated = TRUE, 10 | overwrite.curated = FALSE 11 | ) 12 | } 13 | \arguments{ 14 | \item{file.rds}{Output of the \code{\link{runAbsoluteCN}} function, 15 | serialized with \code{saveRDS}.} 16 | 17 | \item{overwrite.uncurated}{Overwrite existing files unless flagged as 18 | \sQuote{Curated}.} 19 | 20 | \item{overwrite.curated}{Overwrite existing files even if flagged as 21 | \sQuote{Curated}.} 22 | } 23 | \value{ 24 | A \code{data.frame} with the tumor purity and ploidy of the maximum 25 | likelihood solution. 26 | } 27 | \description{ 28 | Function to create a CSV file that can be used to mark the correct solution 29 | in the output of a \code{\link{runAbsoluteCN}} run. 30 | } 31 | \examples{ 32 | 33 | data(purecn.example.output) 34 | file.rds <- "Sample1_PureCN.rds" 35 | saveRDS(purecn.example.output, file = file.rds) 36 | createCurationFile(file.rds) 37 | 38 | } 39 | \seealso{ 40 | \code{\link{runAbsoluteCN}} 41 | } 42 | \author{ 43 | Markus Riester 44 | } 45 | -------------------------------------------------------------------------------- /man/poolCoverage.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/poolCoverage.R 3 | \name{poolCoverage} 4 | \alias{poolCoverage} 5 | \title{Pool coverage from multiple samples} 6 | \usage{ 7 | poolCoverage(all.data, remove.chrs = c(), w = NULL) 8 | } 9 | \arguments{ 10 | \item{all.data}{List of normals, read with \code{\link{readCoverageFile}}.} 11 | 12 | \item{remove.chrs}{Remove these chromosomes from the pool.} 13 | 14 | \item{w}{\code{numeric(length(all.data))} vector of weights. If \code{NULL}, 15 | weight all samples equally.} 16 | } 17 | \value{ 18 | A \code{data.frame} with the averaged coverage over all normals. 19 | } 20 | \description{ 21 | Averages the coverage of a list of samples. 22 | } 23 | \examples{ 24 | 25 | normal.coverage.file <- system.file("extdata", "example_normal.txt.gz", 26 | package = "PureCN") 27 | normal2.coverage.file <- system.file("extdata", "example_normal2.txt.gz", 28 | package = "PureCN") 29 | normal.coverage.files <- c(normal.coverage.file, normal2.coverage.file) 30 | pool <- poolCoverage(lapply(normal.coverage.files, readCoverageFile), 31 | remove.chrs = c("chrX", "chrY")) 32 | 33 | } 34 | \seealso{ 35 | \code{\link{readCoverageFile}} 36 | } 37 | \author{ 38 | Markus Riester 39 | } 40 | -------------------------------------------------------------------------------- /tests/testthat/test_readSegmentationFile.R: -------------------------------------------------------------------------------- 1 | context("readSegmentationFile") 2 | 3 | test_that("Example DNAcopy data matches", { 4 | seg.file <- system.file("extdata", "example_seg.txt", 5 | package = "PureCN") 6 | seg <- readSegmentationFile(seg.file, "Sample1") 7 | offset <- -0.0033 8 | expect_equal(54, nrow(seg)) 9 | expect_equal(0.133381833060556 - offset, seg$seg.mean[1], tolerance = .0001) 10 | expect_equal(-0.6394 - offset, seg$seg.mean[54], tolerance = .0001) 11 | }) 12 | 13 | test_that("Example GATK4 data matches", { 14 | seg.file <- system.file("extdata", "example_gatk4_modelfinal.seg.gz", 15 | package = "PureCN") 16 | seg <- readSegmentationFile(seg.file, "Sample1") 17 | offset <- -0.0037 18 | expect_equal(23, nrow(seg)) 19 | expect_equal(-0.004295 - offset, seg$seg.mean[1], tolerance = .0001) 20 | expect_equal(0.002534 - offset, seg$seg.mean[23], tolerance = .0001) 21 | }) 22 | 23 | test_that("Missing values raise warning", { 24 | seg.file <- system.file("extdata", "buggy_cnvkit.seg.gz", 25 | package = "PureCN") 26 | expect_output(readSegmentationFile(seg.file, "SC_9030.tumour.recalibrated"), 27 | "Coordinates in seg.file contain missing values") 28 | }) 29 | 30 | -------------------------------------------------------------------------------- /tests/testthat/test_calculatePowerDetectSomatic.R: -------------------------------------------------------------------------------- 1 | context("calculatePowerDetectSomatic") 2 | 3 | test_that("Power is calculated correctly for examples", { 4 | p1 <- calculatePowerDetectSomatic(coverage = 5, purity = 1, 5 | ploidy = 2)$power 6 | p2 <- calculatePowerDetectSomatic(coverage = 5, f = 0.5)$power 7 | expect_equal(p1, 0.6407084, tolerance=0.0001) 8 | expect_equal(p2, 0.6407084, tolerance=0.0001) 9 | p3 <- calculatePowerDetectSomatic(coverage = 33, purity = 0.5, 10 | ploidy = 6)$power 11 | expect_equal(p3, 0.8, tolerance=0.001) 12 | p4 <- calculatePowerDetectSomatic(coverage = 330, purity = 0.2, 13 | ploidy = 2, cell.fraction = 0.2)$power 14 | expect_equal(p4, 0.8, tolerance=0.001) 15 | }) 16 | 17 | test_that("Exceptions happen with wrong input", { 18 | expect_error(calculatePowerDetectSomatic(coverage = 5)) 19 | expect_error(calculatePowerDetectSomatic(coverage = 5, f = 1.1)) 20 | expect_error(calculatePowerDetectSomatic(coverage = 1, f = 0.9)) 21 | expect_error(calculatePowerDetectSomatic(coverage = 3, purity = 1.1, 22 | ploidy = 2)) 23 | expect_error(calculatePowerDetectSomatic(coverage = 3, purity = 1, 24 | ploidy = -1)) 25 | expect_error(calculatePowerDetectSomatic(coverage = 5, cell.fraction = 1.1)) 26 | }) 27 | -------------------------------------------------------------------------------- /man/calculateLogRatio.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/calculateLogRatio.R 3 | \name{calculateLogRatio} 4 | \alias{calculateLogRatio} 5 | \title{Calculate coverage log-ratio of tumor vs. normal} 6 | \usage{ 7 | calculateLogRatio(normal, tumor) 8 | } 9 | \arguments{ 10 | \item{normal}{Normal coverage read in by the \code{\link{readCoverageFile}} 11 | function.} 12 | 13 | \item{tumor}{Tumor coverage read in by the \code{\link{readCoverageFile}} 14 | function.} 15 | } 16 | \value{ 17 | \code{numeric(length(tumor))}, tumor vs. normal copy number log-ratios 18 | for all targets. 19 | } 20 | \description{ 21 | This function is automatically called by \code{\link{runAbsoluteCN}} when 22 | normal and tumor coverage are provided (and not a segmentation file or 23 | target-level log-ratios). This function is therefore normally not called by 24 | the user. 25 | } 26 | \examples{ 27 | 28 | normal.coverage.file <- system.file("extdata", "example_normal.txt.gz", 29 | package = "PureCN") 30 | tumor.coverage.file <- system.file("extdata", "example_tumor.txt.gz", 31 | package = "PureCN") 32 | normal <- readCoverageFile(normal.coverage.file) 33 | tumor <- readCoverageFile(tumor.coverage.file) 34 | log.ratio <- calculateLogRatio(normal, tumor) 35 | 36 | } 37 | \author{ 38 | Markus Riester 39 | } 40 | -------------------------------------------------------------------------------- /man/readSegmentationFile.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/readSegmentationFile.R 3 | \name{readSegmentationFile} 4 | \alias{readSegmentationFile} 5 | \title{Read file containing segmentations} 6 | \usage{ 7 | readSegmentationFile( 8 | seg.file, 9 | sampleid, 10 | model.homozygous = FALSE, 11 | format, 12 | zero = FALSE, 13 | verbose = TRUE 14 | ) 15 | } 16 | \arguments{ 17 | \item{seg.file}{File with segmentation} 18 | 19 | \item{sampleid}{Sampleid, for segmentation files containing multiple samples} 20 | 21 | \item{model.homozygous}{Unless \code{TRUE}, checks for very small log2-ratios 22 | that cannot happen in samples with normal contamination} 23 | 24 | \item{format}{File format. If missing, derived from the file 25 | extension. Currently DNAcopy, and GATK4 26 | (ModelSegments) format supported. CNVkit uses DNAcopy format.} 27 | 28 | \item{zero}{Start position is 0-based. Default is \code{FALSE}.} 29 | 30 | \item{verbose}{Verbose output.} 31 | } 32 | \value{ 33 | A \code{data.frame}. 34 | } 35 | \description{ 36 | Read segmentation files produced by DNAcopy, CNVkit or GATK4. 37 | } 38 | \examples{ 39 | 40 | seg.file <- system.file("extdata", "example_seg.txt", 41 | package = "PureCN") 42 | seg <- readSegmentationFile(seg.file, "Sample1") 43 | 44 | } 45 | \author{ 46 | Markus Riester 47 | } 48 | -------------------------------------------------------------------------------- /man/readCoverageFile.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/readCoverageFile.R 3 | \name{readCoverageFile} 4 | \alias{readCoverageFile} 5 | \title{Read coverage file} 6 | \usage{ 7 | readCoverageFile(file, format, zero = NULL, read.length = 100) 8 | } 9 | \arguments{ 10 | \item{file}{Target coverage file.} 11 | 12 | \item{format}{File format. If missing, derived from the file 13 | extension. Currently GATK3 DepthofCoverage, GATK4 CollectFragmentCounts 14 | (hdf5), and CNVkit formats supported.} 15 | 16 | \item{zero}{Start position is 0-based. Default is \code{FALSE} 17 | for GATK, \code{TRUE} for BED file based intervals.} 18 | 19 | \item{read.length}{For output formats which do not provide both counts 20 | and total coverages, approximate them using the specified read length.} 21 | } 22 | \value{ 23 | A \code{data.frame} with the parsed coverage information. 24 | } 25 | \description{ 26 | Read coverage file produced by external tools like The Genome Analysis 27 | Toolkit or by \code{\link{calculateBamCoverageByInterval}}. 28 | } 29 | \examples{ 30 | 31 | tumor.coverage.file <- system.file("extdata", "example_tumor.txt.gz", 32 | package = "PureCN") 33 | coverage <- readCoverageFile(tumor.coverage.file) 34 | 35 | } 36 | \seealso{ 37 | \code{\link{calculateBamCoverageByInterval}} 38 | } 39 | \author{ 40 | Markus Riester 41 | } 42 | -------------------------------------------------------------------------------- /man/bootstrapResults.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/bootstrapResults.R 3 | \name{bootstrapResults} 4 | \alias{bootstrapResults} 5 | \title{Bootstrapping variant fits} 6 | \usage{ 7 | bootstrapResults(res, n = 500, top = NULL, reorder = FALSE) 8 | } 9 | \arguments{ 10 | \item{res}{Return object of the \code{\link{runAbsoluteCN}} function.} 11 | 12 | \item{n}{Number of bootstrap replicates.} 13 | 14 | \item{top}{Include solution if it appears in the top \code{n} solutions of 15 | any bootstrap replicate. If \code{NULL}, do not filter solutions.} 16 | 17 | \item{reorder}{Reorder results by bootstrap value.} 18 | } 19 | \value{ 20 | Returns a \code{\link{runAbsoluteCN}} object with added bootstrap 21 | value to each solution. This value 22 | is the fraction of bootstrap replicates in which the solution ranked first. 23 | } 24 | \description{ 25 | This function bootstraps variants, then optionally re-ranks solutions by 26 | using the bootstrap estimate of the likelihood score, and then optionally 27 | removes solutions that never ranked high in any bootstrap replicate. 28 | } 29 | \examples{ 30 | 31 | data(purecn.example.output) 32 | ret.boot <- bootstrapResults(purecn.example.output, n=100) 33 | plotAbs(ret.boot, type="overview") 34 | 35 | } 36 | \seealso{ 37 | \code{\link{runAbsoluteCN}} 38 | } 39 | \author{ 40 | Markus Riester 41 | } 42 | -------------------------------------------------------------------------------- /man/callCIN.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/callCIN.R 3 | \name{callCIN} 4 | \alias{callCIN} 5 | \title{Call Chromosomal Instability} 6 | \usage{ 7 | callCIN( 8 | res, 9 | id = 1, 10 | allele.specific = TRUE, 11 | reference.state = c("dominant", "normal") 12 | ) 13 | } 14 | \arguments{ 15 | \item{res}{Return object of the \code{\link{runAbsoluteCN}} function.} 16 | 17 | \item{id}{Candidate solution to extract CIN from. \code{id=1} will use the 18 | maximum likelihood solution.} 19 | 20 | \item{allele.specific}{Use allele-specific or only total copy number for 21 | detecting abnormal regions. Copy-number neutral LOH would be ignored when 22 | this parameter is set to \code{FALSE}.} 23 | 24 | \item{reference.state}{Copy number regions different from the reference 25 | state are counted as abnormal. Default is \code{dominant} means the most 26 | common state. The other option is \code{normal}, which defines normal 27 | heterozygous, diploid as reference. The default is robust to errors in 28 | ploidy.} 29 | } 30 | \value{ 31 | Returns \code{double(1)} with CIN value. 32 | } 33 | \description{ 34 | This function provides detailed CIN information. 35 | } 36 | \examples{ 37 | 38 | data(purecn.example.output) 39 | head(callCIN(purecn.example.output)) 40 | 41 | } 42 | \seealso{ 43 | \code{\link{runAbsoluteCN}} 44 | } 45 | \author{ 46 | Markus Riester 47 | } 48 | -------------------------------------------------------------------------------- /tests/testthat/test_readLogRatioFile.R: -------------------------------------------------------------------------------- 1 | context("readLogRatioFile") 2 | data(purecn.example.output) 3 | 4 | test_that("Example data matches", { 5 | logratio.file <- system.file("extdata", "example_gatk4_denoised_cr.tsv.gz", 6 | package = "PureCN") 7 | logratio <- readLogRatioFile(logratio.file) 8 | expect_equal(21, length(logratio)) 9 | expect_equal(0.109473, logratio$log.ratio[1], tolerance = .00001) 10 | expect_equal(-0.185664, logratio$log.ratio[21], tolerance = .00001) 11 | expect_equivalent(seqlengths(logratio), c(248956422, 242193529, 156040895)) 12 | logratio.file2 <- system.file("extdata", "example_logratio.txt.gz", 13 | package = "PureCN") 14 | logratio2 <- readLogRatioFile(logratio.file2) 15 | expect_equal(as.character(logratio), as.character(logratio2)) 16 | expect_equal(logratio$log.ratio, logratio2$log.ratio) 17 | }) 18 | 19 | test_that("parsing -> writing -> parsing works", { 20 | x <- purecn.example.output$input 21 | y <- x 22 | y$log.ratio$log.ratio <- NULL 23 | output.file <- tempfile(fileext = ".tsv") 24 | expect_error( 25 | PureCN:::.writeLogRatioFileGATK4(y, 1, output.file), 26 | "log.ratio NULL" 27 | ) 28 | PureCN:::.writeLogRatioFileGATK4(x, 1, output.file) 29 | z <- readLogRatioFile(output.file) 30 | expect_equivalent(x$log.ratio$log.ratio, z$log.ratio) 31 | file.remove(output.file) 32 | }) 33 | 34 | -------------------------------------------------------------------------------- /tests/testthat/test_callMutationBurden.R: -------------------------------------------------------------------------------- 1 | context("callMutationBurden") 2 | 3 | data(purecn.example.output) 4 | callableBed <- import(system.file("extdata", "example_callable.bed.gz", 5 | package = "PureCN")) 6 | 7 | test_that("Example is called correctly", { 8 | calls <- callMutationBurden(purecn.example.output) 9 | expect_false(is.na(calls$callable.bases.ontarget)) 10 | expect_true(calls$callable.bases.ontarget > 0) 11 | exclude <- GRanges(seqnames = "chr1", IRanges(start = 1, 12 | end = max(end(callableBed)))) 13 | myVcfFilter <- function(vcf) seqnames(vcf) != "chr2" 14 | callsCallable <- callMutationBurden(purecn.example.output, 15 | callable = callableBed, exclude = exclude, fun.countMutation = myVcfFilter) 16 | expect_true(callsCallable$callable.bases.ontarget > 0) 17 | expect_true(callsCallable$callable.bases.flanking > callsCallable$callable.bases.ontarget) 18 | expect_true(callsCallable$callable.bases.all > callsCallable$callable.bases.flanking) 19 | }) 20 | 21 | test_that("Exceptions happen with wrong input", { 22 | expect_error(callMutationBurden(purecn.example.output, callable = callableBed, 23 | exclude = exclude, fun.countMutation = "helloworld")) 24 | expect_error(callMutationBurden(purecn.example.output, callable = callableBed, 25 | exclude = "helloworld")) 26 | expect_error(callMutationBurden(purecn.example.output, callable = "helloworld")) 27 | }) 28 | -------------------------------------------------------------------------------- /inst/extdata/example_allelic_counts.tsv: -------------------------------------------------------------------------------- 1 | @HD VN:1.6 2 | @SQ SN:chr1 LN:249250621 3 | @SQ SN:chr2 LN:243199373 4 | @SQ SN:chr3 LN:198022430 5 | @SQ SN:chr4 LN:191154276 6 | @SQ SN:chr5 LN:180915260 7 | @SQ SN:chr6 LN:171115067 8 | @SQ SN:chr7 LN:159138663 9 | @SQ SN:chr8 LN:146364022 10 | @SQ SN:chr9 LN:141213431 11 | @SQ SN:chr10 LN:135534747 12 | @SQ SN:chr11 LN:135006516 13 | @SQ SN:chr12 LN:133851895 14 | @SQ SN:chr13 LN:115169878 15 | @SQ SN:chr14 LN:107349540 16 | @SQ SN:chr15 LN:102531392 17 | @SQ SN:chr16 LN:90354753 18 | @SQ SN:chr17 LN:81195210 19 | @SQ SN:chr18 LN:78077248 20 | @SQ SN:chr19 LN:59128983 21 | @SQ SN:chr20 LN:63025520 22 | @SQ SN:chr21 LN:48129895 23 | @SQ SN:chr22 LN:51304566 24 | @SQ SN:chrX LN:155270560 25 | @SQ SN:chrY LN:59373566 26 | @SQ SN:chrM LN:16571 27 | @RG ID:PureCN SM:LIB-02240e4 28 | CONTIG POSITION REF_COUNT ALT_COUNT REF_NUCLEOTIDE ALT_NUCLEOTIDE 29 | chr1 114515871 177 189 G A 30 | chr1 150044293 119 157 T G 31 | chr1 158449835 209 222 A G 32 | chr1 158450154 401 294 G A 33 | chr1 158450311 323 262 C T 34 | chr1 158450374 351 269 A G 35 | chr1 160062206 76 63 C T 36 | chr1 177902370 97 60 C A 37 | chr1 200967559 115 137 C G 38 | chr1 247419414 138 205 T C 39 | chr1 247419499 242 320 C T 40 | chr1 248085080 35 32 C T 41 | chr1 248085104 157 116 G A 42 | chr2 10262881 121 136 C T 43 | chr2 10263895 118 96 C G 44 | chr2 69472504 121 70 C T 45 | chr2 138413092 35 79 G A 46 | chr2 138434106 56 134 A G 47 | chr2 185798411 34 67 G A 48 | chr2 188361624 50 107 A G 49 | -------------------------------------------------------------------------------- /man/findHighQualitySNPs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/calculateMappingBiasVcf.R 3 | \name{findHighQualitySNPs} 4 | \alias{findHighQualitySNPs} 5 | \title{Find High Quality SNPs} 6 | \usage{ 7 | findHighQualitySNPs( 8 | mapping.bias.file, 9 | max.bias = 0.2, 10 | min.pon = 2, 11 | triallelic = FALSE, 12 | vcf.file = NULL, 13 | genome 14 | ) 15 | } 16 | \arguments{ 17 | \item{mapping.bias.file}{Generated by \code{\link{calculateMappingBiasVcf}}.} 18 | 19 | \item{max.bias}{Maximum mapping bias} 20 | 21 | \item{min.pon}{Minimum number of normal samples, useful to get reliable 22 | mapping bias.} 23 | 24 | \item{triallelic}{By default, ignore positions with multiple alt alleles.} 25 | 26 | \item{vcf.file}{Optional VCF file (for example dbSNP). Needs to be 27 | bgzip and tabix processed.} 28 | 29 | \item{genome}{See \code{readVcf}} 30 | } 31 | \value{ 32 | A \code{GRanges} object with mapping bias passing filters. 33 | If \code{vcf.file} is provided, it will be the variants in the 34 | corresponding file overlapping with the passed variants. 35 | } 36 | \description{ 37 | Function to extract high quality SNPs from the mapping bias database. 38 | Useful for generating fingerprinting panels etc. 39 | } 40 | \examples{ 41 | 42 | normal.panel.vcf <- system.file("extdata", "normalpanel.vcf.gz", 43 | package = "PureCN") 44 | bias <- calculateMappingBiasVcf(normal.panel.vcf, genome = "h19") 45 | 46 | } 47 | \author{ 48 | Markus Riester 49 | } 50 | -------------------------------------------------------------------------------- /tests/testthat/test_predictSomatic.R: -------------------------------------------------------------------------------- 1 | context("predictSomatic") 2 | 3 | data(purecn.example.output) 4 | ret <- predictSomatic(purecn.example.output) 5 | 6 | test_that("Gene symbol annotation matches", { 7 | expect_equal(class(ret), "data.frame") 8 | expect_equal(nrow(ret), nrow(purecn.example.output$results[[1]]$SNV.posterior$posteriors)) 9 | esr2 <- ret[which(ret$gene.symbol == "ESR2"), ] 10 | expect_equal(as.character(esr2$chr), "chr14") 11 | expect_true(esr2$start > 64699747) 12 | expect_true(esr2$end < 64761128) 13 | }) 14 | 15 | test_that("VCF and data.frame provide equivalent results", { 16 | ret.vcf <- predictSomatic(purecn.example.output, return.vcf = TRUE) 17 | expect_equal(start(ret.vcf), ret$start) 18 | expect_equal(end(ret.vcf), ret$end) 19 | expect_equal(as.character(seqnames(ret.vcf)), as.character(ret$chr)) 20 | expect_equal(info(ret.vcf)$SM1, round(ret$SOMATIC.M1, digits = 4)) 21 | expect_equal(info(ret.vcf)$GM1, round(ret$GERMLINE.M1, digits = 4)) 22 | expect_equal(info(ret.vcf)$PS, round(ret$POSTERIOR.SOMATIC, 23 | digits = 4)) 24 | expect_equal(info(ret.vcf)$GS, ret$gene.symbol) 25 | }) 26 | 27 | test_that("Segments are flagged", { 28 | flagged <- lapply(split(ret$seg.id, ret$M.SEGMENT.FLAGGED), 29 | table) 30 | expect_true(min(flagged$`FALSE`) >= 5) 31 | expect_true(max(flagged$`TRUE`) < 5) 32 | expect_true(min(ret$M.SEGMENT.POSTERIOR) > 0.5) 33 | expect_equal(max(ret$M.SEGMENT.POSTERIOR), 1) 34 | }) 35 | -------------------------------------------------------------------------------- /man/adjustLogRatio.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/adjustLogRatio.R 3 | \name{adjustLogRatio} 4 | \alias{adjustLogRatio} 5 | \title{Adjust tumor vs. normal coverage log ratio for tumor purity and ploidy} 6 | \usage{ 7 | adjustLogRatio(ratio, purity, ploidy, is.log2 = TRUE, min.ratio = 2^-8) 8 | } 9 | \arguments{ 10 | \item{ratio}{Vector of log2 tumor vs normal coverage ratios.} 11 | 12 | \item{purity}{Purity of sample.} 13 | 14 | \item{ploidy}{Ploidy of sample.} 15 | 16 | \item{is.log2}{\code{log.ratio} is \code{log2} transformed.} 17 | 18 | \item{min.ratio}{Minimum (non-log2-transformed) ratio. Set to approx -8 19 | \code{log2} adjusted.} 20 | } 21 | \value{ 22 | \code{numeric(length(log.ratio))}, \code{log.ratio} adjusted 23 | for \code{purity} and \code{ploidy} 24 | } 25 | \description{ 26 | This function can be used to adjust the log ratio for tumor purity and 27 | ploidy for downstream tools that expect a log2 ratio (for example GISTIC). 28 | } 29 | \examples{ 30 | 31 | normal.coverage.file <- system.file("extdata", "example_normal.txt.gz", 32 | package = "PureCN") 33 | tumor.coverage.file <- system.file("extdata", "example_tumor.txt.gz", 34 | package = "PureCN") 35 | normal <- readCoverageFile(normal.coverage.file) 36 | tumor <- readCoverageFile(tumor.coverage.file) 37 | log.ratio <- calculateLogRatio(normal, tumor) 38 | log.ratio.adjusted <- adjustLogRatio(log.ratio, 0.65, 1.73) 39 | 40 | } 41 | \references{ 42 | Nature Biotechnology. 43 | * Toal (2018), https://github.com/lima1/PureCN/issues/40 44 | } 45 | \author{ 46 | Markus Riester 47 | } 48 | -------------------------------------------------------------------------------- /man/callAlterations.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/callAlterations.R 3 | \name{callAlterations} 4 | \alias{callAlterations} 5 | \title{Calling of amplifications and deletions} 6 | \usage{ 7 | callAlterations( 8 | res, 9 | id = 1, 10 | cutoffs = c(0.5, 6, 7), 11 | log.ratio.cutoffs = c(-0.9, 0.9), 12 | failed = NULL, 13 | all.genes = FALSE 14 | ) 15 | } 16 | \arguments{ 17 | \item{res}{Return object of the \code{\link{runAbsoluteCN}} function.} 18 | 19 | \item{id}{Candidate solutions to be used. \code{id=1} will use the maximum 20 | likelihood (or curated) solution.} 21 | 22 | \item{cutoffs}{Copy numbers cutoffs to call losses, focal amplifications and 23 | broad amplifications.} 24 | 25 | \item{log.ratio.cutoffs}{Copy numbers log-ratio cutoffs to call losses and 26 | amplifications in failed samples.} 27 | 28 | \item{failed}{Indicates whether sample was failed. If \code{NULL}, use 29 | available annotation, which can be set in the curation file.} 30 | 31 | \item{all.genes}{If \code{FALSE}, then only return amplifications and 32 | deletions passing the thresholds.} 33 | } 34 | \value{ 35 | A \code{data.frame} with gene-level amplification and deletion 36 | calls. 37 | } 38 | \description{ 39 | Function to extract major copy number alterations from a 40 | \code{\link{runAbsoluteCN}} return object. 41 | } 42 | \examples{ 43 | 44 | data(purecn.example.output) 45 | callAlterations(purecn.example.output) 46 | callAlterations(purecn.example.output, all.genes=TRUE)["ESR2",] 47 | 48 | } 49 | \seealso{ 50 | \code{\link{runAbsoluteCN}} 51 | } 52 | \author{ 53 | Markus Riester 54 | } 55 | -------------------------------------------------------------------------------- /tests/testthat/test_readAllelicCountsFile.R: -------------------------------------------------------------------------------- 1 | context("readAllelicCountsFile") 2 | 3 | vcf.file <- system.file("extdata", "example.vcf.gz", package = "PureCN") 4 | ac.file <- system.file("extdata", "example_allelic_counts.tsv", package = "PureCN") 5 | ac.empty.file <- system.file("extdata", "example_allelic_counts_empty.tsv", package = "PureCN") 6 | vcf <- readVcf(vcf.file, "hg19") 7 | data(purecn.example.output) 8 | normal.coverage.file <- system.file('extdata', 'example_normal.txt.gz', 9 | package = 'PureCN') 10 | tumor.coverage.file <- system.file('extdata', 'example_tumor.txt.gz', 11 | package = 'PureCN') 12 | 13 | test_that("example parses correctly", { 14 | vcf_ac <- readAllelicCountsFile(ac.file) 15 | expect_equal(as.character(ref(vcf_ac)), as.character(ref(head(vcf,20)))) 16 | expect_error(readAllelicCountsFile(ac.empty.file), "Error reading AllelicCountsFile") 17 | }) 18 | 19 | test_that("parsing -> writing -> parsing works", { 20 | output.file <- tempfile(fileext = ".tsv") 21 | PureCN:::.writeAllelicCountsFileGatk(vcf, 1, output.file) 22 | vcf_ac <- readAllelicCountsFile(output.file) 23 | expect_equal(as.character(ref(vcf_ac)), as.character(ref(vcf))) 24 | ret <- runAbsoluteCN(normal.coverage.file = normal.coverage.file, 25 | tumor.coverage.file = tumor.coverage.file, 26 | candidates = purecn.example.output$candidates, 27 | vcf.file = vcf, 28 | genome = "hg19", 29 | test.purity = seq(0.4, 0.7, by = 0.05), min.ploidy = 1.5, 30 | max.ploidy = 2.4, max.candidate.solutions = 1, plot.cnv = FALSE) 31 | expect_true(length(ret$results) > 0) 32 | file.remove(output.file) 33 | }) 34 | -------------------------------------------------------------------------------- /tests/testthat/test_callLOH.R: -------------------------------------------------------------------------------- 1 | context("callLOH") 2 | 3 | test_that("Example is called correctly", { 4 | data(purecn.example.output) 5 | ret <- callLOH(purecn.example.output) 6 | expect_true(is(ret, "data.frame")) 7 | expect_equal(13, ncol(ret)) 8 | }) 9 | 10 | test_that("NCBI-style chromosome names work", { 11 | normal.coverage.file <- system.file("extdata", "example_normal.txt.gz", 12 | package = "PureCN") 13 | tumor.coverage.file <- system.file("extdata", "example_tumor.txt.gz", 14 | package = "PureCN") 15 | vcf.file <- system.file("extdata", "example.vcf.gz", package = "PureCN") 16 | vcf <- readVcf(vcf.file) 17 | normal <- readCoverageFile(normal.coverage.file) 18 | tumor <- readCoverageFile(tumor.coverage.file) 19 | seqlevelsStyle(vcf) <- "Ensembl" 20 | seqlevelsStyle(normal) <- "Ensembl" 21 | seqlevelsStyle(tumor) <- "Ensembl" 22 | ret <- runAbsoluteCN(normal.coverage.file = normal, tumor.coverage.file = tumor, 23 | genome = "hg19", vcf.file = vcf, sampleid = "Sample1", 24 | min.ploidy = 1.4, max.ploidy = 2.4, test.purity = seq(0.4, 25 | 0.7, by = 0.05), max.candidate.solutions = 1, plot = FALSE) 26 | loh <- callLOH(ret) 27 | expect_equal(unique(loh$chr), as.character(1:22)) 28 | loh2 <- callLOH(ret, keep.no.snp.segments = FALSE) 29 | expect_true(nrow(loh) > nrow(loh2)) 30 | idx <- !is.na(loh$M) 31 | expect_equal(loh$C[idx], loh2$C) 32 | expect_equal(is.na(loh$M), is.na(loh$type)) 33 | }) 34 | 35 | test_that("No crash without centromeres", { 36 | x <- purecn.example.output 37 | x$input$centromeres <- NULL 38 | loh <- callLOH(x) 39 | expect_equal(13, ncol(loh)) 40 | }) 41 | -------------------------------------------------------------------------------- /inst/extdata/ex2_reference.fa: -------------------------------------------------------------------------------- 1 | >seq1 2 | TTCCTAAGGGTCAGATAGGCTCTGCGAGCCGCACTTCACTTGACGAAGAATTCTAGTTGTGATTATGATACCCTTCCTGC 3 | CGAACAGACCTGTCTCAGTATAATAAGACCAATTAAATGATAAAAGCAGAACAATAGTAGCGAACCCAACTCGCGACCAA 4 | TGTCGTGCCGTATGAACCACTATACACAATCTCGAACTTGCGCGGCGTTTGAGAATGTCCCCTACGCAACAGCTCAGATG 5 | CGGTAGGTATAAGTAGTCCCATTGGCTGTTTCTGAGTCTTCATAGTCACGAACTACGCTAAGTCTAGGACGTGAGGCCAC 6 | GAAAATATTGAAATCCGCTATTCACGTTTCAATGCTATACGTAAGCTTCGAAGTTTCTCTAGAAACGATAACTTACTTCT 7 | ACGTGGCTTTCCCTCCGTTGGAGCCCCCGTGCCGGCTGGAGGACGCCCCAGTCACACATGAGCCGATCCATCACTCCCAG 8 | GGAGGGTTAATGAAGACTCTTGGTGCGTCTATTTAGTCAGAAACGATCCGCTTTGAAATGATTCCTTGAGGAGGTGTGAC 9 | CTTGAGTATACTCCGCGGGCGAGGATCCACATTGGCGGGAAGGAAAAACCGTGGTCTGCATATCCTGTGTACAGCCATTG 10 | CTAGGGCTCAGCAACGCTTCCCTGCCTAATCTGCACGGATCGAAGGTTGACTCGCCGTGAAATCGTGGCGACCCGCGTGC 11 | GTATGGGGGTAAACGCGACTCTTATGTGCTCTAAGCTGGCAGTTGCATTCAGCTCCGTGCGGCGATGCGCACTGTCCCGT 12 | >seq2 13 | ACGGGAATGGTCAGACCGTGCCCCAAAATCCTCTGGCAGCTCTCACGATGCTAGACACTTGTTGCAAACCTCCTTCGACA 14 | ACTCAAGGCTTGCGACACCAAGCGAAAATCCAGTTGCAACCGACGGGCGCGAGTCTAGGTGCTGGCGGCGACAGTGCGTA 15 | ACCGTGGTCGGGGATCTATGCGTCGGATGCTTAACACAATAGCGTTGCCTACATTCACGTATGGTCATGCGGCGTAAGCA 16 | CTACCACGCAAATCACCGTGCAGGGCCGTGTTCGACACCCTCGACTGATTGTAGCCCATAGTAGATCAGTCTGGATCGAA 17 | AGCGTGCTGAAGCAATTGCCCATTGTCACATGAATCGGTTTCGAAGGAAACTATAGATGTAACGTAGGCCGGGTATCAGG 18 | GACGCATGAGTACCACGCTCAAGCGGGGGCTCTAGTGGATTGGTGGATTGATATTTTGCCGATTTTGCACTTCAACCAGC 19 | TTCTGACCATCACAAAACCGAAGGTCGTTTTTTTTTGGTTAACGAAACTCAAGGTCCGAGAGTGGCGATCGAGTTGAACT 20 | AAAAGATCGTTCTAGATGAACTTTACCAGATACAGCCAGGGCTCACATAACTTTCACTTCTATGGGTGGTTTTTCATCAT 21 | TCACAATACGACAACCAAAGAGCTAAACCTCGGGCTTGTCATTGCAAATGTCCCAGACGTTTGTTCAAAATTAACTCGAG 22 | ACACTGATGGATCCGCAAATTAAGAGGATAACTGTTTTACGCGCGGTCTCATAGACTTGTCGTACCCAAGTCCTTTGAGA 23 | -------------------------------------------------------------------------------- /inst/extdata/ex3_reference.fa: -------------------------------------------------------------------------------- 1 | >chr1 2 | TTCCTAAGGGTCAGATAGGCTCTGCGAGCCGCACTTCACTTGACGAAGAATTCTAGTTGTGATTATGATACCCTTCCTGC 3 | CGAACAGACCTGTCTCAGTATAATAAGACCAATTAAATGATAAAAGCAGAACAATAGTAGCGAACCCAACTCGCGACCAA 4 | TGTCGTGCCGTATGAACCACTATACACAATCTCGAACTTGCGCGGCGTTTGAGAATGTCCCCTACGCAACAGCTCAGATG 5 | CGGTAGGTATAAGTAGTCCCATTGGCTGTTTCTGAGTCTTCATAGTCACGAACTACGCTAAGTCTAGGACGTGAGGCCAC 6 | GAAAATATTGAAATCCGCTATTCACGTTTCAATGCTATACGTAAGCTTCGAAGTTTCTCTAGAAACGATAACTTACTTCT 7 | ACGTGGCTTTCCCTCCGTTGGAGCCCCCGTGCCGGCTGGAGGACGCCCCAGTCACACATGAGCCGATCCATCACTCCCAG 8 | GGAGGGTTAATGAAGACTCTTGGTGCGTCTATTTAGTCAGAAACGATCCGCTTTGAAATGATTCCTTGAGGAGGTGTGAC 9 | CTTGAGTATACTCCGCGGGCGAGGATCCACATTGGCGGGAAGGAAAAACCGTGGTCTGCATATCCTGTGTACAGCCATTG 10 | CTAGGGCTCAGCAACGCTTCCCTGCCTAATCTGCACGGATCGAAGGTTGACTCGCCGTGAAATCGTGGCGACCCGCGTGC 11 | GTATGGGGGTAAACGCGACTCTTATGTGCTCTAAGCTGGCAGTTGCATTCAGCTCCGTGCGGCGATGCGCACTGTCCCGT 12 | >chr2 13 | ACGGGAATGGTCAGACCGTGCCCCAAAATCCTCTGGCAGCTCTCACGATGCTAGACACTTGTTGCAAACCTCCTTCGACA 14 | ACTCAAGGCTTGCGACACCAAGCGAAAATCCAGTTGCAACCGACGGGCGCGAGTCTAGGTGCTGGCGGCGACAGTGCGTA 15 | ACCGTGGTCGGGGATCTATGCGTCGGATGCTTAACACAATAGCGTTGCCTACATTCACGTATGGTCATGCGGCGTAAGCA 16 | CTACCACGCAAATCACCGTGCAGGGCCGTGTTCGACACCCTCGACTGATTGTAGCCCATAGTAGATCAGTCTGGATCGAA 17 | AGCGTGCTGAAGCAATTGCCCATTGTCACATGAATCGGTTTCGAAGGAAACTATAGATGTAACGTAGGCCGGGTATCAGG 18 | GACGCATGAGTACCACGCTCAAGCGGGGGCTCTAGTGGATTGGTGGATTGATATTTTGCCGATTTTGCACTTCAACCAGC 19 | TTCTGACCATCACAAAACCGAAGGTCGTTTTTTTTTGGTTAACGAAACTCAAGGTCCGAGAGTGGCGATCGAGTTGAACT 20 | AAAAGATCGTTCTAGATGAACTTTACCAGATACAGCCAGGGCTCACATAACTTTCACTTCTATGGGTGGTTTTTCATCAT 21 | TCACAATACGACAACCAAAGAGCTAAACCTCGGGCTTGTCATTGCAAATGTCCCAGACGTTTGTTCAAAATTAACTCGAG 22 | ACACTGATGGATCCGCAAATTAAGAGGATAACTGTTTTACGCGCGGTCTCATAGACTTGTCGTACCCAAGTCCTTTGAGA 23 | -------------------------------------------------------------------------------- /man/predictSomatic.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/predictSomatic.R 3 | \name{predictSomatic} 4 | \alias{predictSomatic} 5 | \title{Predict germline vs. somatic status} 6 | \usage{ 7 | predictSomatic(res, id = 1, return.vcf = FALSE) 8 | } 9 | \arguments{ 10 | \item{res}{Return object of the \code{\link{runAbsoluteCN}} function.} 11 | 12 | \item{id}{Candidate solutions to be analyzed. \code{id=1} will analyze the 13 | maximum likelihood solution.} 14 | 15 | \item{return.vcf}{Returns an annotated \code{CollapsedVCF} object. Note that 16 | this VCF will only contain variants not filtered out by the \code{filterVcf} 17 | functions. Variants outside segments or intervals might be included or not 18 | depending on \code{\link{runAbsoluteCN}} arguments.} 19 | } 20 | \value{ 21 | A \code{data.frame} or \code{CollapsedVCF} with SNV state posterior 22 | probabilities. 23 | } 24 | \description{ 25 | This function takes as input the output of a \code{\link{runAbsoluteCN}} run 26 | and provides SNV posterior probabilities for all possible states. 27 | } 28 | \examples{ 29 | 30 | data(purecn.example.output) 31 | # the output data was created using a matched normal sample, but in case 32 | # no matched normal is available, this will help predicting somatic vs. 33 | # germline status 34 | purecnSnvs <- predictSomatic(purecn.example.output) 35 | 36 | # Prefer GRanges? 37 | purecnSnvs <- GRanges(predictSomatic(purecn.example.output)) 38 | 39 | # write a VCF file 40 | purecnVcf <- predictSomatic(purecn.example.output, return.vcf=TRUE) 41 | writeVcf(purecnVcf, file = "Sample1_PureCN.vcf") 42 | 43 | } 44 | \seealso{ 45 | \code{\link{runAbsoluteCN}} 46 | } 47 | \author{ 48 | Markus Riester 49 | } 50 | -------------------------------------------------------------------------------- /man/calculateTangentNormal.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/createNormalDatabase.R 3 | \name{calculateTangentNormal} 4 | \alias{calculateTangentNormal} 5 | \title{Calculate tangent normal} 6 | \usage{ 7 | calculateTangentNormal( 8 | tumor.coverage.file, 9 | normalDB, 10 | num.eigen = 20, 11 | ignore.sex = FALSE, 12 | sex = NULL 13 | ) 14 | } 15 | \arguments{ 16 | \item{tumor.coverage.file}{Coverage file or data of a tumor sample.} 17 | 18 | \item{normalDB}{Database of normal samples, created with 19 | \code{\link{createNormalDatabase}}.} 20 | 21 | \item{num.eigen}{Number of eigen vectors used.} 22 | 23 | \item{ignore.sex}{If \code{FALSE}, detects sex of sample and returns best 24 | normals with matching sex.} 25 | 26 | \item{sex}{Sex of sample. If \code{NULL}, determine with 27 | \code{\link{getSexFromCoverage}} and default parameters. Valid values are 28 | \code{F} for female, \code{M} for male. If all chromosomes are diploid, 29 | specify \code{diploid}.} 30 | } 31 | \description{ 32 | Reimplementation of GATK4 denoising. Please cite the relevant GATK 33 | publication if you use this in a publication. 34 | } 35 | \examples{ 36 | 37 | tumor.coverage.file <- system.file('extdata', 'example_tumor.txt.gz', 38 | package = 'PureCN') 39 | normal.coverage.file <- system.file("extdata", "example_normal.txt.gz", 40 | package = "PureCN") 41 | normal2.coverage.file <- system.file("extdata", "example_normal2.txt.gz", 42 | package = "PureCN") 43 | normal.coverage.files <- c(normal.coverage.file, normal2.coverage.file) 44 | normalDB <- createNormalDatabase(normal.coverage.files) 45 | pool <- calculateTangentNormal(tumor.coverage.file, normalDB) 46 | 47 | } 48 | \seealso{ 49 | \code{\link{createNormalDatabase}} 50 | } 51 | \author{ 52 | Markus Riester 53 | } 54 | -------------------------------------------------------------------------------- /tests/testthat/test_getSexFromCoverage.R: -------------------------------------------------------------------------------- 1 | context("getSexFromCoverage") 2 | 3 | library(GenomeInfoDb) # for renameSeqlevels() 4 | 5 | tumor.coverage.file <- system.file("extdata", "example_tumor.txt.gz", 6 | package = "PureCN") 7 | coverage <- readCoverageFile(tumor.coverage.file) 8 | chr22 <- coverage[which(seqnames(coverage) == "chr22")] 9 | chrX <- renameSeqlevels(chr22, c(chr22 = "chrX")) 10 | 11 | test_that("Warning with missing coverage data", { 12 | sex <- getSexFromCoverage(coverage) 13 | expect_true(is.na(sex)) 14 | expect_output( getSexFromCoverage(coverage), "WARN" ) 15 | }) 16 | 17 | test_that("Warning with missing coverage data in file", { 18 | sex <- getSexFromCoverage(tumor.coverage.file) 19 | expect_true(is.na(sex)) 20 | expect_output( getSexFromCoverage(coverage), "WARN" ) 21 | }) 22 | 23 | test_that("Male correct from coverage data", { 24 | chrY <- renameSeqlevels(chr22, c(chr22 = "chrY")) 25 | coverage_fakemale <- suppressWarnings(c(coverage, chrX, chrY)) 26 | sex <- getSexFromCoverage(coverage_fakemale) 27 | expect_identical("M", sex) 28 | }) 29 | 30 | test_that("Female correct from coverage data", { 31 | chrY <- renameSeqlevels(chr22, c(chr22 = "chrY")) 32 | chrY$average.coverage <- chrY$average.coverage/50 33 | coverage_fakefemale <- suppressWarnings( c(coverage, chrX, chrY) ) 34 | sex <- getSexFromCoverage(coverage_fakefemale) 35 | expect_identical("F", sex) 36 | }) 37 | 38 | test_that("NA correct from contaminated coverage data", { 39 | chrY <- renameSeqlevels(chr22, c(chr22 = "chrY")) 40 | chrY$average.coverage <- chrY$average.coverage / 21 41 | coverage_fakecontamination <- suppressWarnings( c(coverage, chrX, chrY)) 42 | sex <- getSexFromCoverage(coverage_fakecontamination) 43 | expect_true(is.na(sex)) 44 | }) 45 | -------------------------------------------------------------------------------- /R/adjustLogRatio.R: -------------------------------------------------------------------------------- 1 | #' Adjust tumor vs. normal coverage log ratio for tumor purity and ploidy 2 | #' 3 | #' This function can be used to adjust the log ratio for tumor purity and 4 | #' ploidy for downstream tools that expect a log2 ratio (for example GISTIC). 5 | #' 6 | #' 7 | #' @param ratio Vector of log2 tumor vs normal coverage ratios. 8 | #' @param purity Purity of sample. 9 | #' @param ploidy Ploidy of sample. 10 | #' @param is.log2 \code{log.ratio} is \code{log2} transformed. 11 | #' @param min.ratio Minimum (non-log2-transformed) ratio. Set to approx -8 12 | #' \code{log2} adjusted. 13 | #' @return \code{numeric(length(log.ratio))}, \code{log.ratio} adjusted 14 | #' for \code{purity} and \code{ploidy} 15 | #' @author Markus Riester 16 | #' @references 17 | # * Zack et al. (2012), Pan-cancer patterns of somatic copy number alteration 18 | #' Nature Biotechnology. 19 | #' * Toal (2018), https://github.com/lima1/PureCN/issues/40 20 | #' 21 | #' @examples 22 | #' 23 | #' normal.coverage.file <- system.file("extdata", "example_normal.txt.gz", 24 | #' package = "PureCN") 25 | #' tumor.coverage.file <- system.file("extdata", "example_tumor.txt.gz", 26 | #' package = "PureCN") 27 | #' normal <- readCoverageFile(normal.coverage.file) 28 | #' tumor <- readCoverageFile(tumor.coverage.file) 29 | #' log.ratio <- calculateLogRatio(normal, tumor) 30 | #' log.ratio.adjusted <- adjustLogRatio(log.ratio, 0.65, 1.73) 31 | #' 32 | #' @export adjustLogRatio 33 | adjustLogRatio <- function(ratio, purity, ploidy, is.log2 = TRUE, min.ratio = 2^-8) { 34 | if (is.log2) ratio <- 2^ratio 35 | adjusted <- (purity * ploidy * ratio + 2 * (1 - purity) * ratio - 2 * (1 - purity)) / (purity * ploidy) 36 | adjusted <- pmax(min.ratio, adjusted) 37 | if (is.log2) adjusted <- log2(adjusted) 38 | return(adjusted) 39 | } 40 | 41 | -------------------------------------------------------------------------------- /man/filterVcfMuTect2.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/filterVcfMuTect2.R 3 | \name{filterVcfMuTect2} 4 | \alias{filterVcfMuTect2} 5 | \title{Filter VCF MuTect2} 6 | \usage{ 7 | filterVcfMuTect2( 8 | vcf, 9 | tumor.id.in.vcf = NULL, 10 | ignore = c("clustered_events", "t_lod", "str_contraction", "read_position", "position", 11 | "fragment_length", "multiallelic", "clipping", "strand_artifact", "strand_bias", 12 | "slippage", "weak_evidence", "orientation", "haplotype"), 13 | ... 14 | ) 15 | } 16 | \arguments{ 17 | \item{vcf}{\code{CollapsedVCF} object, read in with the \code{readVcf} 18 | function from the VariantAnnotation package.} 19 | 20 | \item{tumor.id.in.vcf}{The tumor id in the VCF file, optional.} 21 | 22 | \item{ignore}{MuTect2 flags that mark variants for exclusion.} 23 | 24 | \item{\dots}{Additional arguments passed to \code{\link{filterVcfBasic}}.} 25 | } 26 | \value{ 27 | A list with elements \code{vcf}, \code{flag} and 28 | \code{flag_comment}. \code{vcf} contains the filtered \code{CollapsedVCF}, 29 | \code{flag} a \code{logical(1)} flag if problems were identified, further 30 | described in \code{flag_comment}. 31 | } 32 | \description{ 33 | Function to remove artifacts and low confidence/quality calls from a 34 | GATK4/MuTect2 generated VCF file. Also applies filters defined in 35 | \code{filterVcfBasic}. 36 | } 37 | \examples{ 38 | 39 | ### This function is typically only called by runAbsolute via the 40 | ### fun.filterVcf and args.filterVcf comments. 41 | library(VariantAnnotation) 42 | vcf.file <- system.file("extdata", "example.vcf.gz", package="PureCN") 43 | vcf <- readVcf(vcf.file, "hg19") 44 | vcf.filtered <- filterVcfMuTect(vcf) 45 | 46 | } 47 | \seealso{ 48 | \code{\link{filterVcfBasic}} 49 | } 50 | \author{ 51 | Markus Riester 52 | } 53 | -------------------------------------------------------------------------------- /man/getSexFromCoverage.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/getSex.R 3 | \name{getSexFromCoverage} 4 | \alias{getSexFromCoverage} 5 | \title{Get sample sex from coverage} 6 | \usage{ 7 | getSexFromCoverage( 8 | coverage.file, 9 | min.ratio = 25, 10 | min.ratio.na = 20, 11 | remove.outliers = TRUE 12 | ) 13 | } 14 | \arguments{ 15 | \item{coverage.file}{Coverage file or data read with 16 | \code{\link{readCoverageFile}}.} 17 | 18 | \item{min.ratio}{Min chrX/chrY coverage ratio to call sample as female.} 19 | 20 | \item{min.ratio.na}{Min chrX/chrY coverage ratio to call sample as 21 | \code{NA}. This ratio defines a grey zone from \code{min.ratio.na} to 22 | \code{min.ratio} in which samples are not called. The default is set to a 23 | copy number ratio that would be rare in male samples, but lower than 24 | expected in female samples. Contamination can be a source of ambiguous 25 | calls. Mappability issues on chromosome Y resulting in low coverage need to 26 | be considered when setting cutoffs.} 27 | 28 | \item{remove.outliers}{Removes coverage outliers before calculating mean 29 | chromosome coverages.} 30 | } 31 | \value{ 32 | Returns a \code{character(1)} with \code{M} for male, \code{F} for 33 | female, or \code{NA} if unknown. 34 | } 35 | \description{ 36 | This function determines the sex of a sample by the coverage ratio of chrX 37 | and chrY. Loss of chromosome Y (LOY) can result in a wrong female call. For 38 | small targeted panels, this will only work when sufficient sex marker genes 39 | such as AMELY are covered. For optimal results, parameters might need to be 40 | tuned for the assay. 41 | } 42 | \examples{ 43 | 44 | tumor.coverage.file <- system.file("extdata", "example_tumor.txt.gz", 45 | package = "PureCN") 46 | sex <- getSexFromCoverage(tumor.coverage.file) 47 | 48 | } 49 | \seealso{ 50 | \code{\link{getSexFromVcf}} 51 | } 52 | \author{ 53 | Markus Riester 54 | } 55 | -------------------------------------------------------------------------------- /tests/testthat/test_callAmplificationsInLowPurity.R: -------------------------------------------------------------------------------- 1 | context("callAmplificationsInLowPurity") 2 | 3 | normal.coverage.file <- system.file("extdata", "example_normal.txt.gz", 4 | package = "PureCN") 5 | normal2.coverage.file <- system.file("extdata", "example_normal2.txt.gz", 6 | package = "PureCN") 7 | normal.coverage.files <- c(normal.coverage.file, normal2.coverage.file) 8 | normalDB <- createNormalDatabase(normal.coverage.files) 9 | data(purecn.example.output) 10 | 11 | test_that("Example is called correctly", { 12 | m <- callAmplificationsInLowPurity(purecn.example.output, 13 | normalDB, all.genes = TRUE, purity = 0.65) 14 | m2 <- callAmplificationsInLowPurity(purecn.example.output, 15 | normalDB, all.genes = TRUE, purity = 0.65, BPPARAM = BiocParallel::bpparam()) 16 | esr2 <- m["ESR2", ] 17 | expect_equal(as.character(esr2$chr), "chr14") 18 | expect_true(esr2$start > 64694600) 19 | expect_true(esr2$end < 64761128) 20 | expect_true(esr2$C < 3 && esr2$C >= 2) 21 | expect_gt(cor(m$p.value, m2$p.value), 0.99) 22 | }) 23 | test_that("Exceptions happen with incorrect input data", { 24 | expect_error(callAmplificationsInLowPurity(purecn.example.output, 25 | normalDB, pvalue.cutoff = 1.2), "pvalue.cutoff") 26 | expect_error(callAmplificationsInLowPurity(purecn.example.output, 27 | normalDB, pvalue.cutoff = -1.2), "pvalue.cutoff") 28 | expect_error(callAmplificationsInLowPurity(purecn.example.output, 29 | normalDB, percentile.cutoff = 120), "percentile.cutoff") 30 | expect_error(callAmplificationsInLowPurity(purecn.example.output, 31 | normalDB, percentile.cutoff = -120), "percentile.cutoff") 32 | expect_error(callAmplificationsInLowPurity(purecn.example.output, 33 | normalDB, purity = -120), "purity") 34 | expect_error(callAmplificationsInLowPurity(purecn.example.output, 35 | normalDB, purity = 80), "purity") 36 | }) 37 | -------------------------------------------------------------------------------- /man/findFocal.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/findFocal.R 3 | \name{findFocal} 4 | \alias{findFocal} 5 | \title{Find focal amplifications} 6 | \usage{ 7 | findFocal(seg, max.size = 3e+06, cn.diff = 2, min.amp.cn = 5) 8 | } 9 | \arguments{ 10 | \item{seg}{Segmentation data.} 11 | 12 | \item{max.size}{Cutoff for focal in base pairs.} 13 | 14 | \item{cn.diff}{Minimum copy number delta between neighboring segments.} 15 | 16 | \item{min.amp.cn}{Minimum amplification integer copy number. Segments with 17 | lower copy number are not tested.} 18 | } 19 | \value{ 20 | \code{logical(n)}, indicating for all n segments whether they are 21 | focally amplified or not. 22 | } 23 | \description{ 24 | Function to find focal amplifications in segmented data. This is 25 | automatically called in \code{\link{runAbsoluteCN}}. 26 | } 27 | \examples{ 28 | 29 | normal.coverage.file <- system.file("extdata", "example_normal_tiny.txt", 30 | package = "PureCN") 31 | tumor.coverage.file <- system.file("extdata", "example_tumor_tiny.txt", 32 | package = "PureCN") 33 | vcf.file <- system.file("extdata", "example.vcf.gz", 34 | package = "PureCN") 35 | interval.file <- system.file("extdata", "example_intervals_tiny.txt", 36 | package = "PureCN") 37 | 38 | # The max.candidate.solutions, max.ploidy and test.purity parameters are set to 39 | # non-default values to speed-up this example. This is not a good idea for real 40 | # samples. 41 | ret <-runAbsoluteCN(normal.coverage.file = normal.coverage.file, 42 | tumor.coverage.file = tumor.coverage.file, vcf.file = vcf.file, 43 | genome="hg19", sampleid = "Sample1", interval.file = interval.file, 44 | max.candidate.solutions = 1, max.ploidy = 4, 45 | test.purity = seq(0.3, 0.7, by = 0.05), 46 | args.focal=list(max.size = 2e+06), fun.focal = findFocal) 47 | 48 | } 49 | \seealso{ 50 | \code{\link{runAbsoluteCN}} 51 | } 52 | \author{ 53 | Markus Riester 54 | } 55 | -------------------------------------------------------------------------------- /man/readCurationFile.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/readCurationFile.R 3 | \name{readCurationFile} 4 | \alias{readCurationFile} 5 | \title{Read curation file} 6 | \usage{ 7 | readCurationFile( 8 | file.rds, 9 | file.curation = gsub(".rds$", ".csv", file.rds), 10 | remove.failed = FALSE, 11 | report.best.only = FALSE, 12 | min.ploidy = NULL, 13 | max.ploidy = NULL 14 | ) 15 | } 16 | \arguments{ 17 | \item{file.rds}{Output of the \code{\link{runAbsoluteCN}} function, 18 | serialized with \code{saveRDS}.} 19 | 20 | \item{file.curation}{Filename of a curation file that points to the correct 21 | tumor purity and ploidy solution.} 22 | 23 | \item{remove.failed}{Do not return solutions that failed.} 24 | 25 | \item{report.best.only}{Only return correct/best solution (useful on low 26 | memory machines when lots of samples are loaded).} 27 | 28 | \item{min.ploidy}{Minimum ploidy to be considered. If \code{NULL}, all. Can 29 | be used to automatically ignore unlikely solutions.} 30 | 31 | \item{max.ploidy}{Maximum ploidy to be considered. If \code{NULL}, all. Can 32 | be used to automatically ignore unlikely solutions.} 33 | } 34 | \value{ 35 | The return value of the corresponding \code{\link{runAbsoluteCN}} 36 | call, but with the results array manipulated according the curation CSV file 37 | and arguments of this function. 38 | } 39 | \description{ 40 | Function that can be used to read the curated output of the 41 | \code{\link{runAbsoluteCN}} function. 42 | } 43 | \examples{ 44 | 45 | data(purecn.example.output) 46 | file.rds <- "Sample1_PureCN.rds" 47 | createCurationFile(file.rds) 48 | # User can change the maximum likelihood solution manually in the generated 49 | # CSV file. The correct solution is then loaded with readCurationFile. 50 | purecn.curated.example.output <-readCurationFile(file.rds) 51 | 52 | } 53 | \seealso{ 54 | \code{\link{runAbsoluteCN} \link{createCurationFile}} 55 | } 56 | \author{ 57 | Markus Riester 58 | } 59 | -------------------------------------------------------------------------------- /R/callCIN.R: -------------------------------------------------------------------------------- 1 | #' Call Chromosomal Instability 2 | #' 3 | #' This function provides detailed CIN information. 4 | #' 5 | #' 6 | #' @param res Return object of the \code{\link{runAbsoluteCN}} function. 7 | #' @param id Candidate solution to extract CIN from. \code{id=1} will use the 8 | #' maximum likelihood solution. 9 | #' @param allele.specific Use allele-specific or only total copy number for 10 | #' detecting abnormal regions. Copy-number neutral LOH would be ignored when 11 | #' this parameter is set to \code{FALSE}. 12 | #' @param reference.state Copy number regions different from the reference 13 | #' state are counted as abnormal. Default is \code{dominant} means the most 14 | #' common state. The other option is \code{normal}, which defines normal 15 | #' heterozygous, diploid as reference. The default is robust to errors in 16 | #' ploidy. 17 | #' @return Returns \code{double(1)} with CIN value. 18 | #' @author Markus Riester 19 | #' @seealso \code{\link{runAbsoluteCN}} 20 | #' @examples 21 | #' 22 | #' data(purecn.example.output) 23 | #' head(callCIN(purecn.example.output)) 24 | #' 25 | #' @export callCIN 26 | callCIN <- function(res, id = 1, allele.specific = TRUE, reference.state = 27 | c("dominant", "normal")) { 28 | loh <- callLOH(res, id) 29 | loh$size <- loh$end - loh$start + 1 30 | # should not happen 31 | loh <- loh[!is.na(loh$size), ] 32 | if (allele.specific) loh <- loh[!is.na(loh$M), ] 33 | reference.state <- match.arg(reference.state) 34 | loh$state <- if (allele.specific) paste0(loh$C, "/", loh$M) else loh$C 35 | dominant.state <- sort(sapply(split(loh$size, loh$state), sum), 36 | decreasing = TRUE)[1] 37 | reference.state.cn <- names(dominant.state) 38 | if (reference.state == "normal") { 39 | reference.state.cn <- if (allele.specific) "2/1" else "2" 40 | } 41 | loh$is.reference <- loh$state == reference.state.cn 42 | sum(loh$size[!loh$is.reference]) / sum(loh$size) 43 | } 44 | -------------------------------------------------------------------------------- /R/poolCoverage.R: -------------------------------------------------------------------------------- 1 | #' Pool coverage from multiple samples 2 | #' 3 | #' Averages the coverage of a list of samples. 4 | #' 5 | #' 6 | #' @param all.data List of normals, read with \code{\link{readCoverageFile}}. 7 | #' @param remove.chrs Remove these chromosomes from the pool. 8 | #' @param w \code{numeric(length(all.data))} vector of weights. If \code{NULL}, 9 | #' weight all samples equally. 10 | #' @return A \code{data.frame} with the averaged coverage over all normals. 11 | #' @author Markus Riester 12 | #' @seealso \code{\link{readCoverageFile}} 13 | #' @examples 14 | #' 15 | #' normal.coverage.file <- system.file("extdata", "example_normal.txt.gz", 16 | #' package = "PureCN") 17 | #' normal2.coverage.file <- system.file("extdata", "example_normal2.txt.gz", 18 | #' package = "PureCN") 19 | #' normal.coverage.files <- c(normal.coverage.file, normal2.coverage.file) 20 | #' pool <- poolCoverage(lapply(normal.coverage.files, readCoverageFile), 21 | #' remove.chrs = c("chrX", "chrY")) 22 | #' 23 | #' @export poolCoverage 24 | poolCoverage <- function(all.data, remove.chrs=c(), w = NULL) { 25 | pool <- all.data[[1]] 26 | 27 | if (length(all.data) == 1) { 28 | return(.removeChr(pool, remove.chrs)) 29 | } 30 | if (is.null(w)) { 31 | w <- rep(1, length(all.data)) 32 | } else if (length(w) != length(all.data)) { 33 | .stopUserError("all.data and w have different lengths.") 34 | } 35 | 36 | pool$coverage <- 0 37 | pool$counts <- 0 38 | 39 | for (i in seq_along(all.data)) { 40 | pool$coverage <- pool$coverage + (w[i] * all.data[[i]]$coverage) 41 | pool$counts <- pool$counts + (w[i] * all.data[[i]]$counts) 42 | } 43 | pool <- .addAverageCoverage(pool) 44 | return(.removeChr(pool, remove.chrs)) 45 | } 46 | 47 | .removeChr <- function(pool, remove.chrs = c()) { 48 | idx <- seqnames(pool) %in% remove.chrs 49 | if (sum(idx)) { 50 | pool[idx]$coverage <- NA 51 | pool[idx]$average.coverage <- NA 52 | } 53 | pool 54 | } 55 | -------------------------------------------------------------------------------- /R/filterVcfMuTect2.R: -------------------------------------------------------------------------------- 1 | #' Filter VCF MuTect2 2 | #' 3 | #' Function to remove artifacts and low confidence/quality calls from a 4 | #' GATK4/MuTect2 generated VCF file. Also applies filters defined in 5 | #' \code{filterVcfBasic}. 6 | #' 7 | #' 8 | #' @param vcf \code{CollapsedVCF} object, read in with the \code{readVcf} 9 | #' function from the VariantAnnotation package. 10 | #' @param tumor.id.in.vcf The tumor id in the VCF file, optional. 11 | #' @param ignore MuTect2 flags that mark variants for exclusion. 12 | #' @param \dots Additional arguments passed to \code{\link{filterVcfBasic}}. 13 | #' @return A list with elements \code{vcf}, \code{flag} and 14 | #' \code{flag_comment}. \code{vcf} contains the filtered \code{CollapsedVCF}, 15 | #' \code{flag} a \code{logical(1)} flag if problems were identified, further 16 | #' described in \code{flag_comment}. 17 | #' @author Markus Riester 18 | #' @seealso \code{\link{filterVcfBasic}} 19 | #' @examples 20 | #' 21 | #' ### This function is typically only called by runAbsolute via the 22 | #' ### fun.filterVcf and args.filterVcf comments. 23 | #' library(VariantAnnotation) 24 | #' vcf.file <- system.file("extdata", "example.vcf.gz", package="PureCN") 25 | #' vcf <- readVcf(vcf.file, "hg19") 26 | #' vcf.filtered <- filterVcfMuTect(vcf) 27 | #' 28 | #' @export filterVcfMuTect2 29 | filterVcfMuTect2 <- function(vcf, tumor.id.in.vcf = NULL, 30 | ignore=c("clustered_events", "t_lod", "str_contraction", 31 | "read_position", "position", "fragment_length", "multiallelic", "clipping", 32 | "strand_artifact", "strand_bias", "slippage", "weak_evidence", 33 | "orientation", "haplotype"), 34 | ...){ 35 | if (is.null(fixed(vcf)$FILTER)) return( 36 | filterVcfBasic(vcf, tumor.id.in.vcf, ...)) 37 | 38 | n <- .countVariants(vcf) 39 | 40 | ids <- sort(unique(unlist(sapply(ignore, grep, fixed(vcf)$FILTER)))) 41 | vcf <- .removeVariants(vcf, ids, "Mutect2") 42 | flog.info("Removing %i Mutect2 calls due to blacklisted failure reasons.", 43 | n-.countVariants(vcf)) 44 | filterVcfBasic(vcf, tumor.id.in.vcf, ...) 45 | } 46 | -------------------------------------------------------------------------------- /man/filterVcfMuTect.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/filterVcfMuTect.R 3 | \name{filterVcfMuTect} 4 | \alias{filterVcfMuTect} 5 | \title{Filter VCF MuTect} 6 | \usage{ 7 | filterVcfMuTect( 8 | vcf, 9 | tumor.id.in.vcf = NULL, 10 | stats.file = NULL, 11 | ignore = c("clustered_read_position", "fstar_tumor_lod", "nearby_gap_events", 12 | "poor_mapping_region_alternate_allele_mapq", "poor_mapping_region_mapq0", 13 | "possible_contamination", "strand_artifact", "seen_in_panel_of_normals"), 14 | ... 15 | ) 16 | } 17 | \arguments{ 18 | \item{vcf}{\code{CollapsedVCF} object, read in with the \code{readVcf} 19 | function from the VariantAnnotation package.} 20 | 21 | \item{tumor.id.in.vcf}{The tumor id in the VCF file, optional.} 22 | 23 | \item{stats.file}{MuTect stats file. If \code{NULL}, will check if VCF 24 | was generated by MuTect2 and if yes will call \code{\link{filterVcfMuTect2}} 25 | instead.} 26 | 27 | \item{ignore}{MuTect flags that mark variants for exclusion.} 28 | 29 | \item{\dots}{Additional arguments passed to \code{\link{filterVcfBasic}}.} 30 | } 31 | \value{ 32 | A list with elements \code{vcf}, \code{flag} and 33 | \code{flag_comment}. \code{vcf} contains the filtered \code{CollapsedVCF}, 34 | \code{flag} a \code{logical(1)} flag if problems were identified, further 35 | described in \code{flag_comment}. 36 | } 37 | \description{ 38 | Function to remove artifacts and low confidence/quality calls from a MuTect 39 | generated VCF file. Also applies filters defined in \code{filterVcfBasic}. 40 | This function will only keep variants listed in the stats file and those not 41 | matching the specified failure reasons. 42 | } 43 | \examples{ 44 | 45 | ### This function is typically only called by runAbsolute via the 46 | ### fun.filterVcf and args.filterVcf comments. 47 | library(VariantAnnotation) 48 | vcf.file <- system.file("extdata", "example.vcf.gz", package="PureCN") 49 | vcf <- readVcf(vcf.file, "hg19") 50 | vcf.filtered <- filterVcfMuTect(vcf) 51 | 52 | } 53 | \seealso{ 54 | \code{\link{filterVcfBasic}} 55 | } 56 | \author{ 57 | Markus Riester 58 | } 59 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: PureCN 2 | Type: Package 3 | Title: Copy number calling and SNV classification using 4 | targeted short read sequencing 5 | Version: 2.15.4 6 | Date: 2025-07-22 7 | Authors@R: c(person("Markus", "Riester", 8 | role = c("aut", "cre"), 9 | email = "markus.riester@novartis.com", 10 | comment = c(ORCID = "0000-0002-4759-8332")), 11 | person("Angad P.", "Singh", role = "aut")) 12 | Description: This package estimates tumor purity, copy number, and loss of 13 | heterozygosity (LOH), and classifies single nucleotide variants (SNVs) by 14 | somatic status and clonality. PureCN is designed for targeted short read 15 | sequencing data, integrates well with standard somatic variant detection 16 | and copy number pipelines, and has support for tumor samples without 17 | matching normal samples. 18 | Depends: 19 | R (>= 3.5.0), 20 | DNAcopy, 21 | VariantAnnotation (>= 1.14.1) 22 | Imports: 23 | GenomicRanges (>= 1.20.3), 24 | IRanges (>= 2.2.1), 25 | RColorBrewer, 26 | S4Vectors, 27 | data.table, 28 | grDevices, 29 | graphics, 30 | stats, 31 | utils, 32 | SummarizedExperiment, 33 | Seqinfo, 34 | GenomeInfoDb, 35 | GenomicFeatures, 36 | Rsamtools, 37 | Biobase, 38 | Biostrings, 39 | BiocGenerics, 40 | rtracklayer, 41 | ggplot2, 42 | gridExtra, 43 | futile.logger, 44 | VGAM, 45 | tools, 46 | methods, 47 | mclust, 48 | rhdf5, 49 | Matrix 50 | Suggests: 51 | BiocParallel, 52 | BiocStyle, 53 | PSCBS, 54 | R.utils, 55 | TxDb.Hsapiens.UCSC.hg19.knownGene, 56 | covr, 57 | knitr, 58 | optparse, 59 | org.Hs.eg.db, 60 | jsonlite, 61 | markdown, 62 | rmarkdown, 63 | testthat 64 | Enhances: 65 | genomicsdb (>= 0.0.3) 66 | VignetteBuilder: knitr 67 | License: Artistic-2.0 68 | BugReports: https://github.com/lima1/PureCN/issues 69 | URL: https://github.com/lima1/PureCN 70 | biocViews: CopyNumberVariation, Software, Sequencing, 71 | VariantAnnotation, VariantDetection, Coverage, ImmunoOncology 72 | NeedsCompilation: no 73 | ByteCompile: yes 74 | RoxygenNote: 7.3.1 75 | -------------------------------------------------------------------------------- /man/setMappingBiasVcf.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/setMappingBiasVcf.R 3 | \name{setMappingBiasVcf} 4 | \alias{setMappingBiasVcf} 5 | \title{Set Mapping Bias VCF} 6 | \usage{ 7 | setMappingBiasVcf( 8 | vcf, 9 | tumor.id.in.vcf = NULL, 10 | mapping.bias.file = NULL, 11 | smooth = TRUE, 12 | smooth.n = 5 13 | ) 14 | } 15 | \arguments{ 16 | \item{vcf}{\code{CollapsedVCF} object, read in with the \code{readVcf} 17 | function from the VariantAnnotation package.} 18 | 19 | \item{tumor.id.in.vcf}{Id of tumor in case multiple samples are stored in 20 | VCF.} 21 | 22 | \item{mapping.bias.file}{A precomputed mapping bias database 23 | obtained by \code{\link{calculateMappingBiasVcf}}. 24 | instead. 25 | reference and alt counts as AD genotype field. Should be compressed and} 26 | 27 | \item{smooth}{Impute mapping bias of variants not found in the panel by 28 | smoothing of neighboring SNPs. Requires \code{mapping.bias.file}.} 29 | 30 | \item{smooth.n}{Number of neighboring variants used for smoothing.} 31 | } 32 | \value{ 33 | Adds elements to the \code{vcf} \code{INFO} field 34 | \item{bias}{A \code{numeric(nrow(vcf))} 35 | vector with the mapping bias of for each 36 | variant in the \code{CollapsedVCF}. Mapping bias is expected as scaling 37 | factor. Adjusted allelic fraction is (observed allelic fraction)/(mapping 38 | bias). Maximum scaling factor is 1 and means no bias.} 39 | \item{pon.count}{A \code{numeric(nrow(vcf))} vector with the number 40 | of hits in the \code{mapping.bias.file}.} 41 | \item{shape1, shape2}{Fit of a beta distribution.} 42 | } 43 | \description{ 44 | Function to set mapping bias for each variant in the provided 45 | \code{CollapsedVCF} object. By default, it returns the same value for all 46 | variants, but a mapping bias file can be provided for position-specific 47 | mapping bias calculation. 48 | } 49 | \examples{ 50 | 51 | # This function is typically only called by runAbsoluteCN via 52 | # fun.setMappingBiasVcf and args.setMappingBiasVcf. 53 | vcf.file <- system.file("extdata", "example.vcf.gz", package="PureCN") 54 | vcf <- readVcf(vcf.file, "hg19") 55 | vcf.bias <- setMappingBiasVcf(vcf) 56 | 57 | } 58 | \author{ 59 | Markus Riester 60 | } 61 | -------------------------------------------------------------------------------- /man/correctCoverageBias.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/correctCoverageBias.R 3 | \name{correctCoverageBias} 4 | \alias{correctCoverageBias} 5 | \title{Correct for library-specific coverage biases} 6 | \usage{ 7 | correctCoverageBias( 8 | coverage.file, 9 | interval.file, 10 | output.file = NULL, 11 | plot.bias = FALSE, 12 | plot.max.density = 50000, 13 | output.qc.file = NULL 14 | ) 15 | } 16 | \arguments{ 17 | \item{coverage.file}{Coverage file or coverage data parsed with the 18 | \code{\link{readCoverageFile}} function.} 19 | 20 | \item{interval.file}{File providing GC content for each exon in the coverage 21 | files. First column in format CHR:START-END. Additional optional columns 22 | provide gene symbols, mappability and replication timing. This file is 23 | generated with the \code{\link{preprocessIntervals}} function.} 24 | 25 | \item{output.file}{Optionally, write file with GC corrected coverage. Can be 26 | read with the \code{\link{readCoverageFile}} function.} 27 | 28 | \item{plot.bias}{Optionally, plot profiles of the pre-normalized and 29 | post-normalized coverage. Provides a quick visual check of coverage bias.} 30 | 31 | \item{plot.max.density}{By default, if the number of intervals in the 32 | probe-set is > 50000, uses a kernel density estimate to plot the coverage 33 | distribution. This uses the \code{stat_density} function from the ggplot2 34 | package. Using this parameter, change the threshold at which density 35 | estimation is applied. If the \code{plot.bias} parameter is set as 36 | \code{FALSE}, this will be ignored.} 37 | 38 | \item{output.qc.file}{Write miscellaneous coverage QC metrics to file.} 39 | } 40 | \description{ 41 | Takes as input coverage data and a mapping file for GC content and 42 | optionally replication timing. Will then normalize coverage data for 43 | GC-bias. Plots the pre and post normalization GC profiles. 44 | } 45 | \examples{ 46 | 47 | normal.coverage.file <- system.file("extdata", "example_normal.txt.gz", 48 | package = "PureCN") 49 | interval.file <- system.file("extdata", "example_intervals.txt", 50 | package = "PureCN") 51 | coverage <- correctCoverageBias(normal.coverage.file, interval.file) 52 | 53 | } 54 | \seealso{ 55 | \code{\link{preprocessIntervals}} 56 | } 57 | \author{ 58 | Angad Singh, Markus Riester 59 | } 60 | -------------------------------------------------------------------------------- /man/calculateBamCoverageByInterval.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/calculateBamCoverageByInterval.R 3 | \name{calculateBamCoverageByInterval} 4 | \alias{calculateBamCoverageByInterval} 5 | \title{Function to calculate coverage from BAM file} 6 | \usage{ 7 | calculateBamCoverageByInterval( 8 | bam.file, 9 | interval.file, 10 | output.file = NULL, 11 | index.file = bam.file, 12 | keep.duplicates = FALSE, 13 | chunks = 20, 14 | ... 15 | ) 16 | } 17 | \arguments{ 18 | \item{bam.file}{Filename of a BAM file.} 19 | 20 | \item{interval.file}{File specifying the intervals. Interval is expected in 21 | first column in format CHR:START-END.} 22 | 23 | \item{output.file}{Optionally, write minimal coverage file. Can be read with 24 | the \code{\link{readCoverageFile}} function.} 25 | 26 | \item{index.file}{The bai index. This is expected without the .bai file 27 | suffix, see \code{?scanBam}.} 28 | 29 | \item{keep.duplicates}{Keep or remove duplicated reads.} 30 | 31 | \item{chunks}{Split \code{interval.file} into specified number of chunks 32 | to reduce memory usage.} 33 | 34 | \item{...}{Additional parameters passed to \code{ScanBamParam}.} 35 | } 36 | \value{ 37 | Returns total and average coverage by intervals. 38 | } 39 | \description{ 40 | Takes a BAM file and an interval file as input and returns coverage for each 41 | interval. Coverage should be then GC-normalized using the 42 | \code{\link{correctCoverageBias}} function before determining purity and 43 | ploidy with \code{\link{runAbsoluteCN}}. Uses the \code{scanBam} function 44 | and applies low quality, duplicate reads as well as secondary alignment 45 | filters. 46 | } 47 | \examples{ 48 | 49 | bam.file <- system.file("extdata", "ex1.bam", package = "PureCN", 50 | mustWork = TRUE) 51 | interval.file <- system.file("extdata", "ex1_intervals.txt", 52 | package = "PureCN", mustWork = TRUE) 53 | 54 | # Calculate raw coverage from BAM file. These need to be corrected for 55 | # GC-bias using the correctCoverageBias function before determining purity 56 | # and ploidy. 57 | coverage <- calculateBamCoverageByInterval(bam.file = bam.file, 58 | interval.file = interval.file) 59 | 60 | } 61 | \seealso{ 62 | \code{\link{preprocessIntervals} 63 | \link{correctCoverageBias} \link{runAbsoluteCN}} 64 | } 65 | \author{ 66 | Markus Riester 67 | } 68 | -------------------------------------------------------------------------------- /man/callAmplificationsInLowPurity.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/callAmplificationsInLowPurity.R 3 | \name{callAmplificationsInLowPurity} 4 | \alias{callAmplificationsInLowPurity} 5 | \title{Calling of amplifications in low purity samples} 6 | \usage{ 7 | callAmplificationsInLowPurity( 8 | res, 9 | normalDB, 10 | pvalue.cutoff = 0.001, 11 | percentile.cutoff = 90, 12 | min.width = 3, 13 | all.genes = FALSE, 14 | purity = NULL, 15 | BPPARAM = NULL 16 | ) 17 | } 18 | \arguments{ 19 | \item{res}{Return object of the \code{\link{runAbsoluteCN}} function.} 20 | 21 | \item{normalDB}{Normal database, created with 22 | \code{\link{createNormalDatabase}}.} 23 | 24 | \item{pvalue.cutoff}{Copy numbers log-ratio cutoffs to call 25 | amplifications as calculating using the log-ratios observed in 26 | \code{normalDB}} 27 | 28 | \item{percentile.cutoff}{Only report genes with log2-ratio mean 29 | exceeding this sample-wise cutoff.} 30 | 31 | \item{min.width}{Minimum number of targets} 32 | 33 | \item{all.genes}{If \code{FALSE}, then only return amplifications 34 | passing the thresholds.} 35 | 36 | \item{purity}{If not \code{NULL}, then scale log2-ratios to the 37 | corresponding integer copy number. Useful when accurate ctDNA 38 | fractions (between 4-10 percent) are available.} 39 | 40 | \item{BPPARAM}{\code{BiocParallelParam} object. If \code{NULL}, does not 41 | use parallelization for fitting local optima.} 42 | } 43 | \value{ 44 | A \code{data.frame} with gene-level amplification calls. 45 | } 46 | \description{ 47 | Function to extract amplification from a 48 | \code{\link{runAbsoluteCN}} return object in samples of too low purity 49 | for the standard \code{\link{callAlterations}}. 50 | } 51 | \examples{ 52 | 53 | data(purecn.example.output) 54 | normal.coverage.file <- system.file("extdata", "example_normal.txt.gz", 55 | package = "PureCN") 56 | normal2.coverage.file <- system.file("extdata", "example_normal2.txt.gz", 57 | package = "PureCN") 58 | normal.coverage.files <- c(normal.coverage.file, normal2.coverage.file) 59 | normalDB <- createNormalDatabase(normal.coverage.files) 60 | callAmplificationsInLowPurity(purecn.example.output, normalDB)["EIF2A", ] 61 | 62 | } 63 | \seealso{ 64 | \code{\link{runAbsoluteCN}} \code{\link{callAlterations}} 65 | } 66 | \author{ 67 | Markus Riester 68 | } 69 | -------------------------------------------------------------------------------- /man/calculateMappingBiasVcf.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/calculateMappingBiasVcf.R 3 | \name{calculateMappingBiasVcf} 4 | \alias{calculateMappingBiasVcf} 5 | \title{Calculate Mapping Bias} 6 | \usage{ 7 | calculateMappingBiasVcf( 8 | normal.panel.vcf.file, 9 | min.normals = 1, 10 | min.normals.betafit = 7, 11 | min.normals.assign.betafit = 3, 12 | min.normals.position.specific.fit = 10, 13 | min.median.coverage.betafit = 5, 14 | num.betafit.clusters = 9, 15 | min.betafit.rho = 1e-04, 16 | max.betafit.rho = 0.2, 17 | yieldSize = 50000, 18 | genome 19 | ) 20 | } 21 | \arguments{ 22 | \item{normal.panel.vcf.file}{\code{character(1)} Combined VCF file of 23 | a panel of normals, reference and alt counts as AD genotype field. 24 | Needs to be compressed and indexed with bgzip and tabix, respectively.} 25 | 26 | \item{min.normals}{Minimum number of normals with heterozygous SNP for 27 | calculating position-specific mapping bias.} 28 | 29 | \item{min.normals.betafit}{Minimum number of normals with heterozygous SNP 30 | fitting a beta binomial distribution} 31 | 32 | \item{min.normals.assign.betafit}{Minimum number of normals with 33 | heterozygous SNPs to assign to a beta binomal fit cluster} 34 | 35 | \item{min.normals.position.specific.fit}{Minimum normals to use 36 | position-specific beta-binomial fits. Otherwise only clustered fits are 37 | used.} 38 | 39 | \item{min.median.coverage.betafit}{Minimum median coverage of normals with 40 | heterozygous SNP for fitting a beta binomial distribution} 41 | 42 | \item{num.betafit.clusters}{Maximum number of beta binomial fit clusters} 43 | 44 | \item{min.betafit.rho}{Minimum dispersion factor rho} 45 | 46 | \item{max.betafit.rho}{Maximum dispersion factor rho} 47 | 48 | \item{yieldSize}{See \code{TabixFile}} 49 | 50 | \item{genome}{See \code{readVcf}} 51 | } 52 | \value{ 53 | A \code{GRanges} object with mapping bias and number of normal 54 | samples with this variant. 55 | } 56 | \description{ 57 | Function calculate mapping bias for each variant in the provided 58 | panel of normals VCF. 59 | } 60 | \examples{ 61 | 62 | normal.panel.vcf <- system.file("extdata", "normalpanel.vcf.gz", 63 | package = "PureCN") 64 | bias <- calculateMappingBiasVcf(normal.panel.vcf, genome = "h19") 65 | saveRDS(bias, "mapping_bias.rds") 66 | 67 | } 68 | \author{ 69 | Markus Riester 70 | } 71 | -------------------------------------------------------------------------------- /man/callAlterationsFromSegmentation.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/callAlterations.R 3 | \name{callAlterationsFromSegmentation} 4 | \alias{callAlterationsFromSegmentation} 5 | \title{Calling of amplifications and deletions from segmentations} 6 | \usage{ 7 | callAlterationsFromSegmentation( 8 | sampleid, 9 | chr, 10 | start, 11 | end, 12 | num.mark = NA, 13 | seg.mean, 14 | C, 15 | interval.file, 16 | fun.focal = findFocal, 17 | args.focal = list(), 18 | ... 19 | ) 20 | } 21 | \arguments{ 22 | \item{sampleid}{The sampleid column in the segmentation file.} 23 | 24 | \item{chr}{The chromosome column.} 25 | 26 | \item{start}{The start positions of the segments.} 27 | 28 | \item{end}{The end positions of the segments.} 29 | 30 | \item{num.mark}{Optionally, the number of probes or markers in each segment.} 31 | 32 | \item{seg.mean}{The segment mean.} 33 | 34 | \item{C}{The segment integer copy number.} 35 | 36 | \item{interval.file}{A mapping file that assigns GC content and gene symbols 37 | to each exon in the coverage files. Used for generating gene-level calls. 38 | First column in format CHR:START-END. Second column GC content (0 to 1). 39 | Third column gene symbol. This file is generated with the 40 | \code{\link{preprocessIntervals}} function.} 41 | 42 | \item{fun.focal}{Function for identifying focal amplifications. Defaults to 43 | \code{\link{findFocal}}.} 44 | 45 | \item{args.focal}{Arguments for focal amplification function.} 46 | 47 | \item{\dots}{Arguments passed to \code{\link{callAlterations}}.} 48 | } 49 | \value{ 50 | A list of \code{\link{callAlterations}} \code{data.frame} objects, 51 | one for each sample. 52 | } 53 | \description{ 54 | This function can be used to obtain gene-level copy number calls from 55 | segmentations. This is useful for comparing PureCN's segmentations with 56 | segmentations obtained by different tools on the gene-level. Segmentation 57 | file can contain multiple samples. 58 | } 59 | \examples{ 60 | 61 | data(purecn.example.output) 62 | seg <- purecn.example.output$results[[1]]$seg 63 | interval.file <- system.file("extdata", "example_intervals.txt", 64 | package = "PureCN") 65 | 66 | calls <- callAlterationsFromSegmentation(sampleid = seg$ID, chr = seg$chrom, 67 | start = seg$loc.start, end = seg$loc.end, num.mark = seg$num.mark, 68 | seg.mean = seg$seg.mean, C = seg$C, interval.file = interval.file) 69 | 70 | } 71 | \author{ 72 | Markus Riester 73 | } 74 | -------------------------------------------------------------------------------- /man/calculateMappingBiasGatk4.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/calculateMappingBiasVcf.R 3 | \name{calculateMappingBiasGatk4} 4 | \alias{calculateMappingBiasGatk4} 5 | \title{Calculate Mapping Bias from GATK4 GenomicsDB} 6 | \usage{ 7 | calculateMappingBiasGatk4( 8 | workspace, 9 | reference.genome, 10 | min.normals = 1, 11 | min.normals.betafit = 7, 12 | min.normals.assign.betafit = 3, 13 | min.normals.position.specific.fit = 10, 14 | min.median.coverage.betafit = 5, 15 | num.betafit.clusters = 9, 16 | min.betafit.rho = 1e-04, 17 | max.betafit.rho = 0.2, 18 | AF.info.field = "AF" 19 | ) 20 | } 21 | \arguments{ 22 | \item{workspace}{Path to the GenomicsDB created by \code{GenomicsDBImport}} 23 | 24 | \item{reference.genome}{Reference FASTA file.} 25 | 26 | \item{min.normals}{Minimum number of normals with heterozygous SNP for 27 | calculating position-specific mapping bias.} 28 | 29 | \item{min.normals.betafit}{Minimum number of normals with heterozygous SNP 30 | fitting a beta distribution} 31 | 32 | \item{min.normals.assign.betafit}{Minimum number of normals with 33 | heterozygous SNPs to assign to a beta binomal fit cluster} 34 | 35 | \item{min.normals.position.specific.fit}{Minimum normals to use 36 | position-specific beta-binomial fits. Otherwise only clustered fits are 37 | used.} 38 | 39 | \item{min.median.coverage.betafit}{Minimum median coverage of normals with 40 | heterozygous SNP for fitting a beta distribution} 41 | 42 | \item{num.betafit.clusters}{Maximum number of beta binomial fit clusters} 43 | 44 | \item{min.betafit.rho}{Minimum dispersion factor rho} 45 | 46 | \item{max.betafit.rho}{Maximum dispersion factor rho} 47 | 48 | \item{AF.info.field}{Field in the \code{workspace} that stores the allelic 49 | fraction} 50 | } 51 | \value{ 52 | A \code{GRanges} object with mapping bias and number of normal 53 | samples with this variant. 54 | } 55 | \description{ 56 | Function calculate mapping bias for each variant in the provided 57 | panel of normals GenomicsDB. 58 | } 59 | \examples{ 60 | 61 | \dontrun{ 62 | resources_file <- system.file("extdata", "gatk4_pon_db.tgz", 63 | package = "PureCN") 64 | tmp_dir <- tempdir() 65 | untar(resources_file, exdir = tmp_dir) 66 | workspace <- file.path(tmp_dir, "gatk4_pon_db") 67 | bias <- calculateMappingBiasGatk4(workspace, "hg19") 68 | saveRDS(bias, "mapping_bias.rds") 69 | unlink(tmp_dir, recursive=TRUE) 70 | } 71 | 72 | } 73 | \author{ 74 | Markus Riester 75 | } 76 | -------------------------------------------------------------------------------- /inst/extdata/example_seg.txt: -------------------------------------------------------------------------------- 1 | ID chrom loc.start loc.end num.mark seg.mean 2 | Sample1 1 1216044 248722319 933 0.133381833060556 3 | Sample1 2 1638036 231775198 707 -0.417889405204461 4 | Sample1 2 236403412 241737117 93 0.0831 5 | Sample1 3 11832017 149470198 436 0.1151 6 | Sample1 3 150264604 151542537 18 1.447 7 | Sample1 3 151545662 195938114 80 0.1254 8 | Sample1 4 843512 70146579 133 0.1301 9 | Sample1 4 75673305 77700146 39 -0.3815 10 | Sample1 4 81188156 108831608 44 0.8534 11 | Sample1 4 110635592 186611721 139 0.0788 12 | Sample1 5 442758 10761153 38 0.1437 13 | Sample1 5 38869183 180687408 359 -0.470303870967742 14 | Sample1 6 2623865 144219759 293 -0.4282 15 | Sample1 6 144224235 170862274 117 0.2096 16 | Sample1 7 938572 14028655 56 0.1258 17 | Sample1 7 23286512 23313764 11 1.5421 18 | Sample1 7 26232167 156469231 309 -0.4089 19 | Sample1 8 6264200 145537891 337 -0.4565 20 | Sample1 9 214953 139440208 369 -0.417889405204461 21 | Sample1 10 323391 72576623 233 0.0987 22 | Sample1 10 72604313 72645621 16 -0.4625 23 | Sample1 10 74768015 75000741 29 0.0539 24 | Sample1 10 82300671 82403793 7 -1.4644 25 | Sample1 10 85982056 88768887 12 0.8815 26 | Sample1 10 91066426 99790218 36 -1.301 27 | Sample1 10 102283640 102289566 5 1.0576 28 | Sample1 10 103541552 121214530 73 -1.4029 29 | Sample1 10 124591880 134121207 24 -0.049 30 | Sample1 11 2291272 34378689 106 -0.470303870967742 31 | Sample1 11 36614927 44081429 15 0.2505 32 | Sample1 11 46880700 57317513 71 -0.388 33 | Sample1 11 57947383 65172437 77 0.0606 34 | Sample1 11 65340286 66335024 33 -0.4986 35 | Sample1 11 66335504 71847083 27 0.7708 36 | Sample1 11 71850100 77907628 66 0.1291 37 | Sample1 11 77909048 77924776 9 0.7195 38 | Sample1 11 82536056 134134828 152 -0.3788 39 | Sample1 12 1740561 99126271 379 0.8812 40 | Sample1 12 113537804 124428836 212 0.1012 41 | Sample1 13 20398996 114438189 329 0.8029 42 | Sample1 14 20757846 101349087 318 0.112465182186235 43 | Sample1 15 27216709 99926271 394 0.133381833060556 44 | Sample1 16 230533 31123514 225 0.0953 45 | Sample1 16 56899289 56947247 25 0.7182 46 | Sample1 16 57507348 57722319 20 -0.4244 47 | Sample1 16 66918983 90038049 183 0.1368 48 | Sample1 17 1399145 76832319 621 0.133381833060556 49 | Sample1 17 77768896 80559277 24 -0.4355 50 | Sample1 18 5394737 71825663 132 -0.4495 51 | Sample1 19 1481982 57301280 496 0.133381833060556 52 | Sample1 20 207959 62610775 330 0.1101 53 | Sample1 21 11098731 47865219 176 0.112465182186235 54 | Sample1 22 17443695 45996257 148 0.1203 55 | Sample1 22 50703417 51066096 20 -0.6394 56 | -------------------------------------------------------------------------------- /man/createNormalDatabase.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/createNormalDatabase.R 3 | \name{createNormalDatabase} 4 | \alias{createNormalDatabase} 5 | \title{Create database of normal samples} 6 | \usage{ 7 | createNormalDatabase( 8 | normal.coverage.files, 9 | sex = NULL, 10 | coverage.outliers = c(0.25, 4), 11 | min.coverage = 0.25, 12 | max.missing = 0.03, 13 | low.coverage = 15, 14 | optimal.off.target.counts = 120, 15 | plot = FALSE, 16 | ... 17 | ) 18 | } 19 | \arguments{ 20 | \item{normal.coverage.files}{Vector with file names pointing to 21 | coverage files of normal samples.} 22 | 23 | \item{sex}{\code{character(length(normal.coverage.files))} with sex for all 24 | files. \code{F} for female, \code{M} for male. If all chromosomes are 25 | diploid, specify \code{diploid}. If \code{NULL}, determine from coverage.} 26 | 27 | \item{coverage.outliers}{Exclude samples with coverages below or above 28 | the specified cutoffs (fractions of the normal sample coverages median). 29 | Only for databases with more than 5 samples.} 30 | 31 | \item{min.coverage}{Exclude intervals with coverage lower than 32 | the specified fraction of the chromosome median in the pool of normals.} 33 | 34 | \item{max.missing}{Exclude intervals with zero coverage in the 35 | specified fraction of normal samples.} 36 | 37 | \item{low.coverage}{Specifies the maximum number of total reads 38 | (NOT average coverage) to call a target low coverage.} 39 | 40 | \item{optimal.off.target.counts}{Used to suggest an optimal off-target 41 | interval width (BETA).} 42 | 43 | \item{plot}{Diagnostics plot, useful to tune parameters.} 44 | 45 | \item{\dots}{Arguments passed to the \code{prcomp} function.} 46 | } 47 | \value{ 48 | A normal database that can be used in the 49 | \code{\link{calculateTangentNormal}} function to retrieve a coverage 50 | normalization sample for a given tumor sample. 51 | } 52 | \description{ 53 | Function to create a database of normal samples, used to normalize 54 | tumor coverages. 55 | } 56 | \examples{ 57 | 58 | normal.coverage.file <- system.file("extdata", "example_normal.txt.gz", 59 | package = "PureCN") 60 | normal2.coverage.file <- system.file("extdata", "example_normal2.txt.gz", 61 | package = "PureCN") 62 | normal.coverage.files <- c(normal.coverage.file, normal2.coverage.file) 63 | normalDB <- createNormalDatabase(normal.coverage.files) 64 | 65 | } 66 | \seealso{ 67 | \code{\link{calculateTangentNormal}} 68 | } 69 | \author{ 70 | Markus Riester 71 | } 72 | -------------------------------------------------------------------------------- /R/readIntervalFile.R: -------------------------------------------------------------------------------- 1 | #' Read interval file 2 | #' 3 | #' Read file containing coordinates of on- and off-target intervals 4 | #' generated by \code{\link{preprocessIntervals}}. 5 | #' 6 | #' @param interval.file A mapping file that assigns GC content and gene symbols 7 | #' to each exon in the coverage files. Used for generating gene-level calls. 8 | #' First column in format CHR:START-END. Second column GC content (0 to 1). 9 | #' Third column gene symbol. This file is generated with the 10 | #' \code{\link{preprocessIntervals}} function. 11 | #' @param strict Error out with missing columns 12 | #' @param verbose Verbose output 13 | #' @return A \code{GRanges} object with the parsed intervals. 14 | #' @author Markus Riester 15 | #' @examples 16 | #' 17 | #' interval.file <- system.file("extdata", "example_intervals.txt", 18 | #' package = "PureCN") 19 | #' x <- readIntervalFile(interval.file) 20 | #' 21 | #' @export readIntervalFile 22 | readIntervalFile <- function(interval.file, strict = TRUE, verbose = TRUE) { 23 | con <- file(interval.file, open = "r") 24 | header <- .parseGATKHeader(con) 25 | intervals <- read.delim(con, header = FALSE, stringsAsFactors = FALSE) 26 | colnames(intervals) <- strsplit(header$last_line, "\t")[[1]] 27 | close(con) 28 | if (is.null(intervals$gc_bias) && strict) { 29 | .stopUserError("No gc_bias column in interval.file.") 30 | } 31 | if (is.null(intervals$Gene)) { 32 | if (verbose) flog.info("No Gene column in interval.file. You won't get gene-level calls.") 33 | intervals$Gene <- "." 34 | } 35 | if (is.null(intervals$on_target)) { 36 | if (verbose) flog.info("No on_target column in interval.file. Recreate this file with IntervalFile.R.") 37 | intervals$on_target <- TRUE 38 | } 39 | if (is.null(intervals$mappability)) { 40 | if (verbose) flog.info("No mappability column in interval.file.") 41 | intervals$mappability <- 1 42 | } 43 | if (is.null(intervals$reptiming)) { 44 | if (verbose) flog.info("No reptiming column in interval.file.") 45 | intervals$reptiming <- NA 46 | } 47 | 48 | gr <- GRanges(intervals[, 1], ranges = NULL, strand = NULL, intervals[, -1]) 49 | gr <- sort(sortSeqlevels(gr)) 50 | # TODO cleanup 51 | gr$on.target <- gr$on_target 52 | gr$on_target <- NULL 53 | 54 | if (length(header$sl)) { 55 | header$sl <- sapply(header$sl, as.numeric) 56 | seqlengths(gr) <- header$sl[names(seqlengths(gr))] 57 | } 58 | return(gr) 59 | } 60 | -------------------------------------------------------------------------------- /man/getSexFromVcf.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/getSex.R 3 | \name{getSexFromVcf} 4 | \alias{getSexFromVcf} 5 | \title{Get sample sex from a VCF file} 6 | \usage{ 7 | getSexFromVcf( 8 | vcf, 9 | tumor.id.in.vcf = NULL, 10 | min.or = 4, 11 | min.or.na = 2.5, 12 | max.pv = 0.001, 13 | homozygous.cutoff = 0.95, 14 | af.cutoff = 0.2, 15 | min.coverage = 15, 16 | use.somatic.status = TRUE 17 | ) 18 | } 19 | \arguments{ 20 | \item{vcf}{CollapsedVCF object, read in with the \code{readVcf} function 21 | from the VariantAnnotation package.} 22 | 23 | \item{tumor.id.in.vcf}{The tumor id in the CollapsedVCF (optional).} 24 | 25 | \item{min.or}{Minimum odds-ratio to call sample as male. If p-value is not 26 | significant due to a small number of SNPs on chromosome X, sample will be 27 | called as NA even when odds-ratio exceeds this cutoff.} 28 | 29 | \item{min.or.na}{Minimum odds-ratio to not call a sample. Odds-ratios in the 30 | range \code{min.or.na} to \code{min.or} define a grey area in which samples 31 | are not called. Contamination can be a source of ambiguous calls.} 32 | 33 | \item{max.pv}{Maximum Fisher's exact p-value to call sample as male.} 34 | 35 | \item{homozygous.cutoff}{Minimum allelic fraction to call position 36 | homozygous.} 37 | 38 | \item{af.cutoff}{Remove all SNVs with allelic fraction lower than the 39 | specified value.} 40 | 41 | \item{min.coverage}{Minimum coverage in tumor. Variants with lower coverage 42 | are ignored.} 43 | 44 | \item{use.somatic.status}{If somatic status and germline data is available, 45 | then exclude somatic variants.} 46 | } 47 | \value{ 48 | Returns a \code{character(1)} with \code{M} for male, \code{F} for 49 | female, or \code{NA} if unknown. 50 | } 51 | \description{ 52 | This function detects non-random distribution of homozygous variants on 53 | chromosome X compared to all other chromosomes. A non-significant Fisher's 54 | exact p-value indicates more than one chromosome X copy. This function is 55 | called in runAbsoluteCN as sanity check when a VCF is provided. It is also 56 | useful for determining sex when no sex marker genes on chrY (e.g. AMELY) are 57 | available. 58 | } 59 | \examples{ 60 | 61 | vcf.file <- system.file("extdata", "example.vcf.gz", package = "PureCN") 62 | vcf <- readVcf(vcf.file, "hg19") 63 | # This example vcf is filtered and contains no homozygous calls, 64 | # which are necessary for determining sex from chromosome X. 65 | getSexFromVcf(vcf) 66 | 67 | } 68 | \seealso{ 69 | \code{\link{getSexFromCoverage}} 70 | } 71 | \author{ 72 | Markus Riester 73 | } 74 | -------------------------------------------------------------------------------- /man/setPriorVcf.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/setPriorVcf.R 3 | \name{setPriorVcf} 4 | \alias{setPriorVcf} 5 | \title{Set Somatic Prior VCF} 6 | \usage{ 7 | setPriorVcf( 8 | vcf, 9 | prior.somatic = c(0.5, 5e-04, 0.999, 1e-04, 0.995, 0.5), 10 | tumor.id.in.vcf = NULL, 11 | min.cosmic.cnt = 6, 12 | DB.info.flag = "DB", 13 | Cosmic.CNT.info.field = "Cosmic.CNT" 14 | ) 15 | } 16 | \arguments{ 17 | \item{vcf}{\code{CollapsedVCF} object, read in with the \code{readVcf} 18 | function from the VariantAnnotation package.} 19 | 20 | \item{prior.somatic}{Prior probabilities for somatic mutations. First value 21 | is for the case when no matched normals are available and the variant is not 22 | in germline databases (second value). Third value is for variants with MuTect 23 | somatic call. Different from 1, because somatic mutations in segments of copy 24 | number 0 have 0 probability and artifacts can thus have dramatic influence on 25 | likelihood score. Forth value is for variants not labeled as somatic by 26 | MuTect. Last two values are optional, if vcf contains a flag Cosmic.CNT, it 27 | will set the prior probability for variants with CNT > 6 to the first of 28 | those values in case of no matched normal available (0.995 default). Final 29 | value is for the case that variant is in both germline databases and 30 | COSMIC count > 6.} 31 | 32 | \item{tumor.id.in.vcf}{Id of tumor in case multiple samples are stored in 33 | VCF.} 34 | 35 | \item{min.cosmic.cnt}{Minimum number of hits in the COSMIC database to 36 | call variant as likely somatic.} 37 | 38 | \item{DB.info.flag}{Flag in INFO of VCF that marks presence in common 39 | germline databases. Defaults to \code{DB} that may contain somatic variants 40 | if it is from an unfiltered germline database.} 41 | 42 | \item{Cosmic.CNT.info.field}{Info field containing hits in the Cosmic database} 43 | } 44 | \value{ 45 | The \code{vcf} with \code{numeric(nrow(vcf))} vector with the 46 | prior probability of somatic status for each variant in the 47 | \code{CollapsedVCF} added to the \code{INFO} field \code{PR}. 48 | } 49 | \description{ 50 | Function to set prior for somatic mutation status for each variant in the 51 | provided \code{CollapsedVCF} object. 52 | } 53 | \examples{ 54 | 55 | # This function is typically only called by runAbsoluteCN via the 56 | # fun.setPriorVcf and args.setPriorVcf comments. 57 | vcf.file <- system.file("extdata", "example.vcf.gz", package="PureCN") 58 | vcf <- readVcf(vcf.file, "hg19") 59 | vcf <- setPriorVcf(vcf) 60 | 61 | } 62 | \author{ 63 | Markus Riester 64 | } 65 | -------------------------------------------------------------------------------- /R/findFocal.R: -------------------------------------------------------------------------------- 1 | #' Find focal amplifications 2 | #' 3 | #' Function to find focal amplifications in segmented data. This is 4 | #' automatically called in \code{\link{runAbsoluteCN}}. 5 | #' 6 | #' 7 | #' @param seg Segmentation data. 8 | #' @param max.size Cutoff for focal in base pairs. 9 | #' @param cn.diff Minimum copy number delta between neighboring segments. 10 | #' @param min.amp.cn Minimum amplification integer copy number. Segments with 11 | #' lower copy number are not tested. 12 | #' @return \code{logical(n)}, indicating for all n segments whether they are 13 | #' focally amplified or not. 14 | #' @author Markus Riester 15 | #' @seealso \code{\link{runAbsoluteCN}} 16 | #' @examples 17 | #' 18 | #' normal.coverage.file <- system.file("extdata", "example_normal_tiny.txt", 19 | #' package = "PureCN") 20 | #' tumor.coverage.file <- system.file("extdata", "example_tumor_tiny.txt", 21 | #' package = "PureCN") 22 | #' vcf.file <- system.file("extdata", "example.vcf.gz", 23 | #' package = "PureCN") 24 | #' interval.file <- system.file("extdata", "example_intervals_tiny.txt", 25 | #' package = "PureCN") 26 | #' 27 | #' # The max.candidate.solutions, max.ploidy and test.purity parameters are set to 28 | #' # non-default values to speed-up this example. This is not a good idea for real 29 | #' # samples. 30 | #' ret <-runAbsoluteCN(normal.coverage.file = normal.coverage.file, 31 | #' tumor.coverage.file = tumor.coverage.file, vcf.file = vcf.file, 32 | #' genome="hg19", sampleid = "Sample1", interval.file = interval.file, 33 | #' max.candidate.solutions = 1, max.ploidy = 4, 34 | #' test.purity = seq(0.3, 0.7, by = 0.05), 35 | #' args.focal=list(max.size = 2e+06), fun.focal = findFocal) 36 | #' 37 | #' @export findFocal 38 | findFocal <- function(seg, max.size = 3000000, cn.diff = 2, min.amp.cn = 5) { 39 | focal <- rep(FALSE, nrow(seg)) 40 | for (i in seq_len(nrow(seg))) { 41 | if (seg$C[i] < min.amp.cn) next 42 | if (seg$size[i] > max.size) next 43 | size <- seg$size[i] 44 | if (i > 1) { 45 | for (j in (i - 1):1) { 46 | if (seg$C[j] < seg$C[i] - cn.diff) { 47 | break 48 | } 49 | size <- size + seg$size[j] 50 | } 51 | } 52 | if (i < nrow(seg)) { 53 | for (j in (i + 1):nrow(seg)) { 54 | if (seg$C[j] < seg$C[i] - cn.diff) { 55 | break 56 | } 57 | size <- size + seg$size[j] 58 | } 59 | } 60 | focal[i] <- size < max.size 61 | } 62 | focal 63 | } 64 | -------------------------------------------------------------------------------- /tests/testthat/test_correctCoverageBias.R: -------------------------------------------------------------------------------- 1 | context("correctCoverageBias") 2 | 3 | normal.coverage.file <- system.file("extdata", "example_normal.txt.gz", 4 | package = "PureCN") 5 | interval.file <- system.file("extdata", "ex2_intervals.txt", 6 | package = "PureCN", mustWork = TRUE) 7 | interval.file2 <- system.file("extdata", "example_intervals.txt", 8 | package = "PureCN") 9 | 10 | test_that("Example data matches after normalization", { 11 | output.file <- tempfile(fileext = ".txt") 12 | coverage <- correctCoverageBias(normal.coverage.file, interval.file2, 13 | output.file = output.file) 14 | expect_equal(class(coverage)[1], "GRanges") 15 | expect_equal(length(coverage), 10049) 16 | correctCoverageBias(normal.coverage.file, interval.file2, plot.max.density = 100, 17 | plot.bias = TRUE) 18 | x <- readCoverageFile(output.file) 19 | expect_equal(x$average.coverage, coverage$average.coverage) 20 | correctCoverageBias(head(x, 200), interval.file2) 21 | gc.data <- read.delim(interval.file2, as.is = TRUE) 22 | gc.data$Gene <- NULL 23 | tmpFile <- tempfile() 24 | write.table(gc.data, file = tmpFile, row.names = FALSE, quote = FALSE, 25 | sep = "\t") 26 | coverage2 <- correctCoverageBias(normal.coverage.file, tmpFile, 27 | output.file = output.file) 28 | corCov <- cor(coverage$average.coverage, coverage2$average.coverage, 29 | use = "complete.obs") 30 | expect_true(corCov > 0.99) 31 | }) 32 | 33 | test_that("Exceptions happen with wrong input", { 34 | expect_error(correctCoverageBias(normal.coverage.file, interval.file)) 35 | coverage <- readCoverageFile(normal.coverage.file) 36 | coverage$average.coverage <- 0 37 | expect_error(correctCoverageBias(coverage, interval.file), "zero") 38 | }) 39 | 40 | test_that("Example data qc matches", { 41 | output.qc.file <- tempfile(fileext = ".txt") 42 | coverage <- correctCoverageBias(normal.coverage.file, interval.file2, 43 | output.qc.file = output.qc.file) 44 | x <- read.delim(output.qc.file, sep=" ") 45 | expect_equal(1, nrow(x)) 46 | expect_equal(10, ncol(x)) 47 | file.remove(output.qc.file) 48 | }) 49 | 50 | test_that("Example data without reptiming works", { 51 | x <- read.delim(interval.file2) 52 | interval.file3 <- tempfile(fileext = ".txt") 53 | x$reptiming <- NULL 54 | write.table(x, file=interval.file3, row.names=FALSE, quote=FALSE, sep="\t") 55 | coverage <- correctCoverageBias(normal.coverage.file, interval.file3, 56 | plot.bias=TRUE) 57 | expect_equal(nrow(x), length(coverage)) 58 | file.remove(interval.file3) 59 | }) 60 | -------------------------------------------------------------------------------- /tests/testthat/test_calculateBamCoverageByInterval.R: -------------------------------------------------------------------------------- 1 | context("calculateBamCoverageByInterval") 2 | 3 | output.file <- tempfile(fileext = ".txt") 4 | 5 | test_that("Coverage from test BAM file matches", { 6 | bam.file <- system.file("extdata", "ex1.bam", package = "PureCN", 7 | mustWork = TRUE) 8 | interval.file <- system.file("extdata", "ex1_intervals.txt", 9 | package = "PureCN", mustWork = TRUE) 10 | coverage <- calculateBamCoverageByInterval(bam.file = bam.file, 11 | interval.file = interval.file, output.file = output.file) 12 | expect_equal(coverage$average.coverage, c(20.95205, 43.78357, 13 | 21.29271), tolerance = 0.01) 14 | expect_equal(coverage$counts, c(610, 1158, 636), tolerance = 0.01) 15 | expect_equal(unlist(coverage$duplication.rate), rep(0, 3), check.names = FALSE) 16 | }) 17 | 18 | test_that("Coverage from test BAM file matches", { 19 | bam.file <- system.file("extdata", "ex1.bam", package = "PureCN", 20 | mustWork = TRUE) 21 | interval.file <- system.file("extdata", "ex1_intervals_headered.txt", 22 | package = "PureCN", mustWork = TRUE) 23 | coverage <- calculateBamCoverageByInterval(bam.file = bam.file, 24 | interval.file = interval.file) 25 | expect_equal(coverage$average.coverage, c(37.49301, 43.78357, 39.10000), 26 | tolerance = 0.01) 27 | expect_equal(coverage$counts, c(568, 1158, 595), tolerance = 0.01) 28 | }) 29 | 30 | 31 | test_that("Coverage output is correct", { 32 | x <- readCoverageFile(output.file) 33 | expect_equal(x$average.coverage, c(20.95205, 43.78357, 21.29271), 34 | tolerance = 0.01) 35 | expect_equal(x$counts, c(610, 1158, 636), tolerance = 0.01) 36 | interval.file <- system.file("extdata", "example_intervals.txt", 37 | package = "PureCN") 38 | expect_error(correctCoverageBias(x, interval.file)) 39 | }) 40 | 41 | test_that("Reading BAM in chunks works", { 42 | fl <- system.file("extdata", "ex1.bam", package = "Rsamtools", 43 | mustWork = TRUE) 44 | res0 <- scanBam(fl)[[1]] # always list-of-lists 45 | idx <- sort(sample(length(res0[[1]]), 300)) 46 | idx <- idx[!is.na(res0$pos[idx])] 47 | x <- GRanges(seqnames = res0$rname[idx], 48 | IRanges(start = res0$pos[idx], end = res0$pos[idx] + 20)) 49 | x$Gene <- "." 50 | x$on.target <- TRUE 51 | x$gc_bias <- NA 52 | x$mappability <- NA 53 | x$reptiming <- NA 54 | f2 <- tempfile() 55 | suppressWarnings(PureCN:::.writeIntervals(x, f2)) 56 | r1 <- calculateBamCoverageByInterval(fl, f2) 57 | r2 <- calculateBamCoverageByInterval(fl, f2, chunks = 3) 58 | file.remove(f2) 59 | expect_equal(as.character(r1), as.character(x)) 60 | expect_equal(as.character(r2), as.character(x)) 61 | expect_equivalent(r1$counts, r2$counts) 62 | }) 63 | file.remove(output.file) 64 | -------------------------------------------------------------------------------- /man/calculatePowerDetectSomatic.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/powerDetectSomatic.R 3 | \name{calculatePowerDetectSomatic} 4 | \alias{calculatePowerDetectSomatic} 5 | \title{Power calculation for detecting somatic mutations} 6 | \usage{ 7 | calculatePowerDetectSomatic( 8 | coverage, 9 | f = NULL, 10 | purity = NULL, 11 | ploidy = NULL, 12 | cell.fraction = 1, 13 | error = 0.001, 14 | fpr = 5e-07, 15 | verbose = TRUE 16 | ) 17 | } 18 | \arguments{ 19 | \item{coverage}{Mean sequencing coverage.} 20 | 21 | \item{f}{Mean expected allelic fraction. If \code{NULL}, requires purity and 22 | ploidy and then calculates the expected fraction.} 23 | 24 | \item{purity}{Purity of sample. Only required when \code{f} is \code{NULL}.} 25 | 26 | \item{ploidy}{Ploidy of sample. Only required when \code{f} is \code{NULL}.} 27 | 28 | \item{cell.fraction}{Fraction of cells harboring mutation. Ignored if 29 | \code{f} is not \code{NULL}.} 30 | 31 | \item{error}{Estimated sequencing error rate.} 32 | 33 | \item{fpr}{Required false positive rate for mutation vs. sequencing error.} 34 | 35 | \item{verbose}{Verbose output.} 36 | } 37 | \value{ 38 | A list with elements \item{power}{Power to detect somatic 39 | mutations.} \item{k}{Minimum number of supporting reads.} \item{f}{Expected 40 | allelic fraction. } 41 | } 42 | \description{ 43 | This function calculates the probability of correctly rejecting the null 44 | hypothesis that an alt allele is a sequencing error rather than a true 45 | (mono-)clonal mutation. 46 | } 47 | \examples{ 48 | 49 | purity <- c(0.1, 0.15, 0.2, 0.25, 0.4, 0.6, 1) 50 | coverage <- seq(5, 35, 1) 51 | power <- lapply(purity, function(p) sapply(coverage, function(cv) 52 | calculatePowerDetectSomatic(coverage = cv, purity = p, ploidy = 2, 53 | verbose = FALSE)$power)) 54 | 55 | # Figure S7b in Carter et al. 56 | plot(coverage, power[[1]], col = 1, xlab = "Sequence coverage", 57 | ylab = "Detection power", ylim = c(0, 1), type = "l") 58 | 59 | for (i in 2:length(power)) lines(coverage, power[[i]], col = i) 60 | abline(h = 0.8, lty = 2, col = "grey") 61 | legend("bottomright", legend = paste("Purity", purity), 62 | fill = seq_along(purity)) 63 | 64 | # Figure S7c in Carter et al. 65 | coverage <- seq(5, 350, 1) 66 | power <- lapply(purity, function(p) sapply(coverage, function(cv) 67 | calculatePowerDetectSomatic(coverage = cv, purity = p, ploidy = 2, 68 | cell.fraction = 0.2, verbose = FALSE)$power)) 69 | plot(coverage, power[[1]], col = 1, xlab = "Sequence coverage", 70 | ylab = "Detection power", ylim = c(0, 1), type = "l") 71 | 72 | for (i in 2:length(power)) lines(coverage, power[[i]], col = i) 73 | abline(h = 0.8, lty = 2, col = "grey") 74 | legend("bottomright", legend = paste("Purity", purity), 75 | fill = seq_along(purity)) 76 | 77 | } 78 | \references{ 79 | Carter et al. (2012), Absolute quantification of somatic DNA 80 | alterations in human cancer. Nature Biotechnology. 81 | } 82 | \author{ 83 | Markus Riester 84 | } 85 | -------------------------------------------------------------------------------- /man/segmentationHclust.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/segmentationHclust.R 3 | \name{segmentationHclust} 4 | \alias{segmentationHclust} 5 | \title{Minimal segmentation function} 6 | \usage{ 7 | segmentationHclust( 8 | seg, 9 | vcf = NULL, 10 | tumor.id.in.vcf = 1, 11 | normal.id.in.vcf = NULL, 12 | min.logr.sdev = 0.15, 13 | prune.hclust.h = NULL, 14 | prune.hclust.method = "ward.D", 15 | chr.hash = NULL, 16 | ... 17 | ) 18 | } 19 | \arguments{ 20 | \item{seg}{If segmentation was provided by the user, this data structure 21 | will contain this segmentation. Useful for minimal segmentation functions. 22 | Otherwise PureCN will re-segment the data. This segmentation function 23 | ignores this user provided segmentation.} 24 | 25 | \item{vcf}{Optional \code{CollapsedVCF} object with germline allelic ratios.} 26 | 27 | \item{tumor.id.in.vcf}{Id of tumor in case multiple samples are stored in 28 | VCF.} 29 | 30 | \item{normal.id.in.vcf}{Id of normal in in VCF. Currently not used.} 31 | 32 | \item{min.logr.sdev}{Minimum log-ratio standard deviation used in the 33 | model. Useful to make fitting more robust to outliers in very clean 34 | data (currently not used in this segmentation function).} 35 | 36 | \item{prune.hclust.h}{Height in the \code{hclust} pruning step. Increasing 37 | this value will merge segments more aggressively. If NULL, try to find a 38 | sensible default.} 39 | 40 | \item{prune.hclust.method}{Cluster method used in the \code{hclust} pruning 41 | step. See documentation for the \code{hclust} function.} 42 | 43 | \item{chr.hash}{Mapping of non-numerical chromsome names to numerical names 44 | (e.g. chr1 to 1, chr2 to 2, etc.). If \code{NULL}, assume chromsomes are 45 | properly ordered.} 46 | 47 | \item{...}{Currently unused arguments provided to other segmentation 48 | functions.} 49 | } 50 | \value{ 51 | \code{data.frame} containing the segmentation. 52 | } 53 | \description{ 54 | A minimal segmentation function useful when segmentation was performed by 55 | third-pary tools. When a \code{CollapsedVCF} with germline SNPs is provided, 56 | it will cluster segments using \code{hclust}. Otherwise it will use the 57 | segmentation as provided. 58 | This function is called via the 59 | \code{fun.segmentation} argument of \code{\link{runAbsoluteCN}}. The 60 | arguments are passed via \code{args.segmentation}. 61 | } 62 | \examples{ 63 | 64 | vcf.file <- system.file("extdata", "example.vcf.gz", 65 | package="PureCN") 66 | interval.file <- system.file("extdata", "example_intervals_tiny.txt", 67 | package="PureCN") 68 | seg.file <- system.file('extdata', 'example_seg.txt', 69 | package = 'PureCN') 70 | 71 | res <- runAbsoluteCN(seg.file = seg.file, 72 | fun.segmentation = segmentationHclust, 73 | max.ploidy = 4, vcf.file = vcf.file, 74 | test.purity = seq(0.3, 0.7, by = 0.05), 75 | max.candidate.solutions = 1, 76 | genome = 'hg19', interval.file = interval.file) 77 | 78 | } 79 | \seealso{ 80 | \code{\link{runAbsoluteCN}} 81 | } 82 | \author{ 83 | Markus Riester 84 | } 85 | -------------------------------------------------------------------------------- /R/createCurationFile.R: -------------------------------------------------------------------------------- 1 | #' Create file to curate PureCN results 2 | #' 3 | #' Function to create a CSV file that can be used to mark the correct solution 4 | #' in the output of a \code{\link{runAbsoluteCN}} run. 5 | #' 6 | #' 7 | #' @param file.rds Output of the \code{\link{runAbsoluteCN}} function, 8 | #' serialized with \code{saveRDS}. 9 | #' @param overwrite.uncurated Overwrite existing files unless flagged as 10 | #' \sQuote{Curated}. 11 | #' @param overwrite.curated Overwrite existing files even if flagged as 12 | #' \sQuote{Curated}. 13 | #' @return A \code{data.frame} with the tumor purity and ploidy of the maximum 14 | #' likelihood solution. 15 | #' @author Markus Riester 16 | #' @seealso \code{\link{runAbsoluteCN}} 17 | #' @examples 18 | #' 19 | #' data(purecn.example.output) 20 | #' file.rds <- "Sample1_PureCN.rds" 21 | #' saveRDS(purecn.example.output, file = file.rds) 22 | #' createCurationFile(file.rds) 23 | #' 24 | #' @export createCurationFile 25 | #' @importFrom utils write.csv 26 | createCurationFile <- function(file.rds, overwrite.uncurated = TRUE, 27 | overwrite.curated = FALSE) { 28 | rds <- readRDS(file.rds) 29 | res <- rds$results[[1]] 30 | contamination <- res$SNV.posterior$posterior.contamination 31 | contamination <- if (is.null(contamination)) 0 else contamination 32 | d.f.curation <- data.frame( 33 | Sampleid = res$seg$ID[1], 34 | Purity = res$purity, 35 | Ploidy = res$ploidy, 36 | Sex = .getSexFromRds(rds), 37 | Contamination = contamination, 38 | Flagged = res$flag, 39 | Failed = FALSE, 40 | Curated = FALSE, 41 | Comment = res$flag_comment 42 | ) 43 | 44 | filename <- file.path(dirname(file.rds), 45 | paste(gsub(".rds$", "", basename(file.rds)), "csv", sep = ".")) 46 | 47 | if (file.exists(filename)) { 48 | tmp <- read.csv(filename, as.is = TRUE) 49 | if (tmp$Curated[1] && !overwrite.curated) { 50 | warning(filename, 51 | " already exists and seems to be edited.", 52 | " Will not overwrite it.") 53 | } else if (!overwrite.uncurated) { 54 | warning(filename, " already exists. Will not overwrite it.") 55 | } else { 56 | write.csv(d.f.curation, file = filename, row.names = FALSE) 57 | } 58 | } else { 59 | write.csv(d.f.curation, file = filename, row.names = FALSE) 60 | } 61 | invisible(d.f.curation) 62 | } 63 | 64 | .getSexFromRds <- function(rds) { 65 | # if run without VCF, then we don't have sex information from VCF 66 | if (is.null(rds$input$sex.vcf)) return(rds$input$sex) 67 | 68 | # conflict of coverage and snp based sex genotyper? 69 | if (!is.na(rds$input$sex) && !is.na(rds$input$sex.vcf)) { 70 | if (rds$input$sex == rds$input$sex.vcf) return(rds$input$sex) 71 | return(paste("Coverage:", rds$input$sex, "VCF:", rds$input$sex.vcf)) 72 | } 73 | # believe coverage based more than VCF in case we have only limited 74 | # number of SNPs on chrX 75 | if (!is.na(rds$input$sex)) { 76 | return(rds$input$sex) 77 | } 78 | return(rds$input$sex.vcf) 79 | } 80 | -------------------------------------------------------------------------------- /man/callMutationBurden.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/callMutationBurden.R 3 | \name{callMutationBurden} 4 | \alias{callMutationBurden} 5 | \title{Call mutation burden} 6 | \usage{ 7 | callMutationBurden( 8 | res, 9 | id = 1, 10 | remove.flagged = TRUE, 11 | min.prior.somatic = 0.1, 12 | max.prior.somatic = 1, 13 | min.cellfraction = 0, 14 | fun.countMutation = function(vcf) width(vcf) == 1, 15 | callable = NULL, 16 | exclude = NULL 17 | ) 18 | } 19 | \arguments{ 20 | \item{res}{Return object of the \code{\link{runAbsoluteCN}} function.} 21 | 22 | \item{id}{Candidate solution to extract mutation burden from. 23 | \code{id=1} will use the maximum likelihood solution.} 24 | 25 | \item{remove.flagged}{Remove variants flagged by 26 | \code{\link{predictSomatic}}.} 27 | 28 | \item{min.prior.somatic}{Exclude variants with somatic prior 29 | probability lower than this cutoff.} 30 | 31 | \item{max.prior.somatic}{Exclude variants with somatic prior 32 | probability higher than this cutoff. This is useful for removing 33 | hotspot mutations in small panels that might inflate the mutation 34 | burden.} 35 | 36 | \item{min.cellfraction}{Exclude variants with cellular fraction 37 | lower than this cutoff. These are sub-clonal mutations or artifacts 38 | with very low allelic fraction.} 39 | 40 | \item{fun.countMutation}{Function that can be used to filter the 41 | input VCF further for filtering, for example to only keep missense 42 | mutations. Expects a \code{logical} vector indicating whether variant 43 | should be counted (\code{TRUE}) or not (\code{FALSE}). Default 44 | is to keep only single nucleotide variants.} 45 | 46 | \item{callable}{\code{GRanges} object with callable genomic regions, 47 | for example obtained by \sQuote{GATK CallableLoci} BED file, imported 48 | with \code{rtracklayer}.} 49 | 50 | \item{exclude}{\code{GRanges} object with genomic regions that 51 | should be excluded from the \code{callable} regions, for example 52 | intronic regions. Requires \code{callable}.} 53 | } 54 | \value{ 55 | Returns \code{data.frame} with mutation counts and sizes 56 | of callable regions. 57 | } 58 | \description{ 59 | This function provides detailed mutation burden information. 60 | } 61 | \examples{ 62 | 63 | data(purecn.example.output) 64 | callMutationBurden(purecn.example.output) 65 | 66 | # To calculate exact mutations per megabase, we can provide a BED 67 | # file containing all callable regions 68 | callableBed <- import(system.file("extdata", "example_callable.bed.gz", 69 | package = "PureCN")) 70 | 71 | # We can exclude some regions for mutation burden calculation, 72 | # for example intronic regions. 73 | exclude <- GRanges(seqnames = "chr1", IRanges(start = 1, 74 | end = max(end(callableBed)))) 75 | 76 | # We can also exclude specific mutations by filtering the input VCF 77 | myVcfFilter <- function(vcf) seqnames(vcf)!="chr2" 78 | 79 | callsCallable <- callMutationBurden(purecn.example.output, 80 | callable = callableBed, exclude = exclude, 81 | fun.countMutation = myVcfFilter) 82 | 83 | } 84 | \seealso{ 85 | \code{\link{runAbsoluteCN}} \code{\link{predictSomatic}} 86 | } 87 | \author{ 88 | Markus Riester 89 | } 90 | -------------------------------------------------------------------------------- /tests/testthat/test_readCoverageFile.R: -------------------------------------------------------------------------------- 1 | context("readCoverageFile") 2 | 3 | test_that("Example data matches and pooling works", { 4 | tumor.coverage.file <- system.file("extdata", "example_tumor.txt.gz", 5 | package = "PureCN") 6 | coverage <- readCoverageFile(tumor.coverage.file) 7 | expect_equal(length(coverage), 10049) 8 | expect_identical("chr1", as.character(seqnames(coverage)[1])) 9 | expect_equal(sum(!is.na(coverage[seqnames(coverage) == "chr21"]$coverage)), 10 | 179) 11 | pool <- poolCoverage(list(coverage)) 12 | expect_equal(pool$average.coverage, coverage$average.coverage) 13 | pool <- poolCoverage(list(coverage), remove.chrs = "chr21") 14 | expect_equal(sum(is.na(pool[seqnames(coverage) == "chr21"]$coverage)), 15 | 179) 16 | }) 17 | 18 | test_that("Overlapping intervals were merged and warned", { 19 | tumor.overlapping.coverage.file <- system.file("extdata", 20 | "test_coverage_overlapping_intervals.txt", package = "PureCN") 21 | expect_output(coverage <- readCoverageFile(tumor.overlapping.coverage.file), 22 | "WARN") 23 | expect_equal(length(coverage), 3) 24 | expect_equal(start(coverage), c(1216042, 1216606, 1216791)) 25 | expect_equal(end(coverage), c(1216050, 1216678, 1217991)) 26 | }) 27 | 28 | test_that("CNVkit *cnn example data is parsed correctly", { 29 | coverageFile <- system.file("extdata", "example_normal3.cnn", 30 | package = "PureCN") 31 | coverage <- readCoverageFile(coverageFile) 32 | expect_equal(length(coverage), 4) 33 | expect_equal(start(coverage), c(762097, 861281, 865591, 866325) + 34 | 1) 35 | expect_equal(end(coverage), c(762270, 861490, 865791, 866498)) 36 | expect_equal(coverage$on.target, c(TRUE, TRUE, TRUE, TRUE)) 37 | coverage <- readCoverageFile(coverageFile, zero = FALSE) 38 | expect_equal(length(coverage), 4) 39 | expect_equal(start(coverage), c(762097, 861281, 865591, 866325)) 40 | expect_equal(end(coverage), c(762270, 861490, 865791, 866498)) 41 | expect_equal(coverage$on.target, c(TRUE, TRUE, TRUE, TRUE)) 42 | }) 43 | 44 | test_that("CNVkit *cnr example data is parsed correctly", { 45 | coverageFile <- system.file("extdata", "example_normal4.cnr", 46 | package = "PureCN") 47 | coverage <- readCoverageFile(coverageFile) 48 | expect_equal(length(coverage), 5) 49 | expect_equal(start(coverage), c(10500, 70509, 227917, 318219, 50 | 367658) + 1) 51 | expect_equal(end(coverage), c(68590, 176917, 267219, 367158, 52 | 367893)) 53 | expect_equal(coverage$on.target, c(FALSE, FALSE, FALSE, FALSE, 54 | TRUE)) 55 | }) 56 | 57 | test_that("GATK4 *hdf5 example data is parsed correctly", { 58 | coverageFile <- system.file("extdata", "example_normal5.hdf5", 59 | package = "PureCN") 60 | coverage <- readCoverageFile(coverageFile) 61 | expect_equal(length(coverage), 10) 62 | expect_equal(head(start(coverage)), 63 | c(3598833, 3599562, 3607444, 3624039, 3638537, 3639872)) 64 | expect_equal(head(coverage$counts), 65 | c(127, 305, 78, 699, 566, 344)) 66 | 67 | expect_equal(head(coverage$on.target), rep(TRUE, 6)) 68 | }) 69 | -------------------------------------------------------------------------------- /R/processMultipleSamples.R: -------------------------------------------------------------------------------- 1 | #' Multi sample normalization and segmentation 2 | #' 3 | #' This function performs normalization and segmentation when multiple 4 | #' for the same patient are available. 5 | #' 6 | #' CURRENTLY DEFUNCT BECAUSE IT DEPENDS ON THE DEFUNCT COPYNUMBER PACKAGE. 7 | #' We are working on a replacement. 8 | #' 9 | #' 10 | #' @param tumor.coverage.files Coverage data for tumor samples. 11 | #' @param sampleids Sample ids, used in output files. 12 | #' @param normalDB Database of normal samples, created with 13 | #' \code{\link{createNormalDatabase}}. 14 | #' @param num.eigen Number of eigen vectors used. 15 | #' @param genome Genome version, for example hg19. Needed to get centromere 16 | #' positions. 17 | #' @param plot.cnv Segmentation plots. 18 | #' @param min.interval.weight Can be used to ignore intervals with low weights. 19 | #' @param w Weight of samples. Can be used to downweight poor quality samples. 20 | #' If \code{NULL}, sets to inverse of median on-target duplication rate if 21 | #' available, otherwise does not do any weighting. 22 | #' @param max.segments If not \code{NULL}, try a higher \code{undo.SD} 23 | #' parameter if number of segments exceeds the threshold. 24 | #' @param chr.hash Mapping of non-numerical chromsome names to numerical names 25 | #' (e.g. chr1 to 1, chr2 to 2, etc.). If \code{NULL}, assume chromsomes are 26 | #' properly ordered. 27 | #' @param centromeres A \code{GRanges} object with centromere positions. 28 | #' @param ... Arguments passed to the segmentation function. 29 | #' @return \code{data.frame} containing the segmentation. 30 | #' @author Markus Riester 31 | #' @references Nilsen G., Liestol K., Van Loo P., Vollan H., Eide M., Rueda O., 32 | #' Chin S., Russell R., Baumbusch L., Caldas C., Borresen-Dale A., 33 | #' Lingjaerde O. (2012). "Copynumber: Efficient algorithms for single- and 34 | #' multi-track copy number segmentation." BMC Genomics, 13(1), 591. 35 | #' 36 | #' @seealso \code{\link{runAbsoluteCN}} 37 | #' @examples 38 | #' 39 | #' normal1.coverage.file <- system.file("extdata", "example_normal.txt.gz", 40 | #' package = "PureCN") 41 | #' normal2.coverage.file <- system.file("extdata", "example_normal2.txt.gz", 42 | #' package = "PureCN") 43 | #' tumor1.coverage.file <- system.file("extdata", "example_tumor.txt.gz", 44 | #' package = "PureCN") 45 | #' tumor2.coverage.file <- system.file("extdata", "example_tumor2.txt.gz", 46 | #' package = "PureCN") 47 | #' 48 | #' normal.coverage.files <- c(normal1.coverage.file, normal2.coverage.file) 49 | #' tumor.coverage.files <- c(tumor1.coverage.file, tumor2.coverage.file) 50 | #' 51 | #' normalDB <- createNormalDatabase(normal.coverage.files) 52 | #' 53 | #' # seg <- processMultipleSamples(tumor.coverage.files, 54 | #' # sampleids = c("Sample1", "Sample2"), 55 | #' # normalDB = normalDB, 56 | #' # genome = "hg19") 57 | #' 58 | #' @export processMultipleSamples 59 | processMultipleSamples <- function(tumor.coverage.files, sampleids, normalDB, 60 | num.eigen = 20, genome, plot.cnv = TRUE, w = NULL, 61 | min.interval.weight = 1 / 3, 62 | max.segments = NULL, chr.hash = NULL, centromeres = NULL, ...) { 63 | .Defunct(msg="preprocessMultipleSamples is temporarily defunct") 64 | } 65 | -------------------------------------------------------------------------------- /R/bootstrapResults.R: -------------------------------------------------------------------------------- 1 | #' Bootstrapping variant fits 2 | #' 3 | #' This function bootstraps variants, then optionally re-ranks solutions by 4 | #' using the bootstrap estimate of the likelihood score, and then optionally 5 | #' removes solutions that never ranked high in any bootstrap replicate. 6 | #' 7 | #' 8 | #' @param res Return object of the \code{\link{runAbsoluteCN}} function. 9 | #' @param n Number of bootstrap replicates. 10 | #' @param top Include solution if it appears in the top \code{n} solutions of 11 | #' any bootstrap replicate. If \code{NULL}, do not filter solutions. 12 | #' @param reorder Reorder results by bootstrap value. 13 | #' @return Returns a \code{\link{runAbsoluteCN}} object with added bootstrap 14 | #' value to each solution. This value 15 | #' is the fraction of bootstrap replicates in which the solution ranked first. 16 | #' @author Markus Riester 17 | #' @seealso \code{\link{runAbsoluteCN}} 18 | #' @examples 19 | #' 20 | #' data(purecn.example.output) 21 | #' ret.boot <- bootstrapResults(purecn.example.output, n=100) 22 | #' plotAbs(ret.boot, type="overview") 23 | #' 24 | #' @export bootstrapResults 25 | #' @importFrom utils head 26 | bootstrapResults <- function(res, n = 500, top = NULL, reorder = FALSE) { 27 | if (length(res$results) < 2) return(res) 28 | if (is.null(top)) top <- length(res$results) 29 | res$results <- .bootstrapResults(res$results, n = n, top = top, 30 | reorder = reorder) 31 | res 32 | } 33 | 34 | .bootstrapResults <- function(results, n, top, reorder) { 35 | ## Sample SNVs with replacement and recalculate log-likelihood. 36 | .bootstrapResult <- function(result) { 37 | lliks <- log(apply(result$SNV.posterior$likelihoods[ 38 | !result$SNV.posterior$posteriors$FLAGGED, ], 1, max)) 39 | lliks <- sum(sample(lliks, replace = TRUE)) 40 | result$log.likelihood + sum(lliks) - 41 | sum(result$SNV.posterior$posteriors$FLAGGED) 42 | } 43 | best <- replicate(n, head(order(sapply(results, .bootstrapResult), 44 | decreasing = TRUE), top)) 45 | 46 | ## Calculate bootstrap value as fraction solution is ranked first. 47 | bootstrap.value <- sapply(seq_along(results), function(i) 48 | sum(best[1, ] == i)) / ncol(best) 49 | for (i in seq_along(results)) { 50 | results[[i]]$bootstrap.value <- bootstrap.value[i] 51 | } 52 | 53 | ## Return only solutions that had ranked high in at least one replicate. 54 | best <- as.vector(best) 55 | results <- results[sort(unique(best))] 56 | if (reorder) { 57 | results <- results[order(sapply(results, function(x) x$bootstrap.value), 58 | decreasing = TRUE)] 59 | } 60 | .flagBootstrap(results) 61 | } 62 | 63 | .flagBootstrap <- function(results) { 64 | if (!is.null(results[[1]]$bootstrap.value)) { 65 | # max should be first, but be safe 66 | maxBootstrap <- max(sapply(results, function(r) r$bootstrap.value), 67 | na.rm = TRUE) 68 | if (maxBootstrap < 0.95) { 69 | for (i in seq_along(results)) { 70 | results[[i]]$flag <- TRUE 71 | results[[i]]$flag_comment <- .appendComment( 72 | results[[i]]$flag_comment, "LOW BOOTSTRAP VALUE") 73 | } 74 | } 75 | } 76 | results 77 | } 78 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM bioconductor/bioconductor_docker:RELEASE_3_19 2 | #FROM bioconductor/bioconductor_docker:devel 3 | 4 | # install base packages 5 | RUN Rscript -e 'if (!requireNamespace("BiocManager", quietly = TRUE)){install.packages("BiocManager")}; \ 6 | BiocManager::install(c("TxDb.Hsapiens.UCSC.hg38.knownGene", "TxDb.Hsapiens.UCSC.hg19.knownGene"))' 7 | RUN Rscript -e 'install.packages(c("optparse", "R.utils")); \ 8 | BiocManager::install(c("remotes", "raerose01/deconstructSigs"));' 9 | RUN Rscript -e 'BiocManager::install(c("GenomicRanges", "IRanges", "DNAcopy", "Biostrings", "GenomicFeatures", "rtracklayer",\ 10 | "S4Vectors", "rhdf5", "VariantAnnotation", "Rsamtools", "BiocGenerics"))' 11 | 12 | # patched PSCBS with support of interval weights 13 | RUN Rscript -e 'BiocManager::install("lima1/PSCBS", ref="add_dnacopy_weighting")' 14 | 15 | RUN apt update \ 16 | && apt install -y --no-install-recommends apt-utils python-is-python3 \ 17 | openjdk-17-jre-headless \ 18 | && apt-get clean \ 19 | && rm -rf /var/lib/apt/lists/* 20 | 21 | # tex support for building vignettes 22 | # RUN apt update \ 23 | # && apt install -y --no-install-recommends \ 24 | # texlive \ 25 | # texlive-latex-extra \ 26 | # texlive-fonts-extra \ 27 | # texlive-bibtex-extra \ 28 | # texlive-science \ 29 | # texi2html \ 30 | # texinfo \ 31 | # && apt-get clean \ 32 | # && rm -rf /var/lib/apt/lists/* 33 | 34 | # install GenomicsDB 35 | ENV GENOMICSDB_PATH=/opt/GenomicsDB 36 | ENV GENOMICSDB_BRANCH=master 37 | RUN mkdir $GENOMICSDB_PATH 38 | ENV INSTALL_PREFIX=$GENOMICSDB_PATH 39 | ENV PREREQS_ENV=$GENOMICSDB_PATH/genomicsdb_prereqs.sh 40 | #ARG TARGETPLATFORM 41 | #RUN if [ "$TARGETPLATFORM" = "linux/amd64" ]; then JAVA_HOME="/usr/lib/jvm/java-17-openjdk-amd64"; else JAVA_HOME=/usr/lib/jvm/java-17-openjdk-arm64; fi 42 | #ENV MAVEN_VERSION=3.9.5 43 | 44 | WORKDIR /tmp 45 | 46 | RUN git clone --recursive --branch $GENOMICSDB_BRANCH https://github.com/GenomicsDB/GenomicsDB.git && \ 47 | cd GenomicsDB/scripts/prereqs && \ 48 | ./install_prereqs.sh && \ 49 | apt-get clean && \ 50 | rm -rf /var/lib/apt/lists/* 51 | 52 | RUN chmod +x $PREREQS_ENV && \ 53 | $PREREQS_ENV && \ 54 | cmake -DCMAKE_INSTALL_PREFIX=$GENOMICSDB_PATH -DCMAKE_BUILD_TYPE=Release ./GenomicsDB && \ 55 | make && make install && \ 56 | rm -rf /tmp/GenomicsDB 57 | 58 | # install GenomicsDB R bindings 59 | RUN Rscript -e 'library(remotes);\ 60 | remotes::install_github("nalinigans/GenomicsDB-R", ref="master", configure.args="--with-genomicsdb=/opt/GenomicsDB/")' 61 | 62 | # install PureCN 63 | RUN Rscript -e 'BiocManager::install("PureCN", dependencies = TRUE)' 64 | #RUN Rscript -e 'BiocManager::install("lima1/PureCN", ref = "RELEASE_3_19", dependencies = TRUE)' 65 | ENV PURECN=/usr/local/lib/R/site-library/PureCN/extdata 66 | 67 | # add symbolic link and paths 68 | ENV PATH $GENOMICSDB_PATH/bin:$PATH 69 | WORKDIR /opt 70 | RUN ln -s $PURECN /opt/PureCN 71 | 72 | # install GATK4 73 | ENV GATK_VERSION="4.5.0.0" 74 | RUN wget --no-verbose https://github.com/broadinstitute/gatk/releases/download/${GATK_VERSION}/gatk-${GATK_VERSION}.zip && \ 75 | unzip gatk-${GATK_VERSION}.zip -d /opt && \ 76 | rm gatk-${GATK_VERSION}.zip 77 | 78 | ENV PATH /opt/gatk-${GATK_VERSION}:$PATH 79 | 80 | CMD ["/bin/bash"] 81 | -------------------------------------------------------------------------------- /man/processMultipleSamples.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/processMultipleSamples.R 3 | \name{processMultipleSamples} 4 | \alias{processMultipleSamples} 5 | \title{Multi sample normalization and segmentation} 6 | \usage{ 7 | processMultipleSamples( 8 | tumor.coverage.files, 9 | sampleids, 10 | normalDB, 11 | num.eigen = 20, 12 | genome, 13 | plot.cnv = TRUE, 14 | w = NULL, 15 | min.interval.weight = 1/3, 16 | max.segments = NULL, 17 | chr.hash = NULL, 18 | centromeres = NULL, 19 | ... 20 | ) 21 | } 22 | \arguments{ 23 | \item{tumor.coverage.files}{Coverage data for tumor samples.} 24 | 25 | \item{sampleids}{Sample ids, used in output files.} 26 | 27 | \item{normalDB}{Database of normal samples, created with 28 | \code{\link{createNormalDatabase}}.} 29 | 30 | \item{num.eigen}{Number of eigen vectors used.} 31 | 32 | \item{genome}{Genome version, for example hg19. Needed to get centromere 33 | positions.} 34 | 35 | \item{plot.cnv}{Segmentation plots.} 36 | 37 | \item{w}{Weight of samples. Can be used to downweight poor quality samples. 38 | If \code{NULL}, sets to inverse of median on-target duplication rate if 39 | available, otherwise does not do any weighting.} 40 | 41 | \item{min.interval.weight}{Can be used to ignore intervals with low weights.} 42 | 43 | \item{max.segments}{If not \code{NULL}, try a higher \code{undo.SD} 44 | parameter if number of segments exceeds the threshold.} 45 | 46 | \item{chr.hash}{Mapping of non-numerical chromsome names to numerical names 47 | (e.g. chr1 to 1, chr2 to 2, etc.). If \code{NULL}, assume chromsomes are 48 | properly ordered.} 49 | 50 | \item{centromeres}{A \code{GRanges} object with centromere positions.} 51 | 52 | \item{...}{Arguments passed to the segmentation function.} 53 | } 54 | \value{ 55 | \code{data.frame} containing the segmentation. 56 | } 57 | \description{ 58 | This function performs normalization and segmentation when multiple 59 | for the same patient are available. 60 | } 61 | \details{ 62 | CURRENTLY DEFUNCT BECAUSE IT DEPENDS ON THE DEFUNCT COPYNUMBER PACKAGE. 63 | We are working on a replacement. 64 | } 65 | \examples{ 66 | 67 | normal1.coverage.file <- system.file("extdata", "example_normal.txt.gz", 68 | package = "PureCN") 69 | normal2.coverage.file <- system.file("extdata", "example_normal2.txt.gz", 70 | package = "PureCN") 71 | tumor1.coverage.file <- system.file("extdata", "example_tumor.txt.gz", 72 | package = "PureCN") 73 | tumor2.coverage.file <- system.file("extdata", "example_tumor2.txt.gz", 74 | package = "PureCN") 75 | 76 | normal.coverage.files <- c(normal1.coverage.file, normal2.coverage.file) 77 | tumor.coverage.files <- c(tumor1.coverage.file, tumor2.coverage.file) 78 | 79 | normalDB <- createNormalDatabase(normal.coverage.files) 80 | 81 | # seg <- processMultipleSamples(tumor.coverage.files, 82 | # sampleids = c("Sample1", "Sample2"), 83 | # normalDB = normalDB, 84 | # genome = "hg19") 85 | 86 | } 87 | \references{ 88 | Nilsen G., Liestol K., Van Loo P., Vollan H., Eide M., Rueda O., 89 | Chin S., Russell R., Baumbusch L., Caldas C., Borresen-Dale A., 90 | Lingjaerde O. (2012). "Copynumber: Efficient algorithms for single- and 91 | multi-track copy number segmentation." BMC Genomics, 13(1), 591. 92 | } 93 | \seealso{ 94 | \code{\link{runAbsoluteCN}} 95 | } 96 | \author{ 97 | Markus Riester 98 | } 99 | -------------------------------------------------------------------------------- /tests/testthat/test_createNormalDatabase.R: -------------------------------------------------------------------------------- 1 | context("createNormalDatabase") 2 | 3 | tumor.coverage.file <- system.file("extdata", "example_tumor.txt.gz", 4 | package = "PureCN") 5 | normal.coverage.file <- system.file("extdata", "example_normal.txt.gz", 6 | package = "PureCN") 7 | normal2.coverage.file <- system.file("extdata", "example_normal2.txt.gz", 8 | package = "PureCN") 9 | normal.coverage.files <- c(normal.coverage.file, normal2.coverage.file) 10 | normalDB <- createNormalDatabase(normal.coverage.files) 11 | 12 | test_that("NormalDB of example data matches expectated values", { 13 | expect_identical(normalDB$sex, c(NA, NA)) 14 | pool <- calculateTangentNormal(normal.coverage.files[1], normalDB) 15 | 16 | n <- lapply(normal.coverage.files, readCoverageFile) 17 | expect_equal(length(pool), length(n[[1]])) 18 | expect_equal(as.character(n[[1]]), normalDB$intervals) 19 | }) 20 | 21 | test_that("Provided sex is handled correctly", { 22 | expect_warning( 23 | normalDB2 <- createNormalDatabase(normal.coverage.files, sex = c("A", 24 | NA)) 25 | ) 26 | expect_equal(normalDB2$sex, as.character(c(NA, NA))) 27 | expect_warning( 28 | normalDB2 <- createNormalDatabase(normal.coverage.files, sex = c("A", 29 | "F")) 30 | ) 31 | expect_equal(normalDB2$sex, c(NA, "F")) 32 | expect_equal(normalDB2$normal.coverage.files, 33 | sapply(normal.coverage.files, normalizePath, 34 | USE.NAMES = FALSE)) 35 | 36 | expect_error(createNormalDatabase(normal.coverage.files, sex = "A")) 37 | }) 38 | 39 | test_that("Exceptions happen with wrong input", { 40 | interval.file <- system.file("extdata", "example_intervals.txt", 41 | package = "PureCN") 42 | normal <- readCoverageFile(normal.coverage.file) 43 | correctCoverageBias(normal, interval.file) 44 | output.file <- tempfile(fileext = ".txt") 45 | expect_output(correctCoverageBias(normal[sample(length(normal)), 46 | ], interval.file, output.file), "WARN") 47 | createNormalDatabase(c(normal.coverage.files, output.file)) 48 | best.normal.coverage.file <- calculateTangentNormal(tumor.coverage.file, 49 | normalDB) 50 | normal3.coverage.file <- system.file("extdata", "example_normal3.cnn", 51 | package = "PureCN") 52 | expect_error(calculateTangentNormal(normal3.coverage.file, normalDB), 53 | "not align") 54 | expect_error(createNormalDatabase(normal.coverage.file), "At least 2") 55 | expect_output(createNormalDatabase( c(normal.coverage.file, normal.coverage.file, 56 | normal2.coverage.file)), "duplicated") 57 | file.remove(output.file) 58 | }) 59 | 60 | 61 | test_that("Exceptions happen with outdated databases", { 62 | normalDB2 <- normalDB 63 | normalDB2$version <- NULL 64 | expect_error(runAbsoluteCN(normal.coverage.file, tumor.coverage.file, normalDB = normalDB2), 65 | "incompatible") 66 | expect_error( calculateTangentNormal(tumor.coverage.file, normalDB2), "incompatible") 67 | }) 68 | 69 | 70 | test_that("Exception thrown when user mixed gc-normalized and raw coverages.", { 71 | normal.coverage.files.wrong <- c(tempfile(fileext="_coverage.txt"), tempfile(fileext="_loess.txt")) 72 | file.create(normal.coverage.files.wrong) 73 | expect_error( createNormalDatabase(normal.coverage.files.wrong), "suffix") 74 | file.remove(normal.coverage.files.wrong) 75 | }) 76 | -------------------------------------------------------------------------------- /R/readLogRatioFile.R: -------------------------------------------------------------------------------- 1 | #' Read file containing interval-level log2 tumor/normal ratios 2 | #' 3 | #' Read log2 ratio file produced by external tools like The Genome Analysis 4 | #' Toolkit version 4. 5 | #' 6 | #' @param file Log2 coverage file. 7 | #' @param format File format. If missing, derived from the file 8 | #' extension. Currently GATK4 DenoiseReadCounts format supported. 9 | #' A simple GATK3-style format, two columns with coordinates 10 | #' as string in format chr:start-stop in first and log2-ratio 11 | #' in second is also supported. 12 | #' @param zero Start position is 0-based. Default is \code{FALSE} 13 | #' for GATK, \code{TRUE} for BED file based intervals. 14 | #' @return A \code{GRange} with the log2 ratio. 15 | #' @author Markus Riester 16 | #' @examples 17 | #' 18 | #' logratio.file <- system.file("extdata", "example_gatk4_denoised_cr.tsv.gz", 19 | #' package = "PureCN") 20 | #' logratio <- readLogRatioFile(logratio.file) 21 | #' 22 | #' @export readLogRatioFile 23 | readLogRatioFile <- function(file, format, zero = NULL) { 24 | if (missing(format)) format <- .getLogRatioFormat(file) 25 | if (format == "GATK3") return(.readLogRatioFileGATK3(file, zero = FALSE)) 26 | if (format == "GATK4") return(.readLogRatioFileGATK4(file, zero = FALSE)) 27 | } 28 | 29 | .getLogRatioFormat <- function(file) { 30 | header <- scan(file, what = character(), sep = "\n", nmax = 1, quiet = TRUE) 31 | format <- "GATK4" 32 | if (grepl("^Target", header)[1]) return("GATK3") 33 | format 34 | } 35 | 36 | .readLogRatioFileGATK3 <- function(file, zero = FALSE) { 37 | x <- fread(file, data.table = FALSE) 38 | gr <- GRanges(x[, 1]) 39 | gr$log.ratio <- x[, 2] 40 | gr 41 | } 42 | 43 | .readLogRatioFileGATK4 <- function(file, zero = FALSE) { 44 | con <- file(file, open = "r") 45 | header <- .parseGATKHeader(con) 46 | x <- read.delim(con, header = FALSE, as.is = TRUE) 47 | colnames(x) <- strsplit(header$last_line, "\t")[[1]] 48 | gr <- GRanges(x[, 1], IRanges(start = x[, 2], end = x[, 3])) 49 | gr$log.ratio <- x[, 4] 50 | gr <- sort(sortSeqlevels(gr)) 51 | if (length(header$sl)) { 52 | header$sl <- sapply(header$sl, as.numeric) 53 | seqlengths(gr) <- header$sl[names(seqlengths(gr))] 54 | } 55 | return(gr) 56 | } 57 | 58 | .writeLogRatioFileGATK4 <- function(x, id = 1, file) { 59 | gr <- x$log.ratio 60 | if (is.null(gr$log.ratio)) { 61 | .stopRuntimeError("log.ratio NULL in .writeLogRatioFileGATK4") 62 | } 63 | output <- data.frame( 64 | CONTIG = seqnames(gr), 65 | START = start(gr), 66 | END = end(gr), 67 | LOG2_COPY_RATIO = gr$log.ratio 68 | ) 69 | con <- file(file, open = "w") 70 | .writeGATKHeader(x$vcf, id, con, "log-ratio") 71 | write.table(output, con, row.names = FALSE, quote = FALSE, sep = "\t") 72 | close(con) 73 | invisible(output) 74 | } 75 | 76 | .writeGATKHeader <- function(vcf, id = 1, con, file_type) { 77 | writeLines(paste("@HD", "VN:1.6", sep = "\t"), con) 78 | if (any(is.na(seqlengths(vcf)))) { 79 | flog.warn("Cannot find all contig lengths while exporting %s file.", 80 | file_type) 81 | } else { 82 | sl <- seqlengths(vcf) 83 | writeLines(paste("@SQ", paste0("SN:",names(sl)), paste0("LN:", sl), sep = "\t"), con) 84 | } 85 | if (!is.null(id)) { 86 | sampleid <- .getSampleIdFromVcf(vcf, id) 87 | writeLines(paste("@RG", "ID:PureCN", paste0("SM:", sampleid), sep = "\t"), con) 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /R/segmentationHclust.R: -------------------------------------------------------------------------------- 1 | #' Minimal segmentation function 2 | #' 3 | #' A minimal segmentation function useful when segmentation was performed by 4 | #' third-pary tools. When a \code{CollapsedVCF} with germline SNPs is provided, 5 | #' it will cluster segments using \code{hclust}. Otherwise it will use the 6 | #' segmentation as provided. 7 | #' This function is called via the 8 | #' \code{fun.segmentation} argument of \code{\link{runAbsoluteCN}}. The 9 | #' arguments are passed via \code{args.segmentation}. 10 | #' 11 | #' 12 | #' @param seg If segmentation was provided by the user, this data structure 13 | #' will contain this segmentation. Useful for minimal segmentation functions. 14 | #' Otherwise PureCN will re-segment the data. This segmentation function 15 | #' ignores this user provided segmentation. 16 | #' @param vcf Optional \code{CollapsedVCF} object with germline allelic ratios. 17 | #' @param tumor.id.in.vcf Id of tumor in case multiple samples are stored in 18 | #' VCF. 19 | #' @param normal.id.in.vcf Id of normal in in VCF. Currently not used. 20 | #' @param min.logr.sdev Minimum log-ratio standard deviation used in the 21 | #' model. Useful to make fitting more robust to outliers in very clean 22 | #' data (currently not used in this segmentation function). 23 | #' @param prune.hclust.h Height in the \code{hclust} pruning step. Increasing 24 | #' this value will merge segments more aggressively. If NULL, try to find a 25 | #' sensible default. 26 | #' @param prune.hclust.method Cluster method used in the \code{hclust} pruning 27 | #' step. See documentation for the \code{hclust} function. 28 | #' @param chr.hash Mapping of non-numerical chromsome names to numerical names 29 | #' (e.g. chr1 to 1, chr2 to 2, etc.). If \code{NULL}, assume chromsomes are 30 | #' properly ordered. 31 | #' @param ... Currently unused arguments provided to other segmentation 32 | #' functions. 33 | #' @return \code{data.frame} containing the segmentation. 34 | #' @author Markus Riester 35 | #' 36 | #' @seealso \code{\link{runAbsoluteCN}} 37 | #' @examples 38 | #' 39 | #' vcf.file <- system.file("extdata", "example.vcf.gz", 40 | #' package="PureCN") 41 | #' interval.file <- system.file("extdata", "example_intervals_tiny.txt", 42 | #' package="PureCN") 43 | #' seg.file <- system.file('extdata', 'example_seg.txt', 44 | #' package = 'PureCN') 45 | #' 46 | #' res <- runAbsoluteCN(seg.file = seg.file, 47 | #' fun.segmentation = segmentationHclust, 48 | #' max.ploidy = 4, vcf.file = vcf.file, 49 | #' test.purity = seq(0.3, 0.7, by = 0.05), 50 | #' max.candidate.solutions = 1, 51 | #' genome = 'hg19', interval.file = interval.file) 52 | #' 53 | #' @export segmentationHclust 54 | segmentationHclust <- function(seg, 55 | vcf = NULL, tumor.id.in.vcf = 1, normal.id.in.vcf = NULL, 56 | min.logr.sdev = 0.15, prune.hclust.h = NULL, prune.hclust.method = "ward.D", 57 | chr.hash = NULL, ...) { 58 | if (is.null(seg)) { 59 | .stopUserError("segmentationHclust requires an input segmentation.") 60 | } 61 | .checkParametersSegmentation(alpha = NULL, undo.SD = NULL, 62 | max.segments = NULL, min.logr.sdev = min.logr.sdev, 63 | prune.hclust.h = prune.hclust.h) 64 | 65 | if (!is.null(vcf)) { 66 | if (is.null(chr.hash)) chr.hash <- .getChrHash(seqlevels(vcf)) 67 | seg <- .pruneByHclust(seg, vcf, tumor.id.in.vcf, 68 | h = prune.hclust.h, 69 | method = prune.hclust.method, chr.hash = chr.hash) 70 | } 71 | idx.enough.markers <- seg$num.mark > 1 72 | rownames(seg) <- NULL 73 | seg[idx.enough.markers, ] 74 | } 75 | -------------------------------------------------------------------------------- /tests/testthat/test_segmentation.R: -------------------------------------------------------------------------------- 1 | context("segmentation") 2 | 3 | normal.coverage.file <- system.file("extdata", "example_normal_tiny.txt", 4 | package = "PureCN") 5 | tumor.coverage.file <- system.file("extdata", "example_tumor_tiny.txt", 6 | package = "PureCN") 7 | vcf.file <- system.file("extdata", "example.vcf.gz", 8 | package = "PureCN") 9 | seg.file <- system.file("extdata", "example_seg.txt", 10 | package = "PureCN") 11 | 12 | test_that("Precomputed boudaries are correct", { 13 | data(purecn.DNAcopy.bdry) 14 | alpha <- formals(segmentationCBS)$alpha 15 | eta <- formals(segment)$eta 16 | nperm <- formals(segment)$nperm 17 | max.ones <- floor(nperm * alpha) + 1 18 | set.seed(123) 19 | sbdry <- getbdry(eta, nperm, max.ones) 20 | expect_equal(purecn.DNAcopy.bdry, sbdry) 21 | }) 22 | 23 | 24 | test_that("GATK4 wrapper works for example data.", { 25 | skip_if_not(PureCN:::.checkGATK4Version("4.1.7.0") >= 0, 26 | "gatk binary > 4.1.7.0 required") 27 | 28 | ret <- runAbsoluteCN(normal.coverage.file = normal.coverage.file, 29 | tumor.coverage.file = tumor.coverage.file, vcf.file = vcf.file, 30 | sampleid = "Sample1", genome = "hg19", 31 | fun.segmentation = segmentationGATK4, max.ploidy = 4, 32 | test.purity = seq(0.3, 0.7, by = 0.05), 33 | max.candidate.solutions = 1, plot.cnv = FALSE) 34 | 35 | expect_equal(0.65, ret$results[[1]]$purity, tolerance = 0.02) 36 | expect_equal(1.62, ret$results[[1]]$ploidy, tolerance = 0.2) 37 | }) 38 | 39 | test_that("Hclust segmentation works", { 40 | expect_error(runAbsoluteCN(normal.coverage.file = normal.coverage.file, 41 | tumor.coverage.file = tumor.coverage.file, 42 | sampleid = "Sample1", genome = "hg19", 43 | fun.segmentation = segmentationHclust, 44 | max.candidate.solutions = 1, plot.cnv = FALSE), 45 | "segmentationHclust requires an") 46 | }) 47 | 48 | 49 | test_that("private function .fixBreakpoint.", { 50 | seg <- readSegmentationFile(seg.file, "Sample1") 51 | data(purecn.example.output) 52 | gr <- purecn.example.output$input$log.ratio 53 | lr <- gr$log.ratio 54 | seg_1 <- PureCN:::.fixBreakpointsInBaits(gr, lr, seg, purecn.example.output$input$chr.hash) 55 | expect_equivalent(seg_1$loc.start, seg$loc.start) 56 | expect_equivalent(seg_1$loc.end, seg$loc.end) 57 | 58 | seg[24, "loc.start"] <- 82403793 + 1 59 | seg[44, "loc.end"] <- 57507347 60 | 61 | seg_1 <- PureCN:::.fixBreakpointsInBaits(gr, lr, seg, purecn.example.output$input$chr.hash) 62 | 63 | expect_equivalent(seg[23, "loc.start"], seg_1[23, "loc.start"]) 64 | expect_equivalent(82403838, seg_1[23, "loc.end"]) 65 | expect_equivalent(82403838 + 1, seg_1[24, "loc.start"]) 66 | expect_equivalent(seg[24, "loc.end"], seg_1[24, "loc.end"]) 67 | 68 | expect_equivalent(seg[44, "loc.start"], seg_1[44, "loc.start"]) 69 | expect_equivalent(57507289 - 1, seg_1[44, "loc.end"]) 70 | expect_equivalent(57507289, seg_1[45, "loc.start"]) 71 | expect_equivalent(seg[45, "loc.end"], seg_1[45, "loc.end"]) 72 | 73 | expect_equivalent(seg$loc.start[-c(23, 24, 44, 45)], 74 | seg_1$loc.start[-c(23, 24, 44, 45)]) 75 | expect_equivalent(seg$loc.end[-c(23, 24, 44, 45)], 76 | seg_1$loc.end[-c(23, 24, 44, 45)]) 77 | }) 78 | 79 | test_that("issue 201 is fixed.", { 80 | expect_error(runAbsoluteCN(normal.coverage.file = normal.coverage.file, 81 | tumor.coverage.file = tumor.coverage.file, 82 | sampleid = "Sample1", genome = "hg19", 83 | args.segmentation = list(undo.SD = "A"), 84 | max.candidate.solutions = 1, plot.cnv = FALSE), 85 | "undo.SD") 86 | }) 87 | -------------------------------------------------------------------------------- /R/calculateLogRatio.R: -------------------------------------------------------------------------------- 1 | #' Calculate coverage log-ratio of tumor vs. normal 2 | #' 3 | #' This function is automatically called by \code{\link{runAbsoluteCN}} when 4 | #' normal and tumor coverage are provided (and not a segmentation file or 5 | #' target-level log-ratios). This function is therefore normally not called by 6 | #' the user. 7 | #' 8 | #' 9 | #' @param normal Normal coverage read in by the \code{\link{readCoverageFile}} 10 | #' function. 11 | #' @param tumor Tumor coverage read in by the \code{\link{readCoverageFile}} 12 | #' function. 13 | #' @return \code{numeric(length(tumor))}, tumor vs. normal copy number log-ratios 14 | #' for all targets. 15 | #' @author Markus Riester 16 | #' @examples 17 | #' 18 | #' normal.coverage.file <- system.file("extdata", "example_normal.txt.gz", 19 | #' package = "PureCN") 20 | #' tumor.coverage.file <- system.file("extdata", "example_tumor.txt.gz", 21 | #' package = "PureCN") 22 | #' normal <- readCoverageFile(normal.coverage.file) 23 | #' tumor <- readCoverageFile(tumor.coverage.file) 24 | #' log.ratio <- calculateLogRatio(normal, tumor) 25 | #' 26 | #' @export calculateLogRatio 27 | calculateLogRatio <- function(normal, tumor) { 28 | # make sure that normal and tumor align 29 | if (!identical(as.character(normal), as.character(tumor))) { 30 | .stopUserError("Interval files in normal and tumor different.") 31 | } 32 | if (is.null(tumor$on.target)) tumor$on.target <- TRUE 33 | 34 | avgCovTumor <- mean(tumor$average.coverage[tumor$on.target], na.rm=TRUE) 35 | avgCovNormal <- mean(normal$average.coverage[tumor$on.target], na.rm=TRUE) 36 | 37 | flog.info("Mean target coverages: %.0fX (tumor) %.0fX (normal).", 38 | avgCovTumor, avgCovNormal) 39 | if (avgCovNormal/avgCovTumor < 0.25 || avgCovNormal/avgCovTumor > 4) { 40 | flog.warn("Large difference in coverage of tumor and normal.") 41 | } 42 | 43 | tumor$log.ratio <- 0. 44 | 45 | for (on.target in c(FALSE, TRUE)) { 46 | idx <- tumor$on.target==on.target 47 | if (!sum(idx)) next 48 | total.cov.normal <- sum(as.numeric(normal[idx]$coverage), na.rm = TRUE) 49 | total.cov.tumor <- sum(as.numeric(tumor[idx]$coverage), na.rm = TRUE) 50 | 51 | log.ratio <- log2(tumor[idx]$average.coverage/normal[idx]$average.coverage) + 52 | log2(total.cov.normal/total.cov.tumor) 53 | tumor[idx]$log.ratio <- .calibrate_log_ratio(log.ratio, tumor[idx]) 54 | } 55 | if (!all(tumor$on.target)) { 56 | # try to align the off-target and on-target log-ratios better 57 | tumor$log.ratio <- .calibrate_off_target_log_ratio(tumor) 58 | } 59 | tumor$log.ratio 60 | } 61 | 62 | .calibrate_log_ratio <- function(log.ratio, granges) { 63 | idxFinite <- is.finite(log.ratio) 64 | if (!sum(idxFinite)) { 65 | .stopUserError("No finite intervals.") 66 | } 67 | mean.log.ratio <- weighted.mean(log.ratio[idxFinite], 68 | width(granges)[idxFinite]) 69 | # calibrate 70 | flog.debug("Calibrating %i log-ratios by %f.", 71 | sum(idxFinite), mean.log.ratio) 72 | return(log.ratio - mean.log.ratio) 73 | } 74 | 75 | .calibrate_off_target_log_ratio <- function(granges) { 76 | idx <- granges$on.target 77 | g1 <- granges[idx] 78 | g2 <- granges[!idx] 79 | nr <- nearest(g1,g2) 80 | d2 <- median(g1$log.ratio - g2$log.ratio[nr], na.rm = TRUE) / 2 81 | if (d2 > 0.1) { 82 | flog.warn("Large potential mis-calibration of on- and off-target log2 ratios: %.2f", d2) 83 | } 84 | granges$log.ratio[idx] <- granges$log.ratio[idx] - d2 85 | granges$log.ratio[!idx] <- granges$log.ratio[!idx] + d2 86 | return(granges$log.ratio) 87 | } 88 | -------------------------------------------------------------------------------- /man/segmentationGATK4.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/segmentationGATK4.R 3 | \name{segmentationGATK4} 4 | \alias{segmentationGATK4} 5 | \title{GATK4 ModelSegments segmentation function} 6 | \usage{ 7 | segmentationGATK4( 8 | normal, 9 | tumor, 10 | log.ratio, 11 | seg, 12 | vcf = NULL, 13 | tumor.id.in.vcf = 1, 14 | normal.id.in.vcf = NULL, 15 | min.logr.sdev = 0.15, 16 | prune.hclust.h = NULL, 17 | prune.hclust.method = NULL, 18 | changepoints.penality = NULL, 19 | additional.cmd.args = "", 20 | chr.hash = NULL, 21 | ... 22 | ) 23 | } 24 | \arguments{ 25 | \item{normal}{Coverage data for normal sample. Ignored in this function.} 26 | 27 | \item{tumor}{Coverage data for tumor sample.} 28 | 29 | \item{log.ratio}{Copy number log-ratios, one for each exon in coverage file.} 30 | 31 | \item{seg}{If segmentation was provided by the user, this data structure 32 | will contain this segmentation. Useful for minimal segmentation functions. 33 | Otherwise PureCN will re-segment the data. This segmentation function 34 | ignores this user provided segmentation.} 35 | 36 | \item{vcf}{Optional \code{CollapsedVCF} object with germline allelic ratios.} 37 | 38 | \item{tumor.id.in.vcf}{Id of tumor in case multiple samples are stored in 39 | VCF.} 40 | 41 | \item{normal.id.in.vcf}{Id of normal in in VCF. Currently not used.} 42 | 43 | \item{min.logr.sdev}{Minimum log-ratio standard deviation used in the 44 | model. Useful to make fitting more robust to outliers in very clean 45 | data.} 46 | 47 | \item{prune.hclust.h}{Ignored in this function.} 48 | 49 | \item{prune.hclust.method}{Ignored in this function.} 50 | 51 | \item{changepoints.penality}{The \code{--number-of-changepoints-penalty-factor}. 52 | If \code{NULL}, find a sensible default. Ignored when provided in 53 | \code{additional.cmd.args}.} 54 | 55 | \item{additional.cmd.args}{\code{character(1)}. By default, 56 | \code{ModelSegments} is called with default parameters. Provide additional 57 | arguments here.} 58 | 59 | \item{chr.hash}{Not needed here since \code{ModelSegments} does not 60 | require numbered chromosome names.} 61 | 62 | \item{...}{Currently unused arguments provided to other segmentation 63 | functions.} 64 | } 65 | \value{ 66 | \code{data.frame} containing the segmentation. 67 | } 68 | \description{ 69 | A wrapper for GATK4s ModelSegmentation function, useful when normalization 70 | is performed with other tools than GATK4, for example PureCN. 71 | This function is called via the 72 | \code{fun.segmentation} argument of \code{\link{runAbsoluteCN}}. The 73 | arguments are passed via \code{args.segmentation}. 74 | } 75 | \examples{ 76 | 77 | normal.coverage.file <- system.file("extdata", "example_normal_tiny.txt", 78 | package="PureCN") 79 | tumor.coverage.file <- system.file("extdata", "example_tumor_tiny.txt", 80 | package="PureCN") 81 | vcf.file <- system.file("extdata", "example.vcf.gz", 82 | package="PureCN") 83 | 84 | # The max.candidate.solutions, max.ploidy and test.purity parameters are set to 85 | # non-default values to speed-up this example. This is not a good idea for real 86 | # samples. 87 | \dontrun{ 88 | ret <-runAbsoluteCN(normal.coverage.file=normal.coverage.file, 89 | tumor.coverage.file=tumor.coverage.file, vcf.file=vcf.file, 90 | sampleid="Sample1", genome="hg19", 91 | fun.segmentation = segmentationGATK4, max.ploidy=4, 92 | args.segmentation = list(additional.cmd.args = "--gcs-max-retries 19"), 93 | test.purity=seq(0.3,0.7,by=0.05), max.candidate.solutions=1) 94 | } 95 | 96 | } 97 | \seealso{ 98 | \code{\link{runAbsoluteCN}} 99 | } 100 | \author{ 101 | Markus Riester 102 | } 103 | -------------------------------------------------------------------------------- /man/plotAbs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plotAbs.R 3 | \name{plotAbs} 4 | \alias{plotAbs} 5 | \title{Plots for analyzing PureCN solutions} 6 | \usage{ 7 | plotAbs( 8 | res, 9 | id = 1, 10 | type = c("hist", "overview", "BAF", "AF", "all"), 11 | chr = NULL, 12 | germline.only = TRUE, 13 | show.contour = FALSE, 14 | purity = NULL, 15 | ploidy = NULL, 16 | alpha = TRUE, 17 | show.segment.means = c("SNV", "segments", "both"), 18 | max.mapping.bias = 0.8, 19 | palette.name = "Paired", 20 | col.snps = "#2b6391", 21 | col.chr.shading = "#f0f0f0", 22 | ... 23 | ) 24 | } 25 | \arguments{ 26 | \item{res}{Return object of the \code{\link{runAbsoluteCN}} function.} 27 | 28 | \item{id}{Candidate solutions to be plotted. \code{id=1} will draw the 29 | plot for the maximum likelihood solution.} 30 | 31 | \item{type}{Different types of plots. \code{hist} will plot a histogram, 32 | assigning log-ratio peaks to integer values. \code{overview} will plot all 33 | local optima, sorted by likelihood. \code{BAF} plots 34 | something like a B-allele frequency plot known from SNP arrays: it plots 35 | allele frequencies of germline variants (or most likely germline when status 36 | is not available) against copy number. \code{AF} plots observed allelic 37 | fractions against expected (purity), maximum likelihood (optimal 38 | multiplicity) allelic fractions. \code{all} plots types \code{BAF} and 39 | \code{AF} for all local optima, and is useful for generating a PDF for 40 | manual inspection.} 41 | 42 | \item{chr}{If \code{NULL}, show all chromosomes, otherwise only the ones 43 | specified (\code{type="BAF"} only).} 44 | 45 | \item{germline.only}{If \code{TRUE}, show only variants most likely being 46 | germline in BAF plot. Useful to set to \code{FALSE} (in combination with 47 | \code{chr}) to study potential artifacts.} 48 | 49 | \item{show.contour}{For \code{type="overview"}, display contour plot.} 50 | 51 | \item{purity}{Display expected integer copy numbers for purity, defaults to 52 | purity of the solution (\code{type="hist"} and \code{"AF"} only).} 53 | 54 | \item{ploidy}{Display expected integer copy numbers for ploidy, defaults to 55 | ploidy of the solution (\code{type="hist"} and \code{"AF"} only).} 56 | 57 | \item{alpha}{Add transparency to the plot if VCF contains many variants 58 | (>2000, \code{type="AF"} and \code{type="BAF"} only).} 59 | 60 | \item{show.segment.means}{Show segment means in germline allele frequency 61 | plot? If \code{both}, show SNVs and segment means. If \code{SNV} show all 62 | SNVs. Only for \code{type="AF"}.} 63 | 64 | \item{max.mapping.bias}{Exclude variants with high mapping bias from 65 | plotting. Note that bias is reported on an inverse scale; a variant with 66 | mapping bias of 1 has no bias. (\code{type="AF"} and \code{type="BAF"} 67 | only).} 68 | 69 | \item{palette.name}{The default \code{RColorBrewer} palette.} 70 | 71 | \item{col.snps}{The color used for germline SNPs.} 72 | 73 | \item{col.chr.shading}{The color used for shading alternate chromosomes.} 74 | 75 | \item{\dots}{Additonal parameters passed to the \code{plot} function.} 76 | } 77 | \value{ 78 | Returns \code{NULL}. 79 | } 80 | \description{ 81 | This function provides various plots for finding correct purity and ploidy 82 | combinations in the results of a \code{\link{runAbsoluteCN}} call. 83 | } 84 | \examples{ 85 | 86 | data(purecn.example.output) 87 | plotAbs(purecn.example.output, type="overview") 88 | # plot details for the maximum likelihood solution (rank 1) 89 | plotAbs(purecn.example.output, 1, type="hist") 90 | plotAbs(purecn.example.output, 1, type="BAF") 91 | plotAbs(purecn.example.output, 1, type = "BAF", chr="chr2") 92 | 93 | } 94 | \seealso{ 95 | \code{\link{runAbsoluteCN}} 96 | } 97 | \author{ 98 | Markus Riester 99 | } 100 | -------------------------------------------------------------------------------- /man/filterIntervals.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/filterIntervals.R 3 | \name{filterIntervals} 4 | \alias{filterIntervals} 5 | \title{Remove low quality intervals} 6 | \usage{ 7 | filterIntervals( 8 | normal, 9 | tumor, 10 | log.ratio, 11 | seg.file, 12 | filter.lowhigh.gc = 0.001, 13 | min.coverage = 15, 14 | min.total.counts = 100, 15 | min.targeted.base = 5, 16 | min.mappability = c(0.6, 0.1), 17 | min.fraction.offtarget = 0.05, 18 | normalDB = NULL 19 | ) 20 | } 21 | \arguments{ 22 | \item{normal}{Coverage data for normal sample.} 23 | 24 | \item{tumor}{Coverage data for tumor sample.} 25 | 26 | \item{log.ratio}{Copy number log-ratios, one for each interval in the 27 | coverage file.} 28 | 29 | \item{seg.file}{If not \code{NULL}, then do not filter intervals, because data 30 | is already segmented via the provided segmentation file.} 31 | 32 | \item{filter.lowhigh.gc}{Quantile q (defines lower q and upper 1-q) for 33 | removing intervals with outlier GC profile. Assuming that GC correction might 34 | not have been worked on those. Requires \code{interval.file}.} 35 | 36 | \item{min.coverage}{Minimum coverage in both normal and tumor. Intervals with 37 | lower coverage are ignored. If a \code{normalDB} is provided, then this 38 | database already provides information about low quality intervals and the 39 | \code{min.coverage} is set to \code{min.coverage/10000}.} 40 | 41 | \item{min.total.counts}{Exclude intervals with fewer than that many reads 42 | in combined tumor and normal.} 43 | 44 | \item{min.targeted.base}{Exclude intervals with targeted base (size in bp) 45 | smaller than this cutoff. This is useful when the same interval file was 46 | used to calculate GC content. For such small targets, the GC content is 47 | likely very different from the true GC content of the probes.} 48 | 49 | \item{min.mappability}{\code{double(2)} specifying the minimum mappability score 50 | for on-target, off-target in that order.} 51 | 52 | \item{min.fraction.offtarget}{Skip off-target regions when less than the 53 | specified fraction of all intervals passes all filters} 54 | 55 | \item{normalDB}{Normal database, created with 56 | \code{\link{createNormalDatabase}}.} 57 | } 58 | \value{ 59 | \code{logical(length(log.ratio))} specifying which intervals should be 60 | used in segmentation. 61 | } 62 | \description{ 63 | This function determines which intervals in the coverage files should be 64 | included or excluded in the segmentation. It is called via the 65 | \code{fun.filterIntervals} argument of \code{\link{runAbsoluteCN}}. The 66 | arguments are passed via \code{args.filterIntervals}. 67 | } 68 | \examples{ 69 | 70 | normal.coverage.file <- system.file("extdata", "example_normal.txt.gz", 71 | package = "PureCN") 72 | normal2.coverage.file <- system.file("extdata", "example_normal2.txt.gz", 73 | package = "PureCN") 74 | normal.coverage.files <- c(normal.coverage.file, normal2.coverage.file) 75 | normalDB <- createNormalDatabase(normal.coverage.files) 76 | 77 | tumor.coverage.file <- system.file("extdata", "example_tumor.txt.gz", 78 | package = "PureCN") 79 | vcf.file <- system.file("extdata", "example.vcf.gz", 80 | package = "PureCN") 81 | interval.file <- system.file("extdata", "example_intervals.txt", 82 | package = "PureCN") 83 | 84 | # The max.candidate.solutions, max.ploidy and test.purity parameters are set to 85 | # non-default values to speed-up this example. This is not a good idea for real 86 | # samples. 87 | ret <-runAbsoluteCN(normal.coverage.file = normal.coverage.file, 88 | tumor.coverage.file = tumor.coverage.file, 89 | genome = "hg19", vcf.file = vcf.file, normalDB = normalDB, 90 | sampleid = "Sample1", interval.file = interval.file, 91 | args.filterIntervals = list(min.targeted.base = 10), max.ploidy = 4, 92 | test.purity = seq(0.3, 0.7, by = 0.05), max.candidate.solutions = 1) 93 | 94 | } 95 | \author{ 96 | Markus Riester 97 | } 98 | -------------------------------------------------------------------------------- /tests/testthat/test_createCurationFile.R: -------------------------------------------------------------------------------- 1 | context("createCurationFile") 2 | 3 | data(purecn.example.output) 4 | file.rds <- tempfile(fileext = ".rds") 5 | saveRDS(purecn.example.output, file = file.rds) 6 | 7 | test_that("Example data is processed correctly", { 8 | ret <- createCurationFile(file.rds) 9 | expect_equal(ret$Purity, purecn.example.output$results[[1]]$purity) 10 | expect_equal(ret$Ploidy, purecn.example.output$results[[1]]$ploidy) 11 | expect_false(ret$Curated) 12 | expect_true(ret$Flagged) 13 | expect_equal(as.character(ret$Sampleid), purecn.example.output$input$sampleid) 14 | }) 15 | 16 | test_that("Default curation file stores the first result", { 17 | retx <- readCurationFile(file.rds) 18 | expect_equal(retx$results[[1]]$purity, purecn.example.output$results[[1]]$purity) 19 | expect_equal(retx$results[[1]]$ploidy, purecn.example.output$results[[1]]$ploidy) 20 | }) 21 | 22 | test_that("min.ploidy=2 ignores the first result", { 23 | retx <- readCurationFile(file.rds, min.ploidy = 2) 24 | expect_equal(retx$results[[1]]$purity, purecn.example.output$results[[2]]$purity) 25 | expect_equal(retx$results[[1]]$ploidy, purecn.example.output$results[[2]]$ploidy) 26 | }) 27 | 28 | test_that("max.ploidy=2 ignores higher ploidy solutions", { 29 | retx <- readCurationFile(file.rds, max.ploidy = 2) 30 | expect_equal(sapply(retx$results, function(x) x$ploidy) < 31 | 2, rep(TRUE, length(retx$results))) 32 | }) 33 | 34 | test_that("report.best.only works as expected", { 35 | retx <- readCurationFile(file.rds, report.best.only = TRUE) 36 | expect_equal(retx$results[[1]]$purity, purecn.example.output$results[[1]]$purity) 37 | expect_equal(retx$results[[1]]$ploidy, purecn.example.output$results[[1]]$ploidy) 38 | expect_equal(length(retx$results), 1) 39 | }) 40 | 41 | test_that("overwriting works as expected", { 42 | retx <- purecn.example.output 43 | retx$results[[1]]$purity <- 0.8 44 | saveRDS(retx, file = file.rds) 45 | filename <- file.path(dirname(file.rds), paste0(gsub(".rds$", 46 | "", basename(file.rds)), ".csv")) 47 | expect_warning(createCurationFile(file.rds, overwrite.uncurated = FALSE)) 48 | ret <- read.csv(filename, as.is = TRUE) 49 | expect_equal(ret$Purity, purecn.example.output$results[[1]]$purity) 50 | expect_equal(ret$Ploidy, purecn.example.output$results[[1]]$ploidy) 51 | createCurationFile(file.rds) 52 | ret <- read.csv(filename, as.is = TRUE) 53 | expect_equal(ret$Purity, retx$results[[1]]$purity) 54 | expect_equal(ret$Ploidy, retx$results[[1]]$ploidy) 55 | ret$Curated <- TRUE 56 | write.csv(ret, file = filename, row.names = FALSE) 57 | saveRDS(purecn.example.output, file = file.rds) 58 | expect_warning(createCurationFile(file.rds)) 59 | ret <- read.csv(filename, as.is = TRUE) 60 | expect_true(ret$Curated) 61 | expect_equal(ret$Purity, 0.8) 62 | ret$Ploidy <- 3.4 63 | write.csv(ret, file = filename, row.names = FALSE) 64 | retx <- readCurationFile(file.rds) 65 | expect_equal(ret$Purity, retx$results[[1]]$purity, tolerance=0.2) 66 | expect_equal(ret$Ploidy, retx$results[[1]]$ploidy, tolerance=0.5) 67 | ret$Purity <- "2.2w" 68 | write.csv(ret, file = filename, row.names = FALSE) 69 | expect_error(readCurationFile(file.rds)) 70 | ret$Purity <- 2.2 71 | ret$Failed <- TRUE 72 | write.csv(ret, file = filename, row.names = FALSE) 73 | retx <- readCurationFile(file.rds, remove.failed = TRUE) 74 | expect_true(is.na(retx)) 75 | ret$Failed <- "true" 76 | write.csv(ret, file = filename, row.names = FALSE) 77 | expect_error(readCurationFile(file.rds, remove.failed = TRUE), "logical") 78 | file.remove(filename) 79 | }) 80 | 81 | test_that("warning occurs with missing curation file", { 82 | ret <- createCurationFile(file.rds) 83 | file.remove(gsub(".rds", ".csv", file.rds)) 84 | expect_output(retx <- readCurationFile(file.rds), "does not exist, creating") 85 | expect_equal(retx$results[[1]]$purity, purecn.example.output$results[[1]]$purity) 86 | expect_equal(retx$results[[1]]$ploidy, purecn.example.output$results[[1]]$ploidy) 87 | }) 88 | 89 | file.remove(file.rds) 90 | -------------------------------------------------------------------------------- /R/annotateTargets.R: -------------------------------------------------------------------------------- 1 | #' Annotate targets with gene symbols 2 | #' 3 | #' This function can be used to add a \sQuote{Gene} meta column containing 4 | #' gene symbols to a \code{GRanges} object. 5 | #' It applies heuristics to find the protein coding genes that were 6 | #' likely meant to target in the assay design in case transcripts 7 | #' overlap. 8 | #' 9 | #' @param x A \code{GRanges} object with interals to annotate 10 | #' @param txdb A \code{TxDb} database, e.g. 11 | #' \code{TxDb.Hsapiens.UCSC.hg19.knownGene} 12 | #' @param org A \code{OrgDb} object, e.g. \code{org.Hs.eg.db}. 13 | #' @return A \code{GRanges} object. 14 | #' @author Markus Riester 15 | #' @examples 16 | #' library(TxDb.Hsapiens.UCSC.hg19.knownGene) 17 | #' library(org.Hs.eg.db) 18 | #' 19 | #' normal.coverage.file <- system.file("extdata", "example_normal.txt.gz", 20 | #' package = "PureCN") 21 | #' x <- head(readCoverageFile(normal.coverage.file), 100) 22 | #' x <- annotateTargets(x,TxDb.Hsapiens.UCSC.hg19.knownGene, org.Hs.eg.db) 23 | #' 24 | #' @importFrom GenomicFeatures transcriptsByOverlaps exonsByOverlaps cdsByOverlaps 25 | #' @export annotateTargets 26 | annotateTargets <- function(x, txdb, org) { 27 | if (!is.null(x$on.target)) { 28 | idx <- x$on.target 29 | } else { 30 | idx <- seq_along(x) 31 | } 32 | txdb <- .checkSeqlevelStyle(x, txdb, "txdb", "interval file") 33 | id <- transcriptsByOverlaps(txdb, ranges = x[idx], columns = "GENEID") 34 | id$SYMBOL <- suppressWarnings( 35 | select(org, vapply(id$GENEID, function(x) x[1], character(1)), 36 | "SYMBOL")[, 2]) 37 | 38 | idCds <- cdsByOverlaps(txdb, ranges = x[idx], columns = "GENEID") 39 | idExons <- exonsByOverlaps(txdb, ranges = x[idx], columns = "GENEID") 40 | idExons$SYMBOL <- suppressWarnings( 41 | select(org, vapply(idExons$GENEID, function(x) x[1], character(1)), 42 | "SYMBOL")[, 2]) 43 | 44 | ov <- findOverlaps(x[idx], id) 45 | ovExons <- findOverlaps(x[idx], idExons) 46 | 47 | # for targets with multiple gene hits, use the one with most overlapping 48 | # targets 49 | d.f <- data.frame(i = queryHits(ov), 50 | GENEID = as.character(id$GENEID[subjectHits(ov)]), 51 | SYMBOL = as.character(id$SYMBOL[subjectHits(ov)])) 52 | d.f <- d.f[!duplicated(d.f), ] 53 | 54 | # remove non-coding transcripts 55 | d.f <- d.f[!grepl("-AS\\d$", d.f$SYMBOL), ] 56 | d.f <- d.f[!grepl("^LOC\\d", d.f$SYMBOL), ] 57 | d.f <- d.f[!grepl("^FLJ\\d+$", d.f$SYMBOL), ] 58 | 59 | d.f$Count <- table(d.f$SYMBOL)[d.f$SYMBOL] 60 | 61 | # in case multiple symbols have the same number of targets, prioritize the 62 | # ones overlapping exons 63 | d.fExons <- data.frame( 64 | i = queryHits(ovExons), 65 | SYMBOL = as.character(idExons$SYMBOL[subjectHits(ovExons)])) 66 | 67 | # downweight orfs 68 | d.fExons <- d.fExons[!grepl("\\dorf\\d", d.fExons$SYMBOL), ] 69 | d.f$CountExons <- table(d.fExons$SYMBOL)[d.f$SYMBOL] 70 | d.f$CountExons[is.na(d.f$CountExons)] <- 0 71 | 72 | d.f$OverlapsExon <- ifelse(paste(d.f$i, d.f$SYMBOL) %in% 73 | paste(d.fExons$i, d.fExons$SYMBOL), 1, 0) 74 | d.f$IsCds <- ifelse(d.f$GENEID %in% unique(unlist(idCds$GENEID)), 1, 0) 75 | 76 | # reorder and pick the best transcript: 77 | # - deprioritize non-protein coding transcripts 78 | # - deprioritize non-exon overlapping intervals 79 | # - deprioritize genes with low total exon count (might not be the main target) 80 | # - in the very unlikely case of a tie, use the total transcript count 81 | d.f <- d.f[order(d.f$i, d.f$IsCds, d.f$OverlapsExon, d.f$CountExons, d.f$Count), ] 82 | d.f$FLAG <- duplicated(d.f$i, fromLast = TRUE) 83 | d.f <- d.f[order(d.f$i, d.f$FLAG), ] 84 | d.f <- d.f[!duplicated(d.f$i), ] 85 | 86 | # Exclude targets for which we have multiple hits, but only one interval 87 | d.f <- d.f[!d.f$FLAG | d.f$Count > 2, ] 88 | if (is.null(x$Gene)) x$Gene <- "." 89 | x[idx]$Gene[d.f$i] <- as.character(d.f$SYMBOL) 90 | x$Gene[is.na(x$Gene)] <- "." 91 | 92 | flog.warn("Attempted adding gene symbols to intervals. Heuristics have %s", 93 | "been used to pick symbols for overlapping genes.") 94 | x 95 | } 96 | -------------------------------------------------------------------------------- /man/preprocessIntervals.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/preprocessIntervals.R 3 | \name{preprocessIntervals} 4 | \alias{preprocessIntervals} 5 | \title{Preprocess intervals} 6 | \usage{ 7 | preprocessIntervals( 8 | interval.file, 9 | reference.file, 10 | output.file = NULL, 11 | off.target = FALSE, 12 | average.target.width = 400, 13 | min.target.width = 100, 14 | min.off.target.width = 20000, 15 | average.off.target.width = 2e+05, 16 | off.target.padding = -500, 17 | mappability = NULL, 18 | min.mappability = c(0.6, 0.1, 0.7), 19 | reptiming = NULL, 20 | average.reptiming.width = 1e+05, 21 | exclude = NULL, 22 | off.target.seqlevels = c("targeted", "all"), 23 | small.targets = c("resize", "drop") 24 | ) 25 | } 26 | \arguments{ 27 | \item{interval.file}{File specifying the intervals. Interval is expected in 28 | first column in format CHR:START-END. Instead of a file, a \code{GRanges} 29 | object can be provided. This allows the use of BED files for example. Note 30 | that GATK interval files are 1-based (first position of the genome is 1). 31 | Other formats like BED files are often 0-based. The \code{import} function 32 | will automatically convert to 1-based \code{GRanges}.} 33 | 34 | \item{reference.file}{Reference FASTA file.} 35 | 36 | \item{output.file}{Optionally, write GC content file.} 37 | 38 | \item{off.target}{Include off-target regions.} 39 | 40 | \item{average.target.width}{Split large targets to approximately this size.} 41 | 42 | \item{min.target.width}{Make sure that target regions are of at least 43 | this specified width. See \code{small.targets}.} 44 | 45 | \item{min.off.target.width}{Only include off-target regions of that 46 | size} 47 | 48 | \item{average.off.target.width}{Split off-target regions to that} 49 | 50 | \item{off.target.padding}{Pad off-target regions.} 51 | 52 | \item{mappability}{Annotate intervals with mappability score. Assumed on a scale 53 | from 0 to 1, with score being 1/(number alignments). Expected as \code{GRanges} 54 | object with first meta column being the score. Regions outside these ranges are 55 | ignored, assuming that \code{mappability} covers the whole accessible genome.} 56 | 57 | \item{min.mappability}{\code{double(3)} specifying the minimum mappability score 58 | for on-target, off-target, and chrY regions in that order. The chrY regions 59 | are only used for sex determination in \sQuote{PureCN} and are therefore 60 | treated differently. Requires \code{mappability}.} 61 | 62 | \item{reptiming}{Annotate intervals with replication timing score. Expected as 63 | \code{GRanges} object with first meta column being the score.} 64 | 65 | \item{average.reptiming.width}{Tile \code{reptiming} into bins of specified 66 | width.} 67 | 68 | \item{exclude}{Any target that overlaps with this \code{GRanges} object 69 | will be excluded.} 70 | 71 | \item{off.target.seqlevels}{Controls how to deal with chromosomes/contigs 72 | found in the \code{reference.file} but not in the \code{interval.file}.} 73 | 74 | \item{small.targets}{Strategy to deal with targets smaller than 75 | \code{min.target.width}.} 76 | } 77 | \value{ 78 | Returns GC content by interval as \code{GRanges} object. 79 | } 80 | \description{ 81 | Optimize intervals for copy number calling by tiling long intervals and by 82 | including off-target regions. Uses \code{scanFa} from the Rsamtools package 83 | to retrieve GC content of intervals in a reference FASTA file. If provided, 84 | will annotate intervals with mappability and replication timing scores. 85 | } 86 | \examples{ 87 | 88 | reference.file <- system.file("extdata", "ex2_reference.fa", 89 | package = "PureCN", mustWork = TRUE) 90 | interval.file <- system.file("extdata", "ex2_intervals.txt", 91 | package = "PureCN", mustWork = TRUE) 92 | bed.file <- system.file("extdata", "ex2_intervals.bed", 93 | package = "PureCN", mustWork = TRUE) 94 | preprocessIntervals(interval.file, reference.file, 95 | output.file = "gc_file.txt") 96 | 97 | intervals <- import(bed.file) 98 | preprocessIntervals(intervals, reference.file, 99 | output.file = "gc_file.txt") 100 | 101 | } 102 | \references{ 103 | Talevich et al. (2016). CNVkit: Genome-Wide Copy Number 104 | Detection and Visualization from Targeted DNA Sequencing. PLoS Comput Biol. 105 | } 106 | \author{ 107 | Markus Riester 108 | } 109 | -------------------------------------------------------------------------------- /R/setPriorVcf.R: -------------------------------------------------------------------------------- 1 | #' Set Somatic Prior VCF 2 | #' 3 | #' Function to set prior for somatic mutation status for each variant in the 4 | #' provided \code{CollapsedVCF} object. 5 | #' 6 | #' 7 | #' @param vcf \code{CollapsedVCF} object, read in with the \code{readVcf} 8 | #' function from the VariantAnnotation package. 9 | #' @param prior.somatic Prior probabilities for somatic mutations. First value 10 | #' is for the case when no matched normals are available and the variant is not 11 | #' in germline databases (second value). Third value is for variants with MuTect 12 | #' somatic call. Different from 1, because somatic mutations in segments of copy 13 | #' number 0 have 0 probability and artifacts can thus have dramatic influence on 14 | #' likelihood score. Forth value is for variants not labeled as somatic by 15 | #' MuTect. Last two values are optional, if vcf contains a flag Cosmic.CNT, it 16 | #' will set the prior probability for variants with CNT > 6 to the first of 17 | #' those values in case of no matched normal available (0.995 default). Final 18 | #' value is for the case that variant is in both germline databases and 19 | #' COSMIC count > 6. 20 | #' @param tumor.id.in.vcf Id of tumor in case multiple samples are stored in 21 | #' VCF. 22 | #' @param min.cosmic.cnt Minimum number of hits in the COSMIC database to 23 | #' call variant as likely somatic. 24 | #' @param DB.info.flag Flag in INFO of VCF that marks presence in common 25 | #' germline databases. Defaults to \code{DB} that may contain somatic variants 26 | #' if it is from an unfiltered germline database. 27 | #' @param Cosmic.CNT.info.field Info field containing hits in the Cosmic database 28 | #' @return The \code{vcf} with \code{numeric(nrow(vcf))} vector with the 29 | #' prior probability of somatic status for each variant in the 30 | #' \code{CollapsedVCF} added to the \code{INFO} field \code{PR}. 31 | #' @author Markus Riester 32 | #' @examples 33 | #' 34 | #' # This function is typically only called by runAbsoluteCN via the 35 | #' # fun.setPriorVcf and args.setPriorVcf comments. 36 | #' vcf.file <- system.file("extdata", "example.vcf.gz", package="PureCN") 37 | #' vcf <- readVcf(vcf.file, "hg19") 38 | #' vcf <- setPriorVcf(vcf) 39 | #' 40 | #' @export setPriorVcf 41 | setPriorVcf <- function(vcf, prior.somatic = c(0.5, 0.0005, 0.999, 0.0001, 42 | 0.995, 0.5), 43 | tumor.id.in.vcf = NULL, min.cosmic.cnt = 6, 44 | DB.info.flag = "DB", Cosmic.CNT.info.field = "Cosmic.CNT") { 45 | if (is.null(tumor.id.in.vcf)) { 46 | tumor.id.in.vcf <- .getTumorIdInVcf(vcf) 47 | } 48 | if (!is.null(info(vcf)$SOMATIC)) { 49 | tmp <- prior.somatic 50 | prior.somatic <- ifelse(info(vcf)$SOMATIC, 51 | prior.somatic[3],prior.somatic[4]) 52 | 53 | flog.info("Found SOMATIC annotation in VCF.") 54 | flog.info("Setting somatic prior probabilities for somatic variants to %f or to %f otherwise.", 55 | tmp[3], tmp[4]) 56 | } else { 57 | tmp <- prior.somatic 58 | prior.somatic <- ifelse(info(vcf)[[DB.info.flag]], 59 | prior.somatic[2], prior.somatic[1]) 60 | if (!is.null(info(vcf)[[Cosmic.CNT.info.field]])) { 61 | flog.info("Found COSMIC annotation in VCF. Requiring %i hits.", 62 | min.cosmic.cnt) 63 | flog.info("Setting somatic prior probabilities for hits to %f or to %f if in both COSMIC and likely germline based on dbSNP membership or population allele frequency.", 64 | tmp[5], tmp[6]) 65 | 66 | prior.somatic[which(info(vcf)[[Cosmic.CNT.info.field]] >= min.cosmic.cnt)] <- tmp[5] 67 | prior.somatic[which(info(vcf)[[Cosmic.CNT.info.field]] >= min.cosmic.cnt & 68 | info(vcf)[[DB.info.flag]])] <- tmp[6] 69 | } else { 70 | flog.info("Setting somatic prior probabilities for likely germline hits to %f or to %f otherwise.", 71 | tmp[2], tmp[1]) 72 | } 73 | } 74 | .annotateVcfPrior(vcf, prior.somatic) 75 | } 76 | .annotateVcfPrior <- function(vcf, prior.somatic) { 77 | key <- paste0(.getPureCNPrefixVcf(vcf), "PR") 78 | newInfo <- DataFrame( 79 | Number = 1, Type = "Float", 80 | Description = "Prior probability somatic", 81 | row.names = key) 82 | 83 | info(header(vcf)) <- rbind(info(header(vcf)), newInfo) 84 | info(vcf)[[key]] <- prior.somatic 85 | return(vcf) 86 | } 87 | 88 | -------------------------------------------------------------------------------- /R/filterVcfMuTect.R: -------------------------------------------------------------------------------- 1 | #' Filter VCF MuTect 2 | #' 3 | #' Function to remove artifacts and low confidence/quality calls from a MuTect 4 | #' generated VCF file. Also applies filters defined in \code{filterVcfBasic}. 5 | #' This function will only keep variants listed in the stats file and those not 6 | #' matching the specified failure reasons. 7 | #' 8 | #' 9 | #' @param vcf \code{CollapsedVCF} object, read in with the \code{readVcf} 10 | #' function from the VariantAnnotation package. 11 | #' @param tumor.id.in.vcf The tumor id in the VCF file, optional. 12 | #' @param stats.file MuTect stats file. If \code{NULL}, will check if VCF 13 | #' was generated by MuTect2 and if yes will call \code{\link{filterVcfMuTect2}} 14 | #' instead. 15 | #' @param ignore MuTect flags that mark variants for exclusion. 16 | #' @param \dots Additional arguments passed to \code{\link{filterVcfBasic}}. 17 | #' @return A list with elements \code{vcf}, \code{flag} and 18 | #' \code{flag_comment}. \code{vcf} contains the filtered \code{CollapsedVCF}, 19 | #' \code{flag} a \code{logical(1)} flag if problems were identified, further 20 | #' described in \code{flag_comment}. 21 | #' @author Markus Riester 22 | #' @seealso \code{\link{filterVcfBasic}} 23 | #' @examples 24 | #' 25 | #' ### This function is typically only called by runAbsolute via the 26 | #' ### fun.filterVcf and args.filterVcf comments. 27 | #' library(VariantAnnotation) 28 | #' vcf.file <- system.file("extdata", "example.vcf.gz", package="PureCN") 29 | #' vcf <- readVcf(vcf.file, "hg19") 30 | #' vcf.filtered <- filterVcfMuTect(vcf) 31 | #' 32 | #' @export filterVcfMuTect 33 | filterVcfMuTect <- function(vcf, tumor.id.in.vcf = NULL, stats.file = NULL, 34 | ignore=c("clustered_read_position", "fstar_tumor_lod", "nearby_gap_events", 35 | "poor_mapping_region_alternate_allele_mapq", "poor_mapping_region_mapq0", 36 | "possible_contamination", "strand_artifact", "seen_in_panel_of_normals"), 37 | ...){ 38 | if (is.null(stats.file) && .detectCaller(vcf) == "MuTect2/GATK4") { 39 | flog.info("Detected MuTect2 VCF.") 40 | return(filterVcfMuTect2(vcf, tumor.id.in.vcf, ...)) 41 | } 42 | if (is.null(stats.file)) return( 43 | filterVcfBasic(vcf, tumor.id.in.vcf, ...)) 44 | 45 | stats <- read.delim(stats.file, as.is=TRUE, skip=1) 46 | 47 | if (is.null(stats$contig) || is.null(stats$position)) { 48 | flog.warn("MuTect stats file lacks contig and position columns.") 49 | return(filterVcfBasic(vcf, tumor.id.in.vcf, ...)) 50 | } 51 | 52 | # check for excessive flags that can point to input data issues, 53 | # correct variants that were incorrectly flagged 54 | for (flag in c("nearby_gap_events", "seen_in_panel_of_normals")) { 55 | if (flag %in% ignore && 56 | sum(grepl(flag, stats$failure_reasons))/nrow(stats) > 0.2) { 57 | ignore <- ignore[-match(flag, ignore)] 58 | flog.warn("Excessive %s, ignoring this flag. Check your data.", flag) 59 | } 60 | } 61 | gr.stats <- GRanges(seqnames=stats$contig, 62 | IRanges(start=stats$position, end=stats$position)) 63 | 64 | ov <- findOverlaps(vcf, gr.stats) 65 | 66 | if (!identical(queryHits(ov),subjectHits(ov)) || 67 | nrow(vcf) != nrow(stats)) { 68 | n <- .countVariants(vcf) 69 | stats <- stats[subjectHits(ov),] 70 | vcf <- .removeVariants(vcf, !seq(length(vcf)) %in% queryHits(ov), 71 | "MuTect align") 72 | flog.warn("MuTect stats file and VCF file do not align perfectly. Will remove %i unmatched variants.", 73 | n-.countVariants(vcf)) 74 | } 75 | if (is.null(stats$failure_reasons)) { 76 | flog.warn("MuTect stats file lacks failure_reasons column.%s", 77 | " Keeping all variants listed in stats file.") 78 | return(filterVcfBasic(vcf, tumor.id.in.vcf, ...)) 79 | } 80 | 81 | n <- .countVariants(vcf) 82 | 83 | ids <- sort(unique(unlist(sapply(ignore, grep, stats$failure_reasons)))) 84 | vcf <- .removeVariants(vcf, ids, "MuTect") 85 | 86 | flog.info("Removing %i MuTect calls due to blacklisted failure reasons.", 87 | n-.countVariants(vcf)) 88 | filterVcfBasic(vcf, tumor.id.in.vcf, ...) 89 | } 90 | 91 | .detectCaller <- function(vcf) { 92 | gatkVersion <- meta(header(vcf))[["GATKCommandLine"]]$Version[1] 93 | if (!is.null(gatkVersion)) { 94 | gatkVersion <- gsub("\\\"", "", gatkVersion) 95 | if (grepl("^4", gatkVersion)) return("MuTect2/GATK4") 96 | } 97 | return("") 98 | } 99 | -------------------------------------------------------------------------------- /R/readCurationFile.R: -------------------------------------------------------------------------------- 1 | #' Read curation file 2 | #' 3 | #' Function that can be used to read the curated output of the 4 | #' \code{\link{runAbsoluteCN}} function. 5 | #' 6 | #' 7 | #' @param file.rds Output of the \code{\link{runAbsoluteCN}} function, 8 | #' serialized with \code{saveRDS}. 9 | #' @param file.curation Filename of a curation file that points to the correct 10 | #' tumor purity and ploidy solution. 11 | #' @param remove.failed Do not return solutions that failed. 12 | #' @param report.best.only Only return correct/best solution (useful on low 13 | #' memory machines when lots of samples are loaded). 14 | #' @param min.ploidy Minimum ploidy to be considered. If \code{NULL}, all. Can 15 | #' be used to automatically ignore unlikely solutions. 16 | #' @param max.ploidy Maximum ploidy to be considered. If \code{NULL}, all. Can 17 | #' be used to automatically ignore unlikely solutions. 18 | #' @return The return value of the corresponding \code{\link{runAbsoluteCN}} 19 | #' call, but with the results array manipulated according the curation CSV file 20 | #' and arguments of this function. 21 | #' @author Markus Riester 22 | #' @seealso \code{\link{runAbsoluteCN} \link{createCurationFile}} 23 | #' @examples 24 | #' 25 | #' data(purecn.example.output) 26 | #' file.rds <- "Sample1_PureCN.rds" 27 | #' createCurationFile(file.rds) 28 | #' # User can change the maximum likelihood solution manually in the generated 29 | #' # CSV file. The correct solution is then loaded with readCurationFile. 30 | #' purecn.curated.example.output <-readCurationFile(file.rds) 31 | #' 32 | #' @export readCurationFile 33 | #' @importFrom utils read.csv 34 | readCurationFile <- function(file.rds, 35 | file.curation = gsub(".rds$", ".csv", file.rds), 36 | remove.failed = FALSE, report.best.only = FALSE, min.ploidy = NULL, 37 | max.ploidy = NULL) { 38 | flog.info("Reading %s...", file.rds) 39 | res <- readRDS(file.rds) 40 | if (!file.exists(file.curation)) { 41 | flog.warn("Curation file %s does not exist, creating one.", file.curation) 42 | output <- try(createCurationFile(file.rds)) 43 | if (is(output, "try-error")) { 44 | flog.warn("Failed to write %s: %s", file.curation, output) 45 | return(res) 46 | } 47 | } 48 | curation <- read.csv(file.curation, as.is=TRUE, nrows=1) 49 | .checkLogical <- function(field) { 50 | if (!is.logical(curation[[field]])) { 51 | .stopUserError("'", field, "' column in ", file.curation, 52 | " not logical(1).") 53 | } 54 | } 55 | .checkLogical("Failed") 56 | .checkLogical("Curated") 57 | .checkLogical("Flagged") 58 | 59 | ## Mark all solutions as failed if sample is curated as failed 60 | if (curation$Failed) { 61 | if (remove.failed) return(NA) 62 | for (i in seq_along(res$results)) res$results[[i]]$failed <- TRUE 63 | } else { 64 | for (i in seq_along(res$results)) res$results[[i]]$failed <- FALSE 65 | } 66 | 67 | # Make sure purity and ploidy are numeric. Stop if not, not warn. 68 | curation$Purity <- suppressWarnings(as.numeric(curation$Purity)) 69 | curation$Ploidy <- suppressWarnings(as.numeric(curation$Ploidy)) 70 | 71 | if (is.na(curation$Purity) || is.na(curation$Ploidy) || 72 | curation$Purity < 0 || curation$Purity > 1 || 73 | curation$Ploidy < 0 || curation$Ploidy > 8) { 74 | .stopUserError("Purity or Ploidy not numeric or in expected range.") 75 | } 76 | res$results <- .findClosestSolution(res$results, curation$Purity, 77 | curation$Ploidy) 78 | 79 | ## Filter by ploidy if necessary 80 | ploidy <- sapply(res$results, function(x) x$ploidy) 81 | if (is.null(min.ploidy)) min.ploidy <- min(ploidy) 82 | if (is.null(max.ploidy)) max.ploidy <- max(ploidy) 83 | idxPloidyOk <- which(ploidy>=min.ploidy & ploidy <= max.ploidy) 84 | res$results <- res$results[idxPloidyOk] 85 | 86 | if (report.best.only) { 87 | res$results <- res$results[1] 88 | } 89 | res 90 | } 91 | 92 | .findClosestSolution <- function(results, purity, ploidy, ploidy.div = 6) { 93 | # Find purity/ploidy solution most similar to curation 94 | diffCurated <- vapply(results, function(x) { 95 | abs(x$purity - purity) + (abs(x$ploidy - ploidy) / ploidy.div) 96 | }, double(1)) 97 | idxCurated <- which.min(diffCurated) 98 | if (idxCurated != 1) { 99 | results[c(1, idxCurated)] <- results[c(idxCurated, 1)] 100 | } 101 | results 102 | } 103 | -------------------------------------------------------------------------------- /R/readAllelicCountsFile.R: -------------------------------------------------------------------------------- 1 | #' Read allelic counts file 2 | #' 3 | #' Read file containing counts of ref and alt alleles of common 4 | # SNPs by external tools like The Genome Analysis 5 | #' Toolkit 4. 6 | #' 7 | #' @param file Input file containing counts of ref and alt alleles 8 | #' @param format File format. If missing, derived from the file 9 | #' extension. Currently only GATK4 CollectAllelicCounts (tsv) 10 | #' format supported. 11 | #' @param zero Start position is 0-based. Default is \code{FALSE} 12 | #' for GATK, \code{TRUE} for BED file based intervals. 13 | #' @return A \code{CollapsedVCF} with the parsed allelic counts. 14 | #' @author Markus Riester 15 | #' @examples 16 | #' 17 | #' ac.file <- system.file("extdata", "example_allelic_counts.tsv", 18 | #' package="PureCN") 19 | #' vcf_ac <- readAllelicCountsFile(ac.file) 20 | #' 21 | #' @importFrom utils write.table 22 | #' @importFrom Biostrings DNAStringSet DNAStringSetList 23 | #' @export readAllelicCountsFile 24 | readAllelicCountsFile <- function(file, format, zero=NULL) { 25 | if (missing(format)) format <- "tsv" 26 | .readAllelicCountsFileGatk4(file, zero) 27 | } 28 | 29 | .writeAllelicCountsFileGatk <- function(vcf, id = 1, file) { 30 | outputCounts <- data.frame( 31 | CONTIG = seqnames(vcf), 32 | POSITION = start(vcf), 33 | REF_COUNT = sapply(geno(vcf)$AD[,id], function(x) x[1]), 34 | ALT_COUNT = sapply(geno(vcf)$AD[,id], function(x) x[2]), 35 | REF_NUCLEOTIDE = as.character(ref(vcf)), 36 | ALT_NUCLEOTIDE = unlist(CharacterList(alt(vcf))) 37 | ) 38 | con <- file(file, open = "w") 39 | .writeGATKHeader(vcf, id, con, "allelic counts") 40 | write.table(outputCounts, con, row.names = FALSE, quote = FALSE, sep = "\t") 41 | close(con) 42 | invisible(outputCounts) 43 | } 44 | 45 | .parseGATKHeader <- function(con) { 46 | .extractField <- function(line, field) { 47 | fields <- strsplit(line, "\t")[[1]] 48 | key <- paste0("^", field, ":") 49 | fields <- fields[grep(key, fields)] 50 | gsub(key, "", fields[1]) 51 | } 52 | sid <- NULL 53 | sl <- list() 54 | while ( TRUE ) { 55 | line <- readLines(con, n = 1) 56 | if ( length(line) == 0 || !grepl("^@", line)[1]) { 57 | break 58 | } 59 | if (grepl("^@RG", line)[1]) sid <- .extractField(line, "SM") 60 | if (grepl("^@SQ", line)[1]) { 61 | sl[[.extractField(line, "SN")]] <- .extractField(line, "LN") 62 | } 63 | } 64 | return(list(sid = sid, sl = sl, last_line = line)) 65 | } 66 | 67 | .readAllelicCountsFileGatk4 <- function(file, zero) { 68 | if (!is.null(zero)) flog.warn("zero ignored for GATK4 files.") 69 | con <- file(file, open = "r") 70 | header <- .parseGATKHeader(con) 71 | inputCounts <- try(read.delim(con, header = FALSE, stringsAsFactors = FALSE)) 72 | if (is(inputCounts, "try-error")) { 73 | .stopUserError("Error reading AllelicCountsFile ", file) 74 | } 75 | colnames(inputCounts) <- strsplit(header$last_line, "\t")[[1]] 76 | close(con) 77 | gr <- GRanges(seqnames = inputCounts$CONTIG, IRanges(start = inputCounts$POSITION, end = inputCounts$POSITION)) 78 | vcf <- VCF(gr, 79 | colData = DataFrame(Samples = 1, row.names = header$sid), 80 | exptData = list(header = VCFHeader(samples = header$sid))) 81 | ref(vcf) <- DNAStringSet(inputCounts$REF_NUCLEOTIDE) 82 | #alt(vcf) <- DNAStringSetList(split(inputCounts$ALT_NUCLEOTIDE, seq(length(vcf)))) 83 | alt(vcf) <- DNAStringSetList(as.list(inputCounts$ALT_NUCLEOTIDE)) 84 | 85 | info(header(vcf)) <- DataFrame( 86 | Number = "0", 87 | Type = "Flag", 88 | Description = "Likely somatic status, based on SOMATIC or Cosmic.CNT info fields, population allele frequency, or germline database membership", 89 | row.names = "DB") 90 | 91 | geno(header(vcf)) <- DataFrame( 92 | Number =".", 93 | Type = "Integer", 94 | Description = "Allelic depths for the ref and alt alleles in the order listed", 95 | row.names = "AD") 96 | 97 | info(vcf)$DB <- TRUE 98 | geno(vcf)$AD <- matrix(lapply(seq(nrow(inputCounts)), function(i) 99 | c(inputCounts$REF_COUNT[i], inputCounts$ALT_COUNT[i])), 100 | ncol = 1, dimnames = list(NULL, header$sid)) 101 | 102 | names(vcf) <- paste0(seqnames(vcf), ":", start(vcf)) 103 | if (length(header$sl)) { 104 | header$sl <- sapply(header$sl, as.numeric) 105 | seqlengths(vcf) <- header$sl[names(seqlengths(vcf))] 106 | } 107 | .readAndCheckVcf(vcf) 108 | } 109 | --------------------------------------------------------------------------------