├── .github
    ├── .gitignore
    └── ISSUE_TEMPLATE
    │   └── issue-report.md
├── inst
    ├── extdata
    │   ├── ex2_reference.fa.fai
    │   ├── ex3_reference.fa.fai
    │   ├── example_normal.list
    │   ├── ex1.bam
    │   ├── ex1.bam.bai
    │   ├── ex1_intervals.txt
    │   ├── example.vcf.gz
    │   ├── issue62.vcf.gz
    │   ├── gatk4_pon_db.tgz
    │   ├── issue109.vcf.gz
    │   ├── issue184.vcf.gz
    │   ├── issue184_2.vcf.gz
    │   ├── issue184_2_mb.rds
    │   ├── buggy_cnvkit.seg.gz
    │   ├── ex2_intervals.bed
    │   ├── ex3_intervals.bed
    │   ├── example_vcf.vcf.gz
    │   ├── issue62.vcf.gz.tbi
    │   ├── normalpanel.vcf.gz
    │   ├── ex1_gcgene.txt
    │   ├── ex2_mappability.bigWig
    │   ├── example_cosmic.vcf.gz
    │   ├── example_mutect2.vcf.gz
    │   ├── example_normal.txt.gz
    │   ├── example_normal2.txt.gz
    │   ├── example_normal5.hdf5
    │   ├── example_single.vcf.gz
    │   ├── example_tumor.txt.gz
    │   ├── example_tumor2.txt.gz
    │   ├── example_vcf.vcf.gz.tbi
    │   ├── normalpanel.vcf.gz.tbi
    │   ├── example_callable.bed.gz
    │   ├── example_logratio.txt.gz
    │   ├── gatk4_m2_test_pon_db.tgz
    │   ├── ex2_intervals.txt
    │   ├── example_cosmic.vcf.gz.tbi
    │   ├── example_mutect2.vcf.gz.tbi
    │   ├── example_single.vcf.gz.tbi
    │   ├── example_gatk4_denoised_cr.tsv.gz
    │   ├── example_gatk4_modelfinal.seg.gz
    │   ├── example_intervals_tiny_ot.txt.gz
    │   ├── ex3_mappability.bed
    │   ├── ex2_mappability.bed
    │   ├── ex1_intervals_headered.txt
    │   ├── ex2_reptiming.bed
    │   ├── example_normal3.cnn
    │   ├── issue192_tumor.seg
    │   ├── example_normal4.cnr
    │   ├── test_coverage_overlapping_intervals.txt
    │   ├── dist
    │   │   ├── calculateSbdry.R
    │   │   └── downloadCentromeres.R
    │   ├── example_allelic_counts_empty.tsv
    │   ├── issue192_tumor.cnr
    │   ├── example_allelic_counts.tsv
    │   ├── ex2_reference.fa
    │   ├── ex3_reference.fa
    │   └── example_seg.txt
    └── CITATION
├── data
    ├── centromeres.rda
    ├── purecn.DNAcopy.bdry.rda
    └── purecn.example.output.rda
├── tests
    ├── testthat.R
    └── testthat
    │   ├── test_plotAbs.R
    │   ├── test_getSexFromVcf.R
    │   ├── test_findFocal.R
    │   ├── test_setPriorVcf.R
    │   ├── test_callCIN.R
    │   ├── test_adjustLogRatio.R
    │   ├── test_annotateTargets.R
    │   ├── test_callAlterationsFromSegmentation.R
    │   ├── test_callAlterations.R
    │   ├── test_bootstrapResults.R
    │   ├── test_poolCoverage.R
    │   ├── test_calculateLogRatio.R
    │   ├── test_readSegmentationFile.R
    │   ├── test_calculatePowerDetectSomatic.R
    │   ├── test_readLogRatioFile.R
    │   ├── test_callMutationBurden.R
    │   ├── test_predictSomatic.R
    │   ├── test_readAllelicCountsFile.R
    │   ├── test_callLOH.R
    │   ├── test_getSexFromCoverage.R
    │   ├── test_callAmplificationsInLowPurity.R
    │   ├── test_correctCoverageBias.R
    │   ├── test_calculateBamCoverageByInterval.R
    │   ├── test_readCoverageFile.R
    │   ├── test_createNormalDatabase.R
    │   ├── test_segmentation.R
    │   └── test_createCurationFile.R
├── .Rbuildignore
├── man
    ├── purecn.DNAcopy.bdry.Rd
    ├── purecn.example.output.Rd
    ├── PureCN-deprecated.Rd
    ├── PureCN-defunct.Rd
    ├── readAllelicCountsFile.Rd
    ├── callLOH.Rd
    ├── centromeres.Rd
    ├── readIntervalFile.Rd
    ├── readLogRatioFile.Rd
    ├── annotateTargets.Rd
    ├── createCurationFile.Rd
    ├── poolCoverage.Rd
    ├── calculateLogRatio.Rd
    ├── readSegmentationFile.Rd
    ├── readCoverageFile.Rd
    ├── bootstrapResults.Rd
    ├── callCIN.Rd
    ├── findHighQualitySNPs.Rd
    ├── adjustLogRatio.Rd
    ├── callAlterations.Rd
    ├── predictSomatic.Rd
    ├── calculateTangentNormal.Rd
    ├── filterVcfMuTect2.Rd
    ├── getSexFromCoverage.Rd
    ├── findFocal.Rd
    ├── readCurationFile.Rd
    ├── filterVcfMuTect.Rd
    ├── setMappingBiasVcf.Rd
    ├── correctCoverageBias.Rd
    ├── calculateBamCoverageByInterval.Rd
    ├── callAmplificationsInLowPurity.Rd
    ├── calculateMappingBiasVcf.Rd
    ├── callAlterationsFromSegmentation.Rd
    ├── calculateMappingBiasGatk4.Rd
    ├── createNormalDatabase.Rd
    ├── getSexFromVcf.Rd
    ├── setPriorVcf.Rd
    ├── calculatePowerDetectSomatic.Rd
    ├── segmentationHclust.Rd
    ├── callMutationBurden.Rd
    ├── processMultipleSamples.Rd
    ├── segmentationGATK4.Rd
    ├── plotAbs.Rd
    ├── filterIntervals.Rd
    └── preprocessIntervals.Rd
├── R
    ├── adjustLogRatio.R
    ├── callCIN.R
    ├── poolCoverage.R
    ├── filterVcfMuTect2.R
    ├── readIntervalFile.R
    ├── findFocal.R
    ├── createCurationFile.R
    ├── processMultipleSamples.R
    ├── bootstrapResults.R
    ├── readLogRatioFile.R
    ├── segmentationHclust.R
    ├── calculateLogRatio.R
    ├── annotateTargets.R
    ├── setPriorVcf.R
    ├── filterVcfMuTect.R
    ├── readCurationFile.R
    └── readAllelicCountsFile.R
├── DESCRIPTION
└── Dockerfile


/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | 


--------------------------------------------------------------------------------
/inst/extdata/ex2_reference.fa.fai:
--------------------------------------------------------------------------------
1 | seq1	800	6	80	81
2 | seq2	800	822	80	81
3 | 


--------------------------------------------------------------------------------
/inst/extdata/ex3_reference.fa.fai:
--------------------------------------------------------------------------------
1 | chr1	800	6	80	81
2 | chr2	800	822	80	81
3 | 


--------------------------------------------------------------------------------
/inst/extdata/example_normal.list:
--------------------------------------------------------------------------------
1 | example_normal.txt
2 | example_normal2.txt
3 | 


--------------------------------------------------------------------------------
/data/centromeres.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lima1/PureCN/HEAD/data/centromeres.rda


--------------------------------------------------------------------------------
/inst/extdata/ex1.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/ex1.bam


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(PureCN)
3 | 
4 | test_check("PureCN")
5 | 


--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | .github
2 | .travis.yml
3 | LICENSE
4 | codecov.R
5 | ^\.github$
6 | Dockerfile
7 | 


--------------------------------------------------------------------------------
/inst/extdata/ex1.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/ex1.bam.bai


--------------------------------------------------------------------------------
/inst/extdata/ex1_intervals.txt:
--------------------------------------------------------------------------------
1 | Targets
2 | seq1:1000-2000
3 | seq2:100-1000
4 | seq2:1001-2000
5 | 


--------------------------------------------------------------------------------
/inst/extdata/example.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/example.vcf.gz


--------------------------------------------------------------------------------
/inst/extdata/issue62.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/issue62.vcf.gz


--------------------------------------------------------------------------------
/data/purecn.DNAcopy.bdry.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lima1/PureCN/HEAD/data/purecn.DNAcopy.bdry.rda


--------------------------------------------------------------------------------
/data/purecn.example.output.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lima1/PureCN/HEAD/data/purecn.example.output.rda


--------------------------------------------------------------------------------
/inst/extdata/gatk4_pon_db.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/gatk4_pon_db.tgz


--------------------------------------------------------------------------------
/inst/extdata/issue109.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/issue109.vcf.gz


--------------------------------------------------------------------------------
/inst/extdata/issue184.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/issue184.vcf.gz


--------------------------------------------------------------------------------
/inst/extdata/issue184_2.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/issue184_2.vcf.gz


--------------------------------------------------------------------------------
/inst/extdata/issue184_2_mb.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/issue184_2_mb.rds


--------------------------------------------------------------------------------
/inst/extdata/buggy_cnvkit.seg.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/buggy_cnvkit.seg.gz


--------------------------------------------------------------------------------
/inst/extdata/ex2_intervals.bed:
--------------------------------------------------------------------------------
1 | seq1	100	250
2 | seq1	300	650
3 | seq2	0	150
4 | seq2	400	550
5 | seq2	700	750
6 | 


--------------------------------------------------------------------------------
/inst/extdata/ex3_intervals.bed:
--------------------------------------------------------------------------------
1 | chr1	100	250
2 | chr1	300	650
3 | chr2	0	150
4 | chr2	400	550
5 | chr2	700	750
6 | 


--------------------------------------------------------------------------------
/inst/extdata/example_vcf.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/example_vcf.vcf.gz


--------------------------------------------------------------------------------
/inst/extdata/issue62.vcf.gz.tbi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/issue62.vcf.gz.tbi


--------------------------------------------------------------------------------
/inst/extdata/normalpanel.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/normalpanel.vcf.gz


--------------------------------------------------------------------------------
/inst/extdata/ex1_gcgene.txt:
--------------------------------------------------------------------------------
1 | Targets	gc_bias
2 | seq1:1000-2000	0.45
3 | seq2:100-1000	0.55
4 | seq2:1001-2000	0.46
5 | 


--------------------------------------------------------------------------------
/inst/extdata/ex2_mappability.bigWig:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/ex2_mappability.bigWig


--------------------------------------------------------------------------------
/inst/extdata/example_cosmic.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/example_cosmic.vcf.gz


--------------------------------------------------------------------------------
/inst/extdata/example_mutect2.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/example_mutect2.vcf.gz


--------------------------------------------------------------------------------
/inst/extdata/example_normal.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/example_normal.txt.gz


--------------------------------------------------------------------------------
/inst/extdata/example_normal2.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/example_normal2.txt.gz


--------------------------------------------------------------------------------
/inst/extdata/example_normal5.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/example_normal5.hdf5


--------------------------------------------------------------------------------
/inst/extdata/example_single.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/example_single.vcf.gz


--------------------------------------------------------------------------------
/inst/extdata/example_tumor.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/example_tumor.txt.gz


--------------------------------------------------------------------------------
/inst/extdata/example_tumor2.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/example_tumor2.txt.gz


--------------------------------------------------------------------------------
/inst/extdata/example_vcf.vcf.gz.tbi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/example_vcf.vcf.gz.tbi


--------------------------------------------------------------------------------
/inst/extdata/normalpanel.vcf.gz.tbi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/normalpanel.vcf.gz.tbi


--------------------------------------------------------------------------------
/inst/extdata/example_callable.bed.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/example_callable.bed.gz


--------------------------------------------------------------------------------
/inst/extdata/example_logratio.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/example_logratio.txt.gz


--------------------------------------------------------------------------------
/inst/extdata/gatk4_m2_test_pon_db.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/gatk4_m2_test_pon_db.tgz


--------------------------------------------------------------------------------
/inst/extdata/ex2_intervals.txt:
--------------------------------------------------------------------------------
1 | Target
2 | seq1:101-250
3 | seq1:301-650
4 | seq2:1-150
5 | seq2:401-550
6 | seq2:701-750
7 | 


--------------------------------------------------------------------------------
/inst/extdata/example_cosmic.vcf.gz.tbi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/example_cosmic.vcf.gz.tbi


--------------------------------------------------------------------------------
/inst/extdata/example_mutect2.vcf.gz.tbi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/example_mutect2.vcf.gz.tbi


--------------------------------------------------------------------------------
/inst/extdata/example_single.vcf.gz.tbi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/example_single.vcf.gz.tbi


--------------------------------------------------------------------------------
/inst/extdata/example_gatk4_denoised_cr.tsv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/example_gatk4_denoised_cr.tsv.gz


--------------------------------------------------------------------------------
/inst/extdata/example_gatk4_modelfinal.seg.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/example_gatk4_modelfinal.seg.gz


--------------------------------------------------------------------------------
/inst/extdata/example_intervals_tiny_ot.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lima1/PureCN/HEAD/inst/extdata/example_intervals_tiny_ot.txt.gz


--------------------------------------------------------------------------------
/inst/extdata/ex3_mappability.bed:
--------------------------------------------------------------------------------
1 | chr1	0	250	.	1	.
2 | chr1	250	650	.	1	.
3 | chr2	0	150	.	0.7	.
4 | chr2	150	550	.	1	.
5 | chr2	550	750	.	1	.
6 | 


--------------------------------------------------------------------------------
/inst/extdata/ex2_mappability.bed:
--------------------------------------------------------------------------------
1 | seq1	0	250	.	1	.
2 | seq1	250	650	.	1	.
3 | seq2	0	150	.	0.699999988079071	.
4 | seq2	150	550	.	1	.
5 | seq2	550	750	.	1	.
6 | 


--------------------------------------------------------------------------------
/inst/extdata/ex1_intervals_headered.txt:
--------------------------------------------------------------------------------
1 | @HD	VN:1.6
2 | @SQ	SN:seq1	LN:1575
3 | @SQ	SN:seq2	LN:1584
4 | Targets
5 | seq1:1000-1500
6 | seq2:100-1000
7 | seq2:1001-1500
8 | 


--------------------------------------------------------------------------------
/inst/extdata/ex2_reptiming.bed:
--------------------------------------------------------------------------------
1 | seq1	0	100	.	10
2 | seq1	100	200	.	15
3 | seq1	200	300	.	20
4 | seq1	300	400	.	10
5 | seq1	400	500	.	12
6 | seq2	0	150	.	50
7 | seq2	150	300	.	80
8 | seq2	300	750	.	10
9 | 


--------------------------------------------------------------------------------
/inst/extdata/example_normal3.cnn:
--------------------------------------------------------------------------------
1 | chromosome	start	end	gene	depth	log2
2 | chr1	762097	762270	LINC00115	174.89	7.45031
3 | chr1	861281	861490	SAMD11	28.9043	4.85321
4 | chr1	865591	865791	SAMD11	51.26	5.67976
5 | chr1	866325	866498	SAMD11	14	3.80735
6 | 


--------------------------------------------------------------------------------
/inst/extdata/issue192_tumor.seg:
--------------------------------------------------------------------------------
1 | ID	chrom	loc.start	loc.end	num.mark	seg.mean
2 | tumor	1	105930	57961455	5665	0.0372507
3 | tumor	1	61680889	80343975	668	0.163111
4 | tumor	1	81495027	136720674	2523	-0.648407
5 | tumor	1	136791606	151618297	777	0.119727
6 | tumor	1	151809054	152269420	71	0.736822
7 | tumor	1	152277886	170640583	1200	0.111717
8 | 


--------------------------------------------------------------------------------
/inst/extdata/example_normal4.cnr:
--------------------------------------------------------------------------------
1 | chromosome	start	end	gene	log2	depth	weight
2 | chr1	10500	68590	Background	0.55584	0.70587	0.466868
3 | chr1	70509	176917	Background	0.235896	1.02411	0.482562
4 | chr1	227917	267219	Background	0.163203	0.387996	0.408305
5 | chr1	318219	367158	Background	0.375418	1.42616	0.424955
6 | chr1	367658	367893	.	0.68569	17.617	0.310347
7 | 


--------------------------------------------------------------------------------
/inst/extdata/test_coverage_overlapping_intervals.txt:
--------------------------------------------------------------------------------
1 | Target	total_coverage	average_coverage
2 | chr1:1216042-1216047	316.551528468946	80.8786439075042
3 | chr1:1216045-1216050	316.551528468946	80.8786439075042
4 | chr1:1216606-1216678	5839.39523091608	129.022717424915
5 | chr1:1216791-1216991	26857.8564530621	220.417338871495
6 | chr1:1216991-1217991	26857.8564530621	220.417338871495
7 | 


--------------------------------------------------------------------------------
/tests/testthat/test_plotAbs.R:
--------------------------------------------------------------------------------
 1 | context("plotAbs")
 2 | 
 3 | test_that("Exceptions happen with wrong input", {
 4 |     data(purecn.example.output)
 5 |     expect_error( plotAbs(purecn.example.output, id = "hello", "BAF"), 
 6 |         "No solution with id hello")
 7 |     expect_error( plotAbs(purecn.example.output, id = 100, "BAF"), 
 8 |         "No solution with id 100")
 9 | })
10 | 


--------------------------------------------------------------------------------
/inst/extdata/dist/calculateSbdry.R:
--------------------------------------------------------------------------------
 1 | library(PureCN)
 2 | 
 3 | alpha <- formals(segmentationCBS)$alpha
 4 | eta <- formals(segment)$eta
 5 | nperm <- formals(segment)$nperm
 6 | max.ones <- floor(nperm * alpha) + 1
 7 | set.seed(123)
 8 | 
 9 | purecn.DNAcopy.bdry <- getbdry(eta, nperm, max.ones)
10 | save(purecn.DNAcopy.bdry, file="~/git/PureCN/data/purecn.DNAcopy.bdry.rda", compress="xz")
11 | 


--------------------------------------------------------------------------------
/man/purecn.DNAcopy.bdry.Rd:
--------------------------------------------------------------------------------
 1 | \name{purecn.DNAcopy.bdry}
 2 | \docType{data}
 3 | \alias{purecn.DNAcopy.bdry}
 4 | \title{DNAcopy boundary data}
 5 | \description{
 6 | This provides the output of the \code{DNAcopy::getbdry} call using \code{\link{segmentationCBS}}
 7 | default parameters.
 8 | }
 9 | \usage{data(purecn.DNAcopy.bdry)}
10 | \value{Output of the \code{DNAcopy::getbdry} call.}
11 | \keyword{datasets}
12 | 


--------------------------------------------------------------------------------
/man/purecn.example.output.Rd:
--------------------------------------------------------------------------------
 1 | \name{purecn.example.output}
 2 | \docType{data}
 3 | \alias{purecn.example.output}
 4 | \title{Example output}
 5 | \description{
 6 | This provides the output of the \code{\link{runAbsoluteCN}} call used in the 
 7 | vignette and examples.
 8 | }
 9 | \usage{data(purecn.example.output)}
10 | \value{Output of the \code{\link{runAbsoluteCN}} call used in the vignette.}
11 | \keyword{datasets}
12 | 


--------------------------------------------------------------------------------
/tests/testthat/test_getSexFromVcf.R:
--------------------------------------------------------------------------------
 1 | context("getSexFromVcf")
 2 | 
 3 | test_that("Example data is called correctly", {
 4 |     vcf.file <- system.file("extdata", "example.vcf.gz", package = "PureCN")
 5 |     vcf <- readVcf(vcf.file, "hg19")
 6 |     sex <- getSexFromVcf(vcf)
 7 |     expect_true(is.na(sex))
 8 |     vcfs <- vcf[info(vcf)$SOMATIC]
 9 |     getSexFromVcf(vcfs, "LIB-02240e4")
10 |     expect_true(is.na(sex))
11 | })
12 | 


--------------------------------------------------------------------------------
/tests/testthat/test_findFocal.R:
--------------------------------------------------------------------------------
 1 | context("findFocal")
 2 | 
 3 | test_that("Example data is called correctly", {
 4 |     data(purecn.example.output)
 5 |     ret <- findFocal(purecn.example.output$results[[1]]$seg)
 6 |     expect_equal(class(ret), "logical")
 7 |     expect_true(nrow(purecn.example.output$results[[1]]$seg) == 
 8 |         length(ret))
 9 |     expect_true(min(purecn.example.output$results[[1]]$seg[ret, 
10 |         "C"]) >= 5)
11 | })
12 | 


--------------------------------------------------------------------------------
/man/PureCN-deprecated.Rd:
--------------------------------------------------------------------------------
 1 | \name{PureCN-deprecated}
 2 | \alias{PureCN-deprecated}
 3 | \title{Deprecated functions in package \sQuote{PureCN}}
 4 | 
 5 | \description{
 6 |   These functions are provided for compatibility with older versions
 7 |   of \sQuote{PureCN} only, and will be defunct at the next release.
 8 | }
 9 | 
10 | \details{
11 |   The following functions are deprecated and will be made defunct; use
12 |   the replacement indicated below:
13 | %  \itemize{
14 | %
15 | %  }
16 | }
17 | 


--------------------------------------------------------------------------------
/tests/testthat/test_setPriorVcf.R:
--------------------------------------------------------------------------------
 1 | context("setPriorVcf")
 2 | 
 3 | test_that("Example data matches expected values", {
 4 |     vcf.file <- system.file("extdata", "example_vcf.vcf.gz", package = "PureCN")
 5 |     vcf <- readVcf(vcf.file, "hg19")
 6 |     vcf <- setPriorVcf(vcf)
 7 |     vcf.priorsomatic <- info(vcf)$PR
 8 |     expected <- c(2322, 9)
 9 |     names(expected) <- c(1e-04, 0.999)
10 |     expect_equal(sort(table(vcf.priorsomatic))[2], expected[1])
11 |     expect_equal(sort(table(vcf.priorsomatic))[1], expected[2])
12 | })
13 | 
14 | 


--------------------------------------------------------------------------------
/tests/testthat/test_callCIN.R:
--------------------------------------------------------------------------------
 1 | context("callCIN")
 2 | 
 3 | test_that("Example is called correctly", {
 4 |     data(purecn.example.output)
 5 |     loh <- callLOH(purecn.example.output)
 6 |     loh$size <- loh$end - loh$start + 1
 7 |     idx <- loh$C == 2
 8 |     ret <- callCIN(purecn.example.output, reference.state = "normal",
 9 |                    allele.specific = FALSE)
10 |     expect_equal(sum(loh$size[!idx])/sum(loh$size), ret, tol = 0.001)
11 |     loh <- loh[!is.na(loh$M),]
12 |     ret <- callCIN(purecn.example.output)
13 |     expect_equal(0.481, ret, tol = 0.02)
14 |     ret <- callCIN(purecn.example.output, reference.state = "normal")
15 |     idx <- loh$C == 2 & loh$M == 1
16 |     expect_equal(sum(loh$size[!idx])/sum(loh$size), ret, tol = 0.001)
17 | })
18 | 


--------------------------------------------------------------------------------
/tests/testthat/test_adjustLogRatio.R:
--------------------------------------------------------------------------------
 1 | context("adjustLogRatio")
 2 | 
 3 | test_that("Function returns expected values for example coverage", {
 4 |      data(purecn.example.output)
 5 |      log.ratio <- purecn.example.output$results[[1]]$seg$seg.mean   
 6 |      purity <- purecn.example.output$results[[1]]$purity
 7 |      ploidy <- purecn.example.output$results[[1]]$ploidy
 8 |      log.ratio.adjusted <- adjustLogRatio(log.ratio, purity, ploidy)
 9 |      total.ploidy <- 1.73
10 |      p <- 1
11 |      log.ratio.offset <- 0
12 |      opt.C <- (2^(log.ratio.adjusted + log.ratio.offset) *  total.ploidy)/p - ((2 * (1 - p))/p)
13 |      expect_lt(abs(min(log.ratio.adjusted, na.rm=TRUE) + 8), 0.001)
14 |      expect_lt(median(abs(opt.C - purecn.example.output$results[[1]]$seg$C)), 0.1)  
15 | })
16 | 
17 | 


--------------------------------------------------------------------------------
/tests/testthat/test_annotateTargets.R:
--------------------------------------------------------------------------------
 1 | context("annotateTargets") 
 2 | library(TxDb.Hsapiens.UCSC.hg19.knownGene)
 3 | library(org.Hs.eg.db)
 4 | test_coverage <- readCoverageFile(system.file("extdata", "example_normal.txt.gz",
 5 |     package = "PureCN"))
 6 | 
 7 | test_that("KIF1B is correctly annotated with UCSC chromosome names", {
 8 |     x <- head(test_coverage, 100)
 9 |     x <- annotateTargets(x, TxDb.Hsapiens.UCSC.hg19.knownGene, 
10 |         org.Hs.eg.db)
11 |     expect_equal(x$Gene[67], "KIF1B")
12 | })
13 | 
14 | test_that("KIF1B is correctly annotated with NCBI chromosome names", {
15 |     x <- head(test_coverage, 100)
16 |     seqlevelsStyle(x) <- "Ensembl"
17 |     x <- annotateTargets(x, TxDb.Hsapiens.UCSC.hg19.knownGene, 
18 |         org.Hs.eg.db)
19 |     expect_equal(x$Gene[67], "KIF1B")
20 | })
21 | 


--------------------------------------------------------------------------------
/tests/testthat/test_callAlterationsFromSegmentation.R:
--------------------------------------------------------------------------------
 1 | context("callAlterationsFromSegmentation")
 2 | 
 3 | test_that("Example is called correctly", {
 4 |     data(purecn.example.output)
 5 |     seg <- purecn.example.output$results[[1]]$seg
 6 |     interval.file <- system.file("extdata", "example_intervals.txt", 
 7 |         package = "PureCN")
 8 |     calls <- callAlterationsFromSegmentation(sampleid = seg$ID, 
 9 |         chr = seg$chrom, start = seg$loc.start, end = seg$loc.end, 
10 |         num.mark = seg$num.mark, seg.mean = seg$seg.mean, C = seg$C, 
11 |         interval.file = interval.file)
12 |     calls2 <- callAlterations(purecn.example.output)
13 |     expect_equal(sort(rownames(calls$Sample1[calls$Sample1$type == 
14 |         "AMPLIFICATION", ])), sort(rownames(calls2[calls2$type == 
15 |         "AMPLIFICATION", ])))
16 | })
17 | 


--------------------------------------------------------------------------------
/inst/CITATION:
--------------------------------------------------------------------------------
 1 | bibentry(bibtype = "Article",
 2 |          author = c(person(given = "Markus", family = "Riester"),
 3 |                     person(given = "Angad", family = "Singh"),
 4 |                     person(given = "A. Rose", family = "Brannon"),
 5 |                     person(given = "Kun", family = "Yu"),
 6 |                     person(given = "Catarina D.", family = "Campbell"),
 7 |                     person(given = "Derek Y.", family = "Chiang"),
 8 |                     person(given = "Michael", family = "Morrissey")),
 9 |          title = "PureCN: Copy number calling and SNV classification using 
10 |          targeted short read sequencing",
11 |          year="2016",   
12 |          volume="11",
13 |          pages="13",
14 |          doi="10.1186/s13029-016-0060-z",
15 |          journal = "Source Code for Biology and Medicine"   
16 |          )
17 | 


--------------------------------------------------------------------------------
/inst/extdata/example_allelic_counts_empty.tsv:
--------------------------------------------------------------------------------
 1 | @HD	VN:1.6
 2 | @SQ	SN:chr1	LN:249250621
 3 | @SQ	SN:chr2	LN:243199373
 4 | @SQ	SN:chr3	LN:198022430
 5 | @SQ	SN:chr4	LN:191154276
 6 | @SQ	SN:chr5	LN:180915260
 7 | @SQ	SN:chr6	LN:171115067
 8 | @SQ	SN:chr7	LN:159138663
 9 | @SQ	SN:chr8	LN:146364022
10 | @SQ	SN:chr9	LN:141213431
11 | @SQ	SN:chr10	LN:135534747
12 | @SQ	SN:chr11	LN:135006516
13 | @SQ	SN:chr12	LN:133851895
14 | @SQ	SN:chr13	LN:115169878
15 | @SQ	SN:chr14	LN:107349540
16 | @SQ	SN:chr15	LN:102531392
17 | @SQ	SN:chr16	LN:90354753
18 | @SQ	SN:chr17	LN:81195210
19 | @SQ	SN:chr18	LN:78077248
20 | @SQ	SN:chr19	LN:59128983
21 | @SQ	SN:chr20	LN:63025520
22 | @SQ	SN:chr21	LN:48129895
23 | @SQ	SN:chr22	LN:51304566
24 | @SQ	SN:chrX	LN:155270560
25 | @SQ	SN:chrY	LN:59373566
26 | @SQ	SN:chrM	LN:16571
27 | @RG	ID:PureCN	SM:LIB-02240e4
28 | CONTIG	POSITION	REF_COUNT	ALT_COUNT	REF_NUCLEOTIDE	ALT_NUCLEOTIDE
29 | 


--------------------------------------------------------------------------------
/tests/testthat/test_callAlterations.R:
--------------------------------------------------------------------------------
 1 | context("callAlterations")
 2 | 
 3 | test_that("Example is called correctly", {
 4 |     data(purecn.example.output)
 5 |     calls <- callAlterations(purecn.example.output)
 6 |     expect_true(sum(calls$C < 6 & calls$C > 0.5) == 0)
 7 |     calls <- callAlterations(purecn.example.output, failed = TRUE)
 8 |     expect_true(sum(calls$gene.mean < 0.9 & calls$gene.mean >
 9 |         -0.9) == 0)
10 |     esr2 <- callAlterations(purecn.example.output, all.genes = TRUE)["ESR2", ]
11 |     expect_equal(as.character(esr2$chr), "chr14")
12 |     expect_true(esr2$start > 64694600)
13 |     expect_true(esr2$end < 64761128)
14 | })
15 | 
16 | test_that("issue_292 is fixed", {
17 |     data(purecn.example.output)
18 |     calls <- callAlterations(purecn.example.output, id = 2, all.genes = TRUE)
19 |     expect_true(abs(mean(calls$C) - purecn.example.output$results[[2]]$ploidy) < 0.5)
20 | })
21 | 


--------------------------------------------------------------------------------
/tests/testthat/test_bootstrapResults.R:
--------------------------------------------------------------------------------
 1 | context("bootstrapResults")
 2 | 
 3 | test_that("Bootstrapping removed solutions", {
 4 |     data(purecn.example.output)
 5 |     set.seed(123)
 6 |     ret <- bootstrapResults(purecn.example.output, n = 100, top = 2)
 7 |     expect_equal(ret$results[[1]]$purity, purecn.example.output$results[[1]]$purity)
 8 |     expect_equal(ret$results[[1]]$ploidy, purecn.example.output$results[[1]]$ploidy)
 9 |     expect_true(length(ret$results) < length(purecn.example.output$results))
10 |     expect_true(ret$results[[1]]$bootstrap.value >= 0.5)
11 |     expect_true(ret$results[[2]]$bootstrap.value < 0.5)
12 |     expect_true(length(ret$results) >= 2)
13 |     ret <- bootstrapResults(purecn.example.output, n = 100, top = 3)
14 |     expect_true(length(ret$results) >= 3)
15 |     ret <- bootstrapResults(purecn.example.output, n = 100)
16 |     expect_equal(length(purecn.example.output$results), length(ret$results))
17 | })
18 | 
19 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/issue-report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Issue report
 3 | about: For issues related to your PureCN output, please use this template if possible.
 4 |   Otherwise start with a blank issue.
 5 | title: ''
 6 | labels: ''
 7 | assignees: ''
 8 | 
 9 | ---
10 | 
11 | **Describe the issue**
12 | A clear and concise description of what the issue is.
13 | 
14 | **To Reproduce**
15 | Copy and paste your complete command line arguments from PureCN.R. If possible and potentially relevant, also copy the output of NormalDB.R and Coverage.R.
16 | 
17 | **Expected behavior**
18 | A clear and concise description of what you expected to happen.
19 | 
20 | **Log file**
21 | Please copy and paste the log file (Sampleid.log) of a representative example
22 | 
23 | **B-allele frequency plot**
24 | Please take a screenshot of the B-allele frequency plot of the maximum likelihood solution
25 | (Sampleid.pdf).
26 | 
27 | **Session Info**
28 | Please start R, type sessionInfo() and paste the output.
29 | 


--------------------------------------------------------------------------------
/man/PureCN-defunct.Rd:
--------------------------------------------------------------------------------
 1 | \name{PureCN-defunct}
 2 | \alias{PureCN-defunct}
 3 | \title{Defunct functions in package \sQuote{PureCN}}
 4 | 
 5 | \description{
 6 |   These functions are defunct and no longer available.
 7 | }
 8 | 
 9 | \details{
10 |   The following functions are defunct; use
11 |   the replacement indicated below:
12 |   \itemize{
13 |     \item{autoCurateResults: no replacement}
14 |     \item{calculateGCContentByInterval: \code{\link{preprocessIntervals}}}
15 |     \item{calculateIntervalWeights: \code{\link{createNormalDatabase}}}
16 |     \item{createExonWeightFile: \code{\link{createNormalDatabase}}}
17 |     \item{createSNPBlacklist: \code{\link{setMappingBiasVcf}}}
18 |     \item{createTargetWeights: \code{\link{createNormalDatabase}}}
19 |     \item{filterTargets: \code{\link{filterIntervals}}}
20 |     \item{findBestNormal: \code{\link{calculateTangentNormal}}}
21 |     \item{getDiploid: no replacement}
22 |     \item{plotBestNormal: no replacement}
23 |     \item{readCoverageGatk: \code{\link{readCoverageFile}}}
24 |   }
25 | }
26 | 


--------------------------------------------------------------------------------
/man/readAllelicCountsFile.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/readAllelicCountsFile.R
 3 | \name{readAllelicCountsFile}
 4 | \alias{readAllelicCountsFile}
 5 | \title{Read allelic counts file}
 6 | \usage{
 7 | readAllelicCountsFile(file, format, zero = NULL)
 8 | }
 9 | \arguments{
10 | \item{file}{Input file containing counts of ref and alt alleles}
11 | 
12 | \item{format}{File format. If missing, derived from the file 
13 | extension. Currently only GATK4 CollectAllelicCounts (tsv)
14 | format supported.}
15 | 
16 | \item{zero}{Start position is 0-based. Default is \code{FALSE}
17 | for GATK, \code{TRUE} for BED file based intervals.}
18 | }
19 | \value{
20 | A \code{CollapsedVCF} with the parsed allelic counts.
21 | }
22 | \description{
23 | Read file containing counts of ref and alt alleles of common
24 | Toolkit 4.
25 | }
26 | \examples{
27 | 
28 | ac.file <- system.file("extdata", "example_allelic_counts.tsv", 
29 |     package="PureCN")
30 | vcf_ac <- readAllelicCountsFile(ac.file)
31 | 
32 | }
33 | \author{
34 | Markus Riester
35 | }
36 | 


--------------------------------------------------------------------------------
/man/callLOH.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/callLOH.R
 3 | \name{callLOH}
 4 | \alias{callLOH}
 5 | \title{Get regions of LOH}
 6 | \usage{
 7 | callLOH(res, id = 1, arm.cutoff = 0.9, keep.no.snp.segments = TRUE)
 8 | }
 9 | \arguments{
10 | \item{res}{Return object of the \code{\link{runAbsoluteCN}} function.}
11 | 
12 | \item{id}{Candidate solution to extract LOH from. \code{id=1} will use the
13 | maximum likelihood solution.}
14 | 
15 | \item{arm.cutoff}{Min fraction LOH on a chromosome arm to call whole arm
16 | events.}
17 | 
18 | \item{keep.no.snp.segments}{Segments without heterozygous SNPs
19 | have no LOH information. This defines whether these segments should
20 | be reported anyways.}
21 | }
22 | \value{
23 | Returns \code{data.frame} with LOH regions.
24 | }
25 | \description{
26 | This function provides detailed LOH information by region.
27 | }
28 | \examples{
29 | 
30 | data(purecn.example.output)
31 | head(callLOH(purecn.example.output))
32 | 
33 | }
34 | \seealso{
35 | \code{\link{runAbsoluteCN}}
36 | }
37 | \author{
38 | Markus Riester
39 | }
40 | 


--------------------------------------------------------------------------------
/tests/testthat/test_poolCoverage.R:
--------------------------------------------------------------------------------
 1 | context("poolCoverage")
 2 | 
 3 | normal.coverage.file <- system.file("extdata", "example_normal.txt.gz", 
 4 |     package = "PureCN")
 5 | normal2.coverage.file <- system.file("extdata", "example_normal2.txt.gz", 
 6 |     package = "PureCN")
 7 | normal.coverage.files <- c(normal.coverage.file, normal2.coverage.file)
 8 | 
 9 | test_that("Example coverage is averaged", {
10 |     coverage <- lapply(normal.coverage.files, readCoverageFile)
11 |     pool <- poolCoverage(coverage)
12 |     expect_equal(coverage[[1]]$average.coverage + coverage[[2]]$average.coverage, 
13 |         pool$average.coverage)
14 |     expect_equal(coverage[[1]]$coverage + coverage[[2]]$coverage, 
15 |         pool$coverage)
16 |     pool2 <- poolCoverage(coverage, w = c(0.5, 0.5))
17 |     expect_equal((coverage[[1]]$coverage + coverage[[2]]$coverage) / 2,
18 |         pool2$coverage)
19 | })
20 | 
21 | test_that("Exceptions happend with wrong input", {
22 |     coverage <- lapply(normal.coverage.files, readCoverageFile)
23 |     expect_error(poolCoverage(coverage, w = seq(3)), "different lengths")
24 | })    
25 | 


--------------------------------------------------------------------------------
/inst/extdata/dist/downloadCentromeres.R:
--------------------------------------------------------------------------------
 1 | library(rtracklayer)
 2 | library(data.table)
 3 | library(PureCN)
 4 | 
 5 | data(chr.hash)
 6 | mySession <- browserSession("UCSC")
 7 | genomes <- c("hg18", "hg19", "hg38")
 8 | centromeres <- list()
 9 | 
10 | for (genome in genomes) {
11 |     genome(mySession) <- genome
12 |     if (genome == "hg38") {
13 |         tbl.gaps <- getTable( ucscTableQuery(mySession,track="Centromeres",
14 | table="centromeres"))
15 |     } else {
16 |         tbl.gaps <- getTable( ucscTableQuery(mySession,  track="Gap",
17 |             table="gap"))
18 |         tbl.gaps <- tbl.gaps[tbl.gaps$type=="centromere",]
19 |     }
20 |     tbl.gaps.dt <- data.table(tbl.gaps)
21 |     tbl.centromeres <- as.data.frame(tbl.gaps.dt[,
22 |         list(chromStart=min(chromStart),chromEnd=max(chromEnd)),by=chrom])
23 |     centromeres[[genome]] <- tbl.centromeres 
24 | }
25 | 
26 | centromeres <- lapply(centromeres, function(x) {
27 |     x$chromNumerical <- chr.hash$number[match(x$chrom, chr.hash$chr)]
28 |     x[order(x$chromNumerical),1:3]
29 | })
30 | 
31 | save(centromeres, file="data/centromeres.rda", compress="xz")
32 | 


--------------------------------------------------------------------------------
/tests/testthat/test_calculateLogRatio.R:
--------------------------------------------------------------------------------
 1 | context("calculateLogRatio")
 2 | 
 3 | test_that("Misaligned on- and off-target regions are aligned", {
 4 |     x <- readCoverageFile(
 5 |         system.file("extdata", "example_intervals_tiny_ot.txt.gz",
 6 |         package = "PureCN"))
 7 |     set.seed(123)
 8 |     l1 <- rnorm(length(x), mean = 0.25, sd=0.3)
 9 |     l2 <- rnorm(length(x), mean = -0.25, sd=0.3)
10 |     x$log.ratio <- l1
11 |     x$log.ratio[x$on.target] <- l2[x$on.target]
12 |     expect_lt(t.test( x$log.ratio[x$on.target], x$log.ratio[!x$on.target])$p.value, 0.001)
13 | 
14 |     xc <- x
15 |     xc$log.ratio <- PureCN:::.calibrate_off_target_log_ratio(x)
16 |     expect_gt(t.test( xc$log.ratio[x$on.target], xc$log.ratio[!x$on.target])$p.value, 0.001)
17 | 
18 |     x$log.ratio <- l2
19 |     x$log.ratio[x$on.target] <- l1[x$on.target]
20 |     expect_lt(t.test( x$log.ratio[x$on.target], x$log.ratio[!x$on.target])$p.value, 0.001)
21 | 
22 |     xc <- x
23 |     xc$log.ratio <- PureCN:::.calibrate_off_target_log_ratio(x)
24 |     expect_gt(t.test( xc$log.ratio[x$on.target], xc$log.ratio[!x$on.target])$p.value, 0.001)
25 | 
26 | })
27 | 
28 | 


--------------------------------------------------------------------------------
/man/centromeres.Rd:
--------------------------------------------------------------------------------
 1 | \name{centromeres}
 2 | \alias{centromeres}
 3 | \docType{data}
 4 | \title{
 5 | A list of data.frames containing centromere positions.
 6 | }
 7 | \description{
 8 | A list of data.frames containing centromere positions for hg18, hg19 and hg38.
 9 | Downloaded from the UCSC genome browser.
10 | }
11 | \usage{data(centromeres)}
12 | \value{
13 |     A list with three data frames, "hg18", "hg19", and "hg38". Each containes
14 | three columns
15 |     \describe{
16 |         \item{\code{chrom}}{a factor with levels \code{chr1} \code{chr10} \code{chr11} \code{chr12} \code{chr13} \code{chr14} \code{chr15} \code{chr16} \code{chr17} \code{chr18} \code{chr19} \code{chr2} \code{chr20} \code{chr21} \code{chr22} \code{chr3} \code{chr4} \code{chr5} \code{chr6} \code{chr7} \code{chr8} \code{chr9} \code{chrX} \code{chrY}}
17 |         \item{\code{chromStart}}{a numeric vector}
18 |         \item{\code{chromEnd}}{a numeric vector}
19 |     }
20 | }
21 | \references{
22 | The script downloadCentromeres.R in the extdata directory was used to generate
23 | the data.frames.
24 | }
25 | \examples{
26 | data(centromeres)
27 | }
28 | \keyword{datasets}
29 | 


--------------------------------------------------------------------------------
/inst/extdata/issue192_tumor.cnr:
--------------------------------------------------------------------------------
 1 | chromosome	start	end	gene	depth	log2	weight
 2 | 6	105929	106231	OR4F1P	73.2318	-0.0897448	0.739157
 3 | 6	106533	106835	OR4F1P	88.4636	-0.137046	0.866497
 4 | 6	203388	203765	AL035696.3,AL035696.1	104.095	-0.154834	0.880158
 5 | 6	304510	304686	DUSP22	261.83	0.421809	0.735374
 6 | 6	335056	335176	DUSP22	50.45	-0.122885	0.93186
 7 | 6	335179	335299	DUSP22	82.7833	-0.0522508	0.789679
 8 | 6	345832	346041	DUSP22	219.225	0.65042	0.724212
 9 | 6	348419	348700	DUSP22	44.8505	-1.18952	0.479454
10 | 6	348735	348903	DUSP22	166.81	0.258922	0.933433
11 | 6	350872	351180	DUSP22	63.7468	-1.33098	0.157633
12 | 6	367261	367484	-	172.04	0.386116	0.805822
13 | 6	391692	391881	IRF4	54.0159	0.217291	0.162526
14 | 6	394719	394871	IRF4	113.289	0.195861	0.854984
15 | 6	394874	394994	IRF4	150.883	0.171075	0.948817
16 | 6	395754	395940	IRF4	74.6882	-0.281235	0.107269
17 | 6	398733	398924	IRF4	96.1047	-0.268284	0.811244
18 | 6	401490	401848	IRF4	253.478	0.580141	0.342134
19 | 6	404920	405216	IRF4	141.395	0.115385	0.954922
20 | 6	406554	406753	IRF4	67.0905	-0.166957	0.563356
21 | 6	406755	406875	IRF4	36	-0.465117	0.905217
22 | 6	407459	407580	IRF4	90.6446	0.0794558	0.902095
23 | 


--------------------------------------------------------------------------------
/man/readIntervalFile.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/readIntervalFile.R
 3 | \name{readIntervalFile}
 4 | \alias{readIntervalFile}
 5 | \title{Read interval file}
 6 | \usage{
 7 | readIntervalFile(interval.file, strict = TRUE, verbose = TRUE)
 8 | }
 9 | \arguments{
10 | \item{interval.file}{A mapping file that assigns GC content and gene symbols
11 | to each exon in the coverage files. Used for generating gene-level calls.
12 | First column in format CHR:START-END. Second column GC content (0 to 1).
13 | Third column gene symbol. This file is generated with the
14 | \code{\link{preprocessIntervals}} function.}
15 | 
16 | \item{strict}{Error out with missing columns}
17 | 
18 | \item{verbose}{Verbose output}
19 | }
20 | \value{
21 | A \code{GRanges} object with the parsed intervals.
22 | }
23 | \description{
24 | Read file containing coordinates of on- and off-target intervals
25 | generated by \code{\link{preprocessIntervals}}.
26 | }
27 | \examples{
28 | 
29 | interval.file <- system.file("extdata", "example_intervals.txt", 
30 |     package = "PureCN")
31 | x <- readIntervalFile(interval.file)
32 | 
33 | }
34 | \author{
35 | Markus Riester
36 | }
37 | 


--------------------------------------------------------------------------------
/man/readLogRatioFile.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/readLogRatioFile.R
 3 | \name{readLogRatioFile}
 4 | \alias{readLogRatioFile}
 5 | \title{Read file containing interval-level log2 tumor/normal ratios}
 6 | \usage{
 7 | readLogRatioFile(file, format, zero = NULL)
 8 | }
 9 | \arguments{
10 | \item{file}{Log2 coverage file.}
11 | 
12 | \item{format}{File format. If missing, derived from the file
13 | extension. Currently GATK4 DenoiseReadCounts format supported.
14 | A simple GATK3-style format, two columns with coordinates
15 | as string in format chr:start-stop in first and log2-ratio
16 | in second is also supported.}
17 | 
18 | \item{zero}{Start position is 0-based. Default is \code{FALSE}
19 | for GATK, \code{TRUE} for BED file based intervals.}
20 | }
21 | \value{
22 | A \code{GRange} with the log2 ratio.
23 | }
24 | \description{
25 | Read log2 ratio file produced by external tools like The Genome Analysis
26 | Toolkit version 4.
27 | }
28 | \examples{
29 | 
30 | logratio.file <- system.file("extdata", "example_gatk4_denoised_cr.tsv.gz",
31 |     package = "PureCN")
32 | logratio <- readLogRatioFile(logratio.file)
33 | 
34 | }
35 | \author{
36 | Markus Riester
37 | }
38 | 


--------------------------------------------------------------------------------
/man/annotateTargets.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/annotateTargets.R
 3 | \name{annotateTargets}
 4 | \alias{annotateTargets}
 5 | \title{Annotate targets with gene symbols}
 6 | \usage{
 7 | annotateTargets(x, txdb, org)
 8 | }
 9 | \arguments{
10 | \item{x}{A \code{GRanges} object with interals to annotate}
11 | 
12 | \item{txdb}{A \code{TxDb} database, e.g.
13 | \code{TxDb.Hsapiens.UCSC.hg19.knownGene}}
14 | 
15 | \item{org}{A \code{OrgDb} object, e.g. \code{org.Hs.eg.db}.}
16 | }
17 | \value{
18 | A \code{GRanges} object.
19 | }
20 | \description{
21 | This function can be used to add a \sQuote{Gene} meta column containing
22 | gene symbols to a \code{GRanges} object.
23 | It applies heuristics to find the protein coding genes that were
24 | likely meant to target in the assay design in case transcripts
25 | overlap.
26 | }
27 | \examples{
28 | library(TxDb.Hsapiens.UCSC.hg19.knownGene)
29 | library(org.Hs.eg.db)
30 | 
31 | normal.coverage.file <- system.file("extdata", "example_normal.txt.gz",
32 |     package = "PureCN")
33 | x <- head(readCoverageFile(normal.coverage.file), 100)
34 | x <- annotateTargets(x,TxDb.Hsapiens.UCSC.hg19.knownGene, org.Hs.eg.db)
35 | 
36 | }
37 | \author{
38 | Markus Riester
39 | }
40 | 


--------------------------------------------------------------------------------
/man/createCurationFile.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/createCurationFile.R
 3 | \name{createCurationFile}
 4 | \alias{createCurationFile}
 5 | \title{Create file to curate PureCN results}
 6 | \usage{
 7 | createCurationFile(
 8 |   file.rds,
 9 |   overwrite.uncurated = TRUE,
10 |   overwrite.curated = FALSE
11 | )
12 | }
13 | \arguments{
14 | \item{file.rds}{Output of the \code{\link{runAbsoluteCN}} function,
15 | serialized with \code{saveRDS}.}
16 | 
17 | \item{overwrite.uncurated}{Overwrite existing files unless flagged as
18 | \sQuote{Curated}.}
19 | 
20 | \item{overwrite.curated}{Overwrite existing files even if flagged as
21 | \sQuote{Curated}.}
22 | }
23 | \value{
24 | A \code{data.frame} with the tumor purity and ploidy of the maximum
25 | likelihood solution.
26 | }
27 | \description{
28 | Function to create a CSV file that can be used to mark the correct solution
29 | in the output of a \code{\link{runAbsoluteCN}} run.
30 | }
31 | \examples{
32 | 
33 | data(purecn.example.output)
34 | file.rds <- "Sample1_PureCN.rds"
35 | saveRDS(purecn.example.output, file = file.rds)
36 | createCurationFile(file.rds)
37 | 
38 | }
39 | \seealso{
40 | \code{\link{runAbsoluteCN}}
41 | }
42 | \author{
43 | Markus Riester
44 | }
45 | 


--------------------------------------------------------------------------------
/man/poolCoverage.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/poolCoverage.R
 3 | \name{poolCoverage}
 4 | \alias{poolCoverage}
 5 | \title{Pool coverage from multiple samples}
 6 | \usage{
 7 | poolCoverage(all.data, remove.chrs = c(), w = NULL)
 8 | }
 9 | \arguments{
10 | \item{all.data}{List of normals, read with \code{\link{readCoverageFile}}.}
11 | 
12 | \item{remove.chrs}{Remove these chromosomes from the pool.}
13 | 
14 | \item{w}{\code{numeric(length(all.data))} vector of weights. If \code{NULL},
15 | weight all samples equally.}
16 | }
17 | \value{
18 | A \code{data.frame} with the averaged coverage over all normals.
19 | }
20 | \description{
21 | Averages the coverage of a list of samples.
22 | }
23 | \examples{
24 | 
25 | normal.coverage.file <- system.file("extdata", "example_normal.txt.gz",
26 |     package = "PureCN")
27 | normal2.coverage.file <- system.file("extdata", "example_normal2.txt.gz",
28 |     package = "PureCN")
29 | normal.coverage.files <- c(normal.coverage.file, normal2.coverage.file)
30 | pool <- poolCoverage(lapply(normal.coverage.files, readCoverageFile),
31 |      remove.chrs = c("chrX", "chrY"))
32 | 
33 | }
34 | \seealso{
35 | \code{\link{readCoverageFile}}
36 | }
37 | \author{
38 | Markus Riester
39 | }
40 | 


--------------------------------------------------------------------------------
/tests/testthat/test_readSegmentationFile.R:
--------------------------------------------------------------------------------
 1 | context("readSegmentationFile")
 2 | 
 3 | test_that("Example DNAcopy data matches", {
 4 |     seg.file <- system.file("extdata", "example_seg.txt", 
 5 |         package = "PureCN")
 6 |     seg <- readSegmentationFile(seg.file, "Sample1")
 7 |     offset <- -0.0033
 8 |     expect_equal(54, nrow(seg))
 9 |     expect_equal(0.133381833060556 - offset, seg$seg.mean[1], tolerance = .0001)
10 |     expect_equal(-0.6394 - offset, seg$seg.mean[54], tolerance = .0001)
11 | })
12 | 
13 | test_that("Example GATK4 data matches", {
14 |     seg.file <- system.file("extdata", "example_gatk4_modelfinal.seg.gz", 
15 |         package = "PureCN")
16 |     seg <- readSegmentationFile(seg.file, "Sample1")
17 |     offset <- -0.0037
18 |     expect_equal(23, nrow(seg))
19 |     expect_equal(-0.004295 - offset, seg$seg.mean[1], tolerance = .0001)
20 |     expect_equal(0.002534 - offset, seg$seg.mean[23], tolerance = .0001)
21 | })
22 | 
23 | test_that("Missing values raise warning", {
24 |     seg.file <- system.file("extdata", "buggy_cnvkit.seg.gz", 
25 |         package = "PureCN")
26 |     expect_output(readSegmentationFile(seg.file, "SC_9030.tumour.recalibrated"),
27 |                    "Coordinates in seg.file contain missing values")
28 | })
29 | 
30 | 


--------------------------------------------------------------------------------
/tests/testthat/test_calculatePowerDetectSomatic.R:
--------------------------------------------------------------------------------
 1 | context("calculatePowerDetectSomatic")
 2 | 
 3 | test_that("Power is calculated correctly for examples", {
 4 |     p1 <- calculatePowerDetectSomatic(coverage = 5, purity = 1, 
 5 |         ploidy = 2)$power
 6 |     p2 <- calculatePowerDetectSomatic(coverage = 5, f = 0.5)$power
 7 |     expect_equal(p1, 0.6407084, tolerance=0.0001)
 8 |     expect_equal(p2, 0.6407084, tolerance=0.0001)
 9 |     p3 <- calculatePowerDetectSomatic(coverage = 33, purity = 0.5, 
10 |         ploidy = 6)$power
11 |     expect_equal(p3, 0.8, tolerance=0.001)
12 |     p4 <- calculatePowerDetectSomatic(coverage = 330, purity = 0.2, 
13 |         ploidy = 2, cell.fraction = 0.2)$power
14 |     expect_equal(p4, 0.8, tolerance=0.001)
15 | })
16 | 
17 | test_that("Exceptions happen with wrong input", {
18 |     expect_error(calculatePowerDetectSomatic(coverage = 5))
19 |     expect_error(calculatePowerDetectSomatic(coverage = 5, f = 1.1))
20 |     expect_error(calculatePowerDetectSomatic(coverage = 1, f = 0.9))
21 |     expect_error(calculatePowerDetectSomatic(coverage = 3, purity = 1.1, 
22 |         ploidy = 2))
23 |     expect_error(calculatePowerDetectSomatic(coverage = 3, purity = 1, 
24 |         ploidy = -1))
25 |     expect_error(calculatePowerDetectSomatic(coverage = 5, cell.fraction = 1.1))
26 | })
27 | 


--------------------------------------------------------------------------------
/man/calculateLogRatio.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/calculateLogRatio.R
 3 | \name{calculateLogRatio}
 4 | \alias{calculateLogRatio}
 5 | \title{Calculate coverage log-ratio of tumor vs. normal}
 6 | \usage{
 7 | calculateLogRatio(normal, tumor)
 8 | }
 9 | \arguments{
10 | \item{normal}{Normal coverage read in by the \code{\link{readCoverageFile}}
11 | function.}
12 | 
13 | \item{tumor}{Tumor coverage read in by the \code{\link{readCoverageFile}}
14 | function.}
15 | }
16 | \value{
17 | \code{numeric(length(tumor))}, tumor vs. normal copy number log-ratios
18 | for all targets.
19 | }
20 | \description{
21 | This function is automatically called by \code{\link{runAbsoluteCN}} when
22 | normal and tumor coverage are provided (and not a segmentation file or
23 | target-level log-ratios). This function is therefore normally not called by
24 | the user.
25 | }
26 | \examples{
27 | 
28 | normal.coverage.file <- system.file("extdata", "example_normal.txt.gz", 
29 |     package = "PureCN")
30 | tumor.coverage.file <- system.file("extdata", "example_tumor.txt.gz", 
31 |     package = "PureCN")
32 | normal <- readCoverageFile(normal.coverage.file)
33 | tumor <- readCoverageFile(tumor.coverage.file)
34 | log.ratio <- calculateLogRatio(normal, tumor)
35 | 
36 | }
37 | \author{
38 | Markus Riester
39 | }
40 | 


--------------------------------------------------------------------------------
/man/readSegmentationFile.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/readSegmentationFile.R
 3 | \name{readSegmentationFile}
 4 | \alias{readSegmentationFile}
 5 | \title{Read file containing segmentations}
 6 | \usage{
 7 | readSegmentationFile(
 8 |   seg.file,
 9 |   sampleid,
10 |   model.homozygous = FALSE,
11 |   format,
12 |   zero = FALSE,
13 |   verbose = TRUE
14 | )
15 | }
16 | \arguments{
17 | \item{seg.file}{File with segmentation}
18 | 
19 | \item{sampleid}{Sampleid, for segmentation files containing multiple samples}
20 | 
21 | \item{model.homozygous}{Unless \code{TRUE}, checks for very small log2-ratios
22 | that cannot happen in samples with normal contamination}
23 | 
24 | \item{format}{File format. If missing, derived from the file
25 | extension. Currently DNAcopy, and GATK4
26 | (ModelSegments) format supported. CNVkit uses DNAcopy format.}
27 | 
28 | \item{zero}{Start position is 0-based. Default is \code{FALSE}.}
29 | 
30 | \item{verbose}{Verbose output.}
31 | }
32 | \value{
33 | A \code{data.frame}.
34 | }
35 | \description{
36 | Read segmentation files produced by DNAcopy, CNVkit or GATK4.
37 | }
38 | \examples{
39 | 
40 | seg.file <- system.file("extdata", "example_seg.txt",
41 |     package = "PureCN")
42 | seg <- readSegmentationFile(seg.file, "Sample1")
43 | 
44 | }
45 | \author{
46 | Markus Riester
47 | }
48 | 


--------------------------------------------------------------------------------
/man/readCoverageFile.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/readCoverageFile.R
 3 | \name{readCoverageFile}
 4 | \alias{readCoverageFile}
 5 | \title{Read coverage file}
 6 | \usage{
 7 | readCoverageFile(file, format, zero = NULL, read.length = 100)
 8 | }
 9 | \arguments{
10 | \item{file}{Target coverage file.}
11 | 
12 | \item{format}{File format. If missing, derived from the file 
13 | extension. Currently GATK3 DepthofCoverage, GATK4 CollectFragmentCounts 
14 | (hdf5), and CNVkit formats supported.}
15 | 
16 | \item{zero}{Start position is 0-based. Default is \code{FALSE}
17 | for GATK, \code{TRUE} for BED file based intervals.}
18 | 
19 | \item{read.length}{For output formats which do not provide both counts 
20 | and total coverages, approximate them using the specified read length.}
21 | }
22 | \value{
23 | A \code{data.frame} with the parsed coverage information.
24 | }
25 | \description{
26 | Read coverage file produced by external tools like The Genome Analysis 
27 | Toolkit or by \code{\link{calculateBamCoverageByInterval}}.
28 | }
29 | \examples{
30 | 
31 | tumor.coverage.file <- system.file("extdata", "example_tumor.txt.gz", 
32 |     package = "PureCN")
33 | coverage <- readCoverageFile(tumor.coverage.file)
34 | 
35 | }
36 | \seealso{
37 | \code{\link{calculateBamCoverageByInterval}}
38 | }
39 | \author{
40 | Markus Riester
41 | }
42 | 


--------------------------------------------------------------------------------
/man/bootstrapResults.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/bootstrapResults.R
 3 | \name{bootstrapResults}
 4 | \alias{bootstrapResults}
 5 | \title{Bootstrapping variant fits}
 6 | \usage{
 7 | bootstrapResults(res, n = 500, top = NULL, reorder = FALSE)
 8 | }
 9 | \arguments{
10 | \item{res}{Return object of the \code{\link{runAbsoluteCN}} function.}
11 | 
12 | \item{n}{Number of bootstrap replicates.}
13 | 
14 | \item{top}{Include solution if it appears in the top \code{n} solutions of
15 | any bootstrap replicate. If \code{NULL}, do not filter solutions.}
16 | 
17 | \item{reorder}{Reorder results by bootstrap value.}
18 | }
19 | \value{
20 | Returns a \code{\link{runAbsoluteCN}} object with added bootstrap
21 | value to each solution. This value
22 | is the fraction of bootstrap replicates in which the solution ranked first.
23 | }
24 | \description{
25 | This function bootstraps variants, then optionally re-ranks solutions by
26 | using the bootstrap estimate of the likelihood score, and then optionally
27 | removes solutions that never ranked high in any bootstrap replicate.
28 | }
29 | \examples{
30 | 
31 | data(purecn.example.output)
32 | ret.boot <- bootstrapResults(purecn.example.output, n=100)
33 | plotAbs(ret.boot, type="overview")
34 | 
35 | }
36 | \seealso{
37 | \code{\link{runAbsoluteCN}}
38 | }
39 | \author{
40 | Markus Riester
41 | }
42 | 


--------------------------------------------------------------------------------
/man/callCIN.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/callCIN.R
 3 | \name{callCIN}
 4 | \alias{callCIN}
 5 | \title{Call Chromosomal Instability}
 6 | \usage{
 7 | callCIN(
 8 |   res,
 9 |   id = 1,
10 |   allele.specific = TRUE,
11 |   reference.state = c("dominant", "normal")
12 | )
13 | }
14 | \arguments{
15 | \item{res}{Return object of the \code{\link{runAbsoluteCN}} function.}
16 | 
17 | \item{id}{Candidate solution to extract CIN from. \code{id=1} will use the
18 | maximum likelihood solution.}
19 | 
20 | \item{allele.specific}{Use allele-specific or only total copy number for
21 | detecting abnormal regions. Copy-number neutral LOH would be ignored when
22 | this parameter is set to \code{FALSE}.}
23 | 
24 | \item{reference.state}{Copy number regions different from the reference
25 | state are counted as abnormal. Default is \code{dominant} means the most
26 | common state. The other option is \code{normal}, which defines normal
27 | heterozygous, diploid as reference. The default is robust to errors in
28 | ploidy.}
29 | }
30 | \value{
31 | Returns \code{double(1)} with CIN value.
32 | }
33 | \description{
34 | This function provides detailed CIN information.
35 | }
36 | \examples{
37 | 
38 | data(purecn.example.output)
39 | head(callCIN(purecn.example.output))
40 | 
41 | }
42 | \seealso{
43 | \code{\link{runAbsoluteCN}}
44 | }
45 | \author{
46 | Markus Riester
47 | }
48 | 


--------------------------------------------------------------------------------
/tests/testthat/test_readLogRatioFile.R:
--------------------------------------------------------------------------------
 1 | context("readLogRatioFile")
 2 | data(purecn.example.output)
 3 | 
 4 | test_that("Example data matches", {
 5 |     logratio.file <- system.file("extdata", "example_gatk4_denoised_cr.tsv.gz", 
 6 |         package = "PureCN")
 7 |     logratio <- readLogRatioFile(logratio.file)
 8 |     expect_equal(21, length(logratio))
 9 |     expect_equal(0.109473, logratio$log.ratio[1], tolerance = .00001)
10 |     expect_equal(-0.185664, logratio$log.ratio[21], tolerance = .00001)
11 |     expect_equivalent(seqlengths(logratio), c(248956422, 242193529, 156040895))
12 |     logratio.file2 <- system.file("extdata", "example_logratio.txt.gz", 
13 |         package = "PureCN")
14 |     logratio2 <- readLogRatioFile(logratio.file2)
15 |     expect_equal(as.character(logratio), as.character(logratio2))
16 |     expect_equal(logratio$log.ratio, logratio2$log.ratio)
17 | })
18 | 
19 | test_that("parsing -> writing -> parsing works", {
20 |     x <- purecn.example.output$input
21 |     y <- x
22 |     y$log.ratio$log.ratio <- NULL
23 |     output.file <- tempfile(fileext = ".tsv")
24 |     expect_error(
25 |         PureCN:::.writeLogRatioFileGATK4(y, 1, output.file),
26 |         "log.ratio NULL"      
27 |     )
28 |     PureCN:::.writeLogRatioFileGATK4(x, 1, output.file)
29 |     z <- readLogRatioFile(output.file)
30 |     expect_equivalent(x$log.ratio$log.ratio, z$log.ratio)
31 |     file.remove(output.file)
32 | })
33 | 
34 | 


--------------------------------------------------------------------------------
/tests/testthat/test_callMutationBurden.R:
--------------------------------------------------------------------------------
 1 | context("callMutationBurden")
 2 | 
 3 | data(purecn.example.output)
 4 | callableBed <- import(system.file("extdata", "example_callable.bed.gz", 
 5 |     package = "PureCN"))
 6 | 
 7 | test_that("Example is called correctly", {
 8 |     calls <- callMutationBurden(purecn.example.output)
 9 |     expect_false(is.na(calls$callable.bases.ontarget))
10 |     expect_true(calls$callable.bases.ontarget > 0)
11 |     exclude <- GRanges(seqnames = "chr1", IRanges(start = 1, 
12 |         end = max(end(callableBed))))
13 |     myVcfFilter <- function(vcf) seqnames(vcf) != "chr2"
14 |     callsCallable <- callMutationBurden(purecn.example.output, 
15 |         callable = callableBed, exclude = exclude, fun.countMutation = myVcfFilter)
16 |     expect_true(callsCallable$callable.bases.ontarget > 0)
17 |     expect_true(callsCallable$callable.bases.flanking > callsCallable$callable.bases.ontarget)
18 |     expect_true(callsCallable$callable.bases.all > callsCallable$callable.bases.flanking)
19 | })    
20 | 
21 | test_that("Exceptions happen with wrong input", {
22 |     expect_error(callMutationBurden(purecn.example.output, callable = callableBed, 
23 |         exclude = exclude, fun.countMutation = "helloworld"))
24 |     expect_error(callMutationBurden(purecn.example.output, callable = callableBed, 
25 |         exclude = "helloworld"))
26 |     expect_error(callMutationBurden(purecn.example.output, callable = "helloworld"))
27 | })
28 | 


--------------------------------------------------------------------------------
/inst/extdata/example_allelic_counts.tsv:
--------------------------------------------------------------------------------
 1 | @HD	VN:1.6
 2 | @SQ	SN:chr1	LN:249250621
 3 | @SQ	SN:chr2	LN:243199373
 4 | @SQ	SN:chr3	LN:198022430
 5 | @SQ	SN:chr4	LN:191154276
 6 | @SQ	SN:chr5	LN:180915260
 7 | @SQ	SN:chr6	LN:171115067
 8 | @SQ	SN:chr7	LN:159138663
 9 | @SQ	SN:chr8	LN:146364022
10 | @SQ	SN:chr9	LN:141213431
11 | @SQ	SN:chr10	LN:135534747
12 | @SQ	SN:chr11	LN:135006516
13 | @SQ	SN:chr12	LN:133851895
14 | @SQ	SN:chr13	LN:115169878
15 | @SQ	SN:chr14	LN:107349540
16 | @SQ	SN:chr15	LN:102531392
17 | @SQ	SN:chr16	LN:90354753
18 | @SQ	SN:chr17	LN:81195210
19 | @SQ	SN:chr18	LN:78077248
20 | @SQ	SN:chr19	LN:59128983
21 | @SQ	SN:chr20	LN:63025520
22 | @SQ	SN:chr21	LN:48129895
23 | @SQ	SN:chr22	LN:51304566
24 | @SQ	SN:chrX	LN:155270560
25 | @SQ	SN:chrY	LN:59373566
26 | @SQ	SN:chrM	LN:16571
27 | @RG	ID:PureCN	SM:LIB-02240e4
28 | CONTIG	POSITION	REF_COUNT	ALT_COUNT	REF_NUCLEOTIDE	ALT_NUCLEOTIDE
29 | chr1	114515871	177	189	G	A
30 | chr1	150044293	119	157	T	G
31 | chr1	158449835	209	222	A	G
32 | chr1	158450154	401	294	G	A
33 | chr1	158450311	323	262	C	T
34 | chr1	158450374	351	269	A	G
35 | chr1	160062206	76	63	C	T
36 | chr1	177902370	97	60	C	A
37 | chr1	200967559	115	137	C	G
38 | chr1	247419414	138	205	T	C
39 | chr1	247419499	242	320	C	T
40 | chr1	248085080	35	32	C	T
41 | chr1	248085104	157	116	G	A
42 | chr2	10262881	121	136	C	T
43 | chr2	10263895	118	96	C	G
44 | chr2	69472504	121	70	C	T
45 | chr2	138413092	35	79	G	A
46 | chr2	138434106	56	134	A	G
47 | chr2	185798411	34	67	G	A
48 | chr2	188361624	50	107	A	G
49 | 


--------------------------------------------------------------------------------
/man/findHighQualitySNPs.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/calculateMappingBiasVcf.R
 3 | \name{findHighQualitySNPs}
 4 | \alias{findHighQualitySNPs}
 5 | \title{Find High Quality SNPs}
 6 | \usage{
 7 | findHighQualitySNPs(
 8 |   mapping.bias.file,
 9 |   max.bias = 0.2,
10 |   min.pon = 2,
11 |   triallelic = FALSE,
12 |   vcf.file = NULL,
13 |   genome
14 | )
15 | }
16 | \arguments{
17 | \item{mapping.bias.file}{Generated by \code{\link{calculateMappingBiasVcf}}.}
18 | 
19 | \item{max.bias}{Maximum mapping bias}
20 | 
21 | \item{min.pon}{Minimum number of normal samples, useful to get reliable
22 | mapping bias.}
23 | 
24 | \item{triallelic}{By default, ignore positions with multiple alt alleles.}
25 | 
26 | \item{vcf.file}{Optional VCF file (for example dbSNP). Needs to be 
27 | bgzip and tabix processed.}
28 | 
29 | \item{genome}{See \code{readVcf}}
30 | }
31 | \value{
32 | A \code{GRanges} object with mapping bias passing filters. 
33 | If \code{vcf.file} is provided, it will be the variants in the
34 | corresponding file overlapping with the passed variants.
35 | }
36 | \description{
37 | Function to extract high quality SNPs from the mapping bias database.
38 | Useful for generating fingerprinting panels etc.
39 | }
40 | \examples{
41 | 
42 | normal.panel.vcf <- system.file("extdata", "normalpanel.vcf.gz",
43 |     package = "PureCN")
44 | bias <- calculateMappingBiasVcf(normal.panel.vcf, genome = "h19")
45 | 
46 | }
47 | \author{
48 | Markus Riester
49 | }
50 | 


--------------------------------------------------------------------------------
/tests/testthat/test_predictSomatic.R:
--------------------------------------------------------------------------------
 1 | context("predictSomatic")
 2 | 
 3 | data(purecn.example.output)
 4 | ret <- predictSomatic(purecn.example.output)
 5 | 
 6 | test_that("Gene symbol annotation matches", {
 7 |     expect_equal(class(ret), "data.frame")
 8 |     expect_equal(nrow(ret), nrow(purecn.example.output$results[[1]]$SNV.posterior$posteriors))
 9 |     esr2 <- ret[which(ret$gene.symbol == "ESR2"), ]
10 |     expect_equal(as.character(esr2$chr), "chr14")
11 |     expect_true(esr2$start > 64699747)
12 |     expect_true(esr2$end < 64761128)
13 | })
14 | 
15 | test_that("VCF and data.frame provide equivalent results", {
16 |     ret.vcf <- predictSomatic(purecn.example.output, return.vcf = TRUE)
17 |     expect_equal(start(ret.vcf), ret$start)
18 |     expect_equal(end(ret.vcf), ret$end)
19 |     expect_equal(as.character(seqnames(ret.vcf)), as.character(ret$chr))
20 |     expect_equal(info(ret.vcf)$SM1, round(ret$SOMATIC.M1, digits = 4))
21 |     expect_equal(info(ret.vcf)$GM1, round(ret$GERMLINE.M1, digits = 4))
22 |     expect_equal(info(ret.vcf)$PS, round(ret$POSTERIOR.SOMATIC, 
23 |         digits = 4))
24 |     expect_equal(info(ret.vcf)$GS, ret$gene.symbol)
25 | })
26 | 
27 | test_that("Segments are flagged", {
28 |     flagged <- lapply(split(ret$seg.id, ret$M.SEGMENT.FLAGGED), 
29 |         table)
30 |     expect_true(min(flagged$`FALSE`) >= 5)
31 |     expect_true(max(flagged$`TRUE`) < 5)
32 |     expect_true(min(ret$M.SEGMENT.POSTERIOR) > 0.5)
33 |     expect_equal(max(ret$M.SEGMENT.POSTERIOR), 1)
34 | })
35 | 


--------------------------------------------------------------------------------
/man/adjustLogRatio.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/adjustLogRatio.R
 3 | \name{adjustLogRatio}
 4 | \alias{adjustLogRatio}
 5 | \title{Adjust tumor vs. normal coverage log ratio for tumor purity and ploidy}
 6 | \usage{
 7 | adjustLogRatio(ratio, purity, ploidy, is.log2 = TRUE, min.ratio = 2^-8)
 8 | }
 9 | \arguments{
10 | \item{ratio}{Vector of log2 tumor vs normal coverage ratios.}
11 | 
12 | \item{purity}{Purity of sample.}
13 | 
14 | \item{ploidy}{Ploidy of sample.}
15 | 
16 | \item{is.log2}{\code{log.ratio} is \code{log2} transformed.}
17 | 
18 | \item{min.ratio}{Minimum (non-log2-transformed) ratio. Set to approx -8
19 | \code{log2} adjusted.}
20 | }
21 | \value{
22 | \code{numeric(length(log.ratio))}, \code{log.ratio} adjusted 
23 | for \code{purity} and \code{ploidy}
24 | }
25 | \description{
26 | This function can be used to adjust the log ratio for tumor purity and
27 | ploidy for downstream tools that expect a log2 ratio (for example GISTIC).
28 | }
29 | \examples{
30 | 
31 | normal.coverage.file <- system.file("extdata", "example_normal.txt.gz", 
32 |     package = "PureCN")
33 | tumor.coverage.file <- system.file("extdata", "example_tumor.txt.gz", 
34 |     package = "PureCN")
35 | normal <- readCoverageFile(normal.coverage.file)
36 | tumor <- readCoverageFile(tumor.coverage.file)
37 | log.ratio <- calculateLogRatio(normal, tumor)
38 | log.ratio.adjusted <- adjustLogRatio(log.ratio, 0.65, 1.73)
39 | 
40 | }
41 | \references{
42 | Nature Biotechnology.
43 |  * Toal (2018), https://github.com/lima1/PureCN/issues/40
44 | }
45 | \author{
46 | Markus Riester
47 | }
48 | 


--------------------------------------------------------------------------------
/man/callAlterations.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/callAlterations.R
 3 | \name{callAlterations}
 4 | \alias{callAlterations}
 5 | \title{Calling of amplifications and deletions}
 6 | \usage{
 7 | callAlterations(
 8 |   res,
 9 |   id = 1,
10 |   cutoffs = c(0.5, 6, 7),
11 |   log.ratio.cutoffs = c(-0.9, 0.9),
12 |   failed = NULL,
13 |   all.genes = FALSE
14 | )
15 | }
16 | \arguments{
17 | \item{res}{Return object of the \code{\link{runAbsoluteCN}} function.}
18 | 
19 | \item{id}{Candidate solutions to be used. \code{id=1} will use the maximum
20 | likelihood (or curated) solution.}
21 | 
22 | \item{cutoffs}{Copy numbers cutoffs to call losses, focal amplifications and
23 | broad amplifications.}
24 | 
25 | \item{log.ratio.cutoffs}{Copy numbers log-ratio cutoffs to call losses and
26 | amplifications in failed samples.}
27 | 
28 | \item{failed}{Indicates whether sample was failed. If \code{NULL}, use
29 | available annotation, which can be set in the curation file.}
30 | 
31 | \item{all.genes}{If \code{FALSE}, then only return amplifications and
32 | deletions passing the thresholds.}
33 | }
34 | \value{
35 | A \code{data.frame} with gene-level amplification and deletion
36 | calls.
37 | }
38 | \description{
39 | Function to extract major copy number alterations from a
40 | \code{\link{runAbsoluteCN}} return object.
41 | }
42 | \examples{
43 | 
44 | data(purecn.example.output)
45 | callAlterations(purecn.example.output)
46 | callAlterations(purecn.example.output, all.genes=TRUE)["ESR2",]
47 | 
48 | }
49 | \seealso{
50 | \code{\link{runAbsoluteCN}}
51 | }
52 | \author{
53 | Markus Riester
54 | }
55 | 


--------------------------------------------------------------------------------
/tests/testthat/test_readAllelicCountsFile.R:
--------------------------------------------------------------------------------
 1 | context("readAllelicCountsFile")
 2 | 
 3 | vcf.file <- system.file("extdata", "example.vcf.gz", package = "PureCN")
 4 | ac.file <- system.file("extdata", "example_allelic_counts.tsv", package = "PureCN")
 5 | ac.empty.file <- system.file("extdata", "example_allelic_counts_empty.tsv", package = "PureCN")
 6 | vcf <- readVcf(vcf.file, "hg19")
 7 | data(purecn.example.output)
 8 | normal.coverage.file <- system.file('extdata', 'example_normal.txt.gz', 
 9 |      package = 'PureCN')
10 | tumor.coverage.file <- system.file('extdata', 'example_tumor.txt.gz', 
11 |      package = 'PureCN')
12 | 
13 | test_that("example parses correctly", {
14 |     vcf_ac <- readAllelicCountsFile(ac.file)
15 |     expect_equal(as.character(ref(vcf_ac)), as.character(ref(head(vcf,20))))
16 |     expect_error(readAllelicCountsFile(ac.empty.file), "Error reading AllelicCountsFile")
17 | })
18 | 
19 | test_that("parsing -> writing -> parsing works", {
20 |     output.file <- tempfile(fileext = ".tsv")
21 |     PureCN:::.writeAllelicCountsFileGatk(vcf, 1, output.file)
22 |     vcf_ac <- readAllelicCountsFile(output.file)
23 |     expect_equal(as.character(ref(vcf_ac)), as.character(ref(vcf)))
24 |     ret <- runAbsoluteCN(normal.coverage.file = normal.coverage.file,
25 |         tumor.coverage.file = tumor.coverage.file,
26 |         candidates = purecn.example.output$candidates,
27 |         vcf.file = vcf,
28 |         genome = "hg19",
29 |         test.purity = seq(0.4, 0.7, by = 0.05), min.ploidy = 1.5, 
30 |         max.ploidy = 2.4, max.candidate.solutions = 1, plot.cnv = FALSE)
31 |     expect_true(length(ret$results) > 0)
32 |     file.remove(output.file)
33 | })
34 | 


--------------------------------------------------------------------------------
/tests/testthat/test_callLOH.R:
--------------------------------------------------------------------------------
 1 | context("callLOH")
 2 | 
 3 | test_that("Example is called correctly", {
 4 |     data(purecn.example.output)
 5 |     ret <- callLOH(purecn.example.output)
 6 |     expect_true(is(ret, "data.frame"))
 7 |     expect_equal(13, ncol(ret))
 8 | })
 9 | 
10 | test_that("NCBI-style chromosome names work", {
11 |     normal.coverage.file <- system.file("extdata", "example_normal.txt.gz", 
12 |         package = "PureCN")
13 |     tumor.coverage.file <- system.file("extdata", "example_tumor.txt.gz", 
14 |         package = "PureCN")
15 |     vcf.file <- system.file("extdata", "example.vcf.gz", package = "PureCN")
16 |     vcf <- readVcf(vcf.file)
17 |     normal <- readCoverageFile(normal.coverage.file)
18 |     tumor <- readCoverageFile(tumor.coverage.file)
19 |     seqlevelsStyle(vcf) <- "Ensembl"
20 |     seqlevelsStyle(normal) <- "Ensembl"
21 |     seqlevelsStyle(tumor) <- "Ensembl"
22 |     ret <- runAbsoluteCN(normal.coverage.file = normal, tumor.coverage.file = tumor, 
23 |         genome = "hg19", vcf.file = vcf, sampleid = "Sample1", 
24 |         min.ploidy = 1.4, max.ploidy = 2.4, test.purity = seq(0.4, 
25 |             0.7, by = 0.05), max.candidate.solutions = 1, plot = FALSE)
26 |     loh <- callLOH(ret)
27 |     expect_equal(unique(loh$chr), as.character(1:22))
28 |     loh2 <- callLOH(ret, keep.no.snp.segments = FALSE)
29 |     expect_true(nrow(loh) > nrow(loh2))
30 |     idx <- !is.na(loh$M)
31 |     expect_equal(loh$C[idx], loh2$C)
32 |     expect_equal(is.na(loh$M), is.na(loh$type))
33 | })
34 | 
35 | test_that("No crash without centromeres", {
36 |     x <- purecn.example.output
37 |     x$input$centromeres <- NULL
38 |     loh <- callLOH(x)
39 |     expect_equal(13, ncol(loh))
40 | })
41 | 


--------------------------------------------------------------------------------
/inst/extdata/ex2_reference.fa:
--------------------------------------------------------------------------------
 1 | >seq1
 2 | TTCCTAAGGGTCAGATAGGCTCTGCGAGCCGCACTTCACTTGACGAAGAATTCTAGTTGTGATTATGATACCCTTCCTGC
 3 | CGAACAGACCTGTCTCAGTATAATAAGACCAATTAAATGATAAAAGCAGAACAATAGTAGCGAACCCAACTCGCGACCAA
 4 | TGTCGTGCCGTATGAACCACTATACACAATCTCGAACTTGCGCGGCGTTTGAGAATGTCCCCTACGCAACAGCTCAGATG
 5 | CGGTAGGTATAAGTAGTCCCATTGGCTGTTTCTGAGTCTTCATAGTCACGAACTACGCTAAGTCTAGGACGTGAGGCCAC
 6 | GAAAATATTGAAATCCGCTATTCACGTTTCAATGCTATACGTAAGCTTCGAAGTTTCTCTAGAAACGATAACTTACTTCT
 7 | ACGTGGCTTTCCCTCCGTTGGAGCCCCCGTGCCGGCTGGAGGACGCCCCAGTCACACATGAGCCGATCCATCACTCCCAG
 8 | GGAGGGTTAATGAAGACTCTTGGTGCGTCTATTTAGTCAGAAACGATCCGCTTTGAAATGATTCCTTGAGGAGGTGTGAC
 9 | CTTGAGTATACTCCGCGGGCGAGGATCCACATTGGCGGGAAGGAAAAACCGTGGTCTGCATATCCTGTGTACAGCCATTG
10 | CTAGGGCTCAGCAACGCTTCCCTGCCTAATCTGCACGGATCGAAGGTTGACTCGCCGTGAAATCGTGGCGACCCGCGTGC
11 | GTATGGGGGTAAACGCGACTCTTATGTGCTCTAAGCTGGCAGTTGCATTCAGCTCCGTGCGGCGATGCGCACTGTCCCGT
12 | >seq2
13 | ACGGGAATGGTCAGACCGTGCCCCAAAATCCTCTGGCAGCTCTCACGATGCTAGACACTTGTTGCAAACCTCCTTCGACA
14 | ACTCAAGGCTTGCGACACCAAGCGAAAATCCAGTTGCAACCGACGGGCGCGAGTCTAGGTGCTGGCGGCGACAGTGCGTA
15 | ACCGTGGTCGGGGATCTATGCGTCGGATGCTTAACACAATAGCGTTGCCTACATTCACGTATGGTCATGCGGCGTAAGCA
16 | CTACCACGCAAATCACCGTGCAGGGCCGTGTTCGACACCCTCGACTGATTGTAGCCCATAGTAGATCAGTCTGGATCGAA
17 | AGCGTGCTGAAGCAATTGCCCATTGTCACATGAATCGGTTTCGAAGGAAACTATAGATGTAACGTAGGCCGGGTATCAGG
18 | GACGCATGAGTACCACGCTCAAGCGGGGGCTCTAGTGGATTGGTGGATTGATATTTTGCCGATTTTGCACTTCAACCAGC
19 | TTCTGACCATCACAAAACCGAAGGTCGTTTTTTTTTGGTTAACGAAACTCAAGGTCCGAGAGTGGCGATCGAGTTGAACT
20 | AAAAGATCGTTCTAGATGAACTTTACCAGATACAGCCAGGGCTCACATAACTTTCACTTCTATGGGTGGTTTTTCATCAT
21 | TCACAATACGACAACCAAAGAGCTAAACCTCGGGCTTGTCATTGCAAATGTCCCAGACGTTTGTTCAAAATTAACTCGAG
22 | ACACTGATGGATCCGCAAATTAAGAGGATAACTGTTTTACGCGCGGTCTCATAGACTTGTCGTACCCAAGTCCTTTGAGA
23 | 


--------------------------------------------------------------------------------
/inst/extdata/ex3_reference.fa:
--------------------------------------------------------------------------------
 1 | >chr1
 2 | TTCCTAAGGGTCAGATAGGCTCTGCGAGCCGCACTTCACTTGACGAAGAATTCTAGTTGTGATTATGATACCCTTCCTGC
 3 | CGAACAGACCTGTCTCAGTATAATAAGACCAATTAAATGATAAAAGCAGAACAATAGTAGCGAACCCAACTCGCGACCAA
 4 | TGTCGTGCCGTATGAACCACTATACACAATCTCGAACTTGCGCGGCGTTTGAGAATGTCCCCTACGCAACAGCTCAGATG
 5 | CGGTAGGTATAAGTAGTCCCATTGGCTGTTTCTGAGTCTTCATAGTCACGAACTACGCTAAGTCTAGGACGTGAGGCCAC
 6 | GAAAATATTGAAATCCGCTATTCACGTTTCAATGCTATACGTAAGCTTCGAAGTTTCTCTAGAAACGATAACTTACTTCT
 7 | ACGTGGCTTTCCCTCCGTTGGAGCCCCCGTGCCGGCTGGAGGACGCCCCAGTCACACATGAGCCGATCCATCACTCCCAG
 8 | GGAGGGTTAATGAAGACTCTTGGTGCGTCTATTTAGTCAGAAACGATCCGCTTTGAAATGATTCCTTGAGGAGGTGTGAC
 9 | CTTGAGTATACTCCGCGGGCGAGGATCCACATTGGCGGGAAGGAAAAACCGTGGTCTGCATATCCTGTGTACAGCCATTG
10 | CTAGGGCTCAGCAACGCTTCCCTGCCTAATCTGCACGGATCGAAGGTTGACTCGCCGTGAAATCGTGGCGACCCGCGTGC
11 | GTATGGGGGTAAACGCGACTCTTATGTGCTCTAAGCTGGCAGTTGCATTCAGCTCCGTGCGGCGATGCGCACTGTCCCGT
12 | >chr2
13 | ACGGGAATGGTCAGACCGTGCCCCAAAATCCTCTGGCAGCTCTCACGATGCTAGACACTTGTTGCAAACCTCCTTCGACA
14 | ACTCAAGGCTTGCGACACCAAGCGAAAATCCAGTTGCAACCGACGGGCGCGAGTCTAGGTGCTGGCGGCGACAGTGCGTA
15 | ACCGTGGTCGGGGATCTATGCGTCGGATGCTTAACACAATAGCGTTGCCTACATTCACGTATGGTCATGCGGCGTAAGCA
16 | CTACCACGCAAATCACCGTGCAGGGCCGTGTTCGACACCCTCGACTGATTGTAGCCCATAGTAGATCAGTCTGGATCGAA
17 | AGCGTGCTGAAGCAATTGCCCATTGTCACATGAATCGGTTTCGAAGGAAACTATAGATGTAACGTAGGCCGGGTATCAGG
18 | GACGCATGAGTACCACGCTCAAGCGGGGGCTCTAGTGGATTGGTGGATTGATATTTTGCCGATTTTGCACTTCAACCAGC
19 | TTCTGACCATCACAAAACCGAAGGTCGTTTTTTTTTGGTTAACGAAACTCAAGGTCCGAGAGTGGCGATCGAGTTGAACT
20 | AAAAGATCGTTCTAGATGAACTTTACCAGATACAGCCAGGGCTCACATAACTTTCACTTCTATGGGTGGTTTTTCATCAT
21 | TCACAATACGACAACCAAAGAGCTAAACCTCGGGCTTGTCATTGCAAATGTCCCAGACGTTTGTTCAAAATTAACTCGAG
22 | ACACTGATGGATCCGCAAATTAAGAGGATAACTGTTTTACGCGCGGTCTCATAGACTTGTCGTACCCAAGTCCTTTGAGA
23 | 


--------------------------------------------------------------------------------
/man/predictSomatic.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/predictSomatic.R
 3 | \name{predictSomatic}
 4 | \alias{predictSomatic}
 5 | \title{Predict germline vs. somatic status}
 6 | \usage{
 7 | predictSomatic(res, id = 1, return.vcf = FALSE)
 8 | }
 9 | \arguments{
10 | \item{res}{Return object of the \code{\link{runAbsoluteCN}} function.}
11 | 
12 | \item{id}{Candidate solutions to be analyzed. \code{id=1} will analyze the
13 | maximum likelihood solution.}
14 | 
15 | \item{return.vcf}{Returns an annotated \code{CollapsedVCF} object. Note that
16 | this VCF will only contain variants not filtered out by the \code{filterVcf}
17 | functions. Variants outside segments or intervals might be included or not
18 | depending on \code{\link{runAbsoluteCN}} arguments.}
19 | }
20 | \value{
21 | A \code{data.frame} or \code{CollapsedVCF} with SNV state posterior
22 | probabilities.
23 | }
24 | \description{
25 | This function takes as input the output of a \code{\link{runAbsoluteCN}} run
26 | and provides SNV posterior probabilities for all possible states.
27 | }
28 | \examples{
29 | 
30 | data(purecn.example.output)
31 | # the output data was created using a matched normal sample, but in case
32 | # no matched normal is available, this will help predicting somatic vs. 
33 | # germline status
34 | purecnSnvs <- predictSomatic(purecn.example.output)
35 | 
36 | # Prefer GRanges?
37 | purecnSnvs <- GRanges(predictSomatic(purecn.example.output))
38 | 
39 | # write a VCF file
40 | purecnVcf <- predictSomatic(purecn.example.output, return.vcf=TRUE)
41 | writeVcf(purecnVcf, file = "Sample1_PureCN.vcf")
42 | 
43 | }
44 | \seealso{
45 | \code{\link{runAbsoluteCN}}
46 | }
47 | \author{
48 | Markus Riester
49 | }
50 | 


--------------------------------------------------------------------------------
/man/calculateTangentNormal.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/createNormalDatabase.R
 3 | \name{calculateTangentNormal}
 4 | \alias{calculateTangentNormal}
 5 | \title{Calculate tangent normal}
 6 | \usage{
 7 | calculateTangentNormal(
 8 |   tumor.coverage.file,
 9 |   normalDB,
10 |   num.eigen = 20,
11 |   ignore.sex = FALSE,
12 |   sex = NULL
13 | )
14 | }
15 | \arguments{
16 | \item{tumor.coverage.file}{Coverage file or data  of a tumor sample.}
17 | 
18 | \item{normalDB}{Database of normal samples, created with
19 | \code{\link{createNormalDatabase}}.}
20 | 
21 | \item{num.eigen}{Number of eigen vectors used.}
22 | 
23 | \item{ignore.sex}{If \code{FALSE}, detects sex of sample and returns best
24 | normals with matching sex.}
25 | 
26 | \item{sex}{Sex of sample. If \code{NULL}, determine with
27 | \code{\link{getSexFromCoverage}} and default parameters. Valid values are
28 | \code{F} for female, \code{M} for male. If all chromosomes are diploid,
29 | specify \code{diploid}.}
30 | }
31 | \description{
32 | Reimplementation of GATK4 denoising. Please cite the relevant GATK
33 | publication if you use this in a publication.
34 | }
35 | \examples{
36 | 
37 | tumor.coverage.file <- system.file('extdata', 'example_tumor.txt.gz', 
38 |     package = 'PureCN')
39 | normal.coverage.file <- system.file("extdata", "example_normal.txt.gz", 
40 |     package = "PureCN")
41 | normal2.coverage.file <- system.file("extdata", "example_normal2.txt.gz", 
42 |     package = "PureCN")
43 | normal.coverage.files <- c(normal.coverage.file, normal2.coverage.file)
44 | normalDB <- createNormalDatabase(normal.coverage.files)
45 | pool <- calculateTangentNormal(tumor.coverage.file, normalDB)
46 | 
47 | }
48 | \seealso{
49 | \code{\link{createNormalDatabase}}
50 | }
51 | \author{
52 | Markus Riester
53 | }
54 | 


--------------------------------------------------------------------------------
/tests/testthat/test_getSexFromCoverage.R:
--------------------------------------------------------------------------------
 1 | context("getSexFromCoverage")
 2 | 
 3 | library(GenomeInfoDb)  # for renameSeqlevels()
 4 | 
 5 | tumor.coverage.file <- system.file("extdata", "example_tumor.txt.gz", 
 6 |     package = "PureCN")
 7 | coverage <- readCoverageFile(tumor.coverage.file)
 8 | chr22 <- coverage[which(seqnames(coverage) == "chr22")]
 9 | chrX <- renameSeqlevels(chr22, c(chr22 = "chrX"))
10 | 
11 | test_that("Warning with missing coverage data", {
12 |     sex <- getSexFromCoverage(coverage) 
13 |     expect_true(is.na(sex))
14 |     expect_output( getSexFromCoverage(coverage), "WARN" )
15 | })
16 | 
17 | test_that("Warning with missing coverage data in file", {
18 |     sex <- getSexFromCoverage(tumor.coverage.file) 
19 |     expect_true(is.na(sex))
20 |     expect_output( getSexFromCoverage(coverage), "WARN" )
21 | })
22 | 
23 | test_that("Male correct from coverage data", {
24 |     chrY <- renameSeqlevels(chr22, c(chr22 = "chrY"))
25 |     coverage_fakemale <- suppressWarnings(c(coverage, chrX, chrY))
26 |     sex <- getSexFromCoverage(coverage_fakemale)
27 |     expect_identical("M", sex)
28 | })
29 | 
30 | test_that("Female correct from coverage data", {
31 |     chrY <- renameSeqlevels(chr22, c(chr22 = "chrY"))
32 |     chrY$average.coverage <- chrY$average.coverage/50
33 |     coverage_fakefemale <- suppressWarnings( c(coverage, chrX, chrY) )
34 |     sex <- getSexFromCoverage(coverage_fakefemale)
35 |     expect_identical("F", sex)
36 | })
37 |     
38 | test_that("NA correct from contaminated coverage data", {
39 |     chrY <- renameSeqlevels(chr22, c(chr22 = "chrY"))
40 |     chrY$average.coverage <- chrY$average.coverage / 21
41 |     coverage_fakecontamination <- suppressWarnings( c(coverage, chrX, chrY))
42 |     sex <- getSexFromCoverage(coverage_fakecontamination)
43 |     expect_true(is.na(sex))
44 | })
45 | 


--------------------------------------------------------------------------------
/R/adjustLogRatio.R:
--------------------------------------------------------------------------------
 1 | #' Adjust tumor vs. normal coverage log ratio for tumor purity and ploidy
 2 | #' 
 3 | #' This function can be used to adjust the log ratio for tumor purity and
 4 | #' ploidy for downstream tools that expect a log2 ratio (for example GISTIC).
 5 | #' 
 6 | #' 
 7 | #' @param ratio Vector of log2 tumor vs normal coverage ratios. 
 8 | #' @param purity Purity of sample.
 9 | #' @param ploidy Ploidy of sample.
10 | #' @param is.log2 \code{log.ratio} is \code{log2} transformed. 
11 | #' @param min.ratio Minimum (non-log2-transformed) ratio. Set to approx -8
12 | #' \code{log2} adjusted.
13 | #' @return \code{numeric(length(log.ratio))}, \code{log.ratio} adjusted 
14 | #' for \code{purity} and \code{ploidy} 
15 | #' @author Markus Riester
16 | #' @references
17 | #   * Zack et al. (2012), Pan-cancer patterns of somatic copy number alteration 
18 | #'    Nature Biotechnology.
19 | #'  * Toal (2018), https://github.com/lima1/PureCN/issues/40
20 | #' 
21 | #' @examples
22 | #' 
23 | #' normal.coverage.file <- system.file("extdata", "example_normal.txt.gz", 
24 | #'     package = "PureCN")
25 | #' tumor.coverage.file <- system.file("extdata", "example_tumor.txt.gz", 
26 | #'     package = "PureCN")
27 | #' normal <- readCoverageFile(normal.coverage.file)
28 | #' tumor <- readCoverageFile(tumor.coverage.file)
29 | #' log.ratio <- calculateLogRatio(normal, tumor)
30 | #' log.ratio.adjusted <- adjustLogRatio(log.ratio, 0.65, 1.73)
31 | #' 
32 | #' @export adjustLogRatio
33 | adjustLogRatio <- function(ratio, purity, ploidy, is.log2 = TRUE, min.ratio = 2^-8) {
34 |     if (is.log2) ratio <- 2^ratio
35 |     adjusted <- (purity * ploidy * ratio + 2 * (1 - purity) * ratio - 2 * (1 - purity)) / (purity * ploidy)
36 |     adjusted <- pmax(min.ratio, adjusted)
37 |     if (is.log2) adjusted <- log2(adjusted)
38 |     return(adjusted)
39 | }
40 | 
41 | 


--------------------------------------------------------------------------------
/man/filterVcfMuTect2.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/filterVcfMuTect2.R
 3 | \name{filterVcfMuTect2}
 4 | \alias{filterVcfMuTect2}
 5 | \title{Filter VCF MuTect2}
 6 | \usage{
 7 | filterVcfMuTect2(
 8 |   vcf,
 9 |   tumor.id.in.vcf = NULL,
10 |   ignore = c("clustered_events", "t_lod", "str_contraction", "read_position", "position",
11 |     "fragment_length", "multiallelic", "clipping", "strand_artifact", "strand_bias",
12 |     "slippage", "weak_evidence", "orientation", "haplotype"),
13 |   ...
14 | )
15 | }
16 | \arguments{
17 | \item{vcf}{\code{CollapsedVCF} object, read in with the \code{readVcf}
18 | function from the VariantAnnotation package.}
19 | 
20 | \item{tumor.id.in.vcf}{The tumor id in the VCF file, optional.}
21 | 
22 | \item{ignore}{MuTect2 flags that mark variants for exclusion.}
23 | 
24 | \item{\dots}{Additional arguments passed to \code{\link{filterVcfBasic}}.}
25 | }
26 | \value{
27 | A list with elements \code{vcf}, \code{flag} and
28 | \code{flag_comment}.  \code{vcf} contains the filtered \code{CollapsedVCF},
29 | \code{flag} a \code{logical(1)} flag if problems were identified, further
30 | described in \code{flag_comment}.
31 | }
32 | \description{
33 | Function to remove artifacts and low confidence/quality calls from a 
34 | GATK4/MuTect2 generated VCF file. Also applies filters defined in 
35 | \code{filterVcfBasic}.
36 | }
37 | \examples{
38 | 
39 | ### This function is typically only called by runAbsolute via the 
40 | ### fun.filterVcf and args.filterVcf comments.
41 | library(VariantAnnotation)    
42 | vcf.file <- system.file("extdata", "example.vcf.gz", package="PureCN")
43 | vcf <- readVcf(vcf.file, "hg19")
44 | vcf.filtered <- filterVcfMuTect(vcf)        
45 | 
46 | }
47 | \seealso{
48 | \code{\link{filterVcfBasic}}
49 | }
50 | \author{
51 | Markus Riester
52 | }
53 | 


--------------------------------------------------------------------------------
/man/getSexFromCoverage.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/getSex.R
 3 | \name{getSexFromCoverage}
 4 | \alias{getSexFromCoverage}
 5 | \title{Get sample sex from coverage}
 6 | \usage{
 7 | getSexFromCoverage(
 8 |   coverage.file,
 9 |   min.ratio = 25,
10 |   min.ratio.na = 20,
11 |   remove.outliers = TRUE
12 | )
13 | }
14 | \arguments{
15 | \item{coverage.file}{Coverage file or data read with
16 | \code{\link{readCoverageFile}}.}
17 | 
18 | \item{min.ratio}{Min chrX/chrY coverage ratio to call sample as female.}
19 | 
20 | \item{min.ratio.na}{Min chrX/chrY coverage ratio to call sample as
21 | \code{NA}.  This ratio defines a grey zone from \code{min.ratio.na} to
22 | \code{min.ratio} in which samples are not called. The default is set to a
23 | copy number ratio that would be rare in male samples, but lower than
24 | expected in female samples. Contamination can be a source of ambiguous
25 | calls. Mappability issues on chromosome Y resulting in low coverage need to
26 | be considered when setting cutoffs.}
27 | 
28 | \item{remove.outliers}{Removes coverage outliers before calculating mean
29 | chromosome coverages.}
30 | }
31 | \value{
32 | Returns a \code{character(1)} with \code{M} for male, \code{F} for
33 | female, or \code{NA} if unknown.
34 | }
35 | \description{
36 | This function determines the sex of a sample by the coverage ratio of chrX
37 | and chrY. Loss of chromosome Y (LOY) can result in a wrong female call. For
38 | small targeted panels, this will only work when sufficient sex marker genes
39 | such as AMELY are covered. For optimal results, parameters might need to be
40 | tuned for the assay.
41 | }
42 | \examples{
43 | 
44 | tumor.coverage.file <- system.file("extdata", "example_tumor.txt.gz",
45 |     package = "PureCN")
46 | sex <- getSexFromCoverage(tumor.coverage.file)
47 | 
48 | }
49 | \seealso{
50 | \code{\link{getSexFromVcf}}
51 | }
52 | \author{
53 | Markus Riester
54 | }
55 | 


--------------------------------------------------------------------------------
/tests/testthat/test_callAmplificationsInLowPurity.R:
--------------------------------------------------------------------------------
 1 | context("callAmplificationsInLowPurity")
 2 | 
 3 | normal.coverage.file <- system.file("extdata", "example_normal.txt.gz", 
 4 |     package = "PureCN")
 5 | normal2.coverage.file <- system.file("extdata", "example_normal2.txt.gz", 
 6 |     package = "PureCN")
 7 | normal.coverage.files <- c(normal.coverage.file, normal2.coverage.file)
 8 | normalDB <- createNormalDatabase(normal.coverage.files)
 9 | data(purecn.example.output)
10 | 
11 | test_that("Example is called correctly", {
12 |     m <- callAmplificationsInLowPurity(purecn.example.output,
13 |        normalDB, all.genes = TRUE, purity = 0.65)
14 |     m2 <- callAmplificationsInLowPurity(purecn.example.output,
15 |        normalDB, all.genes = TRUE, purity = 0.65, BPPARAM = BiocParallel::bpparam())
16 |     esr2 <- m["ESR2", ]
17 |     expect_equal(as.character(esr2$chr), "chr14")
18 |     expect_true(esr2$start > 64694600)
19 |     expect_true(esr2$end < 64761128)
20 |     expect_true(esr2$C < 3 && esr2$C >= 2)
21 |     expect_gt(cor(m$p.value, m2$p.value), 0.99)
22 | })
23 | test_that("Exceptions happen with incorrect input data", {
24 |     expect_error(callAmplificationsInLowPurity(purecn.example.output, 
25 |         normalDB, pvalue.cutoff = 1.2), "pvalue.cutoff")
26 |     expect_error(callAmplificationsInLowPurity(purecn.example.output, 
27 |         normalDB, pvalue.cutoff = -1.2), "pvalue.cutoff")
28 |     expect_error(callAmplificationsInLowPurity(purecn.example.output, 
29 |         normalDB, percentile.cutoff = 120), "percentile.cutoff")
30 |     expect_error(callAmplificationsInLowPurity(purecn.example.output, 
31 |         normalDB, percentile.cutoff = -120), "percentile.cutoff")
32 |     expect_error(callAmplificationsInLowPurity(purecn.example.output, 
33 |         normalDB, purity = -120), "purity")
34 |     expect_error(callAmplificationsInLowPurity(purecn.example.output, 
35 |         normalDB, purity = 80), "purity")
36 | })
37 | 


--------------------------------------------------------------------------------
/man/findFocal.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/findFocal.R
 3 | \name{findFocal}
 4 | \alias{findFocal}
 5 | \title{Find focal amplifications}
 6 | \usage{
 7 | findFocal(seg, max.size = 3e+06, cn.diff = 2, min.amp.cn = 5)
 8 | }
 9 | \arguments{
10 | \item{seg}{Segmentation data.}
11 | 
12 | \item{max.size}{Cutoff for focal in base pairs.}
13 | 
14 | \item{cn.diff}{Minimum copy number delta between neighboring segments.}
15 | 
16 | \item{min.amp.cn}{Minimum amplification integer copy number. Segments with
17 | lower copy number are not tested.}
18 | }
19 | \value{
20 | \code{logical(n)}, indicating for all n segments whether they are
21 | focally amplified or not.
22 | }
23 | \description{
24 | Function to find focal amplifications in segmented data.  This is
25 | automatically called in \code{\link{runAbsoluteCN}}.
26 | }
27 | \examples{
28 | 
29 | normal.coverage.file <- system.file("extdata", "example_normal_tiny.txt",
30 |     package = "PureCN")
31 | tumor.coverage.file <- system.file("extdata", "example_tumor_tiny.txt",
32 |     package = "PureCN")
33 | vcf.file <- system.file("extdata", "example.vcf.gz",
34 |     package = "PureCN")
35 | interval.file <- system.file("extdata", "example_intervals_tiny.txt",
36 |     package = "PureCN")
37 | 
38 | # The max.candidate.solutions, max.ploidy and test.purity parameters are set to
39 | # non-default values to speed-up this example.  This is not a good idea for real
40 | # samples.
41 | ret <-runAbsoluteCN(normal.coverage.file = normal.coverage.file,
42 |     tumor.coverage.file = tumor.coverage.file, vcf.file = vcf.file,
43 |     genome="hg19", sampleid = "Sample1", interval.file = interval.file,
44 |     max.candidate.solutions = 1, max.ploidy = 4,
45 |     test.purity = seq(0.3, 0.7, by = 0.05),
46 |     args.focal=list(max.size = 2e+06), fun.focal = findFocal)
47 | 
48 | }
49 | \seealso{
50 | \code{\link{runAbsoluteCN}}
51 | }
52 | \author{
53 | Markus Riester
54 | }
55 | 


--------------------------------------------------------------------------------
/man/readCurationFile.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/readCurationFile.R
 3 | \name{readCurationFile}
 4 | \alias{readCurationFile}
 5 | \title{Read curation file}
 6 | \usage{
 7 | readCurationFile(
 8 |   file.rds,
 9 |   file.curation = gsub(".rds$", ".csv", file.rds),
10 |   remove.failed = FALSE,
11 |   report.best.only = FALSE,
12 |   min.ploidy = NULL,
13 |   max.ploidy = NULL
14 | )
15 | }
16 | \arguments{
17 | \item{file.rds}{Output of the \code{\link{runAbsoluteCN}} function,
18 | serialized with \code{saveRDS}.}
19 | 
20 | \item{file.curation}{Filename of a curation file that points to the correct
21 | tumor purity and ploidy solution.}
22 | 
23 | \item{remove.failed}{Do not return solutions that failed.}
24 | 
25 | \item{report.best.only}{Only return correct/best solution (useful on low
26 | memory machines when lots of samples are loaded).}
27 | 
28 | \item{min.ploidy}{Minimum ploidy to be considered. If \code{NULL}, all. Can
29 | be used to automatically ignore unlikely solutions.}
30 | 
31 | \item{max.ploidy}{Maximum ploidy to be considered. If \code{NULL}, all. Can
32 | be used to automatically ignore unlikely solutions.}
33 | }
34 | \value{
35 | The return value of the corresponding \code{\link{runAbsoluteCN}}
36 | call, but with the results array manipulated according the curation CSV file
37 | and arguments of this function.
38 | }
39 | \description{
40 | Function that can be used to read the curated output of the
41 | \code{\link{runAbsoluteCN}} function.
42 | }
43 | \examples{
44 | 
45 | data(purecn.example.output)
46 | file.rds <- "Sample1_PureCN.rds"
47 | createCurationFile(file.rds)
48 | # User can change the maximum likelihood solution manually in the generated
49 | # CSV file. The correct solution is then loaded with readCurationFile.
50 | purecn.curated.example.output <-readCurationFile(file.rds)
51 | 
52 | }
53 | \seealso{
54 | \code{\link{runAbsoluteCN} \link{createCurationFile}}
55 | }
56 | \author{
57 | Markus Riester
58 | }
59 | 


--------------------------------------------------------------------------------
/R/callCIN.R:
--------------------------------------------------------------------------------
 1 | #' Call Chromosomal Instability
 2 | #'
 3 | #' This function provides detailed CIN information.
 4 | #'
 5 | #'
 6 | #' @param res Return object of the \code{\link{runAbsoluteCN}} function.
 7 | #' @param id Candidate solution to extract CIN from. \code{id=1} will use the
 8 | #' maximum likelihood solution.
 9 | #' @param allele.specific Use allele-specific or only total copy number for
10 | #' detecting abnormal regions. Copy-number neutral LOH would be ignored when
11 | #' this parameter is set to \code{FALSE}.
12 | #' @param reference.state Copy number regions different from the reference
13 | #' state are counted as abnormal. Default is \code{dominant} means the most
14 | #' common state. The other option is \code{normal}, which defines normal
15 | #' heterozygous, diploid as reference. The default is robust to errors in
16 | #' ploidy.
17 | #' @return Returns \code{double(1)} with CIN value.
18 | #' @author Markus Riester
19 | #' @seealso \code{\link{runAbsoluteCN}}
20 | #' @examples
21 | #'
22 | #' data(purecn.example.output)
23 | #' head(callCIN(purecn.example.output))
24 | #'
25 | #' @export callCIN
26 | callCIN <- function(res, id = 1, allele.specific = TRUE, reference.state =
27 |                     c("dominant", "normal")) {
28 |     loh <- callLOH(res, id)
29 |     loh$size <- loh$end - loh$start + 1
30 |     # should not happen
31 |     loh <- loh[!is.na(loh$size), ]
32 |     if (allele.specific) loh <- loh[!is.na(loh$M), ]
33 |     reference.state <- match.arg(reference.state)
34 |     loh$state <- if (allele.specific) paste0(loh$C, "/", loh$M) else loh$C
35 |     dominant.state <-  sort(sapply(split(loh$size, loh$state), sum),
36 |                             decreasing = TRUE)[1]
37 |     reference.state.cn <- names(dominant.state)
38 |     if (reference.state == "normal") {
39 |         reference.state.cn <- if (allele.specific) "2/1" else "2"
40 |     }
41 |     loh$is.reference <- loh$state == reference.state.cn
42 |     sum(loh$size[!loh$is.reference]) / sum(loh$size)
43 | }
44 | 


--------------------------------------------------------------------------------
/R/poolCoverage.R:
--------------------------------------------------------------------------------
 1 | #' Pool coverage from multiple samples
 2 | #'
 3 | #' Averages the coverage of a list of samples.
 4 | #'
 5 | #'
 6 | #' @param all.data List of normals, read with \code{\link{readCoverageFile}}.
 7 | #' @param remove.chrs Remove these chromosomes from the pool.
 8 | #' @param w \code{numeric(length(all.data))} vector of weights. If \code{NULL},
 9 | #' weight all samples equally.
10 | #' @return A \code{data.frame} with the averaged coverage over all normals.
11 | #' @author Markus Riester
12 | #' @seealso \code{\link{readCoverageFile}}
13 | #' @examples
14 | #'
15 | #' normal.coverage.file <- system.file("extdata", "example_normal.txt.gz",
16 | #'     package = "PureCN")
17 | #' normal2.coverage.file <- system.file("extdata", "example_normal2.txt.gz",
18 | #'     package = "PureCN")
19 | #' normal.coverage.files <- c(normal.coverage.file, normal2.coverage.file)
20 | #' pool <- poolCoverage(lapply(normal.coverage.files, readCoverageFile),
21 | #'      remove.chrs = c("chrX", "chrY"))
22 | #'
23 | #' @export poolCoverage
24 | poolCoverage <- function(all.data, remove.chrs=c(), w = NULL) {
25 |     pool <- all.data[[1]]
26 | 
27 |     if (length(all.data) == 1) {
28 |         return(.removeChr(pool, remove.chrs))
29 |     }
30 |     if (is.null(w)) {
31 |         w <- rep(1, length(all.data))
32 |     } else if (length(w) != length(all.data)) {
33 |         .stopUserError("all.data and w have different lengths.")
34 |     }
35 | 
36 |     pool$coverage <- 0
37 |     pool$counts <- 0
38 | 
39 |     for (i in seq_along(all.data)) {
40 |         pool$coverage <- pool$coverage + (w[i] * all.data[[i]]$coverage)
41 |         pool$counts <- pool$counts + (w[i] * all.data[[i]]$counts)
42 |     }
43 |     pool <- .addAverageCoverage(pool)
44 |     return(.removeChr(pool, remove.chrs))
45 | }
46 | 
47 | .removeChr <- function(pool, remove.chrs = c()) {
48 |     idx <- seqnames(pool) %in% remove.chrs
49 |     if (sum(idx)) {
50 |         pool[idx]$coverage <- NA
51 |         pool[idx]$average.coverage <- NA
52 |     }
53 |     pool
54 | }
55 | 


--------------------------------------------------------------------------------
/R/filterVcfMuTect2.R:
--------------------------------------------------------------------------------
 1 | #' Filter VCF MuTect2
 2 | #' 
 3 | #' Function to remove artifacts and low confidence/quality calls from a 
 4 | #' GATK4/MuTect2 generated VCF file. Also applies filters defined in 
 5 | #' \code{filterVcfBasic}.
 6 | #' 
 7 | #' 
 8 | #' @param vcf \code{CollapsedVCF} object, read in with the \code{readVcf}
 9 | #' function from the VariantAnnotation package.
10 | #' @param tumor.id.in.vcf The tumor id in the VCF file, optional.
11 | #' @param ignore MuTect2 flags that mark variants for exclusion.
12 | #' @param \dots Additional arguments passed to \code{\link{filterVcfBasic}}.
13 | #' @return A list with elements \code{vcf}, \code{flag} and
14 | #' \code{flag_comment}.  \code{vcf} contains the filtered \code{CollapsedVCF},
15 | #' \code{flag} a \code{logical(1)} flag if problems were identified, further
16 | #' described in \code{flag_comment}.
17 | #' @author Markus Riester
18 | #' @seealso \code{\link{filterVcfBasic}}
19 | #' @examples
20 | #' 
21 | #' ### This function is typically only called by runAbsolute via the 
22 | #' ### fun.filterVcf and args.filterVcf comments.
23 | #' library(VariantAnnotation)    
24 | #' vcf.file <- system.file("extdata", "example.vcf.gz", package="PureCN")
25 | #' vcf <- readVcf(vcf.file, "hg19")
26 | #' vcf.filtered <- filterVcfMuTect(vcf)        
27 | #' 
28 | #' @export filterVcfMuTect2
29 | filterVcfMuTect2 <- function(vcf, tumor.id.in.vcf = NULL,
30 | ignore=c("clustered_events", "t_lod", "str_contraction", 
31 | "read_position", "position", "fragment_length", "multiallelic", "clipping",
32 | "strand_artifact", "strand_bias", "slippage", "weak_evidence",
33 | "orientation", "haplotype"),
34 | ...){
35 |     if (is.null(fixed(vcf)$FILTER)) return(
36 |         filterVcfBasic(vcf, tumor.id.in.vcf, ...))
37 |     
38 |     n <- .countVariants(vcf)
39 | 
40 |     ids <- sort(unique(unlist(sapply(ignore, grep, fixed(vcf)$FILTER))))
41 |     vcf <- .removeVariants(vcf, ids, "Mutect2")
42 |     flog.info("Removing %i Mutect2 calls due to blacklisted failure reasons.", 
43 |         n-.countVariants(vcf))
44 |     filterVcfBasic(vcf, tumor.id.in.vcf, ...)
45 | }
46 | 


--------------------------------------------------------------------------------
/man/filterVcfMuTect.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/filterVcfMuTect.R
 3 | \name{filterVcfMuTect}
 4 | \alias{filterVcfMuTect}
 5 | \title{Filter VCF MuTect}
 6 | \usage{
 7 | filterVcfMuTect(
 8 |   vcf,
 9 |   tumor.id.in.vcf = NULL,
10 |   stats.file = NULL,
11 |   ignore = c("clustered_read_position", "fstar_tumor_lod", "nearby_gap_events",
12 |     "poor_mapping_region_alternate_allele_mapq", "poor_mapping_region_mapq0",
13 |     "possible_contamination", "strand_artifact", "seen_in_panel_of_normals"),
14 |   ...
15 | )
16 | }
17 | \arguments{
18 | \item{vcf}{\code{CollapsedVCF} object, read in with the \code{readVcf}
19 | function from the VariantAnnotation package.}
20 | 
21 | \item{tumor.id.in.vcf}{The tumor id in the VCF file, optional.}
22 | 
23 | \item{stats.file}{MuTect stats file. If \code{NULL}, will check if VCF
24 | was generated by MuTect2 and if yes will call \code{\link{filterVcfMuTect2}}
25 | instead.}
26 | 
27 | \item{ignore}{MuTect flags that mark variants for exclusion.}
28 | 
29 | \item{\dots}{Additional arguments passed to \code{\link{filterVcfBasic}}.}
30 | }
31 | \value{
32 | A list with elements \code{vcf}, \code{flag} and
33 | \code{flag_comment}.  \code{vcf} contains the filtered \code{CollapsedVCF},
34 | \code{flag} a \code{logical(1)} flag if problems were identified, further
35 | described in \code{flag_comment}.
36 | }
37 | \description{
38 | Function to remove artifacts and low confidence/quality calls from a MuTect
39 | generated VCF file. Also applies filters defined in \code{filterVcfBasic}.
40 | This function will only keep variants listed in the stats file and those not
41 | matching the specified failure reasons.
42 | }
43 | \examples{
44 | 
45 | ### This function is typically only called by runAbsolute via the 
46 | ### fun.filterVcf and args.filterVcf comments.
47 | library(VariantAnnotation)    
48 | vcf.file <- system.file("extdata", "example.vcf.gz", package="PureCN")
49 | vcf <- readVcf(vcf.file, "hg19")
50 | vcf.filtered <- filterVcfMuTect(vcf)        
51 | 
52 | }
53 | \seealso{
54 | \code{\link{filterVcfBasic}}
55 | }
56 | \author{
57 | Markus Riester
58 | }
59 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: PureCN
 2 | Type: Package
 3 | Title: Copy number calling and SNV classification using
 4 |     targeted short read sequencing
 5 | Version: 2.15.4
 6 | Date: 2025-07-22
 7 | Authors@R: c(person("Markus", "Riester",
 8 |                     role = c("aut", "cre"),
 9 |                     email = "markus.riester@novartis.com",
10 |                     comment = c(ORCID = "0000-0002-4759-8332")),
11 |              person("Angad P.", "Singh", role = "aut"))
12 | Description: This package estimates tumor purity, copy number, and loss of
13 |     heterozygosity (LOH), and classifies single nucleotide variants (SNVs) by
14 |     somatic status and clonality. PureCN is designed for targeted short read
15 |     sequencing data, integrates well with standard somatic variant detection
16 |     and copy number pipelines, and has support for tumor samples without
17 |     matching normal samples.
18 | Depends:
19 |     R (>= 3.5.0),
20 |     DNAcopy,
21 |     VariantAnnotation (>= 1.14.1)
22 | Imports:
23 |     GenomicRanges (>= 1.20.3),
24 |     IRanges (>= 2.2.1),
25 |     RColorBrewer,
26 |     S4Vectors,
27 |     data.table,
28 |     grDevices,
29 |     graphics,
30 |     stats,
31 |     utils,
32 |     SummarizedExperiment,
33 |     Seqinfo,
34 |     GenomeInfoDb,
35 |     GenomicFeatures,
36 |     Rsamtools,
37 |     Biobase,
38 |     Biostrings,
39 |     BiocGenerics,
40 |     rtracklayer,
41 |     ggplot2,
42 |     gridExtra,
43 |     futile.logger,
44 |     VGAM,
45 |     tools,
46 |     methods,
47 |     mclust,
48 |     rhdf5,
49 |     Matrix
50 | Suggests:
51 |     BiocParallel,
52 |     BiocStyle,
53 |     PSCBS,
54 |     R.utils,
55 |     TxDb.Hsapiens.UCSC.hg19.knownGene,
56 |     covr,
57 |     knitr,
58 |     optparse,
59 |     org.Hs.eg.db,
60 |     jsonlite,
61 |     markdown,
62 |     rmarkdown,
63 |     testthat
64 | Enhances:
65 |     genomicsdb (>= 0.0.3)
66 | VignetteBuilder: knitr
67 | License: Artistic-2.0
68 | BugReports: https://github.com/lima1/PureCN/issues
69 | URL: https://github.com/lima1/PureCN
70 | biocViews: CopyNumberVariation, Software, Sequencing,
71 |     VariantAnnotation, VariantDetection, Coverage, ImmunoOncology
72 | NeedsCompilation: no
73 | ByteCompile: yes
74 | RoxygenNote: 7.3.1
75 | 


--------------------------------------------------------------------------------
/man/setMappingBiasVcf.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/setMappingBiasVcf.R
 3 | \name{setMappingBiasVcf}
 4 | \alias{setMappingBiasVcf}
 5 | \title{Set Mapping Bias VCF}
 6 | \usage{
 7 | setMappingBiasVcf(
 8 |   vcf,
 9 |   tumor.id.in.vcf = NULL,
10 |   mapping.bias.file = NULL,
11 |   smooth = TRUE,
12 |   smooth.n = 5
13 | )
14 | }
15 | \arguments{
16 | \item{vcf}{\code{CollapsedVCF} object, read in with the \code{readVcf}
17 | function from the VariantAnnotation package.}
18 | 
19 | \item{tumor.id.in.vcf}{Id of tumor in case multiple samples are stored in
20 | VCF.}
21 | 
22 | \item{mapping.bias.file}{A precomputed mapping bias database 
23 | obtained by \code{\link{calculateMappingBiasVcf}}.
24 | instead.
25 | reference and alt counts as AD genotype field. Should be compressed and}
26 | 
27 | \item{smooth}{Impute mapping bias of variants not found in the panel by
28 | smoothing of neighboring SNPs. Requires \code{mapping.bias.file}.}
29 | 
30 | \item{smooth.n}{Number of neighboring variants used for smoothing.}
31 | }
32 | \value{
33 | Adds elements to the \code{vcf} \code{INFO} field
34 | \item{bias}{A \code{numeric(nrow(vcf))} 
35 | vector with the mapping bias of for each
36 | variant in the \code{CollapsedVCF}. Mapping bias is expected as scaling
37 | factor. Adjusted allelic fraction is (observed allelic fraction)/(mapping
38 | bias). Maximum scaling factor is 1 and means no bias.}
39 | \item{pon.count}{A \code{numeric(nrow(vcf))} vector with the number
40 | of hits in the \code{mapping.bias.file}.}
41 | \item{shape1, shape2}{Fit of a beta distribution.}
42 | }
43 | \description{
44 | Function to set mapping bias for each variant in the provided
45 | \code{CollapsedVCF} object. By default, it returns the same value for all
46 | variants, but a mapping bias file can be provided for position-specific
47 | mapping bias calculation.
48 | }
49 | \examples{
50 | 
51 | # This function is typically only called by runAbsoluteCN via 
52 | # fun.setMappingBiasVcf and args.setMappingBiasVcf.
53 | vcf.file <- system.file("extdata", "example.vcf.gz", package="PureCN")
54 | vcf <- readVcf(vcf.file, "hg19")
55 | vcf.bias <- setMappingBiasVcf(vcf)        
56 | 
57 | }
58 | \author{
59 | Markus Riester
60 | }
61 | 


--------------------------------------------------------------------------------
/man/correctCoverageBias.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/correctCoverageBias.R
 3 | \name{correctCoverageBias}
 4 | \alias{correctCoverageBias}
 5 | \title{Correct for library-specific coverage biases}
 6 | \usage{
 7 | correctCoverageBias(
 8 |   coverage.file,
 9 |   interval.file,
10 |   output.file = NULL,
11 |   plot.bias = FALSE,
12 |   plot.max.density = 50000,
13 |   output.qc.file = NULL
14 | )
15 | }
16 | \arguments{
17 | \item{coverage.file}{Coverage file or coverage data parsed with the
18 | \code{\link{readCoverageFile}} function.}
19 | 
20 | \item{interval.file}{File providing GC content for each exon in the coverage
21 | files. First column in format CHR:START-END. Additional optional columns
22 | provide gene symbols, mappability and replication timing. This file is
23 | generated with the \code{\link{preprocessIntervals}} function.}
24 | 
25 | \item{output.file}{Optionally, write file with GC corrected coverage. Can be
26 | read with the \code{\link{readCoverageFile}} function.}
27 | 
28 | \item{plot.bias}{Optionally, plot profiles of the pre-normalized and
29 | post-normalized coverage. Provides a quick visual check of coverage bias.}
30 | 
31 | \item{plot.max.density}{By default, if the number of intervals in the
32 | probe-set is > 50000, uses a kernel density estimate to plot the coverage
33 | distribution. This uses the \code{stat_density} function from the ggplot2
34 | package. Using this parameter, change the threshold at which density
35 | estimation is applied. If the \code{plot.bias} parameter is set as
36 | \code{FALSE}, this will be ignored.}
37 | 
38 | \item{output.qc.file}{Write miscellaneous coverage QC metrics to file.}
39 | }
40 | \description{
41 | Takes as input coverage data and a mapping file for GC content and
42 | optionally replication timing. Will then normalize coverage data for
43 | GC-bias.  Plots the pre and post normalization GC profiles.
44 | }
45 | \examples{
46 | 
47 | normal.coverage.file <- system.file("extdata", "example_normal.txt.gz",
48 |     package = "PureCN")
49 | interval.file <- system.file("extdata", "example_intervals.txt",
50 |     package = "PureCN")
51 | coverage <- correctCoverageBias(normal.coverage.file, interval.file)
52 | 
53 | }
54 | \seealso{
55 | \code{\link{preprocessIntervals}}
56 | }
57 | \author{
58 | Angad Singh, Markus Riester
59 | }
60 | 


--------------------------------------------------------------------------------
/man/calculateBamCoverageByInterval.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/calculateBamCoverageByInterval.R
 3 | \name{calculateBamCoverageByInterval}
 4 | \alias{calculateBamCoverageByInterval}
 5 | \title{Function to calculate coverage from BAM file}
 6 | \usage{
 7 | calculateBamCoverageByInterval(
 8 |   bam.file,
 9 |   interval.file,
10 |   output.file = NULL,
11 |   index.file = bam.file,
12 |   keep.duplicates = FALSE,
13 |   chunks = 20,
14 |   ...
15 | )
16 | }
17 | \arguments{
18 | \item{bam.file}{Filename of a BAM file.}
19 | 
20 | \item{interval.file}{File specifying the intervals. Interval is expected in
21 | first column in format CHR:START-END.}
22 | 
23 | \item{output.file}{Optionally, write minimal coverage file. Can be read with
24 | the \code{\link{readCoverageFile}} function.}
25 | 
26 | \item{index.file}{The bai index. This is expected without the .bai file
27 | suffix, see \code{?scanBam}.}
28 | 
29 | \item{keep.duplicates}{Keep or remove duplicated reads.}
30 | 
31 | \item{chunks}{Split \code{interval.file} into specified number of chunks
32 | to reduce memory usage.}
33 | 
34 | \item{...}{Additional parameters passed to \code{ScanBamParam}.}
35 | }
36 | \value{
37 | Returns total and average coverage by intervals.
38 | }
39 | \description{
40 | Takes a BAM file and an interval file as input and returns coverage for each
41 | interval. Coverage should be then GC-normalized using the
42 | \code{\link{correctCoverageBias}} function before determining purity and
43 | ploidy with \code{\link{runAbsoluteCN}}. Uses the \code{scanBam} function
44 | and applies low quality, duplicate reads as well as secondary alignment
45 | filters.
46 | }
47 | \examples{
48 | 
49 | bam.file <- system.file("extdata", "ex1.bam", package = "PureCN",
50 |     mustWork = TRUE)
51 | interval.file <- system.file("extdata", "ex1_intervals.txt",
52 |     package = "PureCN", mustWork = TRUE)
53 | 
54 | # Calculate raw coverage from BAM file. These need to be corrected for
55 | # GC-bias using the correctCoverageBias function before determining purity
56 | # and ploidy.
57 | coverage <- calculateBamCoverageByInterval(bam.file = bam.file,
58 |     interval.file = interval.file)
59 | 
60 | }
61 | \seealso{
62 | \code{\link{preprocessIntervals}
63 | \link{correctCoverageBias} \link{runAbsoluteCN}}
64 | }
65 | \author{
66 | Markus Riester
67 | }
68 | 


--------------------------------------------------------------------------------
/man/callAmplificationsInLowPurity.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/callAmplificationsInLowPurity.R
 3 | \name{callAmplificationsInLowPurity}
 4 | \alias{callAmplificationsInLowPurity}
 5 | \title{Calling of amplifications in low purity samples}
 6 | \usage{
 7 | callAmplificationsInLowPurity(
 8 |   res,
 9 |   normalDB,
10 |   pvalue.cutoff = 0.001,
11 |   percentile.cutoff = 90,
12 |   min.width = 3,
13 |   all.genes = FALSE,
14 |   purity = NULL,
15 |   BPPARAM = NULL
16 | )
17 | }
18 | \arguments{
19 | \item{res}{Return object of the \code{\link{runAbsoluteCN}} function.}
20 | 
21 | \item{normalDB}{Normal database, created with
22 | \code{\link{createNormalDatabase}}.}
23 | 
24 | \item{pvalue.cutoff}{Copy numbers log-ratio cutoffs to call
25 | amplifications as calculating using the log-ratios observed in
26 | \code{normalDB}}
27 | 
28 | \item{percentile.cutoff}{Only report genes with log2-ratio mean
29 | exceeding this sample-wise cutoff.}
30 | 
31 | \item{min.width}{Minimum number of targets}
32 | 
33 | \item{all.genes}{If \code{FALSE}, then only return amplifications
34 | passing the thresholds.}
35 | 
36 | \item{purity}{If not \code{NULL}, then scale log2-ratios to the
37 | corresponding integer copy number. Useful when accurate ctDNA
38 | fractions (between 4-10 percent) are available.}
39 | 
40 | \item{BPPARAM}{\code{BiocParallelParam} object. If \code{NULL}, does not
41 | use parallelization for fitting local optima.}
42 | }
43 | \value{
44 | A \code{data.frame} with gene-level amplification calls.
45 | }
46 | \description{
47 | Function to extract amplification from a
48 | \code{\link{runAbsoluteCN}} return object in samples of too low purity
49 | for the standard \code{\link{callAlterations}}.
50 | }
51 | \examples{
52 | 
53 | data(purecn.example.output)
54 | normal.coverage.file <- system.file("extdata", "example_normal.txt.gz", 
55 |     package = "PureCN")
56 | normal2.coverage.file <- system.file("extdata", "example_normal2.txt.gz", 
57 |     package = "PureCN")
58 | normal.coverage.files <- c(normal.coverage.file, normal2.coverage.file)
59 | normalDB <- createNormalDatabase(normal.coverage.files)
60 | callAmplificationsInLowPurity(purecn.example.output, normalDB)["EIF2A", ]
61 | 
62 | }
63 | \seealso{
64 | \code{\link{runAbsoluteCN}} \code{\link{callAlterations}}
65 | }
66 | \author{
67 | Markus Riester
68 | }
69 | 


--------------------------------------------------------------------------------
/man/calculateMappingBiasVcf.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/calculateMappingBiasVcf.R
 3 | \name{calculateMappingBiasVcf}
 4 | \alias{calculateMappingBiasVcf}
 5 | \title{Calculate Mapping Bias}
 6 | \usage{
 7 | calculateMappingBiasVcf(
 8 |   normal.panel.vcf.file,
 9 |   min.normals = 1,
10 |   min.normals.betafit = 7,
11 |   min.normals.assign.betafit = 3,
12 |   min.normals.position.specific.fit = 10,
13 |   min.median.coverage.betafit = 5,
14 |   num.betafit.clusters = 9,
15 |   min.betafit.rho = 1e-04,
16 |   max.betafit.rho = 0.2,
17 |   yieldSize = 50000,
18 |   genome
19 | )
20 | }
21 | \arguments{
22 | \item{normal.panel.vcf.file}{\code{character(1)} Combined VCF file of
23 | a panel of normals, reference and alt counts as AD genotype field.
24 | Needs to be compressed and indexed with bgzip and tabix, respectively.}
25 | 
26 | \item{min.normals}{Minimum number of normals with heterozygous SNP for
27 | calculating position-specific mapping bias.}
28 | 
29 | \item{min.normals.betafit}{Minimum number of normals with heterozygous SNP
30 | fitting a beta binomial distribution}
31 | 
32 | \item{min.normals.assign.betafit}{Minimum number of normals with
33 | heterozygous SNPs to assign to a beta binomal fit cluster}
34 | 
35 | \item{min.normals.position.specific.fit}{Minimum normals to use
36 | position-specific beta-binomial fits. Otherwise only clustered fits are
37 | used.}
38 | 
39 | \item{min.median.coverage.betafit}{Minimum median coverage of normals with
40 | heterozygous SNP for fitting a beta binomial distribution}
41 | 
42 | \item{num.betafit.clusters}{Maximum number of beta binomial fit clusters}
43 | 
44 | \item{min.betafit.rho}{Minimum dispersion factor rho}
45 | 
46 | \item{max.betafit.rho}{Maximum dispersion factor rho}
47 | 
48 | \item{yieldSize}{See \code{TabixFile}}
49 | 
50 | \item{genome}{See \code{readVcf}}
51 | }
52 | \value{
53 | A \code{GRanges} object with mapping bias and number of normal
54 | samples with this variant.
55 | }
56 | \description{
57 | Function calculate mapping bias for each variant in the provided
58 | panel of normals VCF.
59 | }
60 | \examples{
61 | 
62 | normal.panel.vcf <- system.file("extdata", "normalpanel.vcf.gz",
63 |     package = "PureCN")
64 | bias <- calculateMappingBiasVcf(normal.panel.vcf, genome = "h19")
65 | saveRDS(bias, "mapping_bias.rds")
66 | 
67 | }
68 | \author{
69 | Markus Riester
70 | }
71 | 


--------------------------------------------------------------------------------
/man/callAlterationsFromSegmentation.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/callAlterations.R
 3 | \name{callAlterationsFromSegmentation}
 4 | \alias{callAlterationsFromSegmentation}
 5 | \title{Calling of amplifications and deletions from segmentations}
 6 | \usage{
 7 | callAlterationsFromSegmentation(
 8 |   sampleid,
 9 |   chr,
10 |   start,
11 |   end,
12 |   num.mark = NA,
13 |   seg.mean,
14 |   C,
15 |   interval.file,
16 |   fun.focal = findFocal,
17 |   args.focal = list(),
18 |   ...
19 | )
20 | }
21 | \arguments{
22 | \item{sampleid}{The sampleid column in the segmentation file.}
23 | 
24 | \item{chr}{The chromosome column.}
25 | 
26 | \item{start}{The start positions of the segments.}
27 | 
28 | \item{end}{The end positions of the segments.}
29 | 
30 | \item{num.mark}{Optionally, the number of probes or markers in each segment.}
31 | 
32 | \item{seg.mean}{The segment mean.}
33 | 
34 | \item{C}{The segment integer copy number.}
35 | 
36 | \item{interval.file}{A mapping file that assigns GC content and gene symbols
37 | to each exon in the coverage files. Used for generating gene-level calls.
38 | First column in format CHR:START-END. Second column GC content (0 to 1).
39 | Third column gene symbol. This file is generated with the
40 | \code{\link{preprocessIntervals}} function.}
41 | 
42 | \item{fun.focal}{Function for identifying focal amplifications. Defaults to
43 | \code{\link{findFocal}}.}
44 | 
45 | \item{args.focal}{Arguments for focal amplification function.}
46 | 
47 | \item{\dots}{Arguments passed to \code{\link{callAlterations}}.}
48 | }
49 | \value{
50 | A list of \code{\link{callAlterations}} \code{data.frame} objects,
51 | one for each sample.
52 | }
53 | \description{
54 | This function can be used to obtain gene-level copy number calls from
55 | segmentations. This is useful for comparing PureCN's segmentations with
56 | segmentations obtained by different tools on the gene-level.  Segmentation
57 | file can contain multiple samples.
58 | }
59 | \examples{
60 | 
61 | data(purecn.example.output)
62 | seg <- purecn.example.output$results[[1]]$seg
63 | interval.file <- system.file("extdata", "example_intervals.txt",
64 |         package = "PureCN")
65 | 
66 | calls <- callAlterationsFromSegmentation(sampleid = seg$ID, chr = seg$chrom,
67 |     start = seg$loc.start, end = seg$loc.end, num.mark = seg$num.mark,
68 |     seg.mean = seg$seg.mean, C = seg$C, interval.file = interval.file)
69 | 
70 | }
71 | \author{
72 | Markus Riester
73 | }
74 | 


--------------------------------------------------------------------------------
/man/calculateMappingBiasGatk4.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/calculateMappingBiasVcf.R
 3 | \name{calculateMappingBiasGatk4}
 4 | \alias{calculateMappingBiasGatk4}
 5 | \title{Calculate Mapping Bias from GATK4 GenomicsDB}
 6 | \usage{
 7 | calculateMappingBiasGatk4(
 8 |   workspace,
 9 |   reference.genome,
10 |   min.normals = 1,
11 |   min.normals.betafit = 7,
12 |   min.normals.assign.betafit = 3,
13 |   min.normals.position.specific.fit = 10,
14 |   min.median.coverage.betafit = 5,
15 |   num.betafit.clusters = 9,
16 |   min.betafit.rho = 1e-04,
17 |   max.betafit.rho = 0.2,
18 |   AF.info.field = "AF"
19 | )
20 | }
21 | \arguments{
22 | \item{workspace}{Path to the GenomicsDB created by \code{GenomicsDBImport}}
23 | 
24 | \item{reference.genome}{Reference FASTA file.}
25 | 
26 | \item{min.normals}{Minimum number of normals with heterozygous SNP for
27 | calculating position-specific mapping bias.}
28 | 
29 | \item{min.normals.betafit}{Minimum number of normals with heterozygous SNP
30 | fitting a beta distribution}
31 | 
32 | \item{min.normals.assign.betafit}{Minimum number of normals with
33 | heterozygous SNPs to assign to a beta binomal fit cluster}
34 | 
35 | \item{min.normals.position.specific.fit}{Minimum normals to use
36 | position-specific beta-binomial fits. Otherwise only clustered fits are
37 | used.}
38 | 
39 | \item{min.median.coverage.betafit}{Minimum median coverage of normals with
40 | heterozygous SNP for fitting a beta distribution}
41 | 
42 | \item{num.betafit.clusters}{Maximum number of beta binomial fit clusters}
43 | 
44 | \item{min.betafit.rho}{Minimum dispersion factor rho}
45 | 
46 | \item{max.betafit.rho}{Maximum dispersion factor rho}
47 | 
48 | \item{AF.info.field}{Field in the \code{workspace} that stores the allelic
49 | fraction}
50 | }
51 | \value{
52 | A \code{GRanges} object with mapping bias and number of normal
53 | samples with this variant.
54 | }
55 | \description{
56 | Function calculate mapping bias for each variant in the provided
57 | panel of normals GenomicsDB.
58 | }
59 | \examples{
60 | 
61 | \dontrun{
62 | resources_file <- system.file("extdata", "gatk4_pon_db.tgz",
63 |     package = "PureCN")
64 | tmp_dir <- tempdir()
65 | untar(resources_file, exdir = tmp_dir)
66 | workspace <- file.path(tmp_dir, "gatk4_pon_db")
67 | bias <- calculateMappingBiasGatk4(workspace, "hg19")
68 | saveRDS(bias, "mapping_bias.rds")
69 | unlink(tmp_dir, recursive=TRUE)
70 | }
71 | 
72 | }
73 | \author{
74 | Markus Riester
75 | }
76 | 


--------------------------------------------------------------------------------
/inst/extdata/example_seg.txt:
--------------------------------------------------------------------------------
 1 | ID	chrom	loc.start	loc.end	num.mark	seg.mean
 2 | Sample1	1	1216044	248722319	933	0.133381833060556
 3 | Sample1	2	1638036	231775198	707	-0.417889405204461
 4 | Sample1	2	236403412	241737117	93	0.0831
 5 | Sample1	3	11832017	149470198	436	0.1151
 6 | Sample1	3	150264604	151542537	18	1.447
 7 | Sample1	3	151545662	195938114	80	0.1254
 8 | Sample1	4	843512	70146579	133	0.1301
 9 | Sample1	4	75673305	77700146	39	-0.3815
10 | Sample1	4	81188156	108831608	44	0.8534
11 | Sample1	4	110635592	186611721	139	0.0788
12 | Sample1	5	442758	10761153	38	0.1437
13 | Sample1	5	38869183	180687408	359	-0.470303870967742
14 | Sample1	6	2623865	144219759	293	-0.4282
15 | Sample1	6	144224235	170862274	117	0.2096
16 | Sample1	7	938572	14028655	56	0.1258
17 | Sample1	7	23286512	23313764	11	1.5421
18 | Sample1	7	26232167	156469231	309	-0.4089
19 | Sample1	8	6264200	145537891	337	-0.4565
20 | Sample1	9	214953	139440208	369	-0.417889405204461
21 | Sample1	10	323391	72576623	233	0.0987
22 | Sample1	10	72604313	72645621	16	-0.4625
23 | Sample1	10	74768015	75000741	29	0.0539
24 | Sample1	10	82300671	82403793	7	-1.4644
25 | Sample1	10	85982056	88768887	12	0.8815
26 | Sample1	10	91066426	99790218	36	-1.301
27 | Sample1	10	102283640	102289566	5	1.0576
28 | Sample1	10	103541552	121214530	73	-1.4029
29 | Sample1	10	124591880	134121207	24	-0.049
30 | Sample1	11	2291272	34378689	106	-0.470303870967742
31 | Sample1	11	36614927	44081429	15	0.2505
32 | Sample1	11	46880700	57317513	71	-0.388
33 | Sample1	11	57947383	65172437	77	0.0606
34 | Sample1	11	65340286	66335024	33	-0.4986
35 | Sample1	11	66335504	71847083	27	0.7708
36 | Sample1	11	71850100	77907628	66	0.1291
37 | Sample1	11	77909048	77924776	9	0.7195
38 | Sample1	11	82536056	134134828	152	-0.3788
39 | Sample1	12	1740561	99126271	379	0.8812
40 | Sample1	12	113537804	124428836	212	0.1012
41 | Sample1	13	20398996	114438189	329	0.8029
42 | Sample1	14	20757846	101349087	318	0.112465182186235
43 | Sample1	15	27216709	99926271	394	0.133381833060556
44 | Sample1	16	230533	31123514	225	0.0953
45 | Sample1	16	56899289	56947247	25	0.7182
46 | Sample1	16	57507348	57722319	20	-0.4244
47 | Sample1	16	66918983	90038049	183	0.1368
48 | Sample1	17	1399145	76832319	621	0.133381833060556
49 | Sample1	17	77768896	80559277	24	-0.4355
50 | Sample1	18	5394737	71825663	132	-0.4495
51 | Sample1	19	1481982	57301280	496	0.133381833060556
52 | Sample1	20	207959	62610775	330	0.1101
53 | Sample1	21	11098731	47865219	176	0.112465182186235
54 | Sample1	22	17443695	45996257	148	0.1203
55 | Sample1	22	50703417	51066096	20	-0.6394
56 | 


--------------------------------------------------------------------------------
/man/createNormalDatabase.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/createNormalDatabase.R
 3 | \name{createNormalDatabase}
 4 | \alias{createNormalDatabase}
 5 | \title{Create database of normal samples}
 6 | \usage{
 7 | createNormalDatabase(
 8 |   normal.coverage.files,
 9 |   sex = NULL,
10 |   coverage.outliers = c(0.25, 4),
11 |   min.coverage = 0.25,
12 |   max.missing = 0.03,
13 |   low.coverage = 15,
14 |   optimal.off.target.counts = 120,
15 |   plot = FALSE,
16 |   ...
17 | )
18 | }
19 | \arguments{
20 | \item{normal.coverage.files}{Vector with file names pointing to
21 | coverage files of normal samples.}
22 | 
23 | \item{sex}{\code{character(length(normal.coverage.files))} with sex for all
24 | files.  \code{F} for female, \code{M} for male. If all chromosomes are
25 | diploid, specify \code{diploid}. If \code{NULL}, determine from coverage.}
26 | 
27 | \item{coverage.outliers}{Exclude samples with coverages below or above
28 | the specified cutoffs (fractions of the normal sample coverages median).
29 | Only for databases with more than 5 samples.}
30 | 
31 | \item{min.coverage}{Exclude intervals with coverage lower than
32 | the specified fraction of the chromosome median in the pool of normals.}
33 | 
34 | \item{max.missing}{Exclude intervals with zero coverage in the
35 | specified fraction of normal samples.}
36 | 
37 | \item{low.coverage}{Specifies the maximum number of total reads
38 | (NOT average coverage) to call a target low coverage.}
39 | 
40 | \item{optimal.off.target.counts}{Used to suggest an optimal off-target
41 | interval width (BETA).}
42 | 
43 | \item{plot}{Diagnostics plot, useful to tune parameters.}
44 | 
45 | \item{\dots}{Arguments passed to the \code{prcomp} function.}
46 | }
47 | \value{
48 | A normal database that can be used in the
49 | \code{\link{calculateTangentNormal}} function to retrieve a coverage
50 | normalization sample for a given tumor sample.
51 | }
52 | \description{
53 | Function to create a database of normal samples, used to normalize
54 | tumor coverages.
55 | }
56 | \examples{
57 | 
58 | normal.coverage.file <- system.file("extdata", "example_normal.txt.gz",
59 |     package = "PureCN")
60 | normal2.coverage.file <- system.file("extdata", "example_normal2.txt.gz",
61 |     package = "PureCN")
62 | normal.coverage.files <- c(normal.coverage.file, normal2.coverage.file)
63 | normalDB <- createNormalDatabase(normal.coverage.files)
64 | 
65 | }
66 | \seealso{
67 | \code{\link{calculateTangentNormal}}
68 | }
69 | \author{
70 | Markus Riester
71 | }
72 | 


--------------------------------------------------------------------------------
/R/readIntervalFile.R:
--------------------------------------------------------------------------------
 1 | #' Read interval file
 2 | #' 
 3 | #' Read file containing coordinates of on- and off-target intervals
 4 | #' generated by \code{\link{preprocessIntervals}}. 
 5 | #' 
 6 | #' @param interval.file A mapping file that assigns GC content and gene symbols
 7 | #' to each exon in the coverage files. Used for generating gene-level calls.
 8 | #' First column in format CHR:START-END. Second column GC content (0 to 1).
 9 | #' Third column gene symbol. This file is generated with the
10 | #' \code{\link{preprocessIntervals}} function.
11 | #' @param strict Error out with missing columns
12 | #' @param verbose Verbose output
13 | #' @return A \code{GRanges} object with the parsed intervals.
14 | #' @author Markus Riester
15 | #' @examples
16 | #' 
17 | #' interval.file <- system.file("extdata", "example_intervals.txt", 
18 | #'     package = "PureCN")
19 | #' x <- readIntervalFile(interval.file)
20 | #' 
21 | #' @export readIntervalFile
22 | readIntervalFile <- function(interval.file, strict = TRUE, verbose = TRUE) {
23 |     con <- file(interval.file, open = "r")
24 |     header <- .parseGATKHeader(con)
25 |     intervals <- read.delim(con, header = FALSE, stringsAsFactors = FALSE)
26 |     colnames(intervals) <- strsplit(header$last_line, "\t")[[1]]
27 |     close(con)
28 |     if (is.null(intervals$gc_bias) && strict) {
29 |         .stopUserError("No gc_bias column in interval.file.")
30 |     }    
31 |     if (is.null(intervals$Gene)) {
32 |         if (verbose) flog.info("No Gene column in interval.file. You won't get gene-level calls.")
33 |         intervals$Gene <- "."
34 |     }
35 |     if (is.null(intervals$on_target)) {
36 |         if (verbose) flog.info("No on_target column in interval.file. Recreate this file with IntervalFile.R.")
37 |         intervals$on_target <- TRUE
38 |     }
39 |     if (is.null(intervals$mappability)) {
40 |         if (verbose) flog.info("No mappability column in interval.file.")
41 |         intervals$mappability <- 1
42 |     }
43 |     if (is.null(intervals$reptiming)) {
44 |         if (verbose) flog.info("No reptiming column in interval.file.")
45 |         intervals$reptiming <- NA
46 |     }
47 |     
48 |     gr <- GRanges(intervals[, 1], ranges = NULL, strand = NULL, intervals[, -1])
49 |     gr <- sort(sortSeqlevels(gr))
50 |     # TODO cleanup
51 |     gr$on.target <- gr$on_target
52 |     gr$on_target <- NULL
53 | 
54 |     if (length(header$sl)) {
55 |         header$sl <- sapply(header$sl, as.numeric)
56 |         seqlengths(gr) <- header$sl[names(seqlengths(gr))]
57 |     }
58 |     return(gr)
59 | }
60 | 


--------------------------------------------------------------------------------
/man/getSexFromVcf.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/getSex.R
 3 | \name{getSexFromVcf}
 4 | \alias{getSexFromVcf}
 5 | \title{Get sample sex from a VCF file}
 6 | \usage{
 7 | getSexFromVcf(
 8 |   vcf,
 9 |   tumor.id.in.vcf = NULL,
10 |   min.or = 4,
11 |   min.or.na = 2.5,
12 |   max.pv = 0.001,
13 |   homozygous.cutoff = 0.95,
14 |   af.cutoff = 0.2,
15 |   min.coverage = 15,
16 |   use.somatic.status = TRUE
17 | )
18 | }
19 | \arguments{
20 | \item{vcf}{CollapsedVCF object, read in with the \code{readVcf} function
21 | from the VariantAnnotation package.}
22 | 
23 | \item{tumor.id.in.vcf}{The tumor id in the CollapsedVCF (optional).}
24 | 
25 | \item{min.or}{Minimum odds-ratio to call sample as male. If p-value is not
26 | significant due to a small number of SNPs on chromosome X, sample will be
27 | called as NA even when odds-ratio exceeds this cutoff.}
28 | 
29 | \item{min.or.na}{Minimum odds-ratio to not call a sample. Odds-ratios in the
30 | range \code{min.or.na} to \code{min.or} define a grey area in which samples
31 | are not called. Contamination can be a source of ambiguous calls.}
32 | 
33 | \item{max.pv}{Maximum Fisher's exact p-value to call sample as male.}
34 | 
35 | \item{homozygous.cutoff}{Minimum allelic fraction to call position
36 | homozygous.}
37 | 
38 | \item{af.cutoff}{Remove all SNVs with allelic fraction lower than the
39 | specified value.}
40 | 
41 | \item{min.coverage}{Minimum coverage in tumor. Variants with lower coverage
42 | are ignored.}
43 | 
44 | \item{use.somatic.status}{If somatic status and germline data is available,
45 | then exclude somatic variants.}
46 | }
47 | \value{
48 | Returns a \code{character(1)} with \code{M} for male, \code{F} for
49 | female, or \code{NA} if unknown.
50 | }
51 | \description{
52 | This function detects non-random distribution of homozygous variants on
53 | chromosome X compared to all other chromosomes. A non-significant Fisher's
54 | exact p-value indicates more than one chromosome X copy. This function is
55 | called in runAbsoluteCN as sanity check when a VCF is provided. It is also
56 | useful for determining sex when no sex marker genes on chrY (e.g. AMELY) are
57 | available.
58 | }
59 | \examples{
60 | 
61 | vcf.file <- system.file("extdata", "example.vcf.gz", package = "PureCN")
62 | vcf <- readVcf(vcf.file, "hg19")
63 | # This example vcf is filtered and contains no homozygous calls,
64 | # which are necessary for determining sex from chromosome X.
65 | getSexFromVcf(vcf)
66 | 
67 | }
68 | \seealso{
69 | \code{\link{getSexFromCoverage}}
70 | }
71 | \author{
72 | Markus Riester
73 | }
74 | 


--------------------------------------------------------------------------------
/man/setPriorVcf.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/setPriorVcf.R
 3 | \name{setPriorVcf}
 4 | \alias{setPriorVcf}
 5 | \title{Set Somatic Prior VCF}
 6 | \usage{
 7 | setPriorVcf(
 8 |   vcf,
 9 |   prior.somatic = c(0.5, 5e-04, 0.999, 1e-04, 0.995, 0.5),
10 |   tumor.id.in.vcf = NULL,
11 |   min.cosmic.cnt = 6,
12 |   DB.info.flag = "DB",
13 |   Cosmic.CNT.info.field = "Cosmic.CNT"
14 | )
15 | }
16 | \arguments{
17 | \item{vcf}{\code{CollapsedVCF} object, read in with the \code{readVcf}
18 | function from the VariantAnnotation package.}
19 | 
20 | \item{prior.somatic}{Prior probabilities for somatic mutations. First value
21 | is for the case when no matched normals are available and the variant is not
22 | in germline databases (second value). Third value is for variants with MuTect
23 | somatic call. Different from 1, because somatic mutations in segments of copy
24 | number 0 have 0 probability and artifacts can thus have dramatic influence on
25 | likelihood score. Forth value is for variants not labeled as somatic by
26 | MuTect. Last two values are optional, if vcf contains a flag Cosmic.CNT, it
27 | will set the prior probability for variants with CNT > 6 to the first of
28 | those values in case of no matched normal available (0.995 default).  Final
29 | value is for the case that variant is in both germline databases and
30 | COSMIC count > 6.}
31 | 
32 | \item{tumor.id.in.vcf}{Id of tumor in case multiple samples are stored in
33 | VCF.}
34 | 
35 | \item{min.cosmic.cnt}{Minimum number of hits in the COSMIC database to 
36 | call variant as likely somatic.}
37 | 
38 | \item{DB.info.flag}{Flag in INFO of VCF that marks presence in common
39 | germline databases. Defaults to \code{DB} that may contain somatic variants
40 | if it is from an unfiltered germline database.}
41 | 
42 | \item{Cosmic.CNT.info.field}{Info field containing hits in the Cosmic database}
43 | }
44 | \value{
45 | The \code{vcf} with \code{numeric(nrow(vcf))} vector with the
46 | prior probability of somatic status for each variant in the 
47 | \code{CollapsedVCF} added to the \code{INFO} field \code{PR}.
48 | }
49 | \description{
50 | Function to set prior for somatic mutation status for each variant in the
51 | provided \code{CollapsedVCF} object.
52 | }
53 | \examples{
54 | 
55 | # This function is typically only called by runAbsoluteCN via the 
56 | # fun.setPriorVcf and args.setPriorVcf comments.
57 | vcf.file <- system.file("extdata", "example.vcf.gz", package="PureCN")
58 | vcf <- readVcf(vcf.file, "hg19")
59 | vcf <- setPriorVcf(vcf)        
60 | 
61 | }
62 | \author{
63 | Markus Riester
64 | }
65 | 


--------------------------------------------------------------------------------
/R/findFocal.R:
--------------------------------------------------------------------------------
 1 | #' Find focal amplifications
 2 | #'
 3 | #' Function to find focal amplifications in segmented data.  This is
 4 | #' automatically called in \code{\link{runAbsoluteCN}}.
 5 | #'
 6 | #'
 7 | #' @param seg Segmentation data.
 8 | #' @param max.size Cutoff for focal in base pairs.
 9 | #' @param cn.diff Minimum copy number delta between neighboring segments.
10 | #' @param min.amp.cn Minimum amplification integer copy number. Segments with
11 | #' lower copy number are not tested.
12 | #' @return \code{logical(n)}, indicating for all n segments whether they are
13 | #' focally amplified or not.
14 | #' @author Markus Riester
15 | #' @seealso \code{\link{runAbsoluteCN}}
16 | #' @examples
17 | #'
18 | #' normal.coverage.file <- system.file("extdata", "example_normal_tiny.txt",
19 | #'     package = "PureCN")
20 | #' tumor.coverage.file <- system.file("extdata", "example_tumor_tiny.txt",
21 | #'     package = "PureCN")
22 | #' vcf.file <- system.file("extdata", "example.vcf.gz",
23 | #'     package = "PureCN")
24 | #' interval.file <- system.file("extdata", "example_intervals_tiny.txt",
25 | #'     package = "PureCN")
26 | #'
27 | #' # The max.candidate.solutions, max.ploidy and test.purity parameters are set to
28 | #' # non-default values to speed-up this example.  This is not a good idea for real
29 | #' # samples.
30 | #' ret <-runAbsoluteCN(normal.coverage.file = normal.coverage.file,
31 | #'     tumor.coverage.file = tumor.coverage.file, vcf.file = vcf.file,
32 | #'     genome="hg19", sampleid = "Sample1", interval.file = interval.file,
33 | #'     max.candidate.solutions = 1, max.ploidy = 4,
34 | #'     test.purity = seq(0.3, 0.7, by = 0.05),
35 | #'     args.focal=list(max.size = 2e+06), fun.focal = findFocal)
36 | #'
37 | #' @export findFocal
38 | findFocal <- function(seg, max.size = 3000000, cn.diff = 2, min.amp.cn = 5) {
39 |     focal <- rep(FALSE, nrow(seg))
40 |     for (i in seq_len(nrow(seg))) {
41 |         if (seg$C[i] < min.amp.cn) next
42 |         if (seg$size[i] > max.size) next
43 |         size <- seg$size[i]
44 |         if (i > 1) {
45 |             for (j in (i - 1):1) {
46 |                 if (seg$C[j] < seg$C[i] - cn.diff) {
47 |                     break
48 |                 }
49 |                 size <- size + seg$size[j]
50 |             }
51 |         }
52 |         if (i < nrow(seg)) {
53 |             for (j in (i + 1):nrow(seg)) {
54 |                 if (seg$C[j] < seg$C[i] - cn.diff) {
55 |                     break
56 |                 }
57 |                 size <- size + seg$size[j]
58 |             }
59 |         }
60 |         focal[i] <- size < max.size
61 |     }
62 |     focal
63 | }
64 | 


--------------------------------------------------------------------------------
/tests/testthat/test_correctCoverageBias.R:
--------------------------------------------------------------------------------
 1 | context("correctCoverageBias")
 2 | 
 3 | normal.coverage.file <- system.file("extdata", "example_normal.txt.gz", 
 4 |     package = "PureCN")
 5 | interval.file <- system.file("extdata", "ex2_intervals.txt", 
 6 |     package = "PureCN", mustWork = TRUE)
 7 | interval.file2 <- system.file("extdata", "example_intervals.txt", 
 8 |     package = "PureCN")
 9 | 
10 | test_that("Example data matches after normalization", {
11 |     output.file <- tempfile(fileext = ".txt")
12 |     coverage <- correctCoverageBias(normal.coverage.file, interval.file2, 
13 |         output.file = output.file)
14 |     expect_equal(class(coverage)[1], "GRanges")
15 |     expect_equal(length(coverage), 10049)
16 |     correctCoverageBias(normal.coverage.file, interval.file2, plot.max.density = 100, 
17 |         plot.bias = TRUE)
18 |     x <- readCoverageFile(output.file)
19 |     expect_equal(x$average.coverage, coverage$average.coverage)
20 |     correctCoverageBias(head(x, 200), interval.file2)
21 |     gc.data <- read.delim(interval.file2, as.is = TRUE)
22 |     gc.data$Gene <- NULL
23 |     tmpFile <- tempfile()
24 |     write.table(gc.data, file = tmpFile, row.names = FALSE, quote = FALSE, 
25 |         sep = "\t")
26 |     coverage2 <- correctCoverageBias(normal.coverage.file, tmpFile, 
27 |         output.file = output.file)
28 |     corCov <- cor(coverage$average.coverage, coverage2$average.coverage, 
29 |         use = "complete.obs")
30 |     expect_true(corCov > 0.99)
31 | })
32 | 
33 | test_that("Exceptions happen with wrong input", {
34 |     expect_error(correctCoverageBias(normal.coverage.file, interval.file))
35 |     coverage <- readCoverageFile(normal.coverage.file)
36 |     coverage$average.coverage <- 0
37 |     expect_error(correctCoverageBias(coverage, interval.file), "zero")
38 | })
39 | 
40 | test_that("Example data qc matches", {
41 |     output.qc.file <- tempfile(fileext = ".txt")
42 |     coverage <- correctCoverageBias(normal.coverage.file, interval.file2, 
43 |         output.qc.file = output.qc.file)
44 |     x <- read.delim(output.qc.file, sep=" ")
45 |     expect_equal(1, nrow(x))
46 |     expect_equal(10, ncol(x))
47 |     file.remove(output.qc.file)
48 | })
49 | 
50 | test_that("Example data without reptiming works", {
51 |     x <- read.delim(interval.file2)
52 |     interval.file3 <- tempfile(fileext = ".txt")
53 |     x$reptiming <- NULL
54 |     write.table(x, file=interval.file3, row.names=FALSE, quote=FALSE, sep="\t")
55 |     coverage <- correctCoverageBias(normal.coverage.file, interval.file3, 
56 |                                    plot.bias=TRUE)
57 |     expect_equal(nrow(x), length(coverage))
58 |     file.remove(interval.file3)
59 | })
60 | 


--------------------------------------------------------------------------------
/tests/testthat/test_calculateBamCoverageByInterval.R:
--------------------------------------------------------------------------------
 1 | context("calculateBamCoverageByInterval")
 2 | 
 3 | output.file <- tempfile(fileext = ".txt")
 4 | 
 5 | test_that("Coverage from test BAM file matches", {
 6 |     bam.file <- system.file("extdata", "ex1.bam", package = "PureCN", 
 7 |         mustWork = TRUE)
 8 |     interval.file <- system.file("extdata", "ex1_intervals.txt", 
 9 |         package = "PureCN", mustWork = TRUE)
10 |     coverage <- calculateBamCoverageByInterval(bam.file = bam.file, 
11 |         interval.file = interval.file, output.file = output.file)
12 |     expect_equal(coverage$average.coverage, c(20.95205, 43.78357, 
13 |         21.29271), tolerance = 0.01)
14 |     expect_equal(coverage$counts, c(610, 1158, 636), tolerance = 0.01)
15 |     expect_equal(unlist(coverage$duplication.rate), rep(0, 3), check.names = FALSE)
16 | }) 
17 | 
18 | test_that("Coverage from test BAM file matches", {
19 |     bam.file <- system.file("extdata", "ex1.bam", package = "PureCN", 
20 |         mustWork = TRUE)
21 |     interval.file <- system.file("extdata", "ex1_intervals_headered.txt", 
22 |         package = "PureCN", mustWork = TRUE)
23 |     coverage <- calculateBamCoverageByInterval(bam.file = bam.file, 
24 |         interval.file = interval.file)
25 |     expect_equal(coverage$average.coverage, c(37.49301, 43.78357, 39.10000),
26 |         tolerance = 0.01)
27 |     expect_equal(coverage$counts, c(568, 1158,  595), tolerance = 0.01)
28 | }) 
29 | 
30 | 
31 | test_that("Coverage output is correct", {
32 |     x <- readCoverageFile(output.file)
33 |     expect_equal(x$average.coverage, c(20.95205, 43.78357, 21.29271),
34 |         tolerance = 0.01)
35 |     expect_equal(x$counts, c(610, 1158, 636), tolerance = 0.01)
36 |     interval.file <- system.file("extdata", "example_intervals.txt", 
37 |         package = "PureCN")
38 |     expect_error(correctCoverageBias(x, interval.file))
39 | })
40 | 
41 | test_that("Reading BAM in chunks works", {
42 |     fl <- system.file("extdata", "ex1.bam", package = "Rsamtools",
43 |                       mustWork = TRUE)
44 |     res0 <- scanBam(fl)[[1]] # always list-of-lists
45 |     idx <- sort(sample(length(res0[[1]]), 300))
46 |     idx <- idx[!is.na(res0$pos[idx])]
47 |     x <- GRanges(seqnames = res0$rname[idx],
48 |         IRanges(start = res0$pos[idx], end = res0$pos[idx] + 20))
49 |     x$Gene <- "."
50 |     x$on.target <- TRUE
51 |     x$gc_bias <- NA
52 |     x$mappability <- NA
53 |     x$reptiming <- NA
54 |     f2 <- tempfile()
55 |     suppressWarnings(PureCN:::.writeIntervals(x, f2))
56 |     r1 <- calculateBamCoverageByInterval(fl, f2)
57 |     r2 <- calculateBamCoverageByInterval(fl, f2, chunks = 3)
58 |     file.remove(f2)
59 |     expect_equal(as.character(r1), as.character(x))
60 |     expect_equal(as.character(r2), as.character(x))
61 |     expect_equivalent(r1$counts, r2$counts)
62 | })    
63 | file.remove(output.file)
64 | 


--------------------------------------------------------------------------------
/man/calculatePowerDetectSomatic.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/powerDetectSomatic.R
 3 | \name{calculatePowerDetectSomatic}
 4 | \alias{calculatePowerDetectSomatic}
 5 | \title{Power calculation for detecting somatic mutations}
 6 | \usage{
 7 | calculatePowerDetectSomatic(
 8 |   coverage,
 9 |   f = NULL,
10 |   purity = NULL,
11 |   ploidy = NULL,
12 |   cell.fraction = 1,
13 |   error = 0.001,
14 |   fpr = 5e-07,
15 |   verbose = TRUE
16 | )
17 | }
18 | \arguments{
19 | \item{coverage}{Mean sequencing coverage.}
20 | 
21 | \item{f}{Mean expected allelic fraction. If \code{NULL}, requires purity and
22 | ploidy and then calculates the expected fraction.}
23 | 
24 | \item{purity}{Purity of sample. Only required when \code{f} is \code{NULL}.}
25 | 
26 | \item{ploidy}{Ploidy of sample. Only required when \code{f} is \code{NULL}.}
27 | 
28 | \item{cell.fraction}{Fraction of cells harboring mutation. Ignored if
29 | \code{f} is not \code{NULL}.}
30 | 
31 | \item{error}{Estimated sequencing error rate.}
32 | 
33 | \item{fpr}{Required false positive rate for mutation vs. sequencing error.}
34 | 
35 | \item{verbose}{Verbose output.}
36 | }
37 | \value{
38 | A list with elements \item{power}{Power to detect somatic
39 | mutations.} \item{k}{Minimum number of supporting reads.} \item{f}{Expected
40 | allelic fraction. }
41 | }
42 | \description{
43 | This function calculates the probability of correctly rejecting the null
44 | hypothesis that an alt allele is a sequencing error rather than a true
45 | (mono-)clonal mutation.
46 | }
47 | \examples{
48 | 
49 | purity <- c(0.1, 0.15, 0.2, 0.25, 0.4, 0.6, 1)
50 | coverage <- seq(5, 35, 1)
51 | power <- lapply(purity, function(p) sapply(coverage, function(cv)
52 |     calculatePowerDetectSomatic(coverage = cv, purity = p, ploidy = 2,
53 |     verbose = FALSE)$power))
54 | 
55 | # Figure S7b in Carter et al.
56 | plot(coverage, power[[1]], col = 1, xlab = "Sequence coverage",
57 |     ylab = "Detection power", ylim = c(0, 1), type = "l")
58 | 
59 | for (i in 2:length(power)) lines(coverage, power[[i]], col = i)
60 | abline(h = 0.8, lty = 2, col = "grey")
61 | legend("bottomright", legend = paste("Purity", purity),
62 |     fill = seq_along(purity))
63 | 
64 | # Figure S7c in Carter et al.
65 | coverage <- seq(5, 350, 1)
66 | power <- lapply(purity, function(p) sapply(coverage, function(cv)
67 |     calculatePowerDetectSomatic(coverage = cv, purity = p, ploidy = 2,
68 |         cell.fraction = 0.2, verbose = FALSE)$power))
69 | plot(coverage, power[[1]], col = 1, xlab = "Sequence coverage",
70 |     ylab = "Detection power", ylim = c(0, 1), type = "l")
71 | 
72 | for (i in 2:length(power)) lines(coverage, power[[i]], col = i)
73 | abline(h = 0.8, lty = 2, col = "grey")
74 | legend("bottomright", legend = paste("Purity", purity),
75 |     fill = seq_along(purity))
76 | 
77 | }
78 | \references{
79 | Carter et al. (2012), Absolute quantification of somatic DNA
80 | alterations in human cancer. Nature Biotechnology.
81 | }
82 | \author{
83 | Markus Riester
84 | }
85 | 


--------------------------------------------------------------------------------
/man/segmentationHclust.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/segmentationHclust.R
 3 | \name{segmentationHclust}
 4 | \alias{segmentationHclust}
 5 | \title{Minimal segmentation function}
 6 | \usage{
 7 | segmentationHclust(
 8 |   seg,
 9 |   vcf = NULL,
10 |   tumor.id.in.vcf = 1,
11 |   normal.id.in.vcf = NULL,
12 |   min.logr.sdev = 0.15,
13 |   prune.hclust.h = NULL,
14 |   prune.hclust.method = "ward.D",
15 |   chr.hash = NULL,
16 |   ...
17 | )
18 | }
19 | \arguments{
20 | \item{seg}{If segmentation was provided by the user, this data structure
21 | will contain this segmentation. Useful for minimal segmentation functions.
22 | Otherwise PureCN will re-segment the data. This segmentation function
23 | ignores this user provided segmentation.}
24 | 
25 | \item{vcf}{Optional \code{CollapsedVCF} object with germline allelic ratios.}
26 | 
27 | \item{tumor.id.in.vcf}{Id of tumor in case multiple samples are stored in
28 | VCF.}
29 | 
30 | \item{normal.id.in.vcf}{Id of normal in in VCF. Currently not used.}
31 | 
32 | \item{min.logr.sdev}{Minimum log-ratio standard deviation used in the
33 | model. Useful to make fitting more robust to outliers in very clean
34 | data (currently not used in this segmentation function).}
35 | 
36 | \item{prune.hclust.h}{Height in the \code{hclust} pruning step. Increasing
37 | this value will merge segments more aggressively. If NULL, try to find a
38 | sensible default.}
39 | 
40 | \item{prune.hclust.method}{Cluster method used in the \code{hclust} pruning
41 | step. See documentation for the \code{hclust} function.}
42 | 
43 | \item{chr.hash}{Mapping of non-numerical chromsome names to numerical names
44 | (e.g. chr1 to 1, chr2 to 2, etc.). If \code{NULL}, assume chromsomes are
45 | properly ordered.}
46 | 
47 | \item{...}{Currently unused arguments provided to other segmentation
48 | functions.}
49 | }
50 | \value{
51 | \code{data.frame} containing the segmentation.
52 | }
53 | \description{
54 | A minimal segmentation function useful when segmentation was performed by
55 | third-pary tools. When a \code{CollapsedVCF} with germline SNPs is provided,
56 | it will cluster segments using \code{hclust}. Otherwise it will use the
57 | segmentation as provided.
58 | This function is called via the
59 | \code{fun.segmentation} argument of \code{\link{runAbsoluteCN}}.  The
60 | arguments are passed via \code{args.segmentation}.
61 | }
62 | \examples{
63 | 
64 | vcf.file <- system.file("extdata", "example.vcf.gz",
65 |     package="PureCN")
66 | interval.file <- system.file("extdata", "example_intervals_tiny.txt",
67 |     package="PureCN")
68 | seg.file <- system.file('extdata', 'example_seg.txt',
69 |     package = 'PureCN')
70 | 
71 | res <- runAbsoluteCN(seg.file = seg.file,
72 |     fun.segmentation = segmentationHclust,
73 |     max.ploidy = 4, vcf.file = vcf.file,
74 |     test.purity = seq(0.3, 0.7, by = 0.05),
75 |     max.candidate.solutions = 1,
76 |     genome = 'hg19', interval.file = interval.file)
77 | 
78 | }
79 | \seealso{
80 | \code{\link{runAbsoluteCN}}
81 | }
82 | \author{
83 | Markus Riester
84 | }
85 | 


--------------------------------------------------------------------------------
/R/createCurationFile.R:
--------------------------------------------------------------------------------
 1 | #' Create file to curate PureCN results
 2 | #'
 3 | #' Function to create a CSV file that can be used to mark the correct solution
 4 | #' in the output of a \code{\link{runAbsoluteCN}} run.
 5 | #'
 6 | #'
 7 | #' @param file.rds Output of the \code{\link{runAbsoluteCN}} function,
 8 | #' serialized with \code{saveRDS}.
 9 | #' @param overwrite.uncurated Overwrite existing files unless flagged as
10 | #' \sQuote{Curated}.
11 | #' @param overwrite.curated Overwrite existing files even if flagged as
12 | #' \sQuote{Curated}.
13 | #' @return A \code{data.frame} with the tumor purity and ploidy of the maximum
14 | #' likelihood solution.
15 | #' @author Markus Riester
16 | #' @seealso \code{\link{runAbsoluteCN}}
17 | #' @examples
18 | #'
19 | #' data(purecn.example.output)
20 | #' file.rds <- "Sample1_PureCN.rds"
21 | #' saveRDS(purecn.example.output, file = file.rds)
22 | #' createCurationFile(file.rds)
23 | #'
24 | #' @export createCurationFile
25 | #' @importFrom utils write.csv
26 | createCurationFile <- function(file.rds, overwrite.uncurated = TRUE,
27 |                                overwrite.curated = FALSE) {
28 |     rds <- readRDS(file.rds)
29 |     res <- rds$results[[1]]
30 |     contamination <- res$SNV.posterior$posterior.contamination
31 |     contamination <- if (is.null(contamination)) 0 else contamination
32 |     d.f.curation <- data.frame(
33 |         Sampleid = res$seg$ID[1],
34 |         Purity = res$purity,
35 |         Ploidy = res$ploidy,
36 |         Sex = .getSexFromRds(rds),
37 |         Contamination = contamination,
38 |         Flagged = res$flag,
39 |         Failed = FALSE,
40 |         Curated = FALSE,
41 |         Comment = res$flag_comment
42 |     )
43 | 
44 |     filename <- file.path(dirname(file.rds),
45 |         paste(gsub(".rds$", "", basename(file.rds)), "csv", sep = "."))
46 | 
47 |     if (file.exists(filename)) {
48 |         tmp <- read.csv(filename, as.is = TRUE)
49 |         if (tmp$Curated[1] && !overwrite.curated) {
50 |             warning(filename,
51 |                 " already exists and seems to be edited.",
52 |                 " Will not overwrite it.")
53 |         } else if (!overwrite.uncurated) {
54 |             warning(filename, " already exists. Will not overwrite it.")
55 |         } else {
56 |             write.csv(d.f.curation, file = filename, row.names = FALSE)
57 |         }
58 |     } else {
59 |         write.csv(d.f.curation, file = filename, row.names = FALSE)
60 |     }
61 |     invisible(d.f.curation)
62 | }
63 | 
64 | .getSexFromRds <- function(rds) {
65 |     # if run without VCF, then we don't have sex information from VCF
66 |     if (is.null(rds$input$sex.vcf)) return(rds$input$sex)
67 | 
68 |     # conflict of coverage and snp based sex genotyper?
69 |     if (!is.na(rds$input$sex) && !is.na(rds$input$sex.vcf)) {
70 |         if (rds$input$sex == rds$input$sex.vcf) return(rds$input$sex)
71 |         return(paste("Coverage:", rds$input$sex, "VCF:", rds$input$sex.vcf))
72 |     }
73 |     # believe coverage based more than VCF in case we have only limited
74 |     # number of SNPs on chrX
75 |     if (!is.na(rds$input$sex)) {
76 |         return(rds$input$sex)
77 |     }
78 |     return(rds$input$sex.vcf)
79 | }
80 | 


--------------------------------------------------------------------------------
/man/callMutationBurden.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/callMutationBurden.R
 3 | \name{callMutationBurden}
 4 | \alias{callMutationBurden}
 5 | \title{Call mutation burden}
 6 | \usage{
 7 | callMutationBurden(
 8 |   res,
 9 |   id = 1,
10 |   remove.flagged = TRUE,
11 |   min.prior.somatic = 0.1,
12 |   max.prior.somatic = 1,
13 |   min.cellfraction = 0,
14 |   fun.countMutation = function(vcf) width(vcf) == 1,
15 |   callable = NULL,
16 |   exclude = NULL
17 | )
18 | }
19 | \arguments{
20 | \item{res}{Return object of the \code{\link{runAbsoluteCN}} function.}
21 | 
22 | \item{id}{Candidate solution to extract mutation burden from.
23 | \code{id=1} will use the maximum likelihood solution.}
24 | 
25 | \item{remove.flagged}{Remove variants flagged by
26 | \code{\link{predictSomatic}}.}
27 | 
28 | \item{min.prior.somatic}{Exclude variants with somatic prior
29 | probability lower than this cutoff.}
30 | 
31 | \item{max.prior.somatic}{Exclude variants with somatic prior
32 | probability higher than this cutoff. This is useful for removing
33 | hotspot mutations in small panels that might inflate the mutation
34 | burden.}
35 | 
36 | \item{min.cellfraction}{Exclude variants with cellular fraction
37 | lower than this cutoff. These are sub-clonal mutations or artifacts
38 | with very low allelic fraction.}
39 | 
40 | \item{fun.countMutation}{Function that can be used to filter the
41 | input VCF further for filtering, for example to only keep missense
42 | mutations. Expects a \code{logical} vector indicating whether variant
43 | should be counted (\code{TRUE}) or not (\code{FALSE}). Default
44 | is to keep only single nucleotide variants.}
45 | 
46 | \item{callable}{\code{GRanges} object with callable genomic regions,
47 | for example obtained by \sQuote{GATK CallableLoci} BED file, imported
48 | with \code{rtracklayer}.}
49 | 
50 | \item{exclude}{\code{GRanges} object with genomic regions that
51 | should be excluded from the \code{callable} regions, for example
52 | intronic regions. Requires \code{callable}.}
53 | }
54 | \value{
55 | Returns \code{data.frame} with mutation counts and sizes
56 | of callable regions.
57 | }
58 | \description{
59 | This function provides detailed mutation burden information.
60 | }
61 | \examples{
62 | 
63 | data(purecn.example.output)
64 | callMutationBurden(purecn.example.output)
65 | 
66 | # To calculate exact mutations per megabase, we can provide a BED
67 | # file containing all callable regions
68 | callableBed <- import(system.file("extdata", "example_callable.bed.gz",
69 |     package = "PureCN"))
70 | 
71 | # We can exclude some regions for mutation burden calculation,
72 | # for example intronic regions. 
73 | exclude <- GRanges(seqnames = "chr1", IRanges(start = 1, 
74 |     end = max(end(callableBed))))
75 | 
76 | # We can also exclude specific mutations by filtering the input VCF
77 | myVcfFilter <- function(vcf) seqnames(vcf)!="chr2"
78 | 
79 | callsCallable <- callMutationBurden(purecn.example.output,
80 |     callable = callableBed, exclude = exclude,
81 |     fun.countMutation = myVcfFilter)
82 | 
83 | }
84 | \seealso{
85 | \code{\link{runAbsoluteCN}} \code{\link{predictSomatic}}
86 | }
87 | \author{
88 | Markus Riester
89 | }
90 | 


--------------------------------------------------------------------------------
/tests/testthat/test_readCoverageFile.R:
--------------------------------------------------------------------------------
 1 | context("readCoverageFile")
 2 | 
 3 | test_that("Example data matches and pooling works", {
 4 |     tumor.coverage.file <- system.file("extdata", "example_tumor.txt.gz", 
 5 |         package = "PureCN")
 6 |     coverage <- readCoverageFile(tumor.coverage.file)
 7 |     expect_equal(length(coverage), 10049)
 8 |     expect_identical("chr1", as.character(seqnames(coverage)[1]))
 9 |     expect_equal(sum(!is.na(coverage[seqnames(coverage) == "chr21"]$coverage)), 
10 |         179)
11 |     pool <- poolCoverage(list(coverage))
12 |     expect_equal(pool$average.coverage, coverage$average.coverage)
13 |     pool <- poolCoverage(list(coverage), remove.chrs = "chr21")
14 |     expect_equal(sum(is.na(pool[seqnames(coverage) == "chr21"]$coverage)), 
15 |         179)
16 | })
17 | 
18 | test_that("Overlapping intervals were merged and warned", {    
19 |     tumor.overlapping.coverage.file <- system.file("extdata", 
20 |         "test_coverage_overlapping_intervals.txt", package = "PureCN")
21 |     expect_output(coverage <- readCoverageFile(tumor.overlapping.coverage.file),
22 |                   "WARN")
23 |     expect_equal(length(coverage), 3)
24 |     expect_equal(start(coverage), c(1216042, 1216606, 1216791))
25 |     expect_equal(end(coverage), c(1216050, 1216678, 1217991))
26 | })
27 | 
28 | test_that("CNVkit *cnn example data is parsed correctly", {    
29 |     coverageFile <- system.file("extdata", "example_normal3.cnn", 
30 |         package = "PureCN")
31 |     coverage <- readCoverageFile(coverageFile)
32 |     expect_equal(length(coverage), 4)
33 |     expect_equal(start(coverage), c(762097, 861281, 865591, 866325) + 
34 |         1)
35 |     expect_equal(end(coverage), c(762270, 861490, 865791, 866498))
36 |     expect_equal(coverage$on.target, c(TRUE, TRUE, TRUE, TRUE))
37 |     coverage <- readCoverageFile(coverageFile, zero = FALSE)
38 |     expect_equal(length(coverage), 4)
39 |     expect_equal(start(coverage), c(762097, 861281, 865591, 866325))
40 |     expect_equal(end(coverage), c(762270, 861490, 865791, 866498))
41 |     expect_equal(coverage$on.target, c(TRUE, TRUE, TRUE, TRUE))
42 | })
43 | 
44 | test_that("CNVkit *cnr example data is parsed correctly", {    
45 |     coverageFile <- system.file("extdata", "example_normal4.cnr", 
46 |         package = "PureCN")
47 |     coverage <- readCoverageFile(coverageFile)
48 |     expect_equal(length(coverage), 5)
49 |     expect_equal(start(coverage), c(10500, 70509, 227917, 318219, 
50 |         367658) + 1)
51 |     expect_equal(end(coverage), c(68590, 176917, 267219, 367158, 
52 |         367893))
53 |     expect_equal(coverage$on.target, c(FALSE, FALSE, FALSE, FALSE, 
54 |         TRUE))
55 | })
56 | 
57 | test_that("GATK4 *hdf5 example data is parsed correctly", {
58 |     coverageFile <- system.file("extdata", "example_normal5.hdf5", 
59 |         package = "PureCN")
60 |     coverage <- readCoverageFile(coverageFile)
61 |     expect_equal(length(coverage), 10)
62 |     expect_equal(head(start(coverage)), 
63 |         c(3598833, 3599562, 3607444, 3624039, 3638537, 3639872))
64 |     expect_equal(head(coverage$counts), 
65 |         c(127, 305, 78, 699, 566, 344))
66 | 
67 |     expect_equal(head(coverage$on.target), rep(TRUE, 6))
68 | })
69 | 


--------------------------------------------------------------------------------
/R/processMultipleSamples.R:
--------------------------------------------------------------------------------
 1 | #' Multi sample normalization and segmentation
 2 | #'
 3 | #' This function performs normalization and segmentation when multiple
 4 | #' for the same patient are available.
 5 | #'
 6 | #' CURRENTLY DEFUNCT BECAUSE IT DEPENDS ON THE DEFUNCT COPYNUMBER PACKAGE.
 7 | #' We are working on a replacement.
 8 | #'
 9 | #'
10 | #' @param tumor.coverage.files Coverage data for tumor samples.
11 | #' @param sampleids Sample ids, used in output files.
12 | #' @param normalDB Database of normal samples, created with
13 | #' \code{\link{createNormalDatabase}}.
14 | #' @param num.eigen Number of eigen vectors used.
15 | #' @param genome Genome version, for example hg19. Needed to get centromere
16 | #' positions.
17 | #' @param plot.cnv Segmentation plots.
18 | #' @param min.interval.weight Can be used to ignore intervals with low weights.
19 | #' @param w Weight of samples. Can be used to downweight poor quality samples.
20 | #' If \code{NULL}, sets to inverse of median on-target duplication rate if
21 | #' available, otherwise does not do any weighting.
22 | #' @param max.segments If not \code{NULL}, try a higher \code{undo.SD}
23 | #' parameter if number of segments exceeds the threshold.
24 | #' @param chr.hash Mapping of non-numerical chromsome names to numerical names
25 | #' (e.g. chr1 to 1, chr2 to 2, etc.). If \code{NULL}, assume chromsomes are
26 | #' properly ordered.
27 | #' @param centromeres A \code{GRanges} object with centromere positions.
28 | #' @param ... Arguments passed to the segmentation function.
29 | #' @return \code{data.frame} containing the segmentation.
30 | #' @author Markus Riester
31 | #' @references Nilsen G., Liestol K., Van Loo P., Vollan H., Eide M., Rueda O.,
32 | #' Chin S., Russell R., Baumbusch L., Caldas C., Borresen-Dale A.,
33 | #' Lingjaerde O. (2012). "Copynumber: Efficient algorithms for single- and
34 | #' multi-track copy number segmentation." BMC Genomics, 13(1), 591.
35 | #'
36 | #' @seealso \code{\link{runAbsoluteCN}}
37 | #' @examples
38 | #'
39 | #' normal1.coverage.file <- system.file("extdata", "example_normal.txt.gz",
40 | #'     package = "PureCN")
41 | #' normal2.coverage.file <- system.file("extdata", "example_normal2.txt.gz",
42 | #'     package = "PureCN")
43 | #' tumor1.coverage.file <- system.file("extdata", "example_tumor.txt.gz",
44 | #'     package = "PureCN")
45 | #' tumor2.coverage.file <- system.file("extdata", "example_tumor2.txt.gz",
46 | #'     package = "PureCN")
47 | #'
48 | #' normal.coverage.files <- c(normal1.coverage.file, normal2.coverage.file)
49 | #' tumor.coverage.files <- c(tumor1.coverage.file, tumor2.coverage.file)
50 | #'
51 | #' normalDB <- createNormalDatabase(normal.coverage.files)
52 | #'
53 | #' # seg <- processMultipleSamples(tumor.coverage.files,
54 | #' #         sampleids = c("Sample1", "Sample2"),
55 | #' #         normalDB = normalDB,
56 | #' #         genome = "hg19")
57 | #'
58 | #' @export processMultipleSamples
59 | processMultipleSamples <- function(tumor.coverage.files, sampleids, normalDB,
60 |     num.eigen = 20, genome, plot.cnv = TRUE, w = NULL,
61 |     min.interval.weight = 1 / 3,
62 |     max.segments = NULL, chr.hash = NULL, centromeres = NULL, ...) {
63 |     .Defunct(msg="preprocessMultipleSamples is temporarily defunct")
64 | }
65 | 


--------------------------------------------------------------------------------
/R/bootstrapResults.R:
--------------------------------------------------------------------------------
 1 | #' Bootstrapping variant fits
 2 | #'
 3 | #' This function bootstraps variants, then optionally re-ranks solutions by
 4 | #' using the bootstrap estimate of the likelihood score, and then optionally
 5 | #' removes solutions that never ranked high in any bootstrap replicate.
 6 | #'
 7 | #'
 8 | #' @param res Return object of the \code{\link{runAbsoluteCN}} function.
 9 | #' @param n Number of bootstrap replicates.
10 | #' @param top Include solution if it appears in the top \code{n} solutions of
11 | #' any bootstrap replicate. If \code{NULL}, do not filter solutions.
12 | #' @param reorder Reorder results by bootstrap value.
13 | #' @return Returns a \code{\link{runAbsoluteCN}} object with added bootstrap
14 | #' value to each solution. This value
15 | #' is the fraction of bootstrap replicates in which the solution ranked first.
16 | #' @author Markus Riester
17 | #' @seealso \code{\link{runAbsoluteCN}}
18 | #' @examples
19 | #'
20 | #' data(purecn.example.output)
21 | #' ret.boot <- bootstrapResults(purecn.example.output, n=100)
22 | #' plotAbs(ret.boot, type="overview")
23 | #'
24 | #' @export bootstrapResults
25 | #' @importFrom utils head
26 | bootstrapResults <- function(res, n = 500, top = NULL, reorder = FALSE) {
27 |     if (length(res$results) < 2) return(res)
28 |     if (is.null(top)) top <- length(res$results)
29 |     res$results <- .bootstrapResults(res$results, n = n, top = top,
30 |         reorder = reorder)
31 |     res
32 | }
33 | 
34 | .bootstrapResults <- function(results, n, top, reorder) {
35 |     ## Sample SNVs with replacement and recalculate log-likelihood.
36 |     .bootstrapResult <- function(result) {
37 |         lliks <- log(apply(result$SNV.posterior$likelihoods[
38 |             !result$SNV.posterior$posteriors$FLAGGED, ], 1, max))
39 |         lliks <- sum(sample(lliks, replace = TRUE))
40 |         result$log.likelihood + sum(lliks) -
41 |             sum(result$SNV.posterior$posteriors$FLAGGED)
42 |     }
43 |     best <- replicate(n, head(order(sapply(results, .bootstrapResult),
44 |         decreasing = TRUE), top))
45 | 
46 |     ## Calculate bootstrap value as fraction solution is ranked first.
47 |     bootstrap.value <- sapply(seq_along(results), function(i)
48 |         sum(best[1, ] == i)) / ncol(best)
49 |     for (i in seq_along(results)) {
50 |         results[[i]]$bootstrap.value <- bootstrap.value[i]
51 |     }
52 | 
53 |     ## Return only solutions that had ranked high in at least one replicate.
54 |     best <- as.vector(best)
55 |     results <- results[sort(unique(best))]
56 |     if (reorder) {
57 |         results <- results[order(sapply(results, function(x) x$bootstrap.value),
58 |             decreasing = TRUE)]
59 |     }
60 |     .flagBootstrap(results)
61 | }
62 | 
63 | .flagBootstrap <- function(results) {
64 |     if (!is.null(results[[1]]$bootstrap.value)) {
65 |         # max should be first, but be safe
66 |         maxBootstrap <- max(sapply(results, function(r) r$bootstrap.value),
67 |             na.rm = TRUE)
68 |         if (maxBootstrap < 0.95) {
69 |             for (i in seq_along(results)) {
70 |                 results[[i]]$flag <- TRUE
71 |                 results[[i]]$flag_comment <- .appendComment(
72 |                     results[[i]]$flag_comment, "LOW BOOTSTRAP VALUE")
73 |             }
74 |         }
75 |     }
76 |     results
77 | }
78 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM bioconductor/bioconductor_docker:RELEASE_3_19
 2 | #FROM bioconductor/bioconductor_docker:devel
 3 | 
 4 | # install base packages
 5 | RUN Rscript -e 'if (!requireNamespace("BiocManager", quietly = TRUE)){install.packages("BiocManager")}; \
 6 |     BiocManager::install(c("TxDb.Hsapiens.UCSC.hg38.knownGene", "TxDb.Hsapiens.UCSC.hg19.knownGene"))'
 7 | RUN Rscript -e 'install.packages(c("optparse", "R.utils")); \
 8 |     BiocManager::install(c("remotes", "raerose01/deconstructSigs"));'
 9 | RUN Rscript -e 'BiocManager::install(c("GenomicRanges", "IRanges", "DNAcopy", "Biostrings", "GenomicFeatures", "rtracklayer",\
10 | "S4Vectors", "rhdf5", "VariantAnnotation", "Rsamtools", "BiocGenerics"))'
11 | 
12 | # patched PSCBS with support of interval weights
13 | RUN Rscript -e 'BiocManager::install("lima1/PSCBS", ref="add_dnacopy_weighting")'
14 | 
15 | RUN apt update \
16 |     && apt install -y --no-install-recommends apt-utils python-is-python3 \
17 |     openjdk-17-jre-headless \
18 |     && apt-get clean \
19 |     && rm -rf /var/lib/apt/lists/*
20 | 
21 | # tex support for building vignettes
22 | # RUN apt update \
23 | #     && apt install -y --no-install-recommends \
24 | #     texlive \
25 | #     texlive-latex-extra \
26 | #     texlive-fonts-extra \
27 | #     texlive-bibtex-extra \
28 | #     texlive-science \
29 | #     texi2html \
30 | #     texinfo \
31 | #     && apt-get clean \
32 | #     && rm -rf /var/lib/apt/lists/*
33 | 
34 | # install GenomicsDB
35 | ENV GENOMICSDB_PATH=/opt/GenomicsDB
36 | ENV GENOMICSDB_BRANCH=master
37 | RUN mkdir $GENOMICSDB_PATH
38 | ENV INSTALL_PREFIX=$GENOMICSDB_PATH
39 | ENV PREREQS_ENV=$GENOMICSDB_PATH/genomicsdb_prereqs.sh
40 | #ARG TARGETPLATFORM
41 | #RUN if [ "$TARGETPLATFORM" = "linux/amd64" ]; then JAVA_HOME="/usr/lib/jvm/java-17-openjdk-amd64"; else JAVA_HOME=/usr/lib/jvm/java-17-openjdk-arm64; fi
42 | #ENV MAVEN_VERSION=3.9.5
43 | 
44 | WORKDIR /tmp
45 | 
46 | RUN git clone --recursive --branch $GENOMICSDB_BRANCH https://github.com/GenomicsDB/GenomicsDB.git && \
47 |     cd GenomicsDB/scripts/prereqs && \
48 |     ./install_prereqs.sh && \
49 |     apt-get clean && \
50 |     rm -rf /var/lib/apt/lists/*
51 | 
52 | RUN chmod +x $PREREQS_ENV && \
53 |     $PREREQS_ENV && \
54 |     cmake -DCMAKE_INSTALL_PREFIX=$GENOMICSDB_PATH -DCMAKE_BUILD_TYPE=Release ./GenomicsDB && \
55 |     make && make install && \
56 |     rm -rf /tmp/GenomicsDB
57 |   
58 | # install GenomicsDB R bindings
59 | RUN Rscript -e 'library(remotes);\
60 | remotes::install_github("nalinigans/GenomicsDB-R", ref="master", configure.args="--with-genomicsdb=/opt/GenomicsDB/")'
61 | 
62 | # install PureCN
63 | RUN Rscript -e 'BiocManager::install("PureCN", dependencies = TRUE)'
64 | #RUN Rscript -e 'BiocManager::install("lima1/PureCN", ref = "RELEASE_3_19", dependencies = TRUE)'
65 | ENV PURECN=/usr/local/lib/R/site-library/PureCN/extdata
66 | 
67 | # add symbolic link and paths
68 | ENV PATH $GENOMICSDB_PATH/bin:$PATH
69 | WORKDIR /opt
70 | RUN ln -s $PURECN /opt/PureCN
71 | 
72 | # install GATK4
73 | ENV GATK_VERSION="4.5.0.0"
74 | RUN wget --no-verbose https://github.com/broadinstitute/gatk/releases/download/${GATK_VERSION}/gatk-${GATK_VERSION}.zip && \
75 |     unzip gatk-${GATK_VERSION}.zip -d /opt && \
76 |     rm gatk-${GATK_VERSION}.zip
77 | 
78 | ENV PATH /opt/gatk-${GATK_VERSION}:$PATH
79 | 
80 | CMD ["/bin/bash"]
81 | 


--------------------------------------------------------------------------------
/man/processMultipleSamples.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/processMultipleSamples.R
 3 | \name{processMultipleSamples}
 4 | \alias{processMultipleSamples}
 5 | \title{Multi sample normalization and segmentation}
 6 | \usage{
 7 | processMultipleSamples(
 8 |   tumor.coverage.files,
 9 |   sampleids,
10 |   normalDB,
11 |   num.eigen = 20,
12 |   genome,
13 |   plot.cnv = TRUE,
14 |   w = NULL,
15 |   min.interval.weight = 1/3,
16 |   max.segments = NULL,
17 |   chr.hash = NULL,
18 |   centromeres = NULL,
19 |   ...
20 | )
21 | }
22 | \arguments{
23 | \item{tumor.coverage.files}{Coverage data for tumor samples.}
24 | 
25 | \item{sampleids}{Sample ids, used in output files.}
26 | 
27 | \item{normalDB}{Database of normal samples, created with
28 | \code{\link{createNormalDatabase}}.}
29 | 
30 | \item{num.eigen}{Number of eigen vectors used.}
31 | 
32 | \item{genome}{Genome version, for example hg19. Needed to get centromere
33 | positions.}
34 | 
35 | \item{plot.cnv}{Segmentation plots.}
36 | 
37 | \item{w}{Weight of samples. Can be used to downweight poor quality samples.
38 | If \code{NULL}, sets to inverse of median on-target duplication rate if
39 | available, otherwise does not do any weighting.}
40 | 
41 | \item{min.interval.weight}{Can be used to ignore intervals with low weights.}
42 | 
43 | \item{max.segments}{If not \code{NULL}, try a higher \code{undo.SD}
44 | parameter if number of segments exceeds the threshold.}
45 | 
46 | \item{chr.hash}{Mapping of non-numerical chromsome names to numerical names
47 | (e.g. chr1 to 1, chr2 to 2, etc.). If \code{NULL}, assume chromsomes are
48 | properly ordered.}
49 | 
50 | \item{centromeres}{A \code{GRanges} object with centromere positions.}
51 | 
52 | \item{...}{Arguments passed to the segmentation function.}
53 | }
54 | \value{
55 | \code{data.frame} containing the segmentation.
56 | }
57 | \description{
58 | This function performs normalization and segmentation when multiple
59 | for the same patient are available.
60 | }
61 | \details{
62 | CURRENTLY DEFUNCT BECAUSE IT DEPENDS ON THE DEFUNCT COPYNUMBER PACKAGE.
63 | We are working on a replacement.
64 | }
65 | \examples{
66 | 
67 | normal1.coverage.file <- system.file("extdata", "example_normal.txt.gz",
68 |     package = "PureCN")
69 | normal2.coverage.file <- system.file("extdata", "example_normal2.txt.gz",
70 |     package = "PureCN")
71 | tumor1.coverage.file <- system.file("extdata", "example_tumor.txt.gz",
72 |     package = "PureCN")
73 | tumor2.coverage.file <- system.file("extdata", "example_tumor2.txt.gz",
74 |     package = "PureCN")
75 | 
76 | normal.coverage.files <- c(normal1.coverage.file, normal2.coverage.file)
77 | tumor.coverage.files <- c(tumor1.coverage.file, tumor2.coverage.file)
78 | 
79 | normalDB <- createNormalDatabase(normal.coverage.files)
80 | 
81 | # seg <- processMultipleSamples(tumor.coverage.files,
82 | #         sampleids = c("Sample1", "Sample2"),
83 | #         normalDB = normalDB,
84 | #         genome = "hg19")
85 | 
86 | }
87 | \references{
88 | Nilsen G., Liestol K., Van Loo P., Vollan H., Eide M., Rueda O.,
89 | Chin S., Russell R., Baumbusch L., Caldas C., Borresen-Dale A.,
90 | Lingjaerde O. (2012). "Copynumber: Efficient algorithms for single- and
91 | multi-track copy number segmentation." BMC Genomics, 13(1), 591.
92 | }
93 | \seealso{
94 | \code{\link{runAbsoluteCN}}
95 | }
96 | \author{
97 | Markus Riester
98 | }
99 | 


--------------------------------------------------------------------------------
/tests/testthat/test_createNormalDatabase.R:
--------------------------------------------------------------------------------
 1 | context("createNormalDatabase")
 2 | 
 3 | tumor.coverage.file <- system.file("extdata", "example_tumor.txt.gz", 
 4 |     package = "PureCN")
 5 | normal.coverage.file <- system.file("extdata", "example_normal.txt.gz", 
 6 |     package = "PureCN")
 7 | normal2.coverage.file <- system.file("extdata", "example_normal2.txt.gz", 
 8 |     package = "PureCN")
 9 | normal.coverage.files <- c(normal.coverage.file, normal2.coverage.file)
10 | normalDB <- createNormalDatabase(normal.coverage.files)
11 | 
12 | test_that("NormalDB of example data matches expectated values", {
13 |     expect_identical(normalDB$sex, c(NA, NA))
14 |     pool <- calculateTangentNormal(normal.coverage.files[1], normalDB)
15 | 
16 |     n <- lapply(normal.coverage.files, readCoverageFile)
17 |     expect_equal(length(pool), length(n[[1]]))
18 |     expect_equal(as.character(n[[1]]), normalDB$intervals)
19 |  })
20 | 
21 | test_that("Provided sex is handled correctly", {
22 |     expect_warning(
23 |         normalDB2 <- createNormalDatabase(normal.coverage.files, sex = c("A",
24 |             NA))
25 |     )
26 |     expect_equal(normalDB2$sex, as.character(c(NA, NA)))
27 |     expect_warning(
28 |         normalDB2 <- createNormalDatabase(normal.coverage.files, sex = c("A",
29 |         "F"))
30 |     )
31 |     expect_equal(normalDB2$sex, c(NA, "F"))
32 |     expect_equal(normalDB2$normal.coverage.files, 
33 |                  sapply(normal.coverage.files, normalizePath, 
34 |                         USE.NAMES = FALSE))
35 | 
36 |     expect_error(createNormalDatabase(normal.coverage.files, sex = "A"))
37 | })
38 | 
39 | test_that("Exceptions happen with wrong input", {
40 |     interval.file <- system.file("extdata", "example_intervals.txt", 
41 |         package = "PureCN")
42 |     normal <- readCoverageFile(normal.coverage.file)
43 |     correctCoverageBias(normal, interval.file)
44 |     output.file <- tempfile(fileext = ".txt")
45 |     expect_output(correctCoverageBias(normal[sample(length(normal)), 
46 |         ], interval.file, output.file), "WARN")
47 |     createNormalDatabase(c(normal.coverage.files, output.file))
48 |     best.normal.coverage.file <- calculateTangentNormal(tumor.coverage.file, 
49 |         normalDB)
50 |     normal3.coverage.file <- system.file("extdata", "example_normal3.cnn", 
51 |         package = "PureCN")
52 |     expect_error(calculateTangentNormal(normal3.coverage.file, normalDB),
53 |        "not align")
54 |     expect_error(createNormalDatabase(normal.coverage.file), "At least 2")
55 |     expect_output(createNormalDatabase( c(normal.coverage.file, normal.coverage.file, 
56 |                                           normal2.coverage.file)), "duplicated")
57 |     file.remove(output.file)
58 | })
59 | 
60 | 
61 | test_that("Exceptions happen with outdated databases", {
62 |     normalDB2 <- normalDB
63 |     normalDB2$version <- NULL
64 |     expect_error(runAbsoluteCN(normal.coverage.file, tumor.coverage.file, normalDB = normalDB2),
65 |                  "incompatible")
66 |     expect_error( calculateTangentNormal(tumor.coverage.file, normalDB2), "incompatible")
67 | })
68 | 
69 | 
70 | test_that("Exception thrown when user mixed gc-normalized and raw coverages.", {
71 |     normal.coverage.files.wrong <- c(tempfile(fileext="_coverage.txt"), tempfile(fileext="_loess.txt"))
72 |     file.create(normal.coverage.files.wrong)
73 |     expect_error( createNormalDatabase(normal.coverage.files.wrong), "suffix")
74 |     file.remove(normal.coverage.files.wrong)
75 | })
76 | 


--------------------------------------------------------------------------------
/R/readLogRatioFile.R:
--------------------------------------------------------------------------------
 1 | #' Read file containing interval-level log2 tumor/normal ratios
 2 | #'
 3 | #' Read log2 ratio file produced by external tools like The Genome Analysis
 4 | #' Toolkit version 4.
 5 | #'
 6 | #' @param file Log2 coverage file.
 7 | #' @param format File format. If missing, derived from the file
 8 | #' extension. Currently GATK4 DenoiseReadCounts format supported.
 9 | #' A simple GATK3-style format, two columns with coordinates
10 | #' as string in format chr:start-stop in first and log2-ratio
11 | #' in second is also supported.
12 | #' @param zero Start position is 0-based. Default is \code{FALSE}
13 | #' for GATK, \code{TRUE} for BED file based intervals.
14 | #' @return A \code{GRange} with the log2 ratio.
15 | #' @author Markus Riester
16 | #' @examples
17 | #'
18 | #' logratio.file <- system.file("extdata", "example_gatk4_denoised_cr.tsv.gz",
19 | #'     package = "PureCN")
20 | #' logratio <- readLogRatioFile(logratio.file)
21 | #'
22 | #' @export readLogRatioFile
23 | readLogRatioFile <- function(file, format, zero = NULL) {
24 |     if (missing(format)) format <- .getLogRatioFormat(file)
25 |     if (format == "GATK3") return(.readLogRatioFileGATK3(file, zero = FALSE))
26 |     if (format == "GATK4") return(.readLogRatioFileGATK4(file, zero = FALSE))
27 | }
28 | 
29 | .getLogRatioFormat <- function(file) {
30 |      header <- scan(file, what = character(), sep = "\n", nmax = 1, quiet = TRUE)
31 |      format <- "GATK4"
32 |      if (grepl("^Target", header)[1]) return("GATK3")
33 |      format
34 | }
35 | 
36 | .readLogRatioFileGATK3 <- function(file, zero = FALSE) {
37 |     x <- fread(file, data.table = FALSE)
38 |     gr <- GRanges(x[, 1])
39 |     gr$log.ratio <- x[, 2]
40 |     gr
41 | }
42 | 
43 | .readLogRatioFileGATK4 <- function(file, zero = FALSE) {
44 |     con <- file(file, open = "r")
45 |     header <- .parseGATKHeader(con)
46 |     x <- read.delim(con, header = FALSE, as.is = TRUE)
47 |     colnames(x) <- strsplit(header$last_line, "\t")[[1]]
48 |     gr <- GRanges(x[, 1], IRanges(start = x[, 2], end = x[, 3]))
49 |     gr$log.ratio <- x[, 4]
50 |     gr <- sort(sortSeqlevels(gr))
51 |     if (length(header$sl)) {
52 |         header$sl <- sapply(header$sl, as.numeric)
53 |         seqlengths(gr) <- header$sl[names(seqlengths(gr))]
54 |     }
55 |     return(gr)
56 | }
57 | 
58 | .writeLogRatioFileGATK4 <- function(x, id = 1, file) {
59 |     gr <- x$log.ratio
60 |     if (is.null(gr$log.ratio)) {
61 |         .stopRuntimeError("log.ratio NULL in .writeLogRatioFileGATK4")
62 |     }
63 |     output <- data.frame(
64 |         CONTIG = seqnames(gr),
65 |         START = start(gr),
66 |         END = end(gr),
67 |         LOG2_COPY_RATIO = gr$log.ratio
68 |     )
69 |     con <- file(file, open = "w")
70 |     .writeGATKHeader(x$vcf, id, con, "log-ratio")
71 |     write.table(output, con, row.names = FALSE, quote = FALSE, sep = "\t")
72 |     close(con)
73 |     invisible(output)
74 | }
75 | 
76 | .writeGATKHeader <- function(vcf, id = 1, con, file_type) {
77 |     writeLines(paste("@HD", "VN:1.6", sep = "\t"), con)
78 |     if (any(is.na(seqlengths(vcf)))) {
79 |         flog.warn("Cannot find all contig lengths while exporting %s file.",
80 |             file_type)
81 |     } else {
82 |         sl <- seqlengths(vcf)
83 |         writeLines(paste("@SQ", paste0("SN:",names(sl)), paste0("LN:", sl), sep = "\t"), con)
84 |     }
85 |     if (!is.null(id)) {
86 |         sampleid <- .getSampleIdFromVcf(vcf, id)
87 |         writeLines(paste("@RG", "ID:PureCN", paste0("SM:", sampleid), sep = "\t"), con)
88 |    }
89 | } 
90 | 


--------------------------------------------------------------------------------
/R/segmentationHclust.R:
--------------------------------------------------------------------------------
 1 | #' Minimal segmentation function
 2 | #'
 3 | #' A minimal segmentation function useful when segmentation was performed by
 4 | #' third-pary tools. When a \code{CollapsedVCF} with germline SNPs is provided,
 5 | #' it will cluster segments using \code{hclust}. Otherwise it will use the
 6 | #' segmentation as provided.
 7 | #' This function is called via the
 8 | #' \code{fun.segmentation} argument of \code{\link{runAbsoluteCN}}.  The
 9 | #' arguments are passed via \code{args.segmentation}.
10 | #'
11 | #'
12 | #' @param seg If segmentation was provided by the user, this data structure
13 | #' will contain this segmentation. Useful for minimal segmentation functions.
14 | #' Otherwise PureCN will re-segment the data. This segmentation function
15 | #' ignores this user provided segmentation.
16 | #' @param vcf Optional \code{CollapsedVCF} object with germline allelic ratios.
17 | #' @param tumor.id.in.vcf Id of tumor in case multiple samples are stored in
18 | #' VCF.
19 | #' @param normal.id.in.vcf Id of normal in in VCF. Currently not used.
20 | #' @param min.logr.sdev Minimum log-ratio standard deviation used in the
21 | #' model. Useful to make fitting more robust to outliers in very clean
22 | #' data (currently not used in this segmentation function).
23 | #' @param prune.hclust.h Height in the \code{hclust} pruning step. Increasing
24 | #' this value will merge segments more aggressively. If NULL, try to find a
25 | #' sensible default.
26 | #' @param prune.hclust.method Cluster method used in the \code{hclust} pruning
27 | #' step. See documentation for the \code{hclust} function.
28 | #' @param chr.hash Mapping of non-numerical chromsome names to numerical names
29 | #' (e.g. chr1 to 1, chr2 to 2, etc.). If \code{NULL}, assume chromsomes are
30 | #' properly ordered.
31 | #' @param ... Currently unused arguments provided to other segmentation
32 | #' functions.
33 | #' @return \code{data.frame} containing the segmentation.
34 | #' @author Markus Riester
35 | #'
36 | #' @seealso \code{\link{runAbsoluteCN}}
37 | #' @examples
38 | #'
39 | #' vcf.file <- system.file("extdata", "example.vcf.gz",
40 | #'     package="PureCN")
41 | #' interval.file <- system.file("extdata", "example_intervals_tiny.txt",
42 | #'     package="PureCN")
43 | #' seg.file <- system.file('extdata', 'example_seg.txt',
44 | #'     package = 'PureCN')
45 | #'
46 | #' res <- runAbsoluteCN(seg.file = seg.file,
47 | #'     fun.segmentation = segmentationHclust,
48 | #'     max.ploidy = 4, vcf.file = vcf.file,
49 | #'     test.purity = seq(0.3, 0.7, by = 0.05),
50 | #'     max.candidate.solutions = 1,
51 | #'     genome = 'hg19', interval.file = interval.file)
52 | #'
53 | #' @export segmentationHclust
54 | segmentationHclust <- function(seg,
55 |     vcf = NULL, tumor.id.in.vcf = 1, normal.id.in.vcf = NULL,
56 |     min.logr.sdev = 0.15, prune.hclust.h = NULL, prune.hclust.method = "ward.D",
57 |     chr.hash = NULL, ...) {
58 |     if (is.null(seg)) {
59 |         .stopUserError("segmentationHclust requires an input segmentation.")  
60 |     }
61 |     .checkParametersSegmentation(alpha = NULL, undo.SD = NULL,
62 |         max.segments = NULL, min.logr.sdev = min.logr.sdev,
63 |         prune.hclust.h = prune.hclust.h)
64 | 
65 |     if (!is.null(vcf)) {
66 |         if (is.null(chr.hash)) chr.hash <- .getChrHash(seqlevels(vcf))
67 |         seg <- .pruneByHclust(seg, vcf, tumor.id.in.vcf,
68 |             h = prune.hclust.h,
69 |             method = prune.hclust.method, chr.hash = chr.hash)
70 |     }
71 |     idx.enough.markers <- seg$num.mark > 1
72 |     rownames(seg) <- NULL
73 |     seg[idx.enough.markers, ]
74 | }
75 | 


--------------------------------------------------------------------------------
/tests/testthat/test_segmentation.R:
--------------------------------------------------------------------------------
 1 | context("segmentation")
 2 | 
 3 | normal.coverage.file <- system.file("extdata", "example_normal_tiny.txt",
 4 |     package = "PureCN")
 5 | tumor.coverage.file <- system.file("extdata", "example_tumor_tiny.txt",
 6 |     package = "PureCN")
 7 | vcf.file <- system.file("extdata", "example.vcf.gz",
 8 |     package = "PureCN")
 9 | seg.file <- system.file("extdata", "example_seg.txt",
10 |     package = "PureCN")
11 | 
12 | test_that("Precomputed boudaries are correct", {
13 |     data(purecn.DNAcopy.bdry)
14 |     alpha <- formals(segmentationCBS)$alpha
15 |     eta <- formals(segment)$eta
16 |     nperm <- formals(segment)$nperm
17 |     max.ones <- floor(nperm * alpha) + 1
18 |     set.seed(123)
19 |     sbdry <- getbdry(eta, nperm, max.ones)
20 |     expect_equal(purecn.DNAcopy.bdry, sbdry)
21 | })
22 | 
23 | 
24 | test_that("GATK4 wrapper works for example data.", {
25 |    skip_if_not(PureCN:::.checkGATK4Version("4.1.7.0") >= 0,
26 |     "gatk binary > 4.1.7.0 required")
27 | 
28 |     ret <- runAbsoluteCN(normal.coverage.file = normal.coverage.file,
29 |         tumor.coverage.file = tumor.coverage.file, vcf.file = vcf.file,
30 |         sampleid = "Sample1",  genome = "hg19",
31 |         fun.segmentation = segmentationGATK4, max.ploidy = 4,
32 |         test.purity = seq(0.3, 0.7, by = 0.05),
33 |         max.candidate.solutions = 1, plot.cnv = FALSE)
34 | 
35 |     expect_equal(0.65, ret$results[[1]]$purity, tolerance = 0.02)
36 |     expect_equal(1.62, ret$results[[1]]$ploidy, tolerance = 0.2)
37 | })
38 | 
39 | test_that("Hclust segmentation works", {
40 |     expect_error(runAbsoluteCN(normal.coverage.file = normal.coverage.file,
41 |         tumor.coverage.file = tumor.coverage.file,
42 |         sampleid = "Sample1",  genome = "hg19",
43 |         fun.segmentation = segmentationHclust,
44 |         max.candidate.solutions = 1, plot.cnv = FALSE),
45 |         "segmentationHclust requires an")
46 | })
47 | 
48 | 
49 | test_that("private function .fixBreakpoint.", {
50 |     seg <- readSegmentationFile(seg.file, "Sample1")
51 |     data(purecn.example.output)
52 |     gr <- purecn.example.output$input$log.ratio
53 |     lr <- gr$log.ratio
54 |     seg_1 <- PureCN:::.fixBreakpointsInBaits(gr, lr, seg, purecn.example.output$input$chr.hash)
55 |     expect_equivalent(seg_1$loc.start, seg$loc.start)
56 |     expect_equivalent(seg_1$loc.end, seg$loc.end)
57 | 
58 |     seg[24, "loc.start"] <- 82403793 + 1
59 |     seg[44, "loc.end"] <- 57507347
60 | 
61 |     seg_1 <- PureCN:::.fixBreakpointsInBaits(gr, lr, seg, purecn.example.output$input$chr.hash)
62 | 
63 |     expect_equivalent(seg[23, "loc.start"], seg_1[23, "loc.start"])
64 |     expect_equivalent(82403838, seg_1[23, "loc.end"])
65 |     expect_equivalent(82403838 + 1, seg_1[24, "loc.start"])
66 |     expect_equivalent(seg[24, "loc.end"], seg_1[24, "loc.end"])
67 | 
68 |     expect_equivalent(seg[44, "loc.start"], seg_1[44, "loc.start"])
69 |     expect_equivalent(57507289 - 1, seg_1[44, "loc.end"])
70 |     expect_equivalent(57507289, seg_1[45, "loc.start"])
71 |     expect_equivalent(seg[45, "loc.end"], seg_1[45, "loc.end"])
72 | 
73 |     expect_equivalent(seg$loc.start[-c(23, 24, 44, 45)],
74 |         seg_1$loc.start[-c(23, 24, 44, 45)])
75 |     expect_equivalent(seg$loc.end[-c(23, 24, 44, 45)],
76 |         seg_1$loc.end[-c(23, 24, 44, 45)])
77 | })
78 | 
79 | test_that("issue 201 is fixed.", {
80 |     expect_error(runAbsoluteCN(normal.coverage.file = normal.coverage.file,
81 |         tumor.coverage.file = tumor.coverage.file,
82 |         sampleid = "Sample1",  genome = "hg19",
83 |         args.segmentation = list(undo.SD = "A"),
84 |         max.candidate.solutions = 1, plot.cnv = FALSE),
85 |         "undo.SD")
86 | })    
87 | 


--------------------------------------------------------------------------------
/R/calculateLogRatio.R:
--------------------------------------------------------------------------------
 1 | #' Calculate coverage log-ratio of tumor vs. normal
 2 | #' 
 3 | #' This function is automatically called by \code{\link{runAbsoluteCN}} when
 4 | #' normal and tumor coverage are provided (and not a segmentation file or
 5 | #' target-level log-ratios). This function is therefore normally not called by
 6 | #' the user.
 7 | #' 
 8 | #' 
 9 | #' @param normal Normal coverage read in by the \code{\link{readCoverageFile}}
10 | #' function.
11 | #' @param tumor Tumor coverage read in by the \code{\link{readCoverageFile}}
12 | #' function.
13 | #' @return \code{numeric(length(tumor))}, tumor vs. normal copy number log-ratios
14 | #' for all targets.
15 | #' @author Markus Riester
16 | #' @examples
17 | #' 
18 | #' normal.coverage.file <- system.file("extdata", "example_normal.txt.gz", 
19 | #'     package = "PureCN")
20 | #' tumor.coverage.file <- system.file("extdata", "example_tumor.txt.gz", 
21 | #'     package = "PureCN")
22 | #' normal <- readCoverageFile(normal.coverage.file)
23 | #' tumor <- readCoverageFile(tumor.coverage.file)
24 | #' log.ratio <- calculateLogRatio(normal, tumor)
25 | #' 
26 | #' @export calculateLogRatio
27 | calculateLogRatio <- function(normal, tumor) {
28 |     # make sure that normal and tumor align
29 |     if (!identical(as.character(normal), as.character(tumor))) {
30 |         .stopUserError("Interval files in normal and tumor different.")
31 |     }
32 |     if (is.null(tumor$on.target)) tumor$on.target <- TRUE
33 | 
34 |     avgCovTumor <- mean(tumor$average.coverage[tumor$on.target], na.rm=TRUE)
35 |     avgCovNormal <- mean(normal$average.coverage[tumor$on.target], na.rm=TRUE)
36 | 
37 |     flog.info("Mean target coverages: %.0fX (tumor) %.0fX (normal).", 
38 |         avgCovTumor, avgCovNormal)
39 |     if (avgCovNormal/avgCovTumor < 0.25 || avgCovNormal/avgCovTumor > 4) {
40 |         flog.warn("Large difference in coverage of tumor and normal.")
41 |     }    
42 |     
43 |     tumor$log.ratio <- 0.
44 | 
45 |     for (on.target in c(FALSE, TRUE)) {
46 |         idx <- tumor$on.target==on.target
47 |         if (!sum(idx)) next
48 |         total.cov.normal <- sum(as.numeric(normal[idx]$coverage), na.rm = TRUE)
49 |         total.cov.tumor <- sum(as.numeric(tumor[idx]$coverage), na.rm = TRUE)
50 | 
51 |         log.ratio <- log2(tumor[idx]$average.coverage/normal[idx]$average.coverage) + 
52 |                      log2(total.cov.normal/total.cov.tumor)
53 |         tumor[idx]$log.ratio <- .calibrate_log_ratio(log.ratio, tumor[idx])
54 |     }
55 |     if (!all(tumor$on.target)) {
56 |         # try to align the off-target and on-target log-ratios better
57 |         tumor$log.ratio <- .calibrate_off_target_log_ratio(tumor)
58 |     }    
59 |     tumor$log.ratio
60 | }
61 | 
62 | .calibrate_log_ratio <- function(log.ratio, granges) {
63 |     idxFinite <- is.finite(log.ratio)
64 |     if (!sum(idxFinite)) {
65 |         .stopUserError("No finite intervals.")
66 |     }
67 |     mean.log.ratio <- weighted.mean(log.ratio[idxFinite], 
68 |         width(granges)[idxFinite])
69 | # calibrate
70 |     flog.debug("Calibrating %i log-ratios by %f.",
71 |                sum(idxFinite), mean.log.ratio)
72 |     return(log.ratio - mean.log.ratio)
73 | }
74 | 
75 | .calibrate_off_target_log_ratio <- function(granges) {
76 |     idx <- granges$on.target
77 |     g1 <- granges[idx]
78 |     g2 <- granges[!idx]
79 |     nr <- nearest(g1,g2)
80 |     d2 <- median(g1$log.ratio - g2$log.ratio[nr], na.rm = TRUE) / 2
81 |     if (d2 > 0.1) {
82 |         flog.warn("Large potential mis-calibration of on- and off-target log2 ratios: %.2f", d2)
83 |     }
84 |     granges$log.ratio[idx] <- granges$log.ratio[idx] - d2
85 |     granges$log.ratio[!idx] <- granges$log.ratio[!idx] + d2
86 |     return(granges$log.ratio)
87 | }
88 | 


--------------------------------------------------------------------------------
/man/segmentationGATK4.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/segmentationGATK4.R
  3 | \name{segmentationGATK4}
  4 | \alias{segmentationGATK4}
  5 | \title{GATK4 ModelSegments segmentation function}
  6 | \usage{
  7 | segmentationGATK4(
  8 |   normal,
  9 |   tumor,
 10 |   log.ratio,
 11 |   seg,
 12 |   vcf = NULL,
 13 |   tumor.id.in.vcf = 1,
 14 |   normal.id.in.vcf = NULL,
 15 |   min.logr.sdev = 0.15,
 16 |   prune.hclust.h = NULL,
 17 |   prune.hclust.method = NULL,
 18 |   changepoints.penality = NULL,
 19 |   additional.cmd.args = "",
 20 |   chr.hash = NULL,
 21 |   ...
 22 | )
 23 | }
 24 | \arguments{
 25 | \item{normal}{Coverage data for normal sample. Ignored in this function.}
 26 | 
 27 | \item{tumor}{Coverage data for tumor sample.}
 28 | 
 29 | \item{log.ratio}{Copy number log-ratios, one for each exon in coverage file.}
 30 | 
 31 | \item{seg}{If segmentation was provided by the user, this data structure
 32 | will contain this segmentation. Useful for minimal segmentation functions.
 33 | Otherwise PureCN will re-segment the data. This segmentation function
 34 | ignores this user provided segmentation.}
 35 | 
 36 | \item{vcf}{Optional \code{CollapsedVCF} object with germline allelic ratios.}
 37 | 
 38 | \item{tumor.id.in.vcf}{Id of tumor in case multiple samples are stored in
 39 | VCF.}
 40 | 
 41 | \item{normal.id.in.vcf}{Id of normal in in VCF. Currently not used.}
 42 | 
 43 | \item{min.logr.sdev}{Minimum log-ratio standard deviation used in the
 44 | model. Useful to make fitting more robust to outliers in very clean
 45 | data.}
 46 | 
 47 | \item{prune.hclust.h}{Ignored in this function.}
 48 | 
 49 | \item{prune.hclust.method}{Ignored in this function.}
 50 | 
 51 | \item{changepoints.penality}{The \code{--number-of-changepoints-penalty-factor}.
 52 | If \code{NULL}, find a sensible default. Ignored when provided in 
 53 | \code{additional.cmd.args}.}
 54 | 
 55 | \item{additional.cmd.args}{\code{character(1)}. By default,
 56 | \code{ModelSegments} is called with default parameters. Provide additional 
 57 | arguments here.}
 58 | 
 59 | \item{chr.hash}{Not needed here since \code{ModelSegments} does not
 60 | require numbered chromosome names.}
 61 | 
 62 | \item{...}{Currently unused arguments provided to other segmentation
 63 | functions.}
 64 | }
 65 | \value{
 66 | \code{data.frame} containing the segmentation.
 67 | }
 68 | \description{
 69 | A wrapper for GATK4s ModelSegmentation function, useful when normalization
 70 | is performed with other tools than GATK4, for example PureCN.
 71 | This function is called via the
 72 | \code{fun.segmentation} argument of \code{\link{runAbsoluteCN}}.  The
 73 | arguments are passed via \code{args.segmentation}.
 74 | }
 75 | \examples{
 76 | 
 77 | normal.coverage.file <- system.file("extdata", "example_normal_tiny.txt", 
 78 |     package="PureCN")
 79 | tumor.coverage.file <- system.file("extdata", "example_tumor_tiny.txt", 
 80 |     package="PureCN")
 81 | vcf.file <- system.file("extdata", "example.vcf.gz",
 82 |     package="PureCN")
 83 | 
 84 | # The max.candidate.solutions, max.ploidy and test.purity parameters are set to
 85 | # non-default values to speed-up this example.  This is not a good idea for real
 86 | # samples.
 87 | \dontrun{
 88 |  ret <-runAbsoluteCN(normal.coverage.file=normal.coverage.file, 
 89 |      tumor.coverage.file=tumor.coverage.file, vcf.file=vcf.file, 
 90 |      sampleid="Sample1",  genome="hg19",
 91 |      fun.segmentation = segmentationGATK4, max.ploidy=4,
 92 |      args.segmentation = list(additional.cmd.args = "--gcs-max-retries 19"),
 93 |      test.purity=seq(0.3,0.7,by=0.05), max.candidate.solutions=1)
 94 | }
 95 | 
 96 | }
 97 | \seealso{
 98 | \code{\link{runAbsoluteCN}}
 99 | }
100 | \author{
101 | Markus Riester
102 | }
103 | 


--------------------------------------------------------------------------------
/man/plotAbs.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/plotAbs.R
  3 | \name{plotAbs}
  4 | \alias{plotAbs}
  5 | \title{Plots for analyzing PureCN solutions}
  6 | \usage{
  7 | plotAbs(
  8 |   res,
  9 |   id = 1,
 10 |   type = c("hist", "overview", "BAF", "AF", "all"),
 11 |   chr = NULL,
 12 |   germline.only = TRUE,
 13 |   show.contour = FALSE,
 14 |   purity = NULL,
 15 |   ploidy = NULL,
 16 |   alpha = TRUE,
 17 |   show.segment.means = c("SNV", "segments", "both"),
 18 |   max.mapping.bias = 0.8,
 19 |   palette.name = "Paired",
 20 |   col.snps = "#2b6391",
 21 |   col.chr.shading = "#f0f0f0",
 22 |   ...
 23 | )
 24 | }
 25 | \arguments{
 26 | \item{res}{Return object of the \code{\link{runAbsoluteCN}} function.}
 27 | 
 28 | \item{id}{Candidate solutions to be plotted. \code{id=1} will draw the
 29 | plot for the maximum likelihood solution.}
 30 | 
 31 | \item{type}{Different types of plots. \code{hist} will plot a histogram,
 32 | assigning log-ratio peaks to integer values. \code{overview} will plot all
 33 | local optima, sorted by likelihood. \code{BAF} plots
 34 | something like a B-allele frequency plot known from SNP arrays: it plots
 35 | allele frequencies of germline variants (or most likely germline when status
 36 | is not available) against copy number. \code{AF} plots observed allelic
 37 | fractions against expected (purity), maximum likelihood (optimal
 38 | multiplicity) allelic fractions. \code{all} plots types \code{BAF} and
 39 | \code{AF} for all local optima, and is useful for generating a PDF for
 40 | manual inspection.}
 41 | 
 42 | \item{chr}{If \code{NULL}, show all chromosomes, otherwise only the ones
 43 | specified (\code{type="BAF"} only).}
 44 | 
 45 | \item{germline.only}{If \code{TRUE}, show only variants most likely being
 46 | germline in BAF plot. Useful to set to \code{FALSE} (in combination with
 47 | \code{chr}) to study potential artifacts.}
 48 | 
 49 | \item{show.contour}{For \code{type="overview"}, display contour plot.}
 50 | 
 51 | \item{purity}{Display expected integer copy numbers for purity, defaults to
 52 | purity of the solution (\code{type="hist"} and \code{"AF"} only).}
 53 | 
 54 | \item{ploidy}{Display expected integer copy numbers for ploidy, defaults to
 55 | ploidy of the solution (\code{type="hist"} and \code{"AF"} only).}
 56 | 
 57 | \item{alpha}{Add transparency to the plot if VCF contains many variants
 58 | (>2000, \code{type="AF"} and \code{type="BAF"} only).}
 59 | 
 60 | \item{show.segment.means}{Show segment means in germline allele frequency
 61 | plot?  If \code{both}, show SNVs and segment means. If \code{SNV} show all
 62 | SNVs. Only for \code{type="AF"}.}
 63 | 
 64 | \item{max.mapping.bias}{Exclude variants with high mapping bias from
 65 | plotting. Note that bias is reported on an inverse scale; a variant with
 66 | mapping bias of 1 has no bias. (\code{type="AF"} and \code{type="BAF"}
 67 | only).}
 68 | 
 69 | \item{palette.name}{The default \code{RColorBrewer} palette.}
 70 | 
 71 | \item{col.snps}{The color used for germline SNPs.}
 72 | 
 73 | \item{col.chr.shading}{The color used for shading alternate chromosomes.}
 74 | 
 75 | \item{\dots}{Additonal parameters passed to the \code{plot} function.}
 76 | }
 77 | \value{
 78 | Returns \code{NULL}.
 79 | }
 80 | \description{
 81 | This function provides various plots for finding correct purity and ploidy
 82 | combinations in the results of a \code{\link{runAbsoluteCN}} call.
 83 | }
 84 | \examples{
 85 | 
 86 | data(purecn.example.output)
 87 | plotAbs(purecn.example.output, type="overview")
 88 | # plot details for the maximum likelihood solution (rank 1)
 89 | plotAbs(purecn.example.output, 1, type="hist")
 90 | plotAbs(purecn.example.output, 1, type="BAF")
 91 | plotAbs(purecn.example.output, 1, type = "BAF", chr="chr2")
 92 | 
 93 | }
 94 | \seealso{
 95 | \code{\link{runAbsoluteCN}}
 96 | }
 97 | \author{
 98 | Markus Riester
 99 | }
100 | 


--------------------------------------------------------------------------------
/man/filterIntervals.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/filterIntervals.R
 3 | \name{filterIntervals}
 4 | \alias{filterIntervals}
 5 | \title{Remove low quality intervals}
 6 | \usage{
 7 | filterIntervals(
 8 |   normal,
 9 |   tumor,
10 |   log.ratio,
11 |   seg.file,
12 |   filter.lowhigh.gc = 0.001,
13 |   min.coverage = 15,
14 |   min.total.counts = 100,
15 |   min.targeted.base = 5,
16 |   min.mappability = c(0.6, 0.1),
17 |   min.fraction.offtarget = 0.05,
18 |   normalDB = NULL
19 | )
20 | }
21 | \arguments{
22 | \item{normal}{Coverage data for normal sample.}
23 | 
24 | \item{tumor}{Coverage data for tumor sample.}
25 | 
26 | \item{log.ratio}{Copy number log-ratios, one for each interval in the
27 | coverage file.}
28 | 
29 | \item{seg.file}{If not \code{NULL}, then do not filter intervals, because data
30 | is already segmented via the provided segmentation file.}
31 | 
32 | \item{filter.lowhigh.gc}{Quantile q (defines lower q and upper 1-q) for
33 | removing intervals with outlier GC profile. Assuming that GC correction might
34 | not have been worked on those. Requires \code{interval.file}.}
35 | 
36 | \item{min.coverage}{Minimum coverage in both normal and tumor. Intervals with
37 | lower coverage are ignored. If a \code{normalDB} is provided, then this
38 | database already provides information about low quality intervals and the
39 | \code{min.coverage} is set to \code{min.coverage/10000}.}
40 | 
41 | \item{min.total.counts}{Exclude intervals with fewer than that many reads
42 | in combined tumor and normal.}
43 | 
44 | \item{min.targeted.base}{Exclude intervals with targeted base (size in bp)
45 | smaller than this cutoff. This is useful when the same interval file was
46 | used to calculate GC content. For such small targets, the GC content is
47 | likely very different from the true GC content of the probes.}
48 | 
49 | \item{min.mappability}{\code{double(2)} specifying the minimum mappability score
50 | for on-target, off-target in that order.}
51 | 
52 | \item{min.fraction.offtarget}{Skip off-target regions when less than the
53 | specified fraction of all intervals passes all filters}
54 | 
55 | \item{normalDB}{Normal database, created with
56 | \code{\link{createNormalDatabase}}.}
57 | }
58 | \value{
59 | \code{logical(length(log.ratio))} specifying which intervals should be
60 | used in segmentation.
61 | }
62 | \description{
63 | This function determines which intervals in the coverage files should be
64 | included or excluded in the segmentation. It is called via the
65 | \code{fun.filterIntervals} argument of \code{\link{runAbsoluteCN}}. The
66 | arguments are passed via \code{args.filterIntervals}.
67 | }
68 | \examples{
69 | 
70 | normal.coverage.file <- system.file("extdata", "example_normal.txt.gz",
71 |     package = "PureCN")
72 | normal2.coverage.file <- system.file("extdata", "example_normal2.txt.gz",
73 |     package = "PureCN")
74 | normal.coverage.files <- c(normal.coverage.file, normal2.coverage.file)
75 | normalDB <- createNormalDatabase(normal.coverage.files)
76 | 
77 | tumor.coverage.file <- system.file("extdata", "example_tumor.txt.gz",
78 |     package = "PureCN")
79 | vcf.file <- system.file("extdata", "example.vcf.gz",
80 |     package = "PureCN")
81 | interval.file <- system.file("extdata", "example_intervals.txt",
82 |     package = "PureCN")
83 | 
84 | # The max.candidate.solutions, max.ploidy and test.purity parameters are set to
85 | # non-default values to speed-up this example.  This is not a good idea for real
86 | # samples.
87 | ret <-runAbsoluteCN(normal.coverage.file = normal.coverage.file,
88 |     tumor.coverage.file = tumor.coverage.file,
89 |     genome = "hg19", vcf.file = vcf.file, normalDB = normalDB,
90 |     sampleid = "Sample1", interval.file = interval.file,
91 |     args.filterIntervals = list(min.targeted.base = 10), max.ploidy = 4,
92 |     test.purity = seq(0.3, 0.7, by = 0.05), max.candidate.solutions = 1)
93 | 
94 | }
95 | \author{
96 | Markus Riester
97 | }
98 | 


--------------------------------------------------------------------------------
/tests/testthat/test_createCurationFile.R:
--------------------------------------------------------------------------------
 1 | context("createCurationFile")
 2 | 
 3 | data(purecn.example.output)
 4 | file.rds <- tempfile(fileext = ".rds")
 5 | saveRDS(purecn.example.output, file = file.rds)
 6 | 
 7 | test_that("Example data is processed correctly", {
 8 |     ret <- createCurationFile(file.rds)
 9 |     expect_equal(ret$Purity, purecn.example.output$results[[1]]$purity)
10 |     expect_equal(ret$Ploidy, purecn.example.output$results[[1]]$ploidy)
11 |     expect_false(ret$Curated)
12 |     expect_true(ret$Flagged)
13 |     expect_equal(as.character(ret$Sampleid), purecn.example.output$input$sampleid)
14 | })
15 | 
16 | test_that("Default curation file stores the first result", {
17 |     retx <- readCurationFile(file.rds)
18 |     expect_equal(retx$results[[1]]$purity, purecn.example.output$results[[1]]$purity)
19 |     expect_equal(retx$results[[1]]$ploidy, purecn.example.output$results[[1]]$ploidy)
20 | })
21 | 
22 | test_that("min.ploidy=2 ignores the first result", {
23 |     retx <- readCurationFile(file.rds, min.ploidy = 2)
24 |     expect_equal(retx$results[[1]]$purity, purecn.example.output$results[[2]]$purity)
25 |     expect_equal(retx$results[[1]]$ploidy, purecn.example.output$results[[2]]$ploidy)
26 | })
27 | 
28 | test_that("max.ploidy=2 ignores higher ploidy solutions", {
29 |     retx <- readCurationFile(file.rds, max.ploidy = 2)
30 |     expect_equal(sapply(retx$results, function(x) x$ploidy) < 
31 |         2, rep(TRUE, length(retx$results)))
32 | })
33 | 
34 | test_that("report.best.only works as expected", {
35 |     retx <- readCurationFile(file.rds, report.best.only = TRUE)
36 |     expect_equal(retx$results[[1]]$purity, purecn.example.output$results[[1]]$purity)
37 |     expect_equal(retx$results[[1]]$ploidy, purecn.example.output$results[[1]]$ploidy)
38 |     expect_equal(length(retx$results), 1)
39 | })
40 | 
41 | test_that("overwriting works as expected", {
42 |     retx <- purecn.example.output
43 |     retx$results[[1]]$purity <- 0.8
44 |     saveRDS(retx, file = file.rds)
45 |     filename <- file.path(dirname(file.rds), paste0(gsub(".rds$", 
46 |         "", basename(file.rds)), ".csv"))
47 |     expect_warning(createCurationFile(file.rds, overwrite.uncurated = FALSE))
48 |     ret <- read.csv(filename, as.is = TRUE)
49 |     expect_equal(ret$Purity, purecn.example.output$results[[1]]$purity)
50 |     expect_equal(ret$Ploidy, purecn.example.output$results[[1]]$ploidy)
51 |     createCurationFile(file.rds)
52 |     ret <- read.csv(filename, as.is = TRUE)
53 |     expect_equal(ret$Purity, retx$results[[1]]$purity)
54 |     expect_equal(ret$Ploidy, retx$results[[1]]$ploidy)
55 |     ret$Curated <- TRUE
56 |     write.csv(ret, file = filename, row.names = FALSE)
57 |     saveRDS(purecn.example.output, file = file.rds)
58 |     expect_warning(createCurationFile(file.rds))
59 |     ret <- read.csv(filename, as.is = TRUE)
60 |     expect_true(ret$Curated)
61 |     expect_equal(ret$Purity, 0.8)
62 |     ret$Ploidy <- 3.4
63 |     write.csv(ret, file = filename, row.names = FALSE)
64 |     retx <- readCurationFile(file.rds)
65 |     expect_equal(ret$Purity, retx$results[[1]]$purity, tolerance=0.2)
66 |     expect_equal(ret$Ploidy, retx$results[[1]]$ploidy, tolerance=0.5)
67 |     ret$Purity <- "2.2w"
68 |     write.csv(ret, file = filename, row.names = FALSE)
69 |     expect_error(readCurationFile(file.rds))
70 |     ret$Purity <- 2.2
71 |     ret$Failed <- TRUE
72 |     write.csv(ret, file = filename, row.names = FALSE)
73 |     retx <- readCurationFile(file.rds, remove.failed = TRUE)
74 |     expect_true(is.na(retx))
75 |     ret$Failed <- "true"
76 |     write.csv(ret, file = filename, row.names = FALSE)
77 |     expect_error(readCurationFile(file.rds, remove.failed = TRUE), "logical")
78 |     file.remove(filename)
79 | })
80 | 
81 | test_that("warning occurs with missing curation file", {
82 |     ret <- createCurationFile(file.rds)
83 |     file.remove(gsub(".rds", ".csv", file.rds))
84 |     expect_output(retx <- readCurationFile(file.rds), "does not exist, creating")
85 |     expect_equal(retx$results[[1]]$purity, purecn.example.output$results[[1]]$purity)
86 |     expect_equal(retx$results[[1]]$ploidy, purecn.example.output$results[[1]]$ploidy)
87 | })
88 | 
89 | file.remove(file.rds)
90 | 


--------------------------------------------------------------------------------
/R/annotateTargets.R:
--------------------------------------------------------------------------------
 1 | #' Annotate targets with gene symbols
 2 | #'
 3 | #' This function can be used to add a \sQuote{Gene} meta column containing
 4 | #' gene symbols to a \code{GRanges} object.
 5 | #' It applies heuristics to find the protein coding genes that were
 6 | #' likely meant to target in the assay design in case transcripts
 7 | #' overlap.
 8 | #'
 9 | #' @param x A \code{GRanges} object with interals to annotate
10 | #' @param txdb A \code{TxDb} database, e.g.
11 | #' \code{TxDb.Hsapiens.UCSC.hg19.knownGene}
12 | #' @param org A \code{OrgDb} object, e.g. \code{org.Hs.eg.db}.
13 | #' @return A \code{GRanges} object.
14 | #' @author Markus Riester
15 | #' @examples
16 | #' library(TxDb.Hsapiens.UCSC.hg19.knownGene)
17 | #' library(org.Hs.eg.db)
18 | #'
19 | #' normal.coverage.file <- system.file("extdata", "example_normal.txt.gz",
20 | #'     package = "PureCN")
21 | #' x <- head(readCoverageFile(normal.coverage.file), 100)
22 | #' x <- annotateTargets(x,TxDb.Hsapiens.UCSC.hg19.knownGene, org.Hs.eg.db)
23 | #'
24 | #' @importFrom GenomicFeatures transcriptsByOverlaps exonsByOverlaps cdsByOverlaps
25 | #' @export annotateTargets
26 | annotateTargets <- function(x, txdb, org) {
27 |     if (!is.null(x$on.target)) {
28 |         idx <- x$on.target
29 |     } else {
30 |         idx <- seq_along(x)
31 |     }
32 |     txdb <- .checkSeqlevelStyle(x, txdb, "txdb", "interval file")
33 |     id <- transcriptsByOverlaps(txdb, ranges = x[idx], columns = "GENEID")
34 |     id$SYMBOL <- suppressWarnings(
35 |         select(org, vapply(id$GENEID, function(x) x[1], character(1)),
36 |                "SYMBOL")[, 2])
37 | 
38 |     idCds <- cdsByOverlaps(txdb, ranges = x[idx], columns = "GENEID")
39 |     idExons <- exonsByOverlaps(txdb, ranges = x[idx], columns = "GENEID")
40 |     idExons$SYMBOL <- suppressWarnings(
41 |         select(org, vapply(idExons$GENEID, function(x) x[1], character(1)),
42 |                "SYMBOL")[, 2])
43 | 
44 |     ov <- findOverlaps(x[idx], id)
45 |     ovExons <- findOverlaps(x[idx], idExons)
46 | 
47 |     # for targets with multiple gene hits, use the one with most overlapping
48 |     # targets
49 |     d.f <- data.frame(i = queryHits(ov),
50 |                       GENEID = as.character(id$GENEID[subjectHits(ov)]),
51 |                       SYMBOL = as.character(id$SYMBOL[subjectHits(ov)]))
52 |     d.f <- d.f[!duplicated(d.f), ]
53 | 
54 |     # remove non-coding transcripts
55 |     d.f <- d.f[!grepl("-AS\\d$", d.f$SYMBOL), ]
56 |     d.f <- d.f[!grepl("^LOC\\d", d.f$SYMBOL), ]
57 |     d.f <- d.f[!grepl("^FLJ\\d+$", d.f$SYMBOL), ]
58 | 
59 |     d.f$Count <- table(d.f$SYMBOL)[d.f$SYMBOL]
60 | 
61 |     # in case multiple symbols have the same number of targets, prioritize the
62 |     # ones overlapping exons
63 |     d.fExons <- data.frame(
64 |         i = queryHits(ovExons),
65 |         SYMBOL = as.character(idExons$SYMBOL[subjectHits(ovExons)]))
66 | 
67 |     # downweight orfs
68 |     d.fExons <- d.fExons[!grepl("\\dorf\\d", d.fExons$SYMBOL), ]
69 |     d.f$CountExons <- table(d.fExons$SYMBOL)[d.f$SYMBOL]
70 |     d.f$CountExons[is.na(d.f$CountExons)] <- 0
71 | 
72 |     d.f$OverlapsExon <- ifelse(paste(d.f$i, d.f$SYMBOL) %in%
73 |                                paste(d.fExons$i, d.fExons$SYMBOL), 1, 0)
74 |     d.f$IsCds <- ifelse(d.f$GENEID %in% unique(unlist(idCds$GENEID)), 1, 0)
75 | 
76 |     # reorder and pick the best transcript:
77 |     #  - deprioritize non-protein coding transcripts 
78 |     #  - deprioritize non-exon overlapping intervals
79 |     #  - deprioritize genes with low total exon count (might not be the main target)
80 |     #  - in the very unlikely case of a tie, use the total transcript count
81 |     d.f <- d.f[order(d.f$i, d.f$IsCds, d.f$OverlapsExon, d.f$CountExons, d.f$Count), ]
82 |     d.f$FLAG <- duplicated(d.f$i, fromLast = TRUE)
83 |     d.f <- d.f[order(d.f$i, d.f$FLAG), ]
84 |     d.f <- d.f[!duplicated(d.f$i), ]
85 | 
86 |     # Exclude targets for which we have multiple hits, but only one interval
87 |     d.f <- d.f[!d.f$FLAG | d.f$Count > 2, ]
88 |     if (is.null(x$Gene)) x$Gene <- "."
89 |     x[idx]$Gene[d.f$i] <- as.character(d.f$SYMBOL)
90 |     x$Gene[is.na(x$Gene)] <- "."
91 | 
92 |     flog.warn("Attempted adding gene symbols to intervals. Heuristics have %s",
93 |         "been used to pick symbols for overlapping genes.")
94 |     x
95 | }
96 | 


--------------------------------------------------------------------------------
/man/preprocessIntervals.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/preprocessIntervals.R
  3 | \name{preprocessIntervals}
  4 | \alias{preprocessIntervals}
  5 | \title{Preprocess intervals}
  6 | \usage{
  7 | preprocessIntervals(
  8 |   interval.file,
  9 |   reference.file,
 10 |   output.file = NULL,
 11 |   off.target = FALSE,
 12 |   average.target.width = 400,
 13 |   min.target.width = 100,
 14 |   min.off.target.width = 20000,
 15 |   average.off.target.width = 2e+05,
 16 |   off.target.padding = -500,
 17 |   mappability = NULL,
 18 |   min.mappability = c(0.6, 0.1, 0.7),
 19 |   reptiming = NULL,
 20 |   average.reptiming.width = 1e+05,
 21 |   exclude = NULL,
 22 |   off.target.seqlevels = c("targeted", "all"),
 23 |   small.targets = c("resize", "drop")
 24 | )
 25 | }
 26 | \arguments{
 27 | \item{interval.file}{File specifying the intervals. Interval is expected in
 28 | first column in format CHR:START-END.  Instead of a file, a \code{GRanges}
 29 | object can be provided. This allows the use of BED files for example. Note
 30 | that GATK interval files are 1-based (first position of the genome is 1).
 31 | Other formats like BED files are often 0-based. The \code{import} function
 32 | will automatically convert to 1-based \code{GRanges}.}
 33 | 
 34 | \item{reference.file}{Reference FASTA file.}
 35 | 
 36 | \item{output.file}{Optionally, write GC content file.}
 37 | 
 38 | \item{off.target}{Include off-target regions.}
 39 | 
 40 | \item{average.target.width}{Split large targets to approximately this size.}
 41 | 
 42 | \item{min.target.width}{Make sure that target regions are of at least
 43 | this specified width. See \code{small.targets}.}
 44 | 
 45 | \item{min.off.target.width}{Only include off-target regions of that
 46 | size}
 47 | 
 48 | \item{average.off.target.width}{Split off-target regions to that}
 49 | 
 50 | \item{off.target.padding}{Pad off-target regions.}
 51 | 
 52 | \item{mappability}{Annotate intervals with mappability score. Assumed on a scale
 53 | from 0 to 1, with score being 1/(number alignments). Expected as \code{GRanges}
 54 | object with first meta column being the score. Regions outside these ranges are
 55 | ignored, assuming that \code{mappability} covers the whole accessible genome.}
 56 | 
 57 | \item{min.mappability}{\code{double(3)} specifying the minimum mappability score
 58 | for on-target, off-target, and chrY regions in that order. The chrY regions
 59 | are only used for sex determination in \sQuote{PureCN} and are therefore
 60 | treated differently. Requires \code{mappability}.}
 61 | 
 62 | \item{reptiming}{Annotate intervals with replication timing score. Expected as
 63 | \code{GRanges} object with first meta column being the score.}
 64 | 
 65 | \item{average.reptiming.width}{Tile \code{reptiming} into bins of specified
 66 | width.}
 67 | 
 68 | \item{exclude}{Any target that overlaps with this \code{GRanges} object
 69 | will be excluded.}
 70 | 
 71 | \item{off.target.seqlevels}{Controls how to deal with chromosomes/contigs
 72 | found in the \code{reference.file} but not in the \code{interval.file}.}
 73 | 
 74 | \item{small.targets}{Strategy to deal with targets smaller than
 75 | \code{min.target.width}.}
 76 | }
 77 | \value{
 78 | Returns GC content by interval as \code{GRanges} object.
 79 | }
 80 | \description{
 81 | Optimize intervals for copy number calling by tiling long intervals and by
 82 | including off-target regions. Uses \code{scanFa} from the Rsamtools package
 83 | to retrieve GC content of intervals in a reference FASTA file. If provided,
 84 | will annotate intervals with mappability and replication timing scores.
 85 | }
 86 | \examples{
 87 | 
 88 | reference.file <- system.file("extdata", "ex2_reference.fa",
 89 |     package = "PureCN", mustWork = TRUE)
 90 | interval.file <- system.file("extdata", "ex2_intervals.txt",
 91 |     package = "PureCN", mustWork = TRUE)
 92 | bed.file <- system.file("extdata", "ex2_intervals.bed",
 93 |     package = "PureCN", mustWork = TRUE)
 94 | preprocessIntervals(interval.file, reference.file,
 95 |     output.file = "gc_file.txt")
 96 | 
 97 | intervals <- import(bed.file)
 98 | preprocessIntervals(intervals, reference.file,
 99 |     output.file = "gc_file.txt")
100 | 
101 | }
102 | \references{
103 | Talevich et al. (2016). CNVkit: Genome-Wide Copy Number
104 | Detection and Visualization from Targeted DNA Sequencing. PLoS Comput Biol.
105 | }
106 | \author{
107 | Markus Riester
108 | }
109 | 


--------------------------------------------------------------------------------
/R/setPriorVcf.R:
--------------------------------------------------------------------------------
 1 | #' Set Somatic Prior VCF
 2 | #' 
 3 | #' Function to set prior for somatic mutation status for each variant in the
 4 | #' provided \code{CollapsedVCF} object.
 5 | #' 
 6 | #' 
 7 | #' @param vcf \code{CollapsedVCF} object, read in with the \code{readVcf}
 8 | #' function from the VariantAnnotation package.
 9 | #' @param prior.somatic Prior probabilities for somatic mutations. First value
10 | #' is for the case when no matched normals are available and the variant is not
11 | #' in germline databases (second value). Third value is for variants with MuTect
12 | #' somatic call. Different from 1, because somatic mutations in segments of copy
13 | #' number 0 have 0 probability and artifacts can thus have dramatic influence on
14 | #' likelihood score. Forth value is for variants not labeled as somatic by
15 | #' MuTect. Last two values are optional, if vcf contains a flag Cosmic.CNT, it
16 | #' will set the prior probability for variants with CNT > 6 to the first of
17 | #' those values in case of no matched normal available (0.995 default).  Final
18 | #' value is for the case that variant is in both germline databases and
19 | #' COSMIC count > 6.
20 | #' @param tumor.id.in.vcf Id of tumor in case multiple samples are stored in
21 | #' VCF.
22 | #' @param min.cosmic.cnt Minimum number of hits in the COSMIC database to 
23 | #' call variant as likely somatic.
24 | #' @param DB.info.flag Flag in INFO of VCF that marks presence in common
25 | #' germline databases. Defaults to \code{DB} that may contain somatic variants
26 | #' if it is from an unfiltered germline database.
27 | #' @param Cosmic.CNT.info.field Info field containing hits in the Cosmic database
28 | #' @return The \code{vcf} with \code{numeric(nrow(vcf))} vector with the
29 | #' prior probability of somatic status for each variant in the 
30 | #' \code{CollapsedVCF} added to the \code{INFO} field \code{PR}.
31 | #' @author Markus Riester
32 | #' @examples
33 | #' 
34 | #' # This function is typically only called by runAbsoluteCN via the 
35 | #' # fun.setPriorVcf and args.setPriorVcf comments.
36 | #' vcf.file <- system.file("extdata", "example.vcf.gz", package="PureCN")
37 | #' vcf <- readVcf(vcf.file, "hg19")
38 | #' vcf <- setPriorVcf(vcf)        
39 | #' 
40 | #' @export setPriorVcf
41 | setPriorVcf <- function(vcf, prior.somatic = c(0.5, 0.0005, 0.999, 0.0001,
42 |                                                0.995, 0.5),
43 |                         tumor.id.in.vcf = NULL, min.cosmic.cnt = 6, 
44 |                         DB.info.flag = "DB", Cosmic.CNT.info.field = "Cosmic.CNT") {
45 |     if (is.null(tumor.id.in.vcf)) {
46 |         tumor.id.in.vcf <- .getTumorIdInVcf(vcf)
47 |     }
48 |     if (!is.null(info(vcf)$SOMATIC)) {
49 |          tmp <- prior.somatic
50 |          prior.somatic <- ifelse(info(vcf)$SOMATIC,
51 |             prior.somatic[3],prior.somatic[4])
52 | 
53 |          flog.info("Found SOMATIC annotation in VCF.")
54 |          flog.info("Setting somatic prior probabilities for somatic variants to %f or to %f otherwise.", 
55 |             tmp[3], tmp[4])
56 |     } else {
57 |          tmp <- prior.somatic
58 |          prior.somatic <- ifelse(info(vcf)[[DB.info.flag]],
59 |             prior.somatic[2], prior.somatic[1])
60 |          if (!is.null(info(vcf)[[Cosmic.CNT.info.field]])) {
61 |              flog.info("Found COSMIC annotation in VCF. Requiring %i hits.", 
62 |                 min.cosmic.cnt)
63 |              flog.info("Setting somatic prior probabilities for hits to %f or to %f if in both COSMIC and likely germline based on dbSNP membership or population allele frequency.", 
64 |                 tmp[5], tmp[6])
65 | 
66 |              prior.somatic[which(info(vcf)[[Cosmic.CNT.info.field]] >= min.cosmic.cnt)] <- tmp[5]
67 |              prior.somatic[which(info(vcf)[[Cosmic.CNT.info.field]] >= min.cosmic.cnt & 
68 |                 info(vcf)[[DB.info.flag]])] <- tmp[6]
69 |          } else {
70 |              flog.info("Setting somatic prior probabilities for likely germline hits to %f or to %f otherwise.", 
71 |                 tmp[2], tmp[1])
72 |          }      
73 |     }     
74 |     .annotateVcfPrior(vcf, prior.somatic)
75 | }
76 | .annotateVcfPrior <- function(vcf, prior.somatic) {
77 |     key <- paste0(.getPureCNPrefixVcf(vcf), "PR")
78 |     newInfo <- DataFrame(
79 |         Number = 1, Type = "Float",
80 |         Description = "Prior probability somatic",
81 |         row.names = key)
82 |     
83 |     info(header(vcf)) <- rbind(info(header(vcf)), newInfo)
84 |     info(vcf)[[key]] <- prior.somatic
85 |     return(vcf)
86 | }
87 | 
88 | 


--------------------------------------------------------------------------------
/R/filterVcfMuTect.R:
--------------------------------------------------------------------------------
 1 | #' Filter VCF MuTect
 2 | #' 
 3 | #' Function to remove artifacts and low confidence/quality calls from a MuTect
 4 | #' generated VCF file. Also applies filters defined in \code{filterVcfBasic}.
 5 | #' This function will only keep variants listed in the stats file and those not
 6 | #' matching the specified failure reasons.
 7 | #' 
 8 | #' 
 9 | #' @param vcf \code{CollapsedVCF} object, read in with the \code{readVcf}
10 | #' function from the VariantAnnotation package.
11 | #' @param tumor.id.in.vcf The tumor id in the VCF file, optional.
12 | #' @param stats.file MuTect stats file. If \code{NULL}, will check if VCF
13 | #' was generated by MuTect2 and if yes will call \code{\link{filterVcfMuTect2}}
14 | #' instead.
15 | #' @param ignore MuTect flags that mark variants for exclusion.
16 | #' @param \dots Additional arguments passed to \code{\link{filterVcfBasic}}.
17 | #' @return A list with elements \code{vcf}, \code{flag} and
18 | #' \code{flag_comment}.  \code{vcf} contains the filtered \code{CollapsedVCF},
19 | #' \code{flag} a \code{logical(1)} flag if problems were identified, further
20 | #' described in \code{flag_comment}.
21 | #' @author Markus Riester
22 | #' @seealso \code{\link{filterVcfBasic}}
23 | #' @examples
24 | #' 
25 | #' ### This function is typically only called by runAbsolute via the 
26 | #' ### fun.filterVcf and args.filterVcf comments.
27 | #' library(VariantAnnotation)    
28 | #' vcf.file <- system.file("extdata", "example.vcf.gz", package="PureCN")
29 | #' vcf <- readVcf(vcf.file, "hg19")
30 | #' vcf.filtered <- filterVcfMuTect(vcf)        
31 | #' 
32 | #' @export filterVcfMuTect
33 | filterVcfMuTect <- function(vcf, tumor.id.in.vcf = NULL, stats.file = NULL, 
34 | ignore=c("clustered_read_position", "fstar_tumor_lod", "nearby_gap_events", 
35 | "poor_mapping_region_alternate_allele_mapq", "poor_mapping_region_mapq0", 
36 | "possible_contamination", "strand_artifact", "seen_in_panel_of_normals"),
37 | ...){
38 |     if (is.null(stats.file) && .detectCaller(vcf) == "MuTect2/GATK4") {
39 |         flog.info("Detected MuTect2 VCF.")
40 |         return(filterVcfMuTect2(vcf, tumor.id.in.vcf, ...))
41 |     }    
42 |     if (is.null(stats.file)) return(
43 |         filterVcfBasic(vcf, tumor.id.in.vcf, ...))
44 |     
45 |     stats <- read.delim(stats.file, as.is=TRUE, skip=1)
46 | 
47 |     if (is.null(stats$contig) || is.null(stats$position)) {
48 |         flog.warn("MuTect stats file lacks contig and position columns.")
49 |         return(filterVcfBasic(vcf, tumor.id.in.vcf, ...))
50 |     }
51 |     
52 |     # check for excessive flags that can point to input data issues,
53 |     # correct variants that were incorrectly flagged
54 |     for (flag in c("nearby_gap_events", "seen_in_panel_of_normals")) {
55 |         if (flag %in% ignore &&
56 |             sum(grepl(flag, stats$failure_reasons))/nrow(stats) > 0.2) {
57 |             ignore <- ignore[-match(flag, ignore)]
58 |             flog.warn("Excessive %s, ignoring this flag. Check your data.", flag)
59 |         }
60 |     }
61 |     gr.stats <- GRanges(seqnames=stats$contig, 
62 |         IRanges(start=stats$position, end=stats$position))
63 |     
64 |     ov <- findOverlaps(vcf, gr.stats)
65 |      
66 |     if (!identical(queryHits(ov),subjectHits(ov)) || 
67 |             nrow(vcf) != nrow(stats)) {
68 |         n <- .countVariants(vcf)
69 |         stats <- stats[subjectHits(ov),]
70 |         vcf <- .removeVariants(vcf, !seq(length(vcf)) %in% queryHits(ov), 
71 |                                "MuTect align")
72 |         flog.warn("MuTect stats file and VCF file do not align perfectly. Will remove %i unmatched variants.",
73 |             n-.countVariants(vcf))
74 |     }    
75 |     if (is.null(stats$failure_reasons)) {
76 |         flog.warn("MuTect stats file lacks failure_reasons column.%s",
77 |             " Keeping all variants listed in stats file.")
78 |         return(filterVcfBasic(vcf, tumor.id.in.vcf, ...))
79 |     }
80 | 
81 |     n <- .countVariants(vcf)
82 | 
83 |     ids <- sort(unique(unlist(sapply(ignore, grep, stats$failure_reasons))))
84 |     vcf <- .removeVariants(vcf, ids, "MuTect")
85 | 
86 |     flog.info("Removing %i MuTect calls due to blacklisted failure reasons.", 
87 |         n-.countVariants(vcf))
88 |     filterVcfBasic(vcf, tumor.id.in.vcf, ...)
89 | }
90 |     
91 | .detectCaller <- function(vcf) {
92 |     gatkVersion <- meta(header(vcf))[["GATKCommandLine"]]$Version[1]
93 |     if (!is.null(gatkVersion)) {
94 |         gatkVersion <- gsub("\\\"", "", gatkVersion)
95 |         if (grepl("^4", gatkVersion)) return("MuTect2/GATK4")
96 |     }    
97 |     return("")
98 | }    
99 | 


--------------------------------------------------------------------------------
/R/readCurationFile.R:
--------------------------------------------------------------------------------
  1 | #' Read curation file
  2 | #'
  3 | #' Function that can be used to read the curated output of the
  4 | #' \code{\link{runAbsoluteCN}} function.
  5 | #'
  6 | #'
  7 | #' @param file.rds Output of the \code{\link{runAbsoluteCN}} function,
  8 | #' serialized with \code{saveRDS}.
  9 | #' @param file.curation Filename of a curation file that points to the correct
 10 | #' tumor purity and ploidy solution.
 11 | #' @param remove.failed Do not return solutions that failed.
 12 | #' @param report.best.only Only return correct/best solution (useful on low
 13 | #' memory machines when lots of samples are loaded).
 14 | #' @param min.ploidy Minimum ploidy to be considered. If \code{NULL}, all. Can
 15 | #' be used to automatically ignore unlikely solutions.
 16 | #' @param max.ploidy Maximum ploidy to be considered. If \code{NULL}, all. Can
 17 | #' be used to automatically ignore unlikely solutions.
 18 | #' @return The return value of the corresponding \code{\link{runAbsoluteCN}}
 19 | #' call, but with the results array manipulated according the curation CSV file
 20 | #' and arguments of this function.
 21 | #' @author Markus Riester
 22 | #' @seealso \code{\link{runAbsoluteCN} \link{createCurationFile}}
 23 | #' @examples
 24 | #'
 25 | #' data(purecn.example.output)
 26 | #' file.rds <- "Sample1_PureCN.rds"
 27 | #' createCurationFile(file.rds)
 28 | #' # User can change the maximum likelihood solution manually in the generated
 29 | #' # CSV file. The correct solution is then loaded with readCurationFile.
 30 | #' purecn.curated.example.output <-readCurationFile(file.rds)
 31 | #'
 32 | #' @export readCurationFile
 33 | #' @importFrom utils read.csv
 34 | readCurationFile <- function(file.rds,
 35 | file.curation = gsub(".rds$", ".csv", file.rds),
 36 | remove.failed = FALSE, report.best.only = FALSE, min.ploidy = NULL,
 37 | max.ploidy = NULL) {
 38 |     flog.info("Reading %s...", file.rds)
 39 |     res <- readRDS(file.rds)
 40 |     if (!file.exists(file.curation)) {
 41 |         flog.warn("Curation file %s does not exist, creating one.", file.curation)
 42 |         output <- try(createCurationFile(file.rds))
 43 |         if (is(output, "try-error")) {
 44 |             flog.warn("Failed to write %s: %s", file.curation, output)
 45 |             return(res)
 46 |         }
 47 |     }
 48 |     curation <- read.csv(file.curation, as.is=TRUE, nrows=1)
 49 |     .checkLogical <- function(field) {
 50 |         if (!is.logical(curation[[field]])) {
 51 |             .stopUserError("'", field, "' column in ", file.curation, 
 52 |                 " not logical(1).")
 53 |         }
 54 |     }
 55 |     .checkLogical("Failed")
 56 |     .checkLogical("Curated")
 57 |     .checkLogical("Flagged")
 58 | 
 59 |     ## Mark all solutions as failed if sample is curated as failed
 60 |     if (curation$Failed) {
 61 |         if (remove.failed) return(NA)
 62 |         for (i in seq_along(res$results)) res$results[[i]]$failed <- TRUE
 63 |     } else {
 64 |         for (i in seq_along(res$results)) res$results[[i]]$failed <- FALSE
 65 |     }
 66 |     
 67 |     # Make sure purity and ploidy are numeric. Stop if not, not warn.
 68 |     curation$Purity <- suppressWarnings(as.numeric(curation$Purity))
 69 |     curation$Ploidy <- suppressWarnings(as.numeric(curation$Ploidy))
 70 |     
 71 |     if (is.na(curation$Purity) || is.na(curation$Ploidy) ||
 72 |         curation$Purity < 0 || curation$Purity > 1 ||
 73 |         curation$Ploidy < 0 || curation$Ploidy > 8) {
 74 |         .stopUserError("Purity or Ploidy not numeric or in expected range.")
 75 |     }
 76 |     res$results <- .findClosestSolution(res$results, curation$Purity,
 77 |         curation$Ploidy)
 78 |     
 79 |     ## Filter by ploidy if necessary
 80 |     ploidy <- sapply(res$results, function(x) x$ploidy)
 81 |     if (is.null(min.ploidy)) min.ploidy <- min(ploidy)
 82 |     if (is.null(max.ploidy)) max.ploidy <- max(ploidy)
 83 |     idxPloidyOk <- which(ploidy>=min.ploidy & ploidy <= max.ploidy)
 84 |     res$results <- res$results[idxPloidyOk]
 85 |      
 86 |     if (report.best.only) {
 87 |         res$results <- res$results[1]
 88 |     }
 89 |     res
 90 | }    
 91 | 
 92 | .findClosestSolution <- function(results, purity, ploidy, ploidy.div = 6) {
 93 |     # Find purity/ploidy solution most similar to curation
 94 |     diffCurated <- vapply(results, function(x) {
 95 |         abs(x$purity - purity) + (abs(x$ploidy - ploidy) / ploidy.div)
 96 |     }, double(1))
 97 |     idxCurated <- which.min(diffCurated)
 98 |     if (idxCurated != 1) {
 99 |         results[c(1, idxCurated)] <-  results[c(idxCurated, 1)]
100 |     }
101 |     results
102 | }    
103 | 


--------------------------------------------------------------------------------
/R/readAllelicCountsFile.R:
--------------------------------------------------------------------------------
  1 | #' Read allelic counts file
  2 | #' 
  3 | #' Read file containing counts of ref and alt alleles of common
  4 | #  SNPs by external tools like The Genome Analysis 
  5 | #' Toolkit 4. 
  6 | #' 
  7 | #' @param file Input file containing counts of ref and alt alleles
  8 | #' @param format File format. If missing, derived from the file 
  9 | #' extension. Currently only GATK4 CollectAllelicCounts (tsv)
 10 | #' format supported.
 11 | #' @param zero Start position is 0-based. Default is \code{FALSE}
 12 | #' for GATK, \code{TRUE} for BED file based intervals.
 13 | #' @return A \code{CollapsedVCF} with the parsed allelic counts.
 14 | #' @author Markus Riester
 15 | #' @examples
 16 | #' 
 17 | #' ac.file <- system.file("extdata", "example_allelic_counts.tsv", 
 18 | #'     package="PureCN")
 19 | #' vcf_ac <- readAllelicCountsFile(ac.file)
 20 | #' 
 21 | #' @importFrom utils write.table
 22 | #' @importFrom Biostrings DNAStringSet DNAStringSetList
 23 | #' @export readAllelicCountsFile
 24 | readAllelicCountsFile <- function(file, format, zero=NULL) {
 25 |     if (missing(format)) format <- "tsv"
 26 |     .readAllelicCountsFileGatk4(file, zero)
 27 | }
 28 | 
 29 | .writeAllelicCountsFileGatk <- function(vcf, id = 1, file) {
 30 |     outputCounts <- data.frame(
 31 |         CONTIG = seqnames(vcf),
 32 |         POSITION = start(vcf),
 33 |         REF_COUNT = sapply(geno(vcf)$AD[,id], function(x) x[1]),
 34 |         ALT_COUNT = sapply(geno(vcf)$AD[,id], function(x) x[2]),
 35 |         REF_NUCLEOTIDE = as.character(ref(vcf)),
 36 |         ALT_NUCLEOTIDE = unlist(CharacterList(alt(vcf)))
 37 |     )
 38 |     con <- file(file, open = "w")
 39 |     .writeGATKHeader(vcf, id, con, "allelic counts")
 40 |     write.table(outputCounts, con, row.names = FALSE, quote = FALSE, sep = "\t")
 41 |     close(con)
 42 |     invisible(outputCounts)
 43 | }
 44 | 
 45 | .parseGATKHeader <- function(con) {
 46 |     .extractField <- function(line, field) {
 47 |         fields <- strsplit(line, "\t")[[1]]
 48 |         key <- paste0("^", field, ":")
 49 |         fields <- fields[grep(key, fields)]
 50 |         gsub(key, "", fields[1]) 
 51 |     }
 52 |     sid <- NULL
 53 |     sl <- list()
 54 |     while ( TRUE ) {
 55 |         line <- readLines(con, n = 1)
 56 |         if ( length(line) == 0 || !grepl("^@", line)[1]) {
 57 |             break
 58 |         }
 59 |         if (grepl("^@RG", line)[1]) sid <- .extractField(line, "SM")
 60 |         if (grepl("^@SQ", line)[1]) {
 61 |             sl[[.extractField(line, "SN")]] <- .extractField(line, "LN")
 62 |         }
 63 |     }
 64 |     return(list(sid = sid, sl = sl, last_line = line))
 65 | }
 66 | 
 67 | .readAllelicCountsFileGatk4 <- function(file, zero) {
 68 |     if (!is.null(zero)) flog.warn("zero ignored for GATK4 files.")
 69 |     con <- file(file, open = "r")
 70 |     header <- .parseGATKHeader(con)
 71 |     inputCounts <- try(read.delim(con, header = FALSE, stringsAsFactors = FALSE))
 72 |     if (is(inputCounts, "try-error")) {
 73 |         .stopUserError("Error reading AllelicCountsFile ", file)
 74 |     }
 75 |     colnames(inputCounts) <- strsplit(header$last_line, "\t")[[1]]
 76 |     close(con)
 77 |     gr <- GRanges(seqnames = inputCounts$CONTIG, IRanges(start = inputCounts$POSITION, end = inputCounts$POSITION))
 78 |     vcf <- VCF(gr, 
 79 |                 colData = DataFrame(Samples = 1, row.names = header$sid),
 80 |                 exptData = list(header = VCFHeader(samples = header$sid)))
 81 |     ref(vcf) <- DNAStringSet(inputCounts$REF_NUCLEOTIDE)
 82 |     #alt(vcf) <- DNAStringSetList(split(inputCounts$ALT_NUCLEOTIDE, seq(length(vcf))))
 83 |     alt(vcf) <- DNAStringSetList(as.list(inputCounts$ALT_NUCLEOTIDE))
 84 | 
 85 |     info(header(vcf)) <- DataFrame(
 86 |         Number = "0",
 87 |         Type = "Flag",
 88 |         Description = "Likely somatic status, based on SOMATIC or Cosmic.CNT info fields, population allele frequency, or germline database membership",
 89 |         row.names = "DB")
 90 | 
 91 |     geno(header(vcf)) <- DataFrame(
 92 |         Number =".",
 93 |         Type = "Integer",
 94 |         Description = "Allelic depths for the ref and alt alleles in the order listed",
 95 |         row.names = "AD")
 96 | 
 97 |     info(vcf)$DB <- TRUE
 98 |     geno(vcf)$AD <- matrix(lapply(seq(nrow(inputCounts)), function(i)
 99 |             c(inputCounts$REF_COUNT[i], inputCounts$ALT_COUNT[i])),
100 |         ncol = 1, dimnames = list(NULL, header$sid))
101 | 
102 |     names(vcf) <- paste0(seqnames(vcf), ":", start(vcf))
103 |     if (length(header$sl)) {
104 |         header$sl <- sapply(header$sl, as.numeric)
105 |         seqlengths(vcf) <- header$sl[names(seqlengths(vcf))]  
106 |     }
107 |     .readAndCheckVcf(vcf)
108 | }
109 | 


--------------------------------------------------------------------------------