├── .Rbuildignore
├── .gitattributes
├── .gitignore
├── DESCRIPTION
├── NAMESPACE
├── NEWS.md
├── R
├── CESAnalysis.R
├── add_variants.R
├── annotate_variants.R
├── assign_group_average_trinuc_rates.R
├── build_RefCDS.R
├── ces_variant.R
├── classes.R
├── compound_variants.R
├── covered_regions_handling.R
├── create_refset.R
├── default_epistasis_pvalue_calc.R
├── detect_mnv.R
├── epistasis_objectives.R
├── epistasis_wrappers.R
├── gene_mutation_rates.R
├── genome_data_handling.R
├── get_TCGA_project_MAF.R
├── imports.R
├── internal_read_maf.R
├── lift_bed.R
├── load_maf.R
├── load_sample_data.R
├── make_PathScore_input.R
├── mutation_rate_calc.R
├── mutational_signature_effects.R
├── pairwise_variant_epistasis.R
├── plot_effects.R
├── plot_epistasis.R
├── plot_signature_effects.R
├── preload_maf.R
├── run_deconstructSigs.R
├── run_mutational_patterns.R
├── select_samples.R
├── select_variants.R
├── set_gene_rates.R
├── set_signature_weights.R
├── set_trinuc_rates.R
├── si_uniroot_conf_int.R
├── single_variant_si_objectives.R
├── suggest_cosmic_signatures_to_remove.R
├── sysdata.rda
├── trinuc_mutation_rates.R
├── validate_optimizer_args.R
├── variant_counts.R
├── variant_id_handling.R
└── vcf_to_maf_table.R
├── README.md
├── _pkgdown.yml
├── data-raw
├── build_ces.refset.hg19.R
├── build_codon_snvs_to_aa.R
├── build_deconstructSigs_stuff.R
├── generate_cosmic_v3.1_hg19_signature_set.R
├── generate_sysdata.R
└── tutorial
│ ├── prep_BRCA_met_tgs.R
│ ├── prep_TCGA_BRCA_clinical.R
│ └── run_tutorial_data.R
├── doc
├── cosmic_cancer_type_note.R
├── cosmic_cancer_type_note.Rmd
└── cosmic_cancer_type_note.html
├── index.md
├── inst
├── CITATION
├── PathScore
│ ├── PathScore_CDS_ranges_hg19.rds
│ └── PathScore_CDS_ranges_hg38.rds
├── WORDLIST
├── extdata
│ ├── COSMIC_SBS_v3-1.txt
│ ├── COSMIC_v3.1_signature_metadata.txt
│ ├── COSMIC_v3.2_signatures_by_cancer_type.txt
│ ├── brca_tcga_clinical_data_via_gdac_cbioportal.tsv
│ ├── cosmic_sbs_signature_summary.txt
│ └── trinuc_snv_to_deconstructSigs_ID.txt
└── tutorial
│ ├── BRCA_cesa_gene_rates.rds
│ ├── BRCA_cesa_samples.rds
│ ├── BRCA_dndscv_out.rds
│ ├── BRCA_effects_in_top_genes.rds
│ ├── BRCA_epistasis_example.rds
│ ├── BRCA_site_rates_example.rds
│ ├── BRCA_snv_counts.rds
│ ├── LUAD_sig_effects.rds
│ ├── TCGA_BRCA_clinical.txt
│ ├── comp_variant_ep.rds
│ ├── gene_ep_example.rds
│ ├── metastatic_breast_2021_hg38.maf
│ ├── metastatic_breast_2021_license.txt
│ ├── sequential_signif_output.rds
│ ├── top_BRCA_effects.rds
│ ├── top_LUAD_effects.rds
│ └── variant_ep_example.rds
├── man
├── CESAnalysis.Rd
├── CompoundVariantSet.Rd
├── aac_to_snv_ids.Rd
├── add_covered_regions.Rd
├── add_variants.Rd
├── annotate_variants.Rd
├── artifact_account.Rd
├── assign_gr_to_coverage.Rd
├── assign_group_average_trinuc_rates.Rd
├── baseline_mutation_rates.Rd
├── build_RefCDS.Rd
├── calculate_trinuc_rates.Rd
├── ces_epistasis.Rd
├── ces_gene_epistasis.Rd
├── ces_variant.Rd
├── check_for_ref_data.Rd
├── check_sample_overlap.Rd
├── clean_granges_for_cesa.Rd
├── clear_effect_output.Rd
├── clear_epistasis_output.Rd
├── clear_gene_rates.Rd
├── clear_sample_data.Rd
├── clear_trinuc_rates_and_signatures.Rd
├── complete_aac_ids.Rd
├── convert_signature_weights_for_mp.Rd
├── copy_cesa.Rd
├── cosmic_signature_info.Rd
├── create_refset.Rd
├── default_epistasis_pvalue_calc.Rd
├── define_compound_variants.Rd
├── detect_mnv.Rd
├── dot-add_covered_regions.Rd
├── dot-variant_counts.Rd
├── epistasis_plot_schematic.Rd
├── epistasis_results.Rd
├── excluded_maf_records.Rd
├── figures
│ └── current_logo.png
├── gene_mutation_rates.Rd
├── get_PathScore_coding_regions.Rd
├── get_TCGA_project_MAF.Rd
├── get_ces_signature_set.Rd
├── get_cesa_bsg.Rd
├── get_dndscv_model_fit.Rd
├── get_gene_rates.Rd
├── get_gr_from_table.Rd
├── get_ref_data.Rd
├── get_refset_dirs.Rd
├── get_sample_info.Rd
├── get_signature_weights.Rd
├── get_trinuc_rates.Rd
├── identify_maf_variants.Rd
├── lift_bed.Rd
├── list_ces_covariates.Rd
├── list_ces_refsets.Rd
├── list_ces_signature_sets.Rd
├── load_cesa.Rd
├── load_maf.Rd
├── load_sample_data.Rd
├── maf_records.Rd
├── make_PathScore_input.Rd
├── mutational_signature_effects.Rd
├── pairwise_epistasis_lik.Rd
├── pairwise_variant_epistasis.Rd
├── plot_effects.Rd
├── plot_epistasis.Rd
├── plot_signature_effects.Rd
├── preload_maf.Rd
├── preload_ref_data.Rd
├── read_in_maf.Rd
├── read_vcf.Rd
├── run_deconstructSigs.Rd
├── run_dndscv.Rd
├── run_mutational_patterns.Rd
├── samples_with.Rd
├── save_cesa.Rd
├── select_samples.Rd
├── select_variants.Rd
├── set_gene_rates.Rd
├── set_refset_dir.Rd
├── set_signature_weights.Rd
├── set_trinuc_rates.Rd
├── snv_results.Rd
├── sort_and_validate_variant_ids.Rd
├── sswm_lik.Rd
├── suggest_cosmic_signature_exclusions.Rd
├── trinuc_mutation_rates.Rd
├── trinuc_snv_counts.Rd
├── univariate_si_conf_ints.Rd
├── update_covered_in.Rd
├── validate_aac_ids.Rd
├── validate_optimizer_args.Rd
├── validate_signature_set.Rd
├── validate_snv_ids.Rd
├── variant_counts.Rd
└── vcfs_to_maf_table.Rd
├── pkgdown
├── extra.css
└── favicon
│ ├── apple-touch-icon-120x120.png
│ ├── apple-touch-icon-152x152.png
│ ├── apple-touch-icon-180x180.png
│ ├── apple-touch-icon-60x60.png
│ ├── apple-touch-icon-76x76.png
│ ├── apple-touch-icon.png
│ ├── favicon-16x16.png
│ ├── favicon-32x32.png
│ └── favicon.ico
├── user_guide
├── cancereffectsizeR_user_guide.Rmd
└── cancereffectsizeR_user_guide.md
└── vignettes
├── .gitignore
├── MAF_filtering_tips.Rmd
├── cancereffectsizeR.Rmd
├── cosmic_cancer_type_note.Rmd
├── create_custom_covariates.Rmd
└── custom_refset_instructions.Rmd
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^user_guide$
2 | ^cancereffectsizeR\.sublime-workspace$
3 | ^cancereffectsizeR\.sublime-project$
4 | ^cancereffectsizeR\.Rproj$
5 | inst/tutorial/TCGA.BRCA.mutect.995c0111-d90b-4140-bee7-3845436c3b42.DR-10.0.somatic.maf.gz
6 | inst/tutorial/TCGA.LUAD.mutect.0458c57f-316c-4a7c-9294-ccd11c97c2f9.DR-10.0.somatic.maf.gz
7 | luad_cesa.rds
8 | brca_cesa.rds
9 | ^\.Rproj$
10 | ^\.Rproj\.user$
11 | ^data-raw$
12 | ^doc$
13 | ^Meta$
14 | ^index.md$
15 | ^_pkgdown\.yml$
16 | ^docs$
17 | ^pkgdown$
18 | ^vignettes/
19 |
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # prevent large test data files from being included in release tarballs
2 | tests* export-ignore
3 |
4 | # exclude website files, too
5 | docs* export-ignore
6 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | .DS_Store
6 | luad_cesa.rds
7 | brca_cesa.rds
8 | data/all_gene_trinuc_data.RData
9 | inst/doc
10 | inst/tutorial/TCGA-LUAD.maf.gz
11 | inst/tutorial/luad_quickstart.rds
12 | inst/tutorial/TCGA-BRCA.maf.gz
13 | inst/tutorial/brca_tutorial_cesa.rds
14 | doc
15 | Meta
16 | tmp.rds
17 | brca.maf.gz
18 | thca.maf.gz
19 | ucec.maf.gz
20 | untracked/*
21 |
22 |
--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
1 | Package: cancereffectsizeR
2 | Type: Package
3 | Title: Calculate Cancer Effect Size
4 | Version: 2.10.2
5 | Authors@R: c(person("Vincent L.", "Cannataro", email = "cannatarov@emmanuel.edu", role = c("aut"),
6 | comment = c(ORCID = "0000-0002-6364-7747")),
7 | person("Jeff", "Mandell", email = "jeff.mandell@yale.edu", role = c("aut", "cre"),
8 | comment = c(ORCID = "0000-0002-3839-2543")))
9 | Description: cancereffectsizeR quantifies selection for somatic variants in cancer from tumor sequencing data.
10 | See the package website for more information.
11 | URL: https://townsend-lab-yale.github.io/cancereffectsizeR, https://github.com/Townsend-Lab-Yale/cancereffectsizeR
12 | Depends:
13 | R (>= 3.5.0)
14 | biocViews:
15 | Remotes: im3sanger/dndscv
16 | Imports:
17 | bbmle,
18 | Biostrings,
19 | BSgenome,
20 | crayon,
21 | data.table (>= 1.15.0),
22 | dndscv,
23 | GenomeInfoDb,
24 | GenomicRanges,
25 | methods,
26 | MutationalPatterns (>= 2.99.4),
27 | IRanges,
28 | pbapply,
29 | rtracklayer,
30 | S4Vectors,
31 | seqinr,
32 | stringi
33 | Suggests:
34 | formattable,
35 | ggplot2,
36 | ggrepel,
37 | knitr,
38 | mockr,
39 | rmarkdown,
40 | styler,
41 | testthat (>= 3.0.0)
42 | License: GPL-3
43 | Encoding: UTF-8
44 | LazyData: true
45 | RoxygenNote: 7.3.2
46 | BugReports: https://github.com/Townsend-Lab-Yale/cancereffectsizeR/issues
47 | Config/testthat/edition: 2
48 |
--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
1 | # Generated by roxygen2: do not edit by hand
2 |
3 | S3method(.DollarNames,CESAnalysis)
4 | S3method(.DollarNames,CompoundVariantSet)
5 | S3method(as.character,CES_Run_History)
6 | export(CESAnalysis)
7 | export(CompoundVariantSet)
8 | export(add_covered_regions)
9 | export(add_variants)
10 | export(assign_group_average_trinuc_rates)
11 | export(baseline_mutation_rates)
12 | export(build_RefCDS)
13 | export(ces_epistasis)
14 | export(ces_gene_epistasis)
15 | export(ces_variant)
16 | export(check_sample_overlap)
17 | export(clear_effect_output)
18 | export(clear_epistasis_output)
19 | export(clear_gene_rates)
20 | export(clear_sample_data)
21 | export(clear_trinuc_rates_and_signatures)
22 | export(convert_signature_weights_for_mp)
23 | export(cosmic_signature_info)
24 | export(create_refset)
25 | export(define_compound_variants)
26 | export(epistasis_plot_schematic)
27 | export(epistasis_results)
28 | export(excluded_maf_records)
29 | export(gene_mutation_rates)
30 | export(get_PathScore_coding_regions)
31 | export(get_TCGA_project_MAF)
32 | export(get_ces_signature_set)
33 | export(get_gene_rates)
34 | export(get_sample_info)
35 | export(get_signature_weights)
36 | export(get_trinuc_rates)
37 | export(lift_bed)
38 | export(list_ces_covariates)
39 | export(list_ces_refsets)
40 | export(list_ces_signature_sets)
41 | export(load_cesa)
42 | export(load_maf)
43 | export(load_sample_data)
44 | export(maf_records)
45 | export(make_PathScore_input)
46 | export(mutational_signature_effects)
47 | export(pairwise_epistasis_lik)
48 | export(plot_effects)
49 | export(plot_epistasis)
50 | export(plot_signature_effects)
51 | export(preload_maf)
52 | export(samples_with)
53 | export(save_cesa)
54 | export(select_variants)
55 | export(set_gene_rates)
56 | export(set_refset_dir)
57 | export(set_signature_weights)
58 | export(set_trinuc_rates)
59 | export(snv_results)
60 | export(sswm_lik)
61 | export(suggest_cosmic_signature_exclusions)
62 | export(trinuc_mutation_rates)
63 | export(trinuc_snv_counts)
64 | export(validate_signature_set)
65 | export(variant_counts)
66 | export(vcfs_to_maf_table)
67 | import(BSgenome)
68 | import(GenomeInfoDb)
69 | import(data.table)
70 | importFrom(methods,is)
71 | importFrom(methods,new)
72 | importFrom(stats,dist)
73 | importFrom(stats,na.omit)
74 | importFrom(stats,predict)
75 | importFrom(stats,setNames)
76 | importFrom(stats,window)
77 | importFrom(utils,.DollarNames)
78 | importFrom(utils,data)
79 | importFrom(utils,download.file)
80 | importFrom(utils,packageVersion)
81 | importFrom(utils,tail)
82 |
--------------------------------------------------------------------------------
/R/assign_group_average_trinuc_rates.R:
--------------------------------------------------------------------------------
1 | #' Skip mutational signature analysis and assign group average relative trinucleotide-context-specific mutation rates to all samples
2 | #'
3 | #' This function calculates the relative rates of trinucleotide-context-specific mutations across
4 | #' all SNV records from whole-exome and whole-genome MAF data and naively assigns these rates to all samples.
5 | #' This can be helpful if you do not have SNV mutational signatures available for your species, or if
6 | #' you want to assume that all samples share the same SNV mutational processes without relying on signatures.
7 | #' Normally, if mutational signatures are available, it is better to use trinuc_snv_mutation_rates().
8 | #'
9 | #' To reduce the influence of selection, only non-recurrent mutations (i.e., mutations that occur
10 | #' in just one sample) are used to calculate the rates. Targeted sequencing data is excluded for
11 | #' the same reason, and also because the trinucleotide composition of targeted regions could be
12 | #' very different from that of the exome/genome.
13 | #'
14 | #' @param cesa CESAnalysis object
15 | #' @export
16 | assign_group_average_trinuc_rates = function(cesa) {
17 | if(! is(cesa, "CESAnalysis")) {
18 | stop("Expected cesa to be a CESAnalysis object")
19 | }
20 | cesa = copy_cesa(cesa)
21 | cesa = update_cesa_history(cesa, match.call())
22 | if(cesa@maf[, .N] == 0) {
23 | stop("No MAF data in the CESAnalysis")
24 | }
25 | if(all(cesa@samples$coverage == "targeted")) {
26 | stop("We can't estimate relative trinucleotide mutation rates without some exome/genome data in the CESAnalysis (all data is targeted sequencing).")
27 | }
28 |
29 | # Take just SNVs
30 | snv_maf = cesa@maf[variant_type == "snv"]
31 |
32 | # Remove all recurrent SNVs (SNVs appearing in more than one sample)
33 | duplicated_vec_first <- duplicated(snv_maf[,.(Chromosome, Start_Position, Tumor_Allele)])
34 | duplicated_vec_last <- duplicated(snv_maf[,.(Chromosome, Start_Position, Tumor_Allele)],fromLast=T)
35 | duplicated_vec_pos <- which(duplicated_vec_first | duplicated_vec_last)
36 | if (length(duplicated_vec_pos) > 0) {
37 | snv_maf <- snv_maf[-duplicated_vec_pos,]
38 | }
39 |
40 | # Subset to just WES/WGS data
41 | non_tgs_samples = cesa@samples[coverage != "targeted", Unique_Patient_Identifier]
42 | snv_maf = snv_maf[Unique_Patient_Identifier %in% non_tgs_samples]
43 |
44 |
45 | # get trinuc contexts of each SNV and produce a data frame of counts, organized the same way as deconstructSigs data
46 | bsg = .ces_ref_data[[cesa@ref_key]]$genome
47 | trinuc = BSgenome::getSeq(bsg, snv_maf$Chromosome, snv_maf$Start_Position - 1, snv_maf$Start_Position + 1, as.character = T)
48 |
49 | # internal dict converts trinuc/mut (e.g., GTA:C) into deconstructSigs format ("G[T>C]A")
50 | ds_muts = factor(deconstructSigs_notations[.(trinuc, snv_maf$Tumor_Allele), deconstructSigs_ID], levels = deconstructSigs_trinuc_string)
51 |
52 | # mysteriously convert two-way table to data frame
53 | tmp = table(snv_maf$Unique_Patient_Identifier, ds_muts)
54 | counts = apply(tmp, 2, rbind)
55 | rownames(counts) = rownames(tmp)
56 | trinuc_breakdown_per_tumor = as.data.frame(counts)
57 |
58 | # produce normalized rates (putting in pseudocounts any are 0)
59 | trinuc_prop = colSums(trinuc_breakdown_per_tumor) / sum(trinuc_breakdown_per_tumor)
60 |
61 | if(any(trinuc_prop == 0)) {
62 | lowest_nonzero_rate = min(trinuc_prop[trinuc_prop != 0])
63 | trinuc_prop = trinuc_prop + lowest_nonzero_rate
64 | trinuc_prop = trinuc_prop / sum(trinuc_prop) # renormalizing
65 | }
66 |
67 | # create trinuc_proportion_matrix (1 row per sample, all rows with identical trinuc_prop)
68 | num_samples = cesa@samples[, .N]
69 | trinuc_proportion_matrix = matrix(rep(trinuc_prop, num_samples), byrow = T, ncol = 96,
70 | dimnames = list(cesa@samples$Unique_Patient_Identifier, names(trinuc_prop)))
71 | cesa@samples[, sig_analysis_grp := 0L]
72 | cesa@trinucleotide_mutation_weights = list(trinuc_proportion_matrix=trinuc_proportion_matrix)
73 | return(cesa)
74 | }
75 |
--------------------------------------------------------------------------------
/R/detect_mnv.R:
--------------------------------------------------------------------------------
1 | #' Find likely MNVs in an MAF table
2 | #'
3 | #' Same-sample variants with 2bp of other variants are likely MNVs.
4 | #'
5 | #' @param maf a valid MAF-style data.table
6 | #' @return a table with MAF records
7 | #' @keywords internal
8 | detect_mnv = function(maf) {
9 | # MNVs are only possible in sample/chromosome combinations with more than one MAF record
10 | poss_mnv = maf[order(Unique_Patient_Identifier, Chromosome, Start_Position)][, .SD[.N > 1],
11 | by = c("Unique_Patient_Identifier", "Chromosome")]
12 |
13 | if(poss_mnv[, .N] == 0) {
14 | return(poss_mnv)
15 | }
16 | poss_mnv[, dist_to_prev := c(Inf, diff(Start_Position)), by = c("Unique_Patient_Identifier", "Chromosome")]
17 | poss_mnv[, dist_to_next := c(dist_to_prev[2:.N], Inf), by = c("Unique_Patient_Identifier", "Chromosome")]
18 | poss_mnv[dist_to_prev < 3 | dist_to_next < 3, is_mnv := T]
19 |
20 | # organize into groups of likely multi-nucleotide events
21 | mnv = poss_mnv[is_mnv == T]
22 | mnv[, start_of_group := dist_to_prev > 2]
23 | mnv[, mnv_group := cumsum(start_of_group)]
24 | return(mnv)
25 | }
26 |
--------------------------------------------------------------------------------
/R/epistasis_objectives.R:
--------------------------------------------------------------------------------
1 | #' pairwise_epistasis_lik
2 | #'
3 | #' For a pair of variants (or two groups of variants), creates a likelihood function for a
4 | #' model of pairwise epistasis with a "strong mutation, weak selection" assumption.
5 | #'
6 | #' The arguments to this function are automatically supplied by \code{ces_epistasis()} and \code{ces_gene_epistasis()}.
7 | #'
8 | #' @param with_just_1 two-item list of baseline rates in v1/v2 for tumors with mutation in just the first variant(s)
9 | #' @param with_just_2 two-item list of baseline rates in v1/v2 for tumors with mutation in just the second variant(s)
10 | #' @param with_both two-item list of baseline rates in v1/v2 for tumors with mutation in both
11 | #' @param with_neither two-item list of baseline rates in v1/v2 for tumors with mutation n neither
12 | #'
13 | #' @export
14 | #' @return A likelihood function
15 | pairwise_epistasis_lik <- function(with_just_1, with_just_2, with_both, with_neither) {
16 |
17 | fn = function(par) {
18 | # sometimes the pars end up as NaNs or NAs, possibly because of inappropriate optimization techniques
19 | if(! all(is.finite(par))) {
20 | return(1e200)
21 | }
22 |
23 | # two points of discontinuity we need to account for
24 | if((par[3] == par[1] + par[2]) |
25 | (par[4] == par[1] + par[2])){return(1e200)}
26 |
27 | sum_log_lik <- 0
28 |
29 | if(! is.null(with_neither)) {
30 | # log(P{wt}) = -(A + B)
31 | A = par[1] * with_neither[[1]]
32 | B = par[2] * with_neither[[2]]
33 | ll = -1 * (A + B)
34 | sum_log_lik = sum_log_lik + sum(ll)
35 | }
36 |
37 |
38 | if(! is.null(with_just_1)) {
39 | A = par[1] * with_just_1[[1]]
40 | B = par[2] * with_just_1[[2]]
41 | B_on_A = par[4] * with_just_1[[2]]
42 |
43 | lik = (A / (A + B - B_on_A)) * (exp(-1 * B_on_A) - exp(-1 * (A + B)))
44 | sum_log_lik = sum_log_lik + sum(log(lik))
45 | }
46 |
47 |
48 | if(! is.null(with_just_2)) {
49 | A = par[1] * with_just_2[[1]]
50 | B = par[2] * with_just_2[[2]]
51 | A_on_B = par[3] * with_just_2[[1]]
52 |
53 | lik = (B / (A + B - A_on_B)) * (exp(-1 * A_on_B) - exp(-1 * (A + B)))
54 | sum_log_lik = sum_log_lik + sum(log(lik))
55 | }
56 |
57 | if(! is.null(with_both)) {
58 | A = par[1] * with_both[[1]]
59 | B = par[2] * with_both[[2]]
60 | A_on_B = par[3] * with_both[[1]]
61 | B_on_A = par[4] * with_both[[2]]
62 |
63 | p_wt = exp(-1 * (A+B))
64 | p_A = (A / (A + B - B_on_A)) * (exp(-1 * B_on_A) - exp(-1 * (A + B)))
65 | p_B = (B / (A + B - A_on_B)) * (exp(-1 * A_on_B) - exp(-1 * (A + B)))
66 | p_AB = 1 - p_wt - p_A - p_B
67 | sum_log_lik = sum_log_lik + sum(log(p_AB))
68 | }
69 |
70 | # in case it tried all the max at once.
71 | if(!is.finite(sum_log_lik)){
72 | return(1e200)
73 | }
74 | return(-sum_log_lik)
75 | }
76 |
77 | # Set default values for all parameters, which ces_variant will use to set starting values of optimization
78 | formals(fn)[["par"]] = 1000:1003
79 |
80 | # Optimization tool, bbmle::mle, requires that vector of parameters to optimize have named elements
81 | bbmle::parnames(fn) = c("ces_v1", "ces_v2", "ces_v1_after_v2", "ces_v2_after_v1")
82 | return(fn)
83 | }
84 |
--------------------------------------------------------------------------------
/R/imports.R:
--------------------------------------------------------------------------------
1 | #' @import data.table
2 | #' @import GenomeInfoDb
3 | #' @import BSgenome
4 | #' @importFrom utils .DollarNames data packageVersion download.file tail
5 | #' @importFrom methods is new
6 | #' @importFrom stats na.omit predict setNames dist window
7 |
8 | .datatable.aware = TRUE
9 | .ces_ref_data = new.env()
10 | options(datatable.prettyprint.char = 40)
11 |
12 | # Data package names and minimum required version
13 | .official_refsets = list(ces.refset.hg19 = as.package_version("1.1.0"), ces.refset.hg38 = as.package_version("1.2.0"))
14 |
15 | # If refset packages are loaded, put their data in .ces_ref_data for easy access
16 | for(refset in names(.official_refsets)) {
17 | if(refset %in% loadedNamespaces()) {
18 | .ces_ref_data[[refset]] = get(refset, envir = as.environment(paste0('package:', refset)))
19 | lockEnvironment(.ces_ref_data[[refset]], bindings = TRUE)
20 | }
21 | }
22 |
23 | snv_annotation_template = data.table(variant_name = character(), snv_id = character(), chr = character(), pos = numeric(),
24 | ref = character(), alt = character(), genes = list(), intergenic = logical(),
25 | trinuc_mut = character(), essential_splice = logical(),
26 | nearest_pid = list(), covered_in = list())
27 |
28 | aac_annotation_template = data.table(variant_name = character(), aac_id = character(), gene = character(), aachange = character(),
29 | strand = integer(), chr = character(), pid = character(), aa_ref = character(),
30 | aa_pos = numeric(), aa_alt = character(),
31 | nt1_pos = numeric(), nt2_pos = numeric(), nt3_pos = numeric(),
32 | coding_seq = character(), constituent_snvs = list(), essential_splice = logical(),
33 | covered_in = list())
34 |
35 | aac_snv_key_template = data.table(aac_id = character(), snv_id = character(), multi_anno_site = logical(), key = 'aac_id')
36 |
37 | sample_table_template = data.table(Unique_Patient_Identifier = character(), coverage = character(),
38 | covered_regions = character(), sig_analysis_grp = integer(), gene_rate_grp = integer(),
39 | maf_source = character())
40 |
41 | # for use when identifying a column previously handled by cancereffectsizeR
42 | # "merged_into_other_variant" replaced with "merged_with_nearby_variant" in 2.6.4
43 | preload_problems = c('missing_values', 'not_variant', 'duplicate_record', 'failed_liftOver',
44 | 'duplicate_record_after_liftOver', 'unsupported_chr', 'out_of_bounds',
45 | 'reference_mismatch', "merged_into_dbs_variant", "merged_into_other_variant",
46 | "duplicate_from_TCGA_sample_merge", "merged_with_nearby_variant", "invalid_record")
47 |
48 |
49 | # format a string the way R should automatically, then feed it to message()
50 | pretty_message = function(msg, emit = T, black = emit) {
51 | msg = paste0(strwrap(msg), collapse = "\n")
52 | if (black) {
53 | # If current theme is dark, make the message white
54 | if(Sys.getenv("RSTUDIO") == "1" && rstudioapi::isAvailable() && rstudioapi::getThemeInfo()$dark) {
55 | msg = crayon::white(msg) # nocov
56 | } else {
57 | msg = crayon::black(msg)
58 | }
59 | }
60 | if (emit) {
61 | message(msg)
62 | } else {
63 | return(msg)
64 | }
65 | }
66 |
67 | NULL
68 |
--------------------------------------------------------------------------------
/R/lift_bed.R:
--------------------------------------------------------------------------------
1 | #' Convert BED intervals between genome builds
2 | #'
3 | #' Use this utility to convert BED intervals between genome coordinate systems using liftOver. Only
4 | #' the chr/start/end fields of the input BED are used (strand is ignored). The output GRanges
5 | #' will have no associated seqinfo.
6 | #'
7 | #' A warning is given if the lifted intervals are less than 95\% of the size of the original
8 | #' intervals. When the BED input represents sequencing target intervals, most of the input
9 | #' intervals will usually lift successfully.
10 | #'
11 | #' @param bed Pathname of a BED file, or a GRanges (typically loaded from a BED file with \code{rtracklayer::import.bed()}).
12 | #' @param chain A UCSC-style chain file, or a Chain object (such as from \code{rtracklayer::import.chain()}).
13 | #' @param outfile If not NULL, the returned GRanges will be saved to the specified path using \code{rtracklayer::export.chain()}.
14 | #' @return GRanges representing lifted intervals from input \code{bed}.
15 | #' @export
16 | lift_bed = function(bed, chain, outfile = NULL) {
17 | if(rlang::is_scalar_character(bed)) {
18 | if(! file.exists(bed)) {
19 | stop("Specified BED file does not exist.")
20 | }
21 | bed = rtracklayer::import.bed(bed)
22 | } else if(! is(bed, 'GRanges')) {
23 | stop('Input bed should be the path to a BED file or a GRanges object.')
24 | }
25 | bed = BiocGenerics::unstrand(bed)
26 |
27 | if(rlang::is_scalar_character(chain)) {
28 | if(! file.exists(chain)) {
29 | stop('Specified chain file does not exist.')
30 | }
31 | chain = rtracklayer::import.chain(chain)
32 | } else if (! is(chain, 'Chain')) {
33 | stop('Input chain should be the path to a UCSC-style chain file or a Chain object.')
34 | }
35 | names(chain) = sub("^chr", "", names(chain))
36 | seqlevelsStyle(bed) = 'NCBI'
37 | lifted = sort(reduce(unlist(rtracklayer::liftOver(bed, chain))))
38 | seqlevelsStyle(lifted) = 'NCBI'
39 | prop = sum(width(lifted)) / sum(width(reduce(bed)))
40 | if(prop < .95) {
41 | percent = round(prop * 100, 1)
42 | msg = paste0('The lifted intervals cover ', percent, '% of the width of the original intervals.',
43 | ' (For BED files representing sequencing target regions, typically most intervals',
44 | ' will successfully lift between genome builds. If the percentage is very low,',
45 | ' make sure you have the correct genome build for the input file.)')
46 | warning(pretty_message(msg, emit = F))
47 | }
48 | if(! is.null(outfile)) {
49 | if(! rlang::is_scalar_character(outfile)) {
50 | stop('outfile should be NULL or a scalar character indicating a valid file path.')
51 | }
52 | if(! outfile %like% '\\.bed(\\.gz)?$') {
53 | stop('outfile must end in .bed or .bed.gz')
54 | }
55 | if(! dir.exists(dirname(outfile))) {
56 | stop('Directory specified in outfile does not exist.')
57 | }
58 | if(file.exists(outfile)) {
59 | stop('Specified outfile already exists.')
60 | }
61 | rtracklayer::export.bed(lifted, outfile)
62 | message('Lifted BED intervals saved to ', outfile, '.')
63 | }
64 | if(is.null(outfile)) {
65 | return(lifted)
66 | } else {
67 | return(invisible(lifted))
68 | }
69 | }
70 |
--------------------------------------------------------------------------------
/R/run_deconstructSigs.R:
--------------------------------------------------------------------------------
1 | #' cancereffectsizeR wrapper for deconstructSigs
2 | #'
3 | #' This function gets called internally by trinuc_mutation_rates() for each tumor in a CESAnalysis, accepting
4 | #' a data.frame of mutation counts and returning a data.frame of signature weights.
5 | #'
6 | #' @param tumor_trinuc_counts one-row data.frame of trinuc variant counts (in deconstructSigs order) for one tumor
7 | #' @param signatures_df data.frame of signatures (see COSMIC v3 signatures included with package for format)
8 | #' @param signatures_to_remove names of signatures in signatures_df to keep out of deconstructSigs and assign zero weights
9 | #' @param tri.counts.method exome/genome trinucleotide content normalization argument to pass to deconstructSigs (see its docs)
10 | #' @keywords internal
11 | #' @return a data.frame of signature weights
12 | run_deconstructSigs = function(tumor_trinuc_counts, signatures_df, signatures_to_remove, tri.counts.method) {
13 | # toss all the "signatures_to_remove" from the complete set of signatures
14 | signatures_to_include = signatures_df[! rownames(signatures_df) %in% signatures_to_remove,]
15 |
16 | # contexts.needed indicates if trinuc normalization should happen; even if already normalized,
17 | # harmless to normalize again
18 | signatures_output <- deconstructSigs::whichSignatures(tumor.ref = tumor_trinuc_counts,
19 | signatures.ref = signatures_to_include,
20 | contexts.needed = T,
21 | tri.counts.method = tri.counts.method)
22 |
23 | all_weights = signatures_output$weights
24 |
25 | if (!is.null(signatures_to_remove)) {
26 | zeroed_sigs = as.data.frame(matrix(data = 0,nrow = 1,ncol = length(signatures_to_remove)))
27 | colnames(zeroed_sigs) <- signatures_to_remove
28 | all_weights <- cbind(all_weights, zeroed_sigs)
29 |
30 | # sort columns to match standard order
31 | all_weights = all_weights[,rownames(signatures_df)]
32 | }
33 |
34 | return(all_weights)
35 | }
--------------------------------------------------------------------------------
/R/run_mutational_patterns.R:
--------------------------------------------------------------------------------
1 | #' cancereffectsizeR wrapper for fit_to_signatures
2 | #'
3 | #' This function gets called internally by trinuc_mutation_rates() for each tumor in a CESAnalysis, accepting
4 | #' a data.frame of mutation counts and returning fit_to_signatures output. Note: this function supports indels
5 | #' if passed in the same format.
6 | #'
7 | #' @param tumor_trinuc_counts matrix of trinuc variant counts where columns respond to tumors and
8 | #' order of trinucleotide changes match signatures_df
9 | #' @param signatures_df data.frame of signatures (see COSMIC v3 signatures included with package for format)
10 | #' @param signatures_to_remove names of signatures in signatures_df to keep out of MutationalPatterns
11 | #' and assign zero weights. Only occurs when strict == FALSE
12 | #' @param mp_strict_args named list of additional arguments to fit_to_signatures_strict
13 | #' @param bootstrap_mutations T/F (default FALSE) whether to run
14 | #' fit_to_signatures_bootstrapped() with n_boot = 1 instead of fit_to_signatures_strict().
15 | #' @keywords internal
16 | #' @return a data.frame of signature weights
17 | run_mutational_patterns = function(tumor_trinuc_counts, signatures_df, signatures_to_remove,
18 | mp_strict_args = list(), bootstrap_mutations = FALSE) {
19 |
20 | signatures_to_include = signatures_df[! rownames(signatures_df) %in% signatures_to_remove,]
21 |
22 | # transpose so that columns correspond to signatures
23 | signatures_to_include = t(signatures_to_include)
24 |
25 | # convert signatures to matrix as required by MutationalPatterns
26 | signatures_to_include <- data.matrix(signatures_to_include)
27 |
28 | if(bootstrap_mutations) {
29 | mp_strict_args[['n_boot']] = 1
30 | if(! 'method' %in% mp_strict_args) {
31 | mp_strict_args[['method']] = 'strict'
32 | }
33 | }
34 |
35 | args = c(mp_strict_args, list(mut_matrix = tumor_trinuc_counts, signatures = signatures_to_include))
36 |
37 | # have to deal with a rare MP bug
38 | signatures_output = tryCatch(
39 | {
40 | if(bootstrap_mutations) {
41 | withCallingHandlers(
42 | {
43 | do.call(MutationalPatterns::fit_to_signatures_bootstrapped, args)
44 | },
45 | warning = function(w) {
46 | if (conditionMessage(w) %like% "At least one of your samples has less than") {
47 | invokeRestart("muffleWarning")
48 | }
49 | }
50 | )
51 | } else {
52 | do.call(MutationalPatterns::fit_to_signatures_strict, args)$fit_res
53 | }
54 |
55 | }, error = function(e) {
56 | if (grepl('a dimension is zero', conditionMessage(e))) {
57 | MutationalPatterns::fit_to_signatures(mut_matrix = tumor_trinuc_counts, signatures = signatures_to_include)
58 | } else {
59 | e
60 | }
61 | })
62 |
63 | # Convert bootstrap output to match fit_to_signatures_strict output,
64 | # and then produce a weights data.frame.
65 | if(bootstrap_mutations) {
66 | # In the bootstrap function, signatures with no weight get left out of output.
67 | # Add them to signatures_to_remove so they'll get put into table later.
68 | zeroed_sigs = setdiff(colnames(signatures_to_include), colnames(signatures_output))
69 | signatures_to_remove = union(zeroed_sigs, signatures_to_remove)
70 | weights = as.data.frame(signatures_output)
71 | } else {
72 | # must convert to data.frame and transpose so that result is compatible with
73 | # trinuc_mutation_rates
74 | weights = t(as.data.frame(signatures_output$contribution))
75 | }
76 |
77 | # add columns for any removed signatures into output matrix (with zero values)
78 | if(!is.null(signatures_to_remove)) {
79 | zeroed_sigs = as.data.frame(matrix(data = 0,nrow = 1,ncol = length(signatures_to_remove)))
80 | colnames(zeroed_sigs) <- signatures_to_remove
81 | weights <- cbind(weights, zeroed_sigs)
82 |
83 | # sort columns to match standard order
84 | weights <- weights[,rownames(signatures_df)]
85 | }
86 |
87 | return(weights)
88 | }
89 |
--------------------------------------------------------------------------------
/R/select_samples.R:
--------------------------------------------------------------------------------
1 | #' Retrieve validated subset of CESAnalysis samples table
2 | #'
3 | #' @param cesa CESAnalysis
4 | #' @param samples Vector of Unique_Patient_Identifiers, or data.table consisting
5 | #' of rows from a CESAnalysis samples table. If empty, returns full sample table.
6 | #' @return data.table consisting of one or more rows from the CESAnalysis samples table.
7 | #' @keywords internal
8 | select_samples = function(cesa = NULL, samples = character()) {
9 | if(! is(cesa, "CESAnalysis")) {
10 | stop("cesa should be CESAnalysis.")
11 | }
12 | if(! is(samples, "character") && ! is(samples, "data.table")) {
13 | stop("samples should be character or data.table.")
14 | }
15 | if (is(samples, "data.table")) {
16 | if(! "Unique_Patient_Identifier" %in% names(samples)) {
17 | stop("samples should be character or a data.table with a Unique_Patient_Identifier column.")
18 | }
19 | if(samples[, .N] == 0) {
20 | stop("The input samples table is empty")
21 | }
22 | samples = samples$Unique_Patient_Identifier
23 | }
24 |
25 | curr_sample_info = cesa@samples
26 | if(length(samples) > 0) {
27 | if(anyNA(samples)) {
28 | stop("NA values in samples.")
29 | }
30 | if(any(duplicated(samples))) {
31 | stop("Input samples contains duplicates.")
32 | }
33 | curr_sample_info = cesa@samples[samples, on = "Unique_Patient_Identifier", nomatch = NULL]
34 | if(curr_sample_info[, .N] != length(samples)) {
35 | stop("Some input samples not present in CESAnalysis samples table.")
36 | }
37 | }
38 | return(copy(curr_sample_info))
39 | }
40 |
41 |
42 | #' Find samples with specified variants
43 | #'
44 | #' A convenience function to identify samples with specific variants.
45 | #'
46 | #' @param cesa CESAnalysis
47 | #' @param any_of Select samples with ANY of the given variant names/IDs, such as
48 | #' c("8:142506482_C>G", "KRAS G12C"). When a gene has multiple transcripts in reference
49 | #' data, you may wish to use full IDs, such as "KRAS_G12C_ENSP00000256078".
50 | #' @export
51 | samples_with = function(cesa, any_of = NULL) {
52 | if(! is(cesa, "CESAnalysis")) {
53 | stop("cesa expected to be CESAnalysis.")
54 | }
55 | variants_by_type = sort_and_validate_variant_ids(cesa = cesa, input_ids = any_of, drop_unannotated = TRUE)
56 |
57 | snv_ids = variants_by_type[['snv_id']]
58 | aac_ids = variants_by_type[['aac_id']]
59 |
60 | snv_from_aac = unique(cesa@mutations$amino_acid_change[aac_ids, unlist(constituent_snvs), on = 'aac_id'])
61 | all_snv_ids = union(snv_from_aac, snv_ids)
62 |
63 | return(cesa@maf[all_snv_ids, unique(Unique_Patient_Identifier), on = 'variant_id', nomatch = NULL])
64 | }
65 |
--------------------------------------------------------------------------------
/R/set_trinuc_rates.R:
--------------------------------------------------------------------------------
1 | #' Assign pre-calculated relative trinucleotide mutation rates
2 | #'
3 | #' This function assigns trinucleotide-context-specific relative SNV mutation rates to
4 | #' tumors in a CESAnalysis. (These could be rates previously generated with
5 | #' \code{trinuc_mutation_rates()}, or they could calculated using your own methods.) The
6 | #' input rates must be a data.table or matrix. If supplying a data table, there must be a
7 | #' Unique_Patient_Identifier column; if supplying a a matrix, the identifiers should be
8 | #' supplied as rownames instead. Either way, all samples in the CESAnalysis must be
9 | #' represented in the input rates. To avoid user error, there cannot be any superfluous
10 | #' samples in the input rates unless \code{ignore_extra_samples = T}. Besides the
11 | #' identifier column (or matrix rownames), there must be 96 columns, with column names
12 | #' exactly matching the deconstructSigs/MutationalPatterns naming and order (run this
13 | #' function with incorrect column names, and the names you need to use will be printed).
14 | #' Since CES uses relative trinuc rates, rows must sum to 1, with all values greater than
15 | #' 0. You'll get a warning if any rate is less than 1e-9, since (unrealistically) low
16 | #' rates may crash selection model likelihood functions that aren't expecting such small
17 | #' values.
18 | #'
19 | #'
20 | #' @param cesa CESAnalysis object
21 | #' @param trinuc_rates a matrix or data table (see description for format)
22 | #' @param ignore_extra_samples skip samples in the input table that are not in the CESAnalysis (when false, will stop with an error)
23 | #' @export
24 | set_trinuc_rates = function(cesa, trinuc_rates, ignore_extra_samples = FALSE) {
25 | if(! is(cesa, "CESAnalysis")) {
26 | stop("Expected cesa to be a CESAnalysis object", call. = F)
27 | }
28 | cesa = copy_cesa(cesa)
29 | cesa = update_cesa_history(cesa, match.call())
30 | if(cesa@samples[, .N] == 0) {
31 | stop("There are no samples in the CESAnalysis", call. = F)
32 | }
33 | if(is.null(trinuc_rates) || ! is(trinuc_rates, "matrix")) {
34 | if(is(trinuc_rates, "data.table")) {
35 | if(! "Unique_Patient_Identifier" %in% names(trinuc_rates)) {
36 | stop("If you supply a data.table, there must be a Unique_Patient_Identifier column", call. = F)
37 | }
38 | trinuc_proportion_matrix = as.matrix(trinuc_rates[, -"Unique_Patient_Identifier"])
39 | rownames(trinuc_proportion_matrix) = trinuc_rates$Unique_Patient_Identifier
40 | } else {
41 | stop("Expected trinuc rates to be a data.table or matrix", call. = F)
42 | }
43 | } else {
44 | trinuc_proportion_matrix = trinuc_rates
45 | }
46 |
47 | if(anyNA(trinuc_proportion_matrix)) {
48 | stop("NA values found in trinuc_proportion_matrix", call. = F)
49 | }
50 |
51 | if(ncol(trinuc_proportion_matrix) != 96 || ! identical(colnames(trinuc_proportion_matrix), deconstructSigs_trinuc_string)) {
52 | message("Expected column names:")
53 | names_with_quotes = paste0('"', deconstructSigs_trinuc_string, '"')
54 | cat(names_with_quotes, sep = ',')
55 | cat("\n")
56 | stop("Incorrect number of columns and/or incorrect column names or column order", call. = F)
57 | }
58 |
59 | if(length(setdiff(rownames(trinuc_proportion_matrix), cesa$samples$Unique_Patient_Identifier)) > 0) {
60 | if(! ignore_extra_samples) {
61 | stop(paste0("There are samples in the input rates that are not in the CESAnalysis; if this is intentional, re-run with \n",
62 | "ignore_extra_samples = TRUE."), call. = F)
63 | }
64 | trinuc_proportion_matrix = trinuc_proportion_matrix[cesa$samples$Unique_Patient_Identifier, , drop = F]
65 | }
66 | if(! identical(sort(cesa@samples$Unique_Patient_Identifier), sort(rownames(trinuc_proportion_matrix)))) {
67 | stop("Not all samples in the CESAnalysis are present in the input rates.", call. = F)
68 | }
69 |
70 |
71 | # rows must sum to 1, but allow small tolerance
72 | if(any(abs(rowSums(trinuc_proportion_matrix) - 1) > .001)) {
73 | stop("row sums of input rates must all be 1", call. = F)
74 | }
75 | if (any(apply(trinuc_proportion_matrix, 1, function(x) any(x <= 1e-9)))) {
76 | # rate < 0 is invalid
77 | if (any(apply(trinuc_proportion_matrix, 1, function(x) any(x < 0)))) {
78 | stop("input rates cannot have negative entries.")
79 | }
80 | msg = paste0("Some relative mutation rates are very low (<1e-9). This could cause problems, especially if",
81 | " any \"impossible\" mutations (rate = 0) are in the data set. Very low rates, besides being unrealistic, can also ",
82 | "crash selection model likelihood functions due to numerical precision issues. Consider tweaking your method!")
83 | warning(pretty_message(msg, emit = F))
84 | }
85 | cesa@samples[, sig_analysis_grp := 0L]
86 | cesa@trinucleotide_mutation_weights = list(trinuc_proportion_matrix = trinuc_proportion_matrix)
87 | return(cesa)
88 | }
89 |
90 |
--------------------------------------------------------------------------------
/R/si_uniroot_conf_int.R:
--------------------------------------------------------------------------------
1 | #' Calculate uniroot CIs on selection intensities
2 | #'
3 | #' Given a model fit, calculate univariate confidence intervals for each parameter.
4 | #' Returns a list of low/high bounds.
5 | #'
6 | #' @param fit From bbmle
7 | #' @param lik_fn likelihood function
8 | #' @param min_si lower limit on SI/CI
9 | #' @param max_si upper limit on SI/CI
10 | #' @param conf e.g., .95 -> 95\% CIs
11 | #' @keywords internal
12 | univariate_si_conf_ints = function(fit, lik_fn, min_si, max_si, conf) {
13 | max_ll = -1 * as.numeric(bbmle::logLik(fit))
14 | offset = stats::qchisq(conf, 1)/2
15 | selection_intensity = bbmle::coef(fit)
16 | num_pars = length(selection_intensity)
17 | conf_ints = list(num_pars * 2)
18 | for (i in 1:num_pars) {
19 | if(is.na(selection_intensity[i])) {
20 | lower = NA_real_
21 | upper = NA_real_
22 | } else {
23 | # univariate likelihood function freezes all but one SI at MLE
24 | # offset makes output at MLE negative; function should be positive at lower/upper boundaries,
25 | # and uniroot will find the zeroes, which should represent the lower/upper CIs
26 | ulik = function(x) {
27 | pars = selection_intensity
28 | pars[i] = x
29 | return(lik_fn(pars) - max_ll - offset)
30 | }
31 | # if ulik() of the floor SI is negative, no root on [floor MLE], so setting an NA lower bound
32 | if(ulik(min_si) < 0) {
33 | lower = NA_real_
34 | } else {
35 | lower = max(stats::uniroot(ulik, lower = min_si, upper = selection_intensity[i])$root, min_si)
36 | }
37 | if(ulik(max_si) < 0){
38 | # this really shouldn't happen
39 | upper = NA_real_
40 | } else {
41 | upper = stats::uniroot(ulik, lower = selection_intensity[i], upper = max_si)$root
42 | }
43 | }
44 | conf_ints[[i * 2 - 1]] = lower
45 | conf_ints[[i * 2]] = upper
46 | }
47 | si_names = bbmle::parnames(lik_fn)
48 | ci_high_colname = paste0("ci_high_", conf * 100)
49 | ci_low_colname = paste0("ci_low_", conf * 100)
50 | if (num_pars == 1) {
51 | ci_colnames = c(ci_low_colname, ci_high_colname)
52 | } else {
53 | low_colnames = paste(ci_low_colname, si_names, sep = "_")
54 | high_colnames = paste(ci_high_colname, si_names, sep = "_")
55 | ci_colnames = unlist(S4Vectors::zipup(low_colnames, high_colnames))
56 | }
57 | names(conf_ints) = ci_colnames
58 | return(conf_ints)
59 | }
--------------------------------------------------------------------------------
/R/single_variant_si_objectives.R:
--------------------------------------------------------------------------------
1 | #' sswm_lik
2 | #'
3 | #' Generates log-likelihood function of site-level selection with "strong selection, weak
4 | #' mutation" assumption. All arguments to this likelihood function factory are
5 | #' automatically supplied by \code{ces_variant()}.
6 | #'
7 | #' @param rates_tumors_with vector of site-specific mutation rates for all tumors with variant
8 | #' @param rates_tumors_without vector of site-specific mutation rates for all eligible tumors without variant
9 | #' @export
10 | sswm_lik = function(rates_tumors_with, rates_tumors_without) {
11 | fn = function(gamma) {
12 | gamma = unname(gamma) # math faster on unnamed vectors
13 | sum_log_lik = 0
14 | if (length(rates_tumors_without) > 0) {
15 | sum_log_lik = -1 * sum(gamma * rates_tumors_without)
16 | }
17 | if (length(rates_tumors_with) > 0) {
18 | sum_log_lik = sum_log_lik + sum(log(1 - exp(-1 * gamma * rates_tumors_with)))
19 | }
20 |
21 | # convert to negative loglikelihood and return
22 | return(-1 * sum_log_lik)
23 | }
24 |
25 | # Set default values for gamma (SI), which ces_variant will use to set starting value of optimization
26 | formals(fn)[["gamma"]] = 1
27 | bbmle::parnames(fn) = "selection_intensity"
28 | return(fn)
29 | }
30 |
31 |
32 |
33 |
--------------------------------------------------------------------------------
/R/suggest_cosmic_signatures_to_remove.R:
--------------------------------------------------------------------------------
1 | #' Tissue-specific mutational signature exclusions
2 | #'
3 | #' Get suggestions on signatures_to_remove for trinuc_mutation_rates for COSMIC signatures v3 and later.
4 | #' For details, see [this article](https://townsend-lab-yale.github.io/cancereffectsizeR/articles/cosmic_cancer_type_note.html)
5 | #' on our website.
6 | #'
7 | #' @param cancer_type See [here](https://townsend-lab-yale.github.io/cancereffectsizeR/articles/cosmic_cancer_type_note.html)
8 | #' for supported cancer type labels.
9 | #' @param treatment_naive give TRUE if samples were taken pre-treatment; FALSE or leave
10 | #' NULL otherwise.
11 | #' @param quiet (default false) for non-interactive use, suppress explanations and advice.
12 | #' @return a vector of signatures to feed to the \code{trinuc_mutation_rates()}
13 | #' \code{signature_exclusions} argument.
14 | #' @md
15 | #' @export
16 | suggest_cosmic_signature_exclusions = function(cancer_type = NULL, treatment_naive = NULL, quiet = FALSE) {
17 | data_source = paste0(system.file("extdata", package = "cancereffectsizeR"), '/COSMIC_v3.2_signatures_by_cancer_type.txt')
18 | dt = data.table::fread(data_source)
19 | to_remove = character()
20 | if(is.null(cancer_type)) {
21 | if(! quiet) {
22 | pretty_message(paste0("SBS25, SBS89, and SBS91 can usually be excluded."), black = F)
23 | message("Specify cancer_type (see documentation) for suggestions on which signatures do not appear in specific cancers.")
24 | }
25 | to_remove = c("SBS25", "SBS89", "SBS91")
26 | }
27 | if(is.null(treatment_naive)) {
28 | if(! quiet) {
29 | message(paste0("If your samples were taken before treatment (e.g., TCGA data), re-run with treatment_naive = TRUE \nto get a list of ",
30 | "signatures that are chemotherapy-associated, which you may want to exclude."))
31 | }
32 | treatment_naive = FALSE
33 | }
34 |
35 | if (! identical(quiet, TRUE) && ! identical(quiet, FALSE) ) {
36 | stop("argument quiet should be T/F")
37 | }
38 |
39 | sig_metadata = cosmic_signature_info()
40 | suppressWarnings({sig_metadata$short_name = NULL})
41 | original_sig_order = copy(sig_metadata$name)
42 | setkey(sig_metadata, "name")
43 |
44 | if(! is.null(cancer_type)) {
45 | if(length(cancer_type) != 1 || ! is.character(cancer_type)) {
46 | stop("cancer_type should be a 1-length character vector")
47 | }
48 | # handle the only case where two different TCGA studies are the same PCAWG group
49 | if(cancer_type == "COAD" || cancer_type == "READ") {
50 | cancer_type = "ColoRect-AdenoCA"
51 | }
52 | index = which(dt$PCAWG %ilike% cancer_type)
53 | if (length(index) != 1) {
54 | index = which(dt$Applicable_TCGA %ilike% cancer_type)
55 | }
56 | if (length(index) != 1) {
57 | message(paste0("Input cancer_type not recognized.\n",
58 | "See \"Cancer type considerations for COSMIC signatures\" on the cancereffectsizeR website and find your cancer type in the table. If ",
59 | "it's not there, then there is no cancer-type-specific data available."))
60 | stop()
61 | }
62 | to_remove = c(names(which(unlist(dt[index,]) == 0)))
63 | to_remove = intersect(to_remove, sig_metadata$name) # compatibility with v3.1, when using
64 | if(! quiet) {
65 | pretty_message(paste0("The following signatures are suggested absent in ", cancer_type, ", either by Alexandrov 2020 or the COSMIC signature website:"))
66 | print(sig_metadata[to_remove])
67 | cat("\n")
68 | }
69 | }
70 |
71 | treatment_sigs = c("SBS11", "SBS31", "SBS25", "SBS32", "SBS35", "SBS86", "SBS87", "SBS90", "SBS99")
72 | if(treatment_naive == TRUE) {
73 | if(! quiet) {
74 | cat("The following signatures are associated with various treatments:\n")
75 | print(sig_metadata[treatment_sigs])
76 | }
77 | to_remove = c(to_remove, treatment_sigs)
78 | }
79 |
80 | # COSMIC v3.4 split SBS22 into 22a,b and SBS40 into 40a,b,c. We'll apply the same tissue exclusions.
81 | if('SBS22' %in% to_remove) {
82 | to_remove = c(to_remove, c("SBS22a", 'SBS22b'))
83 | }
84 | if('SBS40' %in% to_remove) {
85 | to_remove = c(to_remove, c("SBS40a", 'SBS40b', 'SBS40c'))
86 | }
87 |
88 | # make unique and put signatures in numeric order
89 | to_remove = unique(to_remove)
90 |
91 | to_remove = original_sig_order[original_sig_order %in% to_remove]
92 | if(! quiet) {
93 | sig_string = paste0("signature_exclusions = c(\"", paste(to_remove, collapse = "\", \""), "\")")
94 | message(crayon::black("\nSilently returning the following suggested exclusions: "))
95 | message(crayon::black(sig_string))
96 | }
97 | return(invisible(to_remove))
98 | }
99 |
--------------------------------------------------------------------------------
/R/sysdata.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Townsend-Lab-Yale/cancereffectsizeR/c0f5503e2beca7726bc535caf4f1fcee918afec3/R/sysdata.rda
--------------------------------------------------------------------------------
/R/validate_optimizer_args.R:
--------------------------------------------------------------------------------
1 | #' Check custom optimizer arguments
2 | #'
3 | #' @param optimizer_args List of arguments/values to pass to the optimizer.
4 | #' @keywords internal
5 | validate_optimizer_args = function(optimizer_args) {
6 | if (! is(optimizer_args, "list") || uniqueN(names(optimizer_args)) != length(optimizer_args)) {
7 | stop("optimizer_args should a named list of arguments to pass.")
8 | }
9 |
10 | reserved_args = c('minuslogl', 'start', 'vecpar')
11 | if(any(reserved_args %in% names(optimizer_args))) {
12 | msg = paste0('Optimizer arguments start, vecpar, and minuslogl cannot be changed here. ',
13 | 'If you are using a custom model, your likelihood function can declare these ',
14 | 'values directly (see docs).')
15 | stop(pretty_message(msg, emit = F))
16 | }
17 | if('control' %in% names(optimizer_args) && ! is(optimizer_args$control, 'list')) {
18 | stop('Optimizer argument \"control\" should be a list.')
19 | }
20 | }
21 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | *Welcome to cancereffectsizeR!*
2 |
3 | Please visit the visit the [package website](https://townsend-lab-yale.github.io/cancereffectsizeR/) for installation, tutorial, and documentation.
4 |
5 | We welcome contributions and feedback. If you have questions or want to learn more about how to contribute to development, don't hesitate to open an issue or contact us.
6 |
--------------------------------------------------------------------------------
/_pkgdown.yml:
--------------------------------------------------------------------------------
1 | destination: docs
2 | url: https://townsend-lab-yale.github.io/cancereffectsizeR/
3 | template:
4 | bootstrap: 5
5 | bootswatch: simplex
6 | theme: solarized-light
7 | bslib:
8 | pkgdown-nav-height: 78px
9 | heading_font: avenir
10 | repo:
11 | branch: main
12 |
13 | reference:
14 | - title: Create and manage projects
15 | desc: Create, load, and save projects
16 | contents:
17 | - CESAnalysis
18 | - load_cesa
19 | - save_cesa
20 | - set_refset_dir
21 | - title: Obtain and prep MAF data
22 | contents:
23 | - preload_maf
24 | - check_sample_overlap
25 | - get_TCGA_project_MAF
26 | - vcfs_to_maf_table
27 | - make_PathScore_input
28 | - lift_bed
29 | - title: Load and manage variants
30 | contents:
31 | - load_maf
32 | - select_variants
33 | - variant_counts
34 | - samples_with
35 | - add_variants
36 | - add_covered_regions
37 | - baseline_mutation_rates
38 | - title: Load sample-level data
39 | contents:
40 | - load_sample_data
41 | - clear_sample_data
42 | - title: Compound variants
43 | desc: Combine variants into arbitrary batches and test for batch-level selection
44 | contents:
45 | - define_compound_variants
46 | - CompoundVariantSet
47 | - title: Trinucleotide signatures and rates
48 | desc: Mutational signature extraction and inference of context-specific mutation rates
49 | contents:
50 | - trinuc_mutation_rates
51 | - suggest_cosmic_signature_exclusions
52 | - trinuc_snv_counts
53 | - convert_signature_weights_for_mp
54 | - clear_trinuc_rates_and_signatures
55 | - set_signature_weights
56 | - set_trinuc_rates
57 | - assign_group_average_trinuc_rates
58 | - title: Gene mutation rates
59 | desc: Calculate neutral gene mutation rates
60 | contents:
61 | - gene_mutation_rates
62 | - set_gene_rates
63 | - clear_gene_rates
64 | - title: Cancer effect sizes
65 | desc: Quantify selection for somatic variants
66 | contents:
67 | - ces_variant
68 | - ces_epistasis
69 | - ces_gene_epistasis
70 | - mutational_signature_effects
71 | - clear_effect_output
72 | - clear_epistasis_output
73 | - title: Visualization
74 | desc: Display and compare variant effect sizes
75 | contents:
76 | - plot_effects
77 | - plot_signature_effects
78 | - plot_epistasis
79 | - epistasis_plot_schematic
80 | - title: Selection models
81 | desc: Likelihood function generators for various models of selection
82 | contents:
83 | - sswm_lik
84 | - pairwise_epistasis_lik
85 | - title: Explore reference data
86 | contents:
87 | - cosmic_signature_info
88 | - get_PathScore_coding_regions
89 | - list_ces_refsets
90 | - list_ces_covariates
91 | - list_ces_signature_sets
92 | - get_ces_signature_set
93 | - title: Create custom reference data
94 | desc: Build your own reference data set for almost any genome or tissue type
95 | contents:
96 | - create_refset
97 | - build_RefCDS
98 | - validate_signature_set
99 | - title: Accessors
100 | desc: Data accessors that you probably won't need (use cesa\$maf, cesa\$samples, etc. instead)
101 | contents:
102 | - maf_records
103 | - excluded_maf_records
104 | - get_sample_info
105 | - get_trinuc_rates
106 | - get_signature_weights
107 | - get_gene_rates
108 | - snv_results
109 | - epistasis_results
110 |
111 | home:
112 | strip_header: true
113 | description: >
114 | cancereffectsizeR, an R package from the Townsend Lab at Yale School of Public Health
115 | for quantifying the effect size of somatic mutations in cancer
116 | sidebar:
117 | structure: [townsend, custom_citation, authors, links, license]
118 | components:
119 | townsend:
120 | title: Provided by
121 | text: the [Townsend Lab](https://medicine.yale.edu/lab/townsend/) at the
[Yale School of Public Health](https://ysph.yale.edu).
122 | custom_citation:
123 | title: Citation
124 | text: "[How to cite cancereffectsizeR](#citation)"
125 | navbar:
126 | structure:
127 | left: [intro, reference, articles, tutorials, news]
128 | right: [search, github]
129 | components:
130 | intro:
131 | text: Tutorial
132 | href: articles/cancereffectsizeR.html
133 | news:
134 | text: News
135 | href: news/index.html
136 |
137 |
--------------------------------------------------------------------------------
/data-raw/build_ces.refset.hg19.R:
--------------------------------------------------------------------------------
1 | ## This GTF, or an updated version, will likely be used to generate version 2 of ces.refset.hg19.
2 | ## Version 1 had the same tweaks to CDKN2A and TP53, but the transcripts were pulled from biomaRt.
3 | ## Read in Gencode GTF (release 35, GRCh37-lifted version) as a data table
4 |
5 | gen = as.data.table(rtracklayer::import("gencode.v35lift37.annotation.gtf"))
6 |
7 | # Exome capture
8 | exome_bed = "xgen-exome-research-panel-targets.bed"
9 |
10 | # Restrict to consensus coding sequences
11 | ccds = gen[tag == "CCDS"]
12 |
13 | # Hand-pick the two most significant CDKN2A transcripts and give them new gene names; discard other CDKN2A transcripts
14 | # It's not generally recommended to rename genes; making an exception for historical reaons
15 | ccds[protein_id == "ENSP00000307101.5", gene_name := "CDKN2A.p16INK4a"]
16 | ccds[protein_id == "ENSP00000462950.1", gene_name := "CDKN2A.p14arf"]
17 | ccds = ccds[gene_name != "CDKN2A"]
18 |
19 |
20 | # One other tweak: specify an additional splice site position in the TP53 transcript that will be used
21 | tp53_extra_splice_pos = list(ENSP00000269305.4 = 7579312)
22 |
23 | refcds_output = build_RefCDS(gtf = ccds, genome = "hg19", additional_essential_splice_pos = tp53_extra_splice_pos)
24 |
25 | # Call build_refset
26 | create_refset(output_dir = "tmp",
27 | refcds_output = refcds_output,
28 | species_name = "human",
29 | genome_build_name = "hg19",
30 | BSgenome_name = "hg19",
31 | supported_chr = c(1:22, 'X', 'Y'),
32 | default_exome_bed = exome_bed,
33 | exome_interval_padding = 100)
34 |
35 |
36 |
37 |
--------------------------------------------------------------------------------
/data-raw/build_codon_snvs_to_aa.R:
--------------------------------------------------------------------------------
1 | library(Biostrings)
2 |
3 | # poss_mut: A reference list that, given a particular genomic coding trinucleotide sequence,
4 | # and an amino acid of interest, gives all possible SNVs that would generate the amino acid.
5 | # For each trinucleotide sequence any amino acid, there is a three-item list that gives all
6 | # substitions in the first, second, and third positions that produce the aminoacid.
7 |
8 | # Examples:
9 | # poss_mut$TAG$Trp: TAG (which encodes a stop codon) instead encodes Trp after an A->G substitution at position 2
10 | # poss_mut$GTA$Val: GTA (Valine) continues to code Valine regardless of the the third nucleotide, so
11 | # poss_mut$GTA$Val[[3]] returns c("C", "G", "T"). "A" is not included because it is not a substitution.
12 | # poss_mut$AGC$Arg: Shows that substitutions in first and third positions can cause Ser -> Arg
13 |
14 |
15 | build_codon_snvs_to_aa = function() {
16 | # a handy conversion from short AA abbreviation to long (e.g., K -> Lys)
17 | aa_long = AMINO_ACID_CODE
18 | aa_long["*"] = "STOP" # add in stop codon
19 |
20 |
21 | poss_mut = list()
22 | for (nt1 in DNA_BASES) {
23 | for (nt2 in DNA_BASES) {
24 | for (nt3 in DNA_BASES) {
25 | # for the given three nts, construct a codon and get its amino acid
26 | codon = DNAString(xscat(nt1, nt2, nt3)) # need to cast BString as DNAString for translation
27 | amino_acid = translate(codon)
28 | codon_str = as.character(codon)
29 |
30 | # make empty list structure to hold possible mutations
31 | poss_mut[[codon_str]] = list()
32 | aa_code = aa_long[c(AA_STANDARD, "*")] # get long abbreviations of 20 amino acids, plus stop
33 | for (i in aa_code) {
34 | poss_mut[[codon_str]][[i]] = list()
35 | for (j in 1:3) {
36 | poss_mut[[codon_str]][[i]][[j]] = character()
37 | }
38 | }
39 |
40 | # see ?Biostrings::translate, but if you don't use no.init.codon, you'll get some mistranslations (e.g., CTG as M instead of L)
41 | for (mut in setdiff(DNA_BASES, nt1)) {
42 | new_aa = as.character(translate(replaceLetterAt(codon, 1, mut), no.init.codon = T))
43 | new_aa = aa_long[new_aa]
44 | poss_mut[[codon_str]][[new_aa]][[1]] = c(poss_mut[[codon_str]][[new_aa]][[1]], mut)
45 | }
46 | for (mut in setdiff(DNA_BASES, nt2)) {
47 | new_aa = as.character(translate(replaceLetterAt(codon, 2, mut), no.init.codon = T))
48 | new_aa = aa_long[new_aa]
49 | poss_mut[[codon_str]][[new_aa]][[2]] = c(poss_mut[[codon_str]][[new_aa]][[2]], mut)
50 | }
51 | for (mut in setdiff(DNA_BASES, nt3)) {
52 | new_aa = as.character(translate(replaceLetterAt(codon, 3, mut), no.init.codon = T))
53 | new_aa = aa_long[new_aa]
54 | poss_mut[[codon_str]][[new_aa]][[3]] = c(poss_mut[[codon_str]][[new_aa]][[3]], mut)
55 | }
56 | }
57 | }
58 | }
59 | return(poss_mut)
60 | }
61 |
62 |
--------------------------------------------------------------------------------
/data-raw/build_deconstructSigs_stuff.R:
--------------------------------------------------------------------------------
1 | # The 96 trinuc-context-specific SNVs, in the order used by deconstructSigs data structures
2 | # Note that the central reference trinucleotides are always C/T, as A/G reference are
3 | # represented using the reverse complement trinucleotide context
4 | build_deconstructSigs_trinuc_string = function() {
5 | return(c("A[C>A]A",
6 | "A[C>A]C",
7 | "A[C>A]G",
8 | "A[C>A]T",
9 | "C[C>A]A",
10 | "C[C>A]C",
11 | "C[C>A]G",
12 | "C[C>A]T",
13 | "G[C>A]A",
14 | "G[C>A]C",
15 | "G[C>A]G",
16 | "G[C>A]T",
17 | "T[C>A]A",
18 | "T[C>A]C",
19 | "T[C>A]G",
20 | "T[C>A]T",
21 | "A[C>G]A",
22 | "A[C>G]C",
23 | "A[C>G]G",
24 | "A[C>G]T",
25 | "C[C>G]A",
26 | "C[C>G]C",
27 | "C[C>G]G",
28 | "C[C>G]T",
29 | "G[C>G]A",
30 | "G[C>G]C",
31 | "G[C>G]G",
32 | "G[C>G]T",
33 | "T[C>G]A",
34 | "T[C>G]C",
35 | "T[C>G]G",
36 | "T[C>G]T",
37 | "A[C>T]A",
38 | "A[C>T]C",
39 | "A[C>T]G",
40 | "A[C>T]T",
41 | "C[C>T]A",
42 | "C[C>T]C",
43 | "C[C>T]G",
44 | "C[C>T]T",
45 | "G[C>T]A",
46 | "G[C>T]C",
47 | "G[C>T]G",
48 | "G[C>T]T",
49 | "T[C>T]A",
50 | "T[C>T]C",
51 | "T[C>T]G",
52 | "T[C>T]T",
53 | "A[T>A]A",
54 | "A[T>A]C",
55 | "A[T>A]G",
56 | "A[T>A]T",
57 | "C[T>A]A",
58 | "C[T>A]C",
59 | "C[T>A]G",
60 | "C[T>A]T",
61 | "G[T>A]A",
62 | "G[T>A]C",
63 | "G[T>A]G",
64 | "G[T>A]T",
65 | "T[T>A]A",
66 | "T[T>A]C",
67 | "T[T>A]G",
68 | "T[T>A]T",
69 | "A[T>C]A",
70 | "A[T>C]C",
71 | "A[T>C]G",
72 | "A[T>C]T",
73 | "C[T>C]A",
74 | "C[T>C]C",
75 | "C[T>C]G",
76 | "C[T>C]T",
77 | "G[T>C]A",
78 | "G[T>C]C",
79 | "G[T>C]G",
80 | "G[T>C]T",
81 | "T[T>C]A",
82 | "T[T>C]C",
83 | "T[T>C]G",
84 | "T[T>C]T",
85 | "A[T>G]A",
86 | "A[T>G]C",
87 | "A[T>G]G",
88 | "A[T>G]T",
89 | "C[T>G]A",
90 | "C[T>G]C",
91 | "C[T>G]G",
92 | "C[T>G]T",
93 | "G[T>G]A",
94 | "G[T>G]C",
95 | "G[T>G]G",
96 | "G[T>G]T",
97 | "T[T>G]A",
98 | "T[T>G]C",
99 | "T[T>G]G",
100 | "T[T>G]T"))
101 | }
102 |
103 | # data table with context (e.g., A), central mutation (e.g., G), and corresponding deconstructSigs notation (G[C>G]C)
104 | build_dS_notation_table = function() {
105 | dt = data.table::fread(system.file("extdata/trinuc_snv_to_deconstructSigs_ID.txt", package = "cancereffectsizeR"))
106 | setkey(dt, "context", "mutation")
107 | return(dt)
108 | }
109 |
--------------------------------------------------------------------------------
/data-raw/generate_cosmic_v3.1_hg19_signature_set.R:
--------------------------------------------------------------------------------
1 | library(data.table)
2 | library(cancereffectsizeR)
3 |
4 | # downloaded 08-09-20
5 | cosmic = fread(system.file("extdata/COSMIC_SBS_v3-1.txt", package = "cancereffectsizeR"))
6 | metadata = fread(system.file("extdata/COSMIC_v3.1_signature_metadata.txt", package = "cancereffectsizeR"))
7 |
8 | # column names will be deconstructSigs-style trinuc mutations
9 | dS_muts = cosmic[, deconstructSigs_notations[.(Subtype, substr(Type, 3, 3)), deconstructSigs_ID]]
10 |
11 |
12 | # drop non-signature columns
13 | cosmic = cosmic[, .SD, .SDcols = patterns("SBS")]
14 | sig_names = colnames(cosmic)
15 | cosmic[cosmic == '-'] = '0' # some empty values (reflecting something about COSMIC methods?)
16 |
17 | # convert all to numeric (those with "-" ended up as character)
18 | cosmic[, (sig_names) := lapply(.SD, as.numeric), .SDcols = sig_names]
19 |
20 | cosmic_df = as.data.frame(t(cosmic))
21 | rownames(cosmic_df) = sig_names
22 | colnames(cosmic_df) = dS_muts
23 |
24 | # put columns in canonical order (the order used by deconstructSigs, to avoid mistakes later)
25 | deconstructSigs_trinuc_string = getFromNamespace("deconstructSigs_trinuc_string", "cancereffectsizeR")
26 | cosmic_df = cosmic_df[, deconstructSigs_trinuc_string]
27 | signature_set = list(name = "COSMIC v3.1", signatures = cosmic_df, meta = metadata)
28 |
29 | # trigger an error if this signature set isn't valid
30 | validate_signature_set(signature_set)
31 |
32 | # save in hg19 reference data collection
33 | out_path = paste0(system.file("ref_sets/ces_hg19_v1/signatures", package = "cancereffectsizeR"), '/COSMIC_v3.1_signatures.rds')
34 | saveRDS(signature_set, out_path)
35 |
--------------------------------------------------------------------------------
/data-raw/generate_sysdata.R:
--------------------------------------------------------------------------------
1 | prev_dir = setwd(system.file("data-raw", package = "cancereffectsizeR"))
2 |
3 | source("build_deconstructSigs_stuff.R")
4 | deconstructSigs_trinuc_string = build_deconstructSigs_trinuc_string()
5 | deconstructSigs_notations = build_dS_notation_table()
6 |
7 | source("build_codon_snvs_to_aa.R")
8 | codon_sbs_to_aa = build_codon_snvs_to_aa()
9 |
10 | cosmic_sbs_signature_etiology = data.table::fread(system.file('extdata/cosmic_sbs_signature_summary.txt', package = 'cancereffectsizeR'))
11 |
12 | # Pulled from COSMIC DBS mutational signature definitions
13 | cosmic_dbs_classes = c(
14 | "AC>CA",
15 | "AC>CG",
16 | "AC>CT",
17 | "AC>GA",
18 | "AC>GG",
19 | "AC>GT",
20 | "AC>TA",
21 | "AC>TG",
22 | "AC>TT",
23 | "AT>CA",
24 | "AT>CC",
25 | "AT>CG",
26 | "AT>GA",
27 | "AT>GC",
28 | "AT>TA",
29 | "CC>AA",
30 | "CC>AG",
31 | "CC>AT",
32 | "CC>GA",
33 | "CC>GG",
34 | "CC>GT",
35 | "CC>TA",
36 | "CC>TG",
37 | "CC>TT",
38 | "CG>AT",
39 | "CG>GC",
40 | "CG>GT",
41 | "CG>TA",
42 | "CG>TC",
43 | "CG>TT",
44 | "CT>AA",
45 | "CT>AC",
46 | "CT>AG",
47 | "CT>GA",
48 | "CT>GC",
49 | "CT>GG",
50 | "CT>TA",
51 | "CT>TC",
52 | "CT>TG",
53 | "GC>AA",
54 | "GC>AG",
55 | "GC>AT",
56 | "GC>CA",
57 | "GC>CG",
58 | "GC>TA",
59 | "TA>AT",
60 | "TA>CG",
61 | "TA>CT",
62 | "TA>GC",
63 | "TA>GG",
64 | "TA>GT",
65 | "TC>AA",
66 | "TC>AG",
67 | "TC>AT",
68 | "TC>CA",
69 | "TC>CG",
70 | "TC>CT",
71 | "TC>GA",
72 | "TC>GG",
73 | "TC>GT",
74 | "TG>AA",
75 | "TG>AC",
76 | "TG>AT",
77 | "TG>CA",
78 | "TG>CC",
79 | "TG>CT",
80 | "TG>GA",
81 | "TG>GC",
82 | "TG>GT",
83 | "TT>AA",
84 | "TT>AC",
85 | "TT>AG",
86 | "TT>CA",
87 | "TT>CC",
88 | "TT>CG",
89 | "TT>GA",
90 | "TT>GC",
91 | "TT>GG"
92 | )
93 |
94 |
95 | usethis::use_data(deconstructSigs_trinuc_string,
96 | deconstructSigs_notations, codon_sbs_to_aa,
97 | cosmic_dbs_classes, cosmic_sbs_signature_etiology,
98 | internal = TRUE, overwrite = TRUE)
99 | setwd(prev_dir)
100 |
--------------------------------------------------------------------------------
/data-raw/tutorial/prep_BRCA_met_tgs.R:
--------------------------------------------------------------------------------
1 | # Download from cBioPortal on 11/11/21
2 | load("~/cancereffectsizeR/")
3 | maf_file = 'data_mutations.txt'
4 | chain_file = 'hg19ToHg38.over.chain'
5 |
6 | # Load data, remove patients with more than one sample, lift to hg38, filter germline variants, save
7 | # minimal necessary MAF data for cancereffectsizeR
8 | maf = fread(maf_file)
9 | maf[, patient_id := gsub('-T.*', '', Tumor_Sample_Barcode)]
10 | multisample_patients = maf[, .(uniqueN(Tumor_Sample_Barcode) > 1), by = "patient_id"][V1 == T, patient_id]
11 | maf = maf[! multisample_patients, on = 'patient_id']
12 | maf = preload_maf(maf, refset = ces.refset.hg38, chain_file = chain_file, sample_col = 'patient_id')
13 | maf = maf[germline_variant_site == FALSE]
14 | maf = maf[, .(Unique_Patient_Identifier, Chromosome, Start_Position, Reference_Allele, Tumor_Allele)]
15 | out_file = paste0(system.file("tutorial", package = "cancereffectsizeR") , '/metastatic_breast_2021_hg38.maf')
16 | fwrite(maf, out_file, sep = "\t")
17 |
--------------------------------------------------------------------------------
/data-raw/tutorial/prep_TCGA_BRCA_clinical.R:
--------------------------------------------------------------------------------
1 | library(data.table)
2 | tcga_clinical_file = system.file("extdata/brca_tcga_clinical_data_via_gdac_cbioportal.tsv", package = 'cancereffectsizeR')
3 |
4 | tcga_clinical = fread(tcga_clinical_file)
5 | tcga_clinical[`HER2 fish status` == "Positive" | `IHC-HER2` == 'Positive', HER2 := 'HER2+']
6 | tcga_clinical[is.na(HER2) & (`HER2 fish status` == 'Negative' | `IHC-HER2` == 'Negative'), HER2 := 'HER2-']
7 |
8 | tcga_clinical[`ER Status By IHC` == 'Positive', ER := 'ER+']
9 | tcga_clinical[`ER Status By IHC` == 'Negative', ER := 'ER-']
10 |
11 | tcga_clinical[`PR status by ihc` == 'Positive', PR := 'PR+']
12 | tcga_clinical[`PR status by ihc` == 'Negative', PR := 'PR-']
13 |
14 | # HR- if both PR and ER have been tested and are negative
15 | tcga_clinical[PR == 'PR-' & ER == 'ER-', HR := 'HR-']
16 |
17 | # HR+ if either PR+ or ER+
18 | tcga_clinical[PR == 'PR+' | ER == 'ER+', HR := 'HR+']
19 |
20 |
21 | tcga_clinical[! is.na(HR) & ! is.na(HER2), receptor_status := paste(HR, HER2, sep = '/')]
22 | tcga_clinical[receptor_status == 'HR-/HER2-', receptor_status := 'TNBC']
23 |
24 |
25 | tcga_clinical[`American Joint Committee on Cancer Metastasis Stage Code` == 'M0', pM := 'M0']
26 | tcga_clinical[`American Joint Committee on Cancer Metastasis Stage Code` == 'M1', pM := 'M1']
27 |
28 |
29 | tcga_clinical = unique(tcga_clinical[, .(patient_id = `Patient ID`, pM, receptor_status)])
30 |
31 | output_file = paste0(system.file(package = "cancereffectsizeR"), '/tutorial/TCGA_BRCA_clinical.txt')
32 | fwrite(tcga_clinical, output_file, sep = "\t", na = 'NA')
33 |
34 |
--------------------------------------------------------------------------------
/doc/cosmic_cancer_type_note.R:
--------------------------------------------------------------------------------
1 | ## ----eval = F-----------------------------------------------------------------
2 | # suggest_cosmic_v3_signatures_to_remove(cancer_type = "BRCA", treatment_naive = TRUE)
3 | # suggest_cosmic_v3_signatures_to_remove(cancer_type = "Kidney-RCC")
4 |
5 | ## ---- echo=FALSE, warning=FALSE, message=FALSE--------------------------------
6 | data_source = paste0(system.file("extdata", package = "cancereffectsizeR"), '/pcawg_tcga_cancer_types.txt')
7 | cancer_type = data.table::fread(data_source)
8 | cancer_type[is.na(cancer_type)] = "(none)"
9 | formattable::formattable(cancer_type[, .(PCAWG, Applicable_TCGA, Number_of_tumors, Description)])
10 |
11 |
--------------------------------------------------------------------------------
/doc/cosmic_cancer_type_note.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Cancer type considerations for COSMIC signatures"
3 | output: rmarkdown::html_vignette
4 | vignette: >
5 | %\VignetteIndexEntry{Cancer type considerations for COSMIC signatures}
6 | %\VignetteEngine{knitr::rmarkdown}
7 | \usepackage[utf8]{inputenc}
8 | ---
9 | The cancereffectsizeR trinuc_mutation_rates uses mutational signature extraction to calculate relative trinucleotide-specific SNV mutation rates for each tumor in the input data. The "signatures_to_remove" option allows some signatures to be excluded from this analysis, which means each tumor will receive a weight of 0 for these signatures (i.e., 0% of the tumor's SNVs are attributable to these signatures). By default, SBS25 is excluded because it's considered dubious and is specific to Hodgkin's lymphoma cell lines. You can override this behavior by setting `signatures_to_remove = "none"`.
10 |
11 | In [Alexandrov 2020](https://doi.org/10.1038/s41586-020-1943-3), the paper introducing the latest COSMIC mutational signatures (COSMICv3),
12 | there are some signatures that do not appear in certain cancer types. Therefore, you may wish to exclude those signatures during
13 | mutational signature analysis of your data.
14 |
15 | There are also a few signatures associated with certain chemotherapy drugs (SBS31, SBS32, SBS35), so if you are confident that your sample
16 | is treatment-naive, you may wish to exclude these.
17 |
18 | Note that while some COSMIC signatures are believed to represent sequencing artifacts, it's not recommended to remove these from analysis, because cancereffectsizeR already has special handling of these signatures to determine relative rates of true mutational processes in tumors.
19 |
20 |
21 |
22 | To get a list of signatures to exclude based on cancer type and treatment status, you can use `suggest_cosmic_v3_signatures_to_remove()`.
23 | ```{r eval = F}
24 | suggest_cosmic_v3_signatures_to_remove(cancer_type = "BRCA", treatment_naive = TRUE)
25 | suggest_cosmic_v3_signatures_to_remove(cancer_type = "Kidney-RCC")
26 | ```
27 |
28 | The cancer type recommendations are based on Extended Data Figure 5 of Alexandrov 2020. The first two columns of the table below, also based on that figure, give the labels accepted by the `cancer_type` argument.
29 |
30 | Before excluding signatures, make sure your data set does not contain tumors from multiple PCAWG categories. For example, TCGA HNSC (head and neck cancer) includes oral cancers, which are listed separately here as Oral-SCC, so excluding all signatures that do not appear in Head-SCC (such as SBS29, tobacco chewing) would not be appropriate.
31 |
32 | ```{r, echo=FALSE, warning=FALSE, message=FALSE}
33 | data_source = paste0(system.file("extdata", package = "cancereffectsizeR"), '/pcawg_tcga_cancer_types.txt')
34 | cancer_type = data.table::fread(data_source)
35 | cancer_type[is.na(cancer_type)] = "(none)"
36 | formattable::formattable(cancer_type[, .(PCAWG, Applicable_TCGA, Number_of_tumors, Description)])
37 | ```
38 |
39 |
--------------------------------------------------------------------------------
/index.md:
--------------------------------------------------------------------------------
1 | # (placeholder; leave)
2 |
3 |
cancereffectsizeR
4 | #### Quantify somatic evolution in cancer
5 |
6 | ---
7 |
8 | Welcome to cancereffectsizeR! This R package provides a variety of tools for analyzing somatic variant data and characterizing the evolutionary trajectories of cancers. Key package features and related theory are presented in a [recent article](https://aacrjournals.org/cancerres/article/83/4/500/716429/Estimation-of-Neutral-Mutation-Rates-and) in *Cancer Research*. As we continue development of cancereffectsizeR, we welcome your [feedback, questions, and bug reports](https://github.com/Townsend-Lab-Yale/cancereffectsizeR/issues).
9 |
10 | ## Installation, tutorial, and customizations
11 | For simple installation instructions and a demonstration of a cancer effect analysis, see the [tutorial](articles/cancereffectsizeR.html). The quickstart section offers a condensed introduction to get you running a basic analysis in minutes.
12 |
13 | The package has extensive support for testing specific research questions with customized analyses:
14 |
15 | - Quantify the cancer effects of variants under the default model of selection, test epistatic models of selection, or define and test your own models. Arbitrarily batch variants by position, gene, or functional annotation, and quantify selection by batch. Assess differential selection among patient subgroups.
16 | - Combining mutational signature analysis with cancer effect estimation, compare signatures' relative contributions to oncogenesis and mutagenesis, using either COSMIC signatures or any custom signature set.
17 | - Annotate somatic variants with built-in reference data, or create a [custom reference data set](articles/custom_refset_instructions.html) for almost any species/genome.
18 | - Use provided tissue-specific covariates to inform calculation of gene mutation rates via [dNdScv](https://github.com/im3sanger/dndscv), or build your own [custom covariates](articles/create_custom_covariates.html).
19 |
20 | ## Selected publications
21 |
22 | * **[Estimation of Neutral Mutation Rates and Quantification of Somatic Variant Selection Using cancereffectsizeR](https://aacrjournals.org/cancerres/article/83/4/500/716429/Estimation-of-Neutral-Mutation-Rates-and)**, *Cancer Research* (2023).
This resource report discusses the package's key features and methods and presents analyses validating that cancer effects are a useful quantification of the cancer relevance of somatic variants.
23 |
24 | * **[Attribution of Cancer Origins to Endogenous, Exogenous, and Preventable Mutational Processes](https://academic.oup.com/mbe/article/39/5/msac084/6570859)**, *Molecular Biology and Evolution* (2022).
Cancer effects are incorporated into a novel method to calculate the relative contributions of various mutational processes to oncogenesis. Apply the method yourself with `mutational_signature_effects()`.
25 |
26 | * **[Effect Sizes of Somatic Mutations in Cancer](https://doi.org/10.1093/jnci/djy168)**, *Journal of the National Cancer Institute *(2018).
A pan-cancer analysis of cancer effects employing [version 0.1.0](https://github.com/Townsend-Lab-Yale/cancereffectsizeR/releases/tag/0.1.0) of this package. This original version was developed by Vincent Cannataro, Stephen Gaffney, and Jeffrey Townsend.
27 |
28 | ## Citation
29 | When reporting work that uses cancereffectsizeR, please cite our [resource report](https://aacrjournals.org/cancerres/article/83/4/500/716429/Estimation-of-Neutral-Mutation-Rates-and) published in *Cancer Research*:
30 |
31 | >Mandell JD, Cannataro VL, Townsend JP. Estimation of neutral mutation rates and quantification of somatic variant selection using cancereffectsizeR. Cancer Research. 2023 Feb 15; 83(4):500-505. [doi:10.1158/0008-5472.CAN-22-1508](https://www.doi.org/10.1158/0008-5472.CAN-22-1508).
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
--------------------------------------------------------------------------------
/inst/CITATION:
--------------------------------------------------------------------------------
1 | citHeader("If you use cancereffectsizeR in your work, please cite the resource report published in Cancer Research:")
2 |
3 | citEntry(entry="Article",
4 | title = "Estimation of Neutral Mutation Rates and Quantification of Somatic Variant Selection Using cancereffectsizeR",
5 | author = personList(
6 | as.person("Jeffrey D. Mandell"),
7 | as.person("Vincent L. Cannataro"),
8 | as.person("Jeffrey P. Townsend")
9 | ),
10 | doi = "10.1158/0008-5472.CAN-22-1508",
11 | journal = "Cancer Research",
12 | year = "2023",
13 | volume = "83",
14 | number = "4",
15 | page = "500–505",
16 | textVersion = "Mandell JD, Cannataro VL, Townsend JP. Estimation of
17 | neutral mutation rates and quantification of somatic variant selection
18 | using cancereffectsizeR. Cancer Research. 2023 Feb 15; 83(4):500-505. doi:10.1158/0008-5472.CAN-22-1508.")
19 |
--------------------------------------------------------------------------------
/inst/PathScore/PathScore_CDS_ranges_hg19.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Townsend-Lab-Yale/cancereffectsizeR/c0f5503e2beca7726bc535caf4f1fcee918afec3/inst/PathScore/PathScore_CDS_ranges_hg19.rds
--------------------------------------------------------------------------------
/inst/PathScore/PathScore_CDS_ranges_hg38.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Townsend-Lab-Yale/cancereffectsizeR/c0f5503e2beca7726bc535caf4f1fcee918afec3/inst/PathScore/PathScore_CDS_ranges_hg38.rds
--------------------------------------------------------------------------------
/inst/WORDLIST:
--------------------------------------------------------------------------------
1 | cancereffectsizeR
2 | cesa
3 | CES
4 | CESAnalysis
5 | contigs
6 | COSMIC
7 | curation
8 | deconstructSigs
9 | dNdScv
10 | dndscv
11 | epigenomics
12 | epistasis
13 | epistatic
14 | exome
15 | Gencode
16 | germline
17 | ggrepel
18 | histone
19 | hotspot
20 | hotspots
21 | hypermutation
22 | indel
23 | indels
24 | intergenic
25 | lysine
26 | KRAS
27 | liftOver
28 | MAF
29 | MutationalPatterns
30 | noncoding
31 | nonsilent
32 | nonsynonymous
33 | nucleotide
34 | nucleotides
35 | PCAWG
36 | preload
37 | quickstart
38 | RefCDS
39 | refset
40 | refsets
41 | reproducibility
42 | RStudio
43 | snv
44 | SNV
45 | SNVs
46 | snvs
47 | TCGA
48 | TGS
49 | trinucleotide
50 | WES
51 | WGS
52 | WXS
--------------------------------------------------------------------------------
/inst/extdata/COSMIC_v3.1_signature_metadata.txt:
--------------------------------------------------------------------------------
1 | Signature Etiology Likely_Artifact Exome_Min Genome_Min
2 | SBS1 Deamination with age FALSE
3 | SBS2 APOBEC FALSE
4 | SBS3 Defective homologous recombination FALSE
5 | SBS4 Tobacco smoking FALSE
6 | SBS5 "Unknown, clock-like" FALSE
7 | SBS6 Defective DNA mismatch repair FALSE 200 10000
8 | SBS7a UV light FALSE
9 | SBS7b UV light FALSE
10 | SBS7c UV light FALSE
11 | SBS7d UV light FALSE
12 | SBS8 Unknown FALSE
13 | SBS9 Polymerase eta somatic hypermutation FALSE
14 | SBS10a Polymerase epsilon FALSE 2000 100000
15 | SBS10b Polymerase epsilon FALSE 2000 100000
16 | SBS11 Temozolomide treatment FALSE
17 | SBS12 Unknown FALSE
18 | SBS13 APOBEC FALSE
19 | SBS14 Concurrent polymerase epsilon mutation and defective mismatch repair FALSE 200 10000
20 | SBS15 Defective mismatch repair FALSE 200 10000
21 | SBS16 Unknown FALSE
22 | SBS17a Unknown FALSE
23 | SBS17b Unknown FALSE
24 | SBS18 Damage by reactive oxygen species FALSE
25 | SBS19 Unknown FALSE
26 | SBS20 Concurrent POLD1 mutations and defective mismatch repair FALSE 200 10000
27 | SBS21 DNA mismatch repair deficiency FALSE 200 10000
28 | SBS22 Aristolochic acid exposure FALSE
29 | SBS23 Unknown FALSE
30 | SBS24 Aflatoxin exposure FALSE
31 | SBS25 Chemotherapy treatment FALSE
32 | SBS26 Defective DNA mismatch repair FALSE 200 10000
33 | SBS28 Unknown FALSE
34 | SBS29 Tobacco chewing FALSE
35 | SBS30 Deficiency in base excision repair due to mutations in NTHL1 FALSE
36 | SBS31 Platinum drug chemotherapy FALSE
37 | SBS32 Azathioprine treatment (used for immunosuppression) FALSE
38 | SBS33 Unknown FALSE
39 | SBS34 Unknown FALSE
40 | SBS35 Prior chemotherapy treatment FALSE
41 | SBS36 Defective DNA base excision repair due to MUTYH mutations FALSE
42 | SBS37 Unknown FALSE
43 | SBS38 Indirect effects of UV light exposure FALSE
44 | SBS39 Unknown FALSE
45 | SBS40 Unknown FALSE
46 | SBS41 Unknown FALSE
47 | SBS42 Occupational exposure to haloalkanes FALSE
48 | SBS44 Defective DNA mismatch repair FALSE 200 10000
49 | SBS84 Activity of activation-induced cytidine deaminase (AID) FALSE
50 | SBS85 Indirect effects of activation-induced cytidine deaminase (AID) FALSE
51 | SBS86 Unknown chemotherapy treatment (COSMIC 3.1) FALSE
52 | SBS87 Thiopurine chemotherapy treatment (COSMIC 3.1) FALSE
53 | SBS88 Colibactin exposure (COSMIC 3.1) FALSE
54 | SBS89 Unknown (COSMIC 3.1) FALSE
55 | SBS90 Duocarmycin exposure (COSMIC 3.1) FALSE
56 | SBS27 Possible sequencing artifact TRUE
57 | SBS43 Possible sequencing artifact TRUE
58 | SBS45 Possible artifact due to 8-oxo-guanine introduced during sequencing TRUE
59 | SBS46 Possible sequencing artifact TRUE
60 | SBS47 Possible sequencing artifact TRUE
61 | SBS48 Possible sequencing artifact TRUE
62 | SBS49 Possible sequencing artifact TRUE
63 | SBS50 Possible sequencing artifact TRUE
64 | SBS51 Possible sequencing artifact TRUE
65 | SBS52 Possible sequencing artifact TRUE
66 | SBS53 Possible sequencing artifact TRUE
67 | SBS54 Possible sequencing artifact and/or contamination with germline variants TRUE
68 | SBS55 Possible sequencing artifact TRUE
69 | SBS56 Possible sequencing artifact TRUE
70 | SBS57 Possible sequencing artifact TRUE
71 | SBS58 Potential sequencing artifact TRUE
72 | SBS59 Potential sequencing artifact TRUE
73 | SBS60 Potential sequencing artifact TRUE
--------------------------------------------------------------------------------
/inst/extdata/cosmic_sbs_signature_summary.txt:
--------------------------------------------------------------------------------
1 | name short_name description detail present_in_COSMIC_3.4
2 | SBS1 1 spontaneous 5-methylcytosine deamination TRUE
3 | SBS2 2 APOBEC activity TRUE
4 | SBS3 3 defective homologous recombination TRUE
5 | SBS4 4 tobacco-related tobacco smoking TRUE
6 | SBS5 5 "unknown, clock-like" TRUE
7 | SBS6 6 mismatch repair defects TRUE
8 | SBS7a 7a UV light TRUE
9 | SBS7b 7b UV light TRUE
10 | SBS7c 7c UV light TRUE
11 | SBS7d 7d UV light TRUE
12 | SBS8 8 unknown etiology TRUE
13 | SBS9 9 polymerase eta somatic hypermutation TRUE
14 | SBS10a 10a defective polymerase epsilon polymerase epsiolon exonuclease domain mutations TRUE
15 | SBS10b 10b defective polymerase epsilon polymerase epsiolon exonuclease domain mutations TRUE
16 | SBS10c 10c defective POLD1 proofreading TRUE
17 | SBS10d 10d defective POLD1 proofreading TRUE
18 | SBS11 11 chemotherapeutic agents temozolomide treatment TRUE
19 | SBS12 12 unknown etiology TRUE
20 | SBS13 13 APOBEC activity TRUE
21 | SBS14 14 mismatch repair defects concurrent polymerase epsilon mutation and mismatch repair defect TRUE
22 | SBS15 15 mismatch repair defects TRUE
23 | SBS16 16 associated with alcohol consumption COSMIC describes etiology as unknown TRUE
24 | SBS17a 17a unknown etiology TRUE
25 | SBS17b 17b unknown etiology TRUE
26 | SBS18 18 reactive oxygen species TRUE
27 | SBS19 19 unknown etiology TRUE
28 | SBS20 20 mismatch repair defects concurrent POLD1 mutation and defective DNA mismatch repair TRUE
29 | SBS21 21 mismatch repair defects TRUE
30 | SBS22 22 aristolochic acid exposure FALSE
31 | SBS22a 22a aristolochic acid exposure TRUE
32 | SBS22b 22b aristolochic acid exposure TRUE
33 | SBS23 23 unknown etiology TRUE
34 | SBS24 24 aflatoxin exposure TRUE
35 | SBS25 25 chemotherapeutic agents COSMIC considers unknown but notes presence in chemotherapy-exposed Hodgkin's lymphoma cell lines TRUE
36 | SBS26 26 mismatch repair defects TRUE
37 | SBS27 27 likely or possible artifact TRUE
38 | SBS28 28 unknown etiology TRUE
39 | SBS29 29 tobacco-related tobacco chewing TRUE
40 | SBS30 30 BER deficiency due to NTHL1 mutation TRUE
41 | SBS31 31 chemotherapeutic agents platinum chemotherapy treatment TRUE
42 | SBS32 32 chemotherapeutic agents azathioprine treatment TRUE
43 | SBS33 33 unknown etiology TRUE
44 | SBS34 34 unknown etiology TRUE
45 | SBS35 35 chemotherapeutic agents platinum chemotherapy treatment TRUE
46 | SBS36 36 BER deficiency due to MUTYH mutation TRUE
47 | SBS37 37 unknown etiology TRUE
48 | SBS38 38 UV light indirect effects of UV exposure TRUE
49 | SBS39 39 unknown etiology TRUE
50 | SBS40 40 unknown etiology FALSE
51 | SBS40a 40a unknown etiology TRUE
52 | SBS40b 40b unknown etiology TRUE
53 | SBS40c 40c unknown etiology TRUE
54 | SBS41 41 unknown etiology TRUE
55 | SBS42 42 occupational exposure to haloalkanes TRUE
56 | SBS43 43 likely or possible artifact TRUE
57 | SBS44 44 mismatch repair defects TRUE
58 | SBS45 45 likely or possible artifact TRUE
59 | SBS46 46 likely or possible artifact TRUE
60 | SBS47 47 likely or possible artifact TRUE
61 | SBS48 48 likely or possible artifact TRUE
62 | SBS49 49 likely or possible artifact TRUE
63 | SBS50 50 likely or possible artifact TRUE
64 | SBS51 51 likely or possible artifact TRUE
65 | SBS52 52 likely or possible artifact TRUE
66 | SBS53 53 likely or possible artifact TRUE
67 | SBS54 54 likely or possible artifact TRUE
68 | SBS55 55 likely or possible artifact TRUE
69 | SBS56 56 likely or possible artifact TRUE
70 | SBS57 57 likely or possible artifact TRUE
71 | SBS58 58 likely or possible artifact TRUE
72 | SBS59 59 likely or possible artifact TRUE
73 | SBS60 60 likely or possible artifact TRUE
74 | SBS84 84 activation-induced cytidine deaminases activity of AIDs TRUE
75 | SBS85 85 activation-induced cytidine deaminases indirect effects of AIDs TRUE
76 | SBS86 86 chemotherapeutic agents unknown chemotherapy treatment TRUE
77 | SBS87 87 chemotherapeutic agents thiopurine treatment TRUE
78 | SBS88 88 colibactin exposure TRUE
79 | SBS89 89 unknown etiology TRUE
80 | SBS90 90 chemotherapeutic agents duocarmycin exposure TRUE
81 | SBS91 91 unknown etiology TRUE
82 | SBS92 92 tobacco-related affects bladder tissue of tobacco smokers TRUE
83 | SBS93 93 unknown etiology TRUE
84 | SBS94 94 unknown etiology TRUE
85 | SBS95 95 likely or possible artifact TRUE
86 | SBS96 96 unknown etiology TRUE
87 | SBS97 97 unknown etiology TRUE
88 | SBS98 98 unknown etiology TRUE
89 | SBS99 99 chemotherapeutic agents melphalan exposure TRUE
--------------------------------------------------------------------------------
/inst/extdata/trinuc_snv_to_deconstructSigs_ID.txt:
--------------------------------------------------------------------------------
1 | context mutation deconstructSigs_ID
2 | ATA A A[T>A]A
3 | TTA A T[T>A]A
4 | GTA A G[T>A]A
5 | CTA A C[T>A]A
6 | AGA A T[C>T]T
7 | TGA A T[C>T]A
8 | GGA A T[C>T]C
9 | CGA A T[C>T]G
10 | ACA A A[C>A]A
11 | TCA A T[C>A]A
12 | GCA A G[C>A]A
13 | CCA A C[C>A]A
14 | ATT A A[T>A]T
15 | TTT A T[T>A]T
16 | GTT A G[T>A]T
17 | CTT A C[T>A]T
18 | AGT A A[C>T]T
19 | TGT A A[C>T]A
20 | GGT A A[C>T]C
21 | CGT A A[C>T]G
22 | ACT A A[C>A]T
23 | TCT A T[C>A]T
24 | GCT A G[C>A]T
25 | CCT A C[C>A]T
26 | ATG A A[T>A]G
27 | TTG A T[T>A]G
28 | GTG A G[T>A]G
29 | CTG A C[T>A]G
30 | AGG A C[C>T]T
31 | TGG A C[C>T]A
32 | GGG A C[C>T]C
33 | CGG A C[C>T]G
34 | ACG A A[C>A]G
35 | TCG A T[C>A]G
36 | GCG A G[C>A]G
37 | CCG A C[C>A]G
38 | ATC A A[T>A]C
39 | TTC A T[T>A]C
40 | GTC A G[T>A]C
41 | CTC A C[T>A]C
42 | AGC A G[C>T]T
43 | TGC A G[C>T]A
44 | GGC A G[C>T]C
45 | CGC A G[C>T]G
46 | ACC A A[C>A]C
47 | TCC A T[C>A]C
48 | GCC A G[C>A]C
49 | CCC A C[C>A]C
50 | AAA T T[T>A]T
51 | TAA T T[T>A]A
52 | GAA T T[T>A]C
53 | CAA T T[T>A]G
54 | AGA T T[C>A]T
55 | TGA T T[C>A]A
56 | GGA T T[C>A]C
57 | CGA T T[C>A]G
58 | ACA T A[C>T]A
59 | TCA T T[C>T]A
60 | GCA T G[C>T]A
61 | CCA T C[C>T]A
62 | AAT T A[T>A]T
63 | TAT T A[T>A]A
64 | GAT T A[T>A]C
65 | CAT T A[T>A]G
66 | AGT T A[C>A]T
67 | TGT T A[C>A]A
68 | GGT T A[C>A]C
69 | CGT T A[C>A]G
70 | ACT T A[C>T]T
71 | TCT T T[C>T]T
72 | GCT T G[C>T]T
73 | CCT T C[C>T]T
74 | AAG T C[T>A]T
75 | TAG T C[T>A]A
76 | GAG T C[T>A]C
77 | CAG T C[T>A]G
78 | AGG T C[C>A]T
79 | TGG T C[C>A]A
80 | GGG T C[C>A]C
81 | CGG T C[C>A]G
82 | ACG T A[C>T]G
83 | TCG T T[C>T]G
84 | GCG T G[C>T]G
85 | CCG T C[C>T]G
86 | AAC T G[T>A]T
87 | TAC T G[T>A]A
88 | GAC T G[T>A]C
89 | CAC T G[T>A]G
90 | AGC T G[C>A]T
91 | TGC T G[C>A]A
92 | GGC T G[C>A]C
93 | CGC T G[C>A]G
94 | ACC T A[C>T]C
95 | TCC T T[C>T]C
96 | GCC T G[C>T]C
97 | CCC T C[C>T]C
98 | AAA G T[T>C]T
99 | TAA G T[T>C]A
100 | GAA G T[T>C]C
101 | CAA G T[T>C]G
102 | ATA G A[T>G]A
103 | TTA G T[T>G]A
104 | GTA G G[T>G]A
105 | CTA G C[T>G]A
106 | ACA G A[C>G]A
107 | TCA G T[C>G]A
108 | GCA G G[C>G]A
109 | CCA G C[C>G]A
110 | AAT G A[T>C]T
111 | TAT G A[T>C]A
112 | GAT G A[T>C]C
113 | CAT G A[T>C]G
114 | ATT G A[T>G]T
115 | TTT G T[T>G]T
116 | GTT G G[T>G]T
117 | CTT G C[T>G]T
118 | ACT G A[C>G]T
119 | TCT G T[C>G]T
120 | GCT G G[C>G]T
121 | CCT G C[C>G]T
122 | AAG G C[T>C]T
123 | TAG G C[T>C]A
124 | GAG G C[T>C]C
125 | CAG G C[T>C]G
126 | ATG G A[T>G]G
127 | TTG G T[T>G]G
128 | GTG G G[T>G]G
129 | CTG G C[T>G]G
130 | ACG G A[C>G]G
131 | TCG G T[C>G]G
132 | GCG G G[C>G]G
133 | CCG G C[C>G]G
134 | AAC G G[T>C]T
135 | TAC G G[T>C]A
136 | GAC G G[T>C]C
137 | CAC G G[T>C]G
138 | ATC G A[T>G]C
139 | TTC G T[T>G]C
140 | GTC G G[T>G]C
141 | CTC G C[T>G]C
142 | ACC G A[C>G]C
143 | TCC G T[C>G]C
144 | GCC G G[C>G]C
145 | CCC G C[C>G]C
146 | AAA C T[T>G]T
147 | TAA C T[T>G]A
148 | GAA C T[T>G]C
149 | CAA C T[T>G]G
150 | ATA C A[T>C]A
151 | TTA C T[T>C]A
152 | GTA C G[T>C]A
153 | CTA C C[T>C]A
154 | AGA C T[C>G]T
155 | TGA C T[C>G]A
156 | GGA C T[C>G]C
157 | CGA C T[C>G]G
158 | AAT C A[T>G]T
159 | TAT C A[T>G]A
160 | GAT C A[T>G]C
161 | CAT C A[T>G]G
162 | ATT C A[T>C]T
163 | TTT C T[T>C]T
164 | GTT C G[T>C]T
165 | CTT C C[T>C]T
166 | AGT C A[C>G]T
167 | TGT C A[C>G]A
168 | GGT C A[C>G]C
169 | CGT C A[C>G]G
170 | AAG C C[T>G]T
171 | TAG C C[T>G]A
172 | GAG C C[T>G]C
173 | CAG C C[T>G]G
174 | ATG C A[T>C]G
175 | TTG C T[T>C]G
176 | GTG C G[T>C]G
177 | CTG C C[T>C]G
178 | AGG C C[C>G]T
179 | TGG C C[C>G]A
180 | GGG C C[C>G]C
181 | CGG C C[C>G]G
182 | AAC C G[T>G]T
183 | TAC C G[T>G]A
184 | GAC C G[T>G]C
185 | CAC C G[T>G]G
186 | ATC C A[T>C]C
187 | TTC C T[T>C]C
188 | GTC C G[T>C]C
189 | CTC C C[T>C]C
190 | AGC C G[C>G]T
191 | TGC C G[C>G]A
192 | GGC C G[C>G]C
193 | CGC C G[C>G]G
194 |
--------------------------------------------------------------------------------
/inst/tutorial/BRCA_cesa_gene_rates.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Townsend-Lab-Yale/cancereffectsizeR/c0f5503e2beca7726bc535caf4f1fcee918afec3/inst/tutorial/BRCA_cesa_gene_rates.rds
--------------------------------------------------------------------------------
/inst/tutorial/BRCA_cesa_samples.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Townsend-Lab-Yale/cancereffectsizeR/c0f5503e2beca7726bc535caf4f1fcee918afec3/inst/tutorial/BRCA_cesa_samples.rds
--------------------------------------------------------------------------------
/inst/tutorial/BRCA_dndscv_out.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Townsend-Lab-Yale/cancereffectsizeR/c0f5503e2beca7726bc535caf4f1fcee918afec3/inst/tutorial/BRCA_dndscv_out.rds
--------------------------------------------------------------------------------
/inst/tutorial/BRCA_effects_in_top_genes.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Townsend-Lab-Yale/cancereffectsizeR/c0f5503e2beca7726bc535caf4f1fcee918afec3/inst/tutorial/BRCA_effects_in_top_genes.rds
--------------------------------------------------------------------------------
/inst/tutorial/BRCA_epistasis_example.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Townsend-Lab-Yale/cancereffectsizeR/c0f5503e2beca7726bc535caf4f1fcee918afec3/inst/tutorial/BRCA_epistasis_example.rds
--------------------------------------------------------------------------------
/inst/tutorial/BRCA_site_rates_example.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Townsend-Lab-Yale/cancereffectsizeR/c0f5503e2beca7726bc535caf4f1fcee918afec3/inst/tutorial/BRCA_site_rates_example.rds
--------------------------------------------------------------------------------
/inst/tutorial/BRCA_snv_counts.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Townsend-Lab-Yale/cancereffectsizeR/c0f5503e2beca7726bc535caf4f1fcee918afec3/inst/tutorial/BRCA_snv_counts.rds
--------------------------------------------------------------------------------
/inst/tutorial/LUAD_sig_effects.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Townsend-Lab-Yale/cancereffectsizeR/c0f5503e2beca7726bc535caf4f1fcee918afec3/inst/tutorial/LUAD_sig_effects.rds
--------------------------------------------------------------------------------
/inst/tutorial/comp_variant_ep.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Townsend-Lab-Yale/cancereffectsizeR/c0f5503e2beca7726bc535caf4f1fcee918afec3/inst/tutorial/comp_variant_ep.rds
--------------------------------------------------------------------------------
/inst/tutorial/gene_ep_example.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Townsend-Lab-Yale/cancereffectsizeR/c0f5503e2beca7726bc535caf4f1fcee918afec3/inst/tutorial/gene_ep_example.rds
--------------------------------------------------------------------------------
/inst/tutorial/metastatic_breast_2021_license.txt:
--------------------------------------------------------------------------------
1 | License included in Metastatic Breast Cancer MAF data source:
2 | The data are available under the ODC Open Database License (ODbL)(http://opendatacommons.org/licenses/odbl/1.0/) (summary available here: http://www.opendatacommons.org/licenses/odbl/1-0/summary/): you are free to share and modify the data so long as you attribute any public use of the database, or works produced from the database; keep the resulting data-sets open; and offer your shared or adapted version of the data-set under the same ODbL license.
3 |
4 |
--------------------------------------------------------------------------------
/inst/tutorial/sequential_signif_output.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Townsend-Lab-Yale/cancereffectsizeR/c0f5503e2beca7726bc535caf4f1fcee918afec3/inst/tutorial/sequential_signif_output.rds
--------------------------------------------------------------------------------
/inst/tutorial/top_BRCA_effects.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Townsend-Lab-Yale/cancereffectsizeR/c0f5503e2beca7726bc535caf4f1fcee918afec3/inst/tutorial/top_BRCA_effects.rds
--------------------------------------------------------------------------------
/inst/tutorial/top_LUAD_effects.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Townsend-Lab-Yale/cancereffectsizeR/c0f5503e2beca7726bc535caf4f1fcee918afec3/inst/tutorial/top_LUAD_effects.rds
--------------------------------------------------------------------------------
/inst/tutorial/variant_ep_example.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Townsend-Lab-Yale/cancereffectsizeR/c0f5503e2beca7726bc535caf4f1fcee918afec3/inst/tutorial/variant_ep_example.rds
--------------------------------------------------------------------------------
/man/CESAnalysis.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/CESAnalysis.R
3 | \name{CESAnalysis}
4 | \alias{CESAnalysis}
5 | \title{Create a cancereffectsizeR analysis}
6 | \usage{
7 | CESAnalysis(refset = NULL)
8 | }
9 | \arguments{
10 | \item{refset}{Name of reference data set (refset) to use; run \code{list_ces_refsets()} for
11 | available refsets. Alternatively, the path to a custom reference data directory.}
12 | }
13 | \value{
14 | CESAnalysis object
15 | }
16 | \description{
17 | Creates a CESAnalysis, the central data structure of cancereffectsizeR.
18 | }
19 |
--------------------------------------------------------------------------------
/man/CompoundVariantSet.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/compound_variants.R
3 | \name{CompoundVariantSet}
4 | \alias{CompoundVariantSet}
5 | \title{Create CompoundVariantSet from variant IDs}
6 | \usage{
7 | CompoundVariantSet(cesa, variant_id)
8 | }
9 | \arguments{
10 | \item{cesa}{CESAnalysis (used to access variant annotations)}
11 |
12 | \item{variant_id}{Vector of variant IDs to include in one compound variant, or a list of
13 | vectors, each of which defines a separate compound variant. If the vector or list is
14 | named, names will be kept. Otherwise, compound variants will be named sequentially.}
15 | }
16 | \description{
17 | A CompoundVariantSet is a collection of "compound variants". A compound variant is an arbitrary
18 | group of variants that have sequencing coverage across some set of samples. (Any of these samples
19 | with one or more of the constituent SNVs "has the compound variant"--samples with coverage at
20 | only some of the sites are not considered.) The compound variants within a CompoundVariantSet are
21 | always disjoint: that is, no individual variant appears in more than one of the compound
22 | variants.
23 | }
24 | \details{
25 | Example: \code{CompoundVariantSet(cesa, variant_id = list(kras12 = c("KRAS G12C", "KRAS G12D",
26 | "KRAS G12V")))} creates a CompoundVariantSet containing one compound variant. To create
27 | a large set, it's usually easier to use define_compound_variants(), which calls this
28 | function internally to define compound variants from an input variant table.
29 |
30 | If you're using this function because you have a complex use case that
31 | define_compound_variants() can't handle, please let us know so we can try to make
32 | improvements!
33 | }
34 |
--------------------------------------------------------------------------------
/man/aac_to_snv_ids.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/add_variants.R
3 | \name{aac_to_snv_ids}
4 | \alias{aac_to_snv_ids}
5 | \title{Get SNVs that cause an amino acid change}
6 | \usage{
7 | aac_to_snv_ids(refcds_entry_name, aa_pos, aa_alt, bsg, refcds)
8 | }
9 | \arguments{
10 | \item{aa_pos}{Integer position of substitution on the transcript.}
11 |
12 | \item{aa_alt}{Identity of substitution, either a three-letter code ("Lys") or "STOP"}
13 |
14 | \item{bsg}{A BSgenome object for the genome build associated with the RefCDS entry}
15 |
16 | \item{refcds_entry}{A RefCDS entry for the relevant transcript}
17 | }
18 | \description{
19 | An internal function to figure out the SNVs that can cause a given amino acid substitution in a transcript
20 | }
21 | \keyword{internal}
22 |
--------------------------------------------------------------------------------
/man/add_covered_regions.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/covered_regions_handling.R
3 | \name{add_covered_regions}
4 | \alias{add_covered_regions}
5 | \title{add_covered_regions}
6 | \usage{
7 | add_covered_regions(
8 | target_cesa = NULL,
9 | source_cesa = NULL,
10 | covered_regions = NULL,
11 | covered_regions_name = NULL,
12 | coverage_type = NULL,
13 | covered_regions_padding = 0
14 | )
15 | }
16 | \arguments{
17 | \item{target_cesa}{CESAnalysis with annotated variants that the covered regions will be added to}
18 |
19 | \item{source_cesa}{Another CESAnalysis to copy all covered regions from}
20 |
21 | \item{covered_regions}{A GRanges object or BED file path with genome build matching the target_cesa,
22 | if not using source_cesa}
23 |
24 | \item{covered_regions_name}{A name to identify the covered regions, if not using source_cesa}
25 |
26 | \item{coverage_type}{exome, genome, or targeted (if not using source_cesa)}
27 |
28 | \item{covered_regions_padding}{optionally, add +/- this many bp to each interval in covered_regions}
29 | }
30 | \value{
31 | CESAnalysis given in target_cesa, with the new covered regions added
32 | }
33 | \description{
34 | add_covered_regions
35 | }
36 |
--------------------------------------------------------------------------------
/man/add_variants.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/add_variants.R
3 | \name{add_variants}
4 | \alias{add_variants}
5 | \title{Add variant annotations}
6 | \usage{
7 | add_variants(
8 | target_cesa = NULL,
9 | variant_table = NULL,
10 | snv_id = NULL,
11 | aac_id = NULL,
12 | bed = NULL,
13 | gr = NULL,
14 | source_cesa = NULL,
15 | padding = 0
16 | )
17 | }
18 | \arguments{
19 | \item{target_cesa}{CESAnalysis to receive variant annotations}
20 |
21 | \item{variant_table}{A data.table with chr/start/end positions (1-based closed
22 | coordinates, like MAF format). All possible SNVs overlapping the table's genomic
23 | coordinates (within \code{padding} bases) will be added. The tables returned by
24 | select_variants() and (CESAnalysis)$variants work, and get special handling of
25 | amino-acid-change SNVs: only the precise positions in start, end, and center_nt_pos
26 | are used. (This avoids adding all variants between start/end, which on
27 | splice-site-spanning variants can be many thousands.)}
28 |
29 | \item{snv_id}{Character vector of CES-style SNV IDs to add.}
30 |
31 | \item{aac_id}{Character vector of AAC IDs (or short names, like "KRAS_G12C")}
32 |
33 | \item{bed}{A path to a BED file. All possible SNVs overlapping BED intervals (within
34 | \code{padding} bases) will be added.}
35 |
36 | \item{gr}{A GRanges object. All possible SNVs overlapping the ranges (within \code{padding}
37 | bases) will be added.}
38 |
39 | \item{source_cesa}{Another CESAnalysis from which to copy snv_ids. SNVs will be
40 | re-annotated using the target_cesa's associated reference data.}
41 |
42 | \item{padding}{How many bases (default 0) to expand start and end of each gr range}
43 | }
44 | \description{
45 | Use this function to add variant annotations to your CESAnalysis by specifying variants
46 | to add in one of five ways: a data.table containing genomic coordinates (output from
47 | select_variants(), typically), a GRanges object, a BED file, another CESAnalysis, or
48 | SNV/AAC IDs.
49 | }
50 | \details{
51 | All methods of adding variants work by identifying which SNVs to add and then using the
52 | target_cesa's associated reference data to identify overlapping amino-acid-change
53 | mutations, which are then added as well. (You can't add just SNVs or just AACs.) Note
54 | that if you try to add far more distinct variants than appear in a typical cohort (as
55 | in, millions), annotation will take a while and the annotation tables in the
56 | CESAnalysis may take up significant memory. Please contact us if you have issues.
57 | }
58 |
--------------------------------------------------------------------------------
/man/annotate_variants.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/annotate_variants.R
3 | \name{annotate_variants}
4 | \alias{annotate_variants}
5 | \title{Annotate variants}
6 | \usage{
7 | annotate_variants(refset = NULL, variants = NULL)
8 | }
9 | \arguments{
10 | \item{refset}{CES reference data set (e.g., from the ces.refset.hg19 data package)}
11 |
12 | \item{variants}{MAF-like data.table of variants (e.g., as generated by preload_maf())}
13 | }
14 | \description{
15 | Annotates CESAnalysis MAF data with reference genome and gene data; called by load_maf
16 | }
17 | \keyword{internal}
18 |
--------------------------------------------------------------------------------
/man/artifact_account.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/set_signature_weights.R
3 | \name{artifact_account}
4 | \alias{artifact_account}
5 | \title{Calculate relative rates of biological mutational processes}
6 | \usage{
7 | artifact_account(
8 | weights,
9 | signature_names,
10 | artifact_signatures = NULL,
11 | fail_if_zeroed = FALSE
12 | )
13 | }
14 | \arguments{
15 | \item{weights}{data.table of signature weights (can have extra columns)}
16 |
17 | \item{signature_names}{names of signatures in weights (i.e., all column names)}
18 |
19 | \item{artifact_signatures}{vector of artifact signature names (or NULL)}
20 |
21 | \item{fail_if_zeroed}{T/F on whether to exit if a tumor would have all-zero weights.}
22 | }
23 | \description{
24 | Sets artifact signature weights to zero and normalizes so that biologically-associated
25 | weights sum to (1 - unattributed proportion) in each sample.
26 | }
27 | \keyword{internal}
28 |
--------------------------------------------------------------------------------
/man/assign_gr_to_coverage.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/covered_regions_handling.R
3 | \name{assign_gr_to_coverage}
4 | \alias{assign_gr_to_coverage}
5 | \title{assign_gr_to_coverage}
6 | \usage{
7 | assign_gr_to_coverage(cesa, gr, covered_regions_name, coverage_type)
8 | }
9 | \arguments{
10 | \item{cesa}{CESAnalysis to receive the gr}
11 |
12 | \item{gr}{GRanges}
13 |
14 | \item{covered_regions_name}{unique name for the covered regions}
15 |
16 | \item{coverage_type}{"exome" or "targeted"}
17 | }
18 | \description{
19 | Adds a validated GRanges object as a CESAnalysis's coverage set. Called by
20 | add_covered_regions() after various checks pass.
21 | }
22 | \details{
23 | Special handling occurs if covered_regions_name is "exome+".
24 | }
25 | \keyword{internal}
26 |
--------------------------------------------------------------------------------
/man/assign_group_average_trinuc_rates.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/assign_group_average_trinuc_rates.R
3 | \name{assign_group_average_trinuc_rates}
4 | \alias{assign_group_average_trinuc_rates}
5 | \title{Skip mutational signature analysis and assign group average relative trinucleotide-context-specific mutation rates to all samples}
6 | \usage{
7 | assign_group_average_trinuc_rates(cesa)
8 | }
9 | \arguments{
10 | \item{cesa}{CESAnalysis object}
11 | }
12 | \description{
13 | This function calculates the relative rates of trinucleotide-context-specific mutations across
14 | all SNV records from whole-exome and whole-genome MAF data and naively assigns these rates to all samples.
15 | This can be helpful if you do not have SNV mutational signatures available for your species, or if
16 | you want to assume that all samples share the same SNV mutational processes without relying on signatures.
17 | Normally, if mutational signatures are available, it is better to use trinuc_snv_mutation_rates().
18 | }
19 | \details{
20 | To reduce the influence of selection, only non-recurrent mutations (i.e., mutations that occur
21 | in just one sample) are used to calculate the rates. Targeted sequencing data is excluded for
22 | the same reason, and also because the trinucleotide composition of targeted regions could be
23 | very different from that of the exome/genome.
24 | }
25 |
--------------------------------------------------------------------------------
/man/baseline_mutation_rates.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/mutation_rate_calc.R
3 | \name{baseline_mutation_rates}
4 | \alias{baseline_mutation_rates}
5 | \title{Baseline mutation rate calculation}
6 | \usage{
7 | baseline_mutation_rates(
8 | cesa,
9 | aac_ids = NULL,
10 | snv_ids = NULL,
11 | variant_ids = NULL,
12 | samples = character()
13 | )
14 | }
15 | \arguments{
16 | \item{cesa}{CESAnalysis with gene mutation rates and tumor-specific trinucleotide-context-specific mutation rates already calculated}
17 |
18 | \item{aac_ids}{vector of IDs for amino acid change variants}
19 |
20 | \item{snv_ids}{vector of IDs for SNVs}
21 |
22 | \item{variant_ids}{vector of mixed IDs (faster to use snv_ids and aac_ids for large jobs, if already known)}
23 |
24 | \item{samples}{Which samples to calculate rates for. Defaults to all samples. Can be a
25 | vector of Unique_Patient_Identifiers, or a data.table containing rows from the
26 | CESAnalysis sample table.}
27 | }
28 | \value{
29 | a data table of mutation rates with one column per variant, and a Unique_Patient_Identifier column identifying each row
30 | }
31 | \description{
32 | Calculates neutral mutation rates at specific sites based on gene mutation rates and the relative
33 | trinucleotide-context-specific SNV mutation rates of each sample
34 | }
35 |
--------------------------------------------------------------------------------
/man/build_RefCDS.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/build_RefCDS.R
3 | \name{build_RefCDS}
4 | \alias{build_RefCDS}
5 | \title{cancereffectsizeR's RefCDS builder}
6 | \usage{
7 | build_RefCDS(
8 | gtf,
9 | genome,
10 | use_all_transcripts = TRUE,
11 | cds_ranges_lack_stop_codons = TRUE,
12 | cores = 1,
13 | additional_essential_splice_pos = NULL,
14 | numcode = 1,
15 | chromosome_style = "NCBI"
16 | )
17 | }
18 | \arguments{
19 | \item{gtf}{Path of a Gencode-style GTF file, or an equivalently formatted data
20 | table. See details for required columns (features). It's possible to build such a
21 | table using data pulled from biomaRt, but it's easier to use a GTF.}
22 |
23 | \item{genome}{Genome assembly name (e.g., "hg19"); an associated BSgenome object must be
24 | available to load. Alternatively, supply a BSgenome object directly.}
25 |
26 | \item{use_all_transcripts}{T/F (default TRUE): Whether to use all complete transcripts or just the longest
27 | one for each gene.}
28 |
29 | \item{cds_ranges_lack_stop_codons}{The CDS records in Gencode GTFs don't include the
30 | stop codons in their genomic intervals. If your input does include the stop
31 | codons within CDS records, set to FALSE.}
32 |
33 | \item{cores}{how many cores to use for parallel computations}
34 |
35 | \item{additional_essential_splice_pos}{Usually not needed. A list of
36 | additional essential splice site positions to combine with those calculated
37 | automatically by this function. Each element of the list should have a name
38 | matching a protein_id in the input and consist of a numeric vector of
39 | additional positions. This option exists so that mutations at chr17:7579312
40 | on TP53 are treated as splice site mutations in cancereffectsizeR's default
41 | hg19 reference data set. (Variants at this coding position, which are
42 | always synonymous, have validated effects on splicing, even though the
43 | position misses automatic "essential splice" annotation by 1 base.)}
44 |
45 | \item{numcode}{(don't use) NCBI genetic code number; currently only code 1, the
46 | standard genetic code, is supported}
47 |
48 | \item{chromosome_style}{Chromosome naming style to use. Defaults to "NCBI". For the human
49 | genome, that means 1, 2,..., as opposed to "UCSC" style (chr1, chr2, ...). Value gets
50 | passed to genomeInfoDb's seqlevelsStyle().}
51 | }
52 | \value{
53 | A two-item list: RefCDS (which is itself a big list, with each entry containing
54 | information on one coding sequence (CDS)), and a GRanges object that defines the
55 | genomic intervals covered by each CDS.
56 | }
57 | \description{
58 | Based on the buildref function in Inigo Martincorena's package dNdScv, this function
59 | takes in gene/transcript/CDS definitions and creates a dNdScv-style RefCDS object and
60 | an associated GenomicRanges object also required to run dNdScv.
61 | }
62 | \details{
63 | Required columns are seqnames, start, end, strand, gene_name, gene_id, protein_id, and
64 | type. Only rows that have type == "CDS" will be used. Strand should be
65 | "+" or "-".
66 |
67 | By default, only one the longest complete transcript is used from each gene in the
68 | input. If you set use_all_transcripts = TRUE, then all complete transcripts will be
69 | used, resulting in multiple RefCDS entries for some genes. If you do this, you may
70 | want to first eliminate low-confidence or superfluous transcripts from the input data.
71 | }
72 |
--------------------------------------------------------------------------------
/man/calculate_trinuc_rates.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/set_signature_weights.R
3 | \name{calculate_trinuc_rates}
4 | \alias{calculate_trinuc_rates}
5 | \title{Calculate trinuc rates}
6 | \usage{
7 | calculate_trinuc_rates(weights, signatures, tumor_names)
8 | }
9 | \arguments{
10 | \item{weights}{matrix of signature weights}
11 |
12 | \item{signatures}{matrix of signatures}
13 |
14 | \item{tumor_names}{names of tumors corresponding to rows of weights}
15 | }
16 | \value{
17 | matrix of trinuc rates where each row corresponds to a tumor
18 | }
19 | \description{
20 | Used internally to calculate trinuc rates from signature weights
21 | }
22 | \details{
23 | If any relative rate is less than 1e-9, we add the lowest above-threshold rate to all
24 | rates and renormalize rates so that they sum to 1.
25 | }
26 | \keyword{internal}
27 |
--------------------------------------------------------------------------------
/man/ces_epistasis.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/epistasis_wrappers.R
3 | \name{ces_epistasis}
4 | \alias{ces_epistasis}
5 | \title{Variant-level pairwise epistasis}
6 | \usage{
7 | ces_epistasis(
8 | cesa = NULL,
9 | variants = NULL,
10 | samples = character(),
11 | run_name = "auto",
12 | cores = 1,
13 | model = "default",
14 | lik_args = list(),
15 | pval_calc_fn = NULL,
16 | optimizer_args = list(),
17 | conf = 0.95,
18 | return_fit = FALSE
19 | )
20 | }
21 | \arguments{
22 | \item{cesa}{CESAnalysis}
23 |
24 | \item{variants}{To test pairs of variants, supply a list where each element is a
25 | 2-length vector of CES-style variant IDs. Alternatively (and often more usefully),
26 | supply a CompoundVariantSet (see \code{define_compound_variants()}) to test all pairs
27 | of compound variants in the set.}
28 |
29 | \item{samples}{Which samples to include in inference. Defaults to all samples.
30 | Can be a vector of Unique_Patient_Identifiers, or a data.table containing rows from
31 | the CESAnalysis sample table.}
32 |
33 | \item{run_name}{Optionally, a name to identify the current run.}
34 |
35 | \item{cores}{number of cores for parallel processing of variant pairs}
36 |
37 | \item{model}{Set to "default" to use built-in
38 | model of epistatic selection, or supply a custom function factory (see details).}
39 |
40 | \item{lik_args}{Extra arguments, given as a list, to pass to custom likelihood functions.}
41 |
42 | \item{pval_calc_fn}{For use with custom models; optional. A function that takes an epistasis
43 | model fit as input and returns p-values and other descriptives.}
44 |
45 | \item{optimizer_args}{List of arguments to pass to the optimizer (bbmle::mle2).}
46 |
47 | \item{conf}{confidence interval size from 0 to 1 (.95 -> 95\%); NULL skips calculation,
48 | reducing runtime.}
49 |
50 | \item{return_fit}{TRUE/FALSE (default FALSE): Embed epistatic model fits for each variant pair in
51 | a "fit" attribute of the epistasis results table. Use \code{attr(my_results, 'fit')} to access
52 | the list of fitted models.}
53 | }
54 | \value{
55 | CESAnalysis with a table of epistatic inferences appended to list \code{[CESAnalysis]$epistasis}. Some column definitions:
56 | \itemize{
57 | \item variant_A, variant_B: Names for the two variants or merged sets of variants in each
58 | epistatic inference. For brevity in the case of merged variant sets, we say that a sample with
59 | any variant in variant set A "has variant A."
60 | \item ces_A0: Cancer effect (scaled selection coefficient) of variant A that acts in the absence of variant B.
61 | \item ces_B0: Cancer effect of variant B that acts in the absence of variant A.
62 | \item ces_A_on_B: Cancer effect of variant A that acts when a sample already has variant B.
63 | \item ces_B_on_A: Cancer effect of variant B that acts when a sample already has variant A.
64 | \item p_A_change: P-value of likelihood ratio test (LRT) that informs whether selection for
65 | variant A significantly changes after acquiring variant B. The LRT compares the likelihood of
66 | the full epistatic model to that of a reduced model in which ces_A0 and ces_A_on_B are set
67 | equal. The p-value is the probability, under the reduced model, of the likelihood ratio being
68 | greater than or equal to the ratio observed.
69 | \item p_B_change: P-value of likelihood ratio test (LRT) that informs whether selection for
70 | variant B significantly changes after acquiring variant A. The LRT compares the likelihood of
71 | the full epistatic model to that of a reduced model in which ces_B0 and ces_B_on_A are set
72 | equal. The p-value is the probability, under the reduced model, of the likelihood ratio being
73 | greater than or equal to the ratio observed.
74 | \item p_epistasis: P-value of likelihood ratio test that informs whether the epistatic model
75 | better explains the mutation data than a non-epistatic model in which selection for mutations
76 | in each variant are independent of the mutation status of the other variant. Quite often, p_epistasis
77 | will suggest a significant epistatic effect even though p_A_change and p_B_change do not suggest
78 | significant changes in selection for either variant individually. This is because the degree of
79 | co-occurrence can often be explained equally well by a strong change in selection for either variant.
80 | \item expected_nAB_epistasis: The expected number of samples with both A and B mutated under the fitted epistatic model.
81 | Typically, this will be very close to the actual number of AB samples (nAB).
82 | \item expected_nAB_null: The expected number of samples with both A and B mutated under a no-epistasis model.
83 | \item AB_epistatic_ratio: The ratio \code{expected_nAB_epistasis/expected_nAB_null}. Useful to gauge the overall
84 | impact of epistatic interactions on the co-occurrence of variants A and B. Since the expectations take mutation rates into account,
85 | this ratio is a better indicator than the relative frequencies of A0, B0, AB, 00 in the data set.
86 | \item nA0, nB0, nAB, n00: Number of (included) samples with mutations in just A, just B, both A and B, and neither.
87 | \item ces_A_null, ces_B_null: Cancer effects of A and B when estimated independently; that is,
88 | effects under a no-epistasis model.
89 | }
90 | }
91 | \description{
92 | Calculate selection intensity under an assumption of pairwise epistasis between pairs of variants.
93 | CompoundVariantSets are supported.
94 | }
95 |
--------------------------------------------------------------------------------
/man/ces_variant.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/ces_variant.R
3 | \name{ces_variant}
4 | \alias{ces_variant}
5 | \title{Calculate cancer effects of variants}
6 | \usage{
7 | ces_variant(
8 | cesa = NULL,
9 | variants = select_variants(cesa, min_freq = 2),
10 | samples = character(),
11 | model = "default",
12 | run_name = "auto",
13 | lik_args = list(),
14 | optimizer_args = if (identical(model, "default")) list(method = "L-BFGS-B", lower =
15 | 0.001, upper = 1e+09) else list(),
16 | return_fit = FALSE,
17 | hold_out_same_gene_samples = "auto",
18 | cores = 1,
19 | conf = 0.95
20 | )
21 | }
22 | \arguments{
23 | \item{cesa}{CESAnalysis object}
24 |
25 | \item{variants}{Which variants to estimate effects for, specified with a variant table such as
26 | from \code{[CESAnalysis]$variants} or \code{select_variants()}, or a \code{CompoundVariantSet}
27 | from \code{define_compound_variants()}. Defaults to all recurrent mutations; that is,
28 | \code{[CESAnalysis]$variants[maf_prevalence > 1]}. To include all variants, set to
29 | \code{[CESAnalysis]$variants}.}
30 |
31 | \item{samples}{Which samples to include in inference. Defaults to all samples. Can be a vector of
32 | Unique_Patient_Identifiers, or a data.table containing rows from the CESAnalysis sample table.}
33 |
34 | \item{model}{Set to "basic" (default) or "sequential" (not yet available) to use built-in
35 | models of selection, or supply a custom function factory (see details).}
36 |
37 | \item{run_name}{Optionally, a name to identify the current run.}
38 |
39 | \item{lik_args}{Extra arguments, given as a list, to pass to custom likelihood functions.}
40 |
41 | \item{optimizer_args}{Named list of arguments to pass to the optimizer, bbmle::mle2. Use, for example,
42 | to choose optimization algorithm or parameter boundaries on custom models.}
43 |
44 | \item{return_fit}{TRUE/FALSE (default FALSE): Embed model fit for each variant in a "fit"
45 | attribute of the selection results table. Use \code{attr(selection_table, 'fit')} to access the
46 | list of fitted models. Defaults to FALSE to save memory. Model fit objects can be of moderate
47 | or large size. If you run thousands of variants at once, you may exhaust your system memory.}
48 |
49 | \item{hold_out_same_gene_samples}{When finding likelihood of each variant, hold out samples that
50 | lack the variant but have any other mutations in the same gene. By default, TRUE when running
51 | with single variants, FALSE with a CompoundVariantSet.}
52 |
53 | \item{cores}{Number of cores to use for processing variants in parallel (not useful for Windows
54 | systems).}
55 |
56 | \item{conf}{Cancer effect confidence interval width (NULL skips calculation, speeds runtime). Ignored
57 | when running custom models.}
58 | }
59 | \value{
60 | CESAnalysis object with selection results appended to the selection output list
61 | }
62 | \description{
63 | This function calculates variant effect sizes under the chosen model of selection. Under the
64 | default model, a variant is assumed to have a consistent scaled selection coefficient (cancer
65 | effect) across all included samples.
66 | }
67 | \details{
68 | Definitions of the sample count columns in the effects output:
69 | \itemize{
70 | \item included_with_variant: Number of samples that have the variant and were included in the inference.
71 | \item included_total: Number of samples that have coverage at the site and were included in the inference.
72 | \item held_out: Samples that have coverage at the site, but were held out of the inference due to \code{hold_out_same_gene_samples = TRUE}.
73 | \item uncovered: Samples that were not included in the inference because their sequencing did not cover the variant site.
74 | }
75 | Note that if a table of samples to include in the inference is specified with \code{samples}, any
76 | CESAnalysis samples not present in the table will not be included in any of the above accounts.
77 |
78 | It's possible to pass in your own selection model. You'll need to create a "function factory"
79 | that, for any variant, produces a likelihood function that can be evaluated on the data. The
80 | first two arguments must be \code{rates_tumors_with} and \code{rates_tumors_without}, which give the baseline
81 | site mutation rates in samples with and without the variant. The third argument must be
82 | \code{sample_index}, a data.table that associates \code{Unique_Patient_Identifier} with group names and
83 | indices. (Your function factory must accept this argument, but it doesn't have to use its value.)
84 | Values for all three of these arguments will be calculated by ces_variant and passed to your
85 | function factory automatically. Your function can take whatever additional arguments you like,
86 | and you can pass in values using \code{lik_args}. The likelihood function parameters that
87 | ces_variant will optimize should be named and have default values. See the source code of
88 | \code{sswm_lik()} for an example.
89 | }
90 |
--------------------------------------------------------------------------------
/man/check_for_ref_data.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/genome_data_handling.R
3 | \name{check_for_ref_data}
4 | \alias{check_for_ref_data}
5 | \title{check_for_ref_data}
6 | \usage{
7 | check_for_ref_data(data_dir_or_cesa, datatype)
8 | }
9 | \arguments{
10 | \item{data_dir_or_cesa}{CESAnalysis, or file path for reference data directory}
11 | }
12 | \description{
13 | checks if the requested reference data exists and returns T/F
14 | }
15 | \keyword{internal}
16 |
--------------------------------------------------------------------------------
/man/check_sample_overlap.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/preload_maf.R
3 | \name{check_sample_overlap}
4 | \alias{check_sample_overlap}
5 | \title{Catch duplicate samples}
6 | \usage{
7 | check_sample_overlap(maf_list)
8 | }
9 | \arguments{
10 | \item{maf_list}{A list of data.tables (or a single data.table) with MAF data and cancereffectsizeR-style column names,
11 | as generated by \code{preload_maf()}.}
12 | }
13 | \value{
14 | a data.table with overlap statistics
15 | }
16 | \description{
17 | Takes in a data.table of MAF data (produced, typically, with \code{preload_maf()}) and
18 | identifies samples with relatively high proportions of shared SNV mutations. Some
19 | flagged sample pairs may reflect shared driver mutations or chance overlap of variants
20 | in SNV or sequencing error hotspots. Very high overlap may indicate sample duplication,
21 | re-use of samples across data sources, or within-experiment sample contamination. To limit
22 | the influence of shared calling error, it's recommended to run this function after
23 | any quality filtering of MAF records, as a final step.
24 | }
25 | \details{
26 | Sample pairs are flagged when...
27 | \itemize{
28 | \item Both samples have <6 total SNVs and any shared SNVs.
29 | \item Both samples have <21 total SNVs and >1 shared mutation.
30 | \item One sample has just 1 or 2 total SNVs and has any overlaps with the other sample.
31 | \item The samples have >2 shared SNVs and at least one percent of SNVs are shared (in the sample with fewer SNVs).
32 | }
33 | These thresholds err on the side of reporting too many possible duplicates. In general,
34 | and especially when dealing with targeted sequencing data, the presence of 1 or 2
35 | shared mutations between a pair of samples is not strong evidence of sample
36 | duplication. It's up to the user to filter and interpret the output.
37 |
38 | In addition to reporting SNV counts, this function divides the genome into 1000-bp
39 | windows and reports the following:
40 | \itemize{
41 | \item variant_windows_A: Number of windows in which sample A has a variant.
42 | \item variant_windows_B: Same for B.
43 | \item windows_shared: Number of windows that contain a variant shared between both samples.
44 | }
45 | Sometimes, samples have little overlap except for a few hotspots that may derive from
46 | shared calling error or highly mutable regions. These window counts can help
47 | distinguish such samples from those with more pervasive SNV overlap.
48 | }
49 |
--------------------------------------------------------------------------------
/man/clean_granges_for_cesa.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/CESAnalysis.R
3 | \name{clean_granges_for_cesa}
4 | \alias{clean_granges_for_cesa}
5 | \title{clean_granges_for_cesa}
6 | \usage{
7 | clean_granges_for_cesa(
8 | cesa = NULL,
9 | gr = NULL,
10 | padding = 0,
11 | refset_env = NULL,
12 | reduce_sort_strip = TRUE
13 | )
14 | }
15 | \arguments{
16 | \item{cesa}{CESAnalysis; required unless refset_env is supplied directly.}
17 |
18 | \item{gr}{GRanges object}
19 |
20 | \item{padding}{How many bases to expand start and end of each position}
21 |
22 | \item{refset_env}{A reference data set environment. Required if cesa is not specified.}
23 |
24 | \item{reduce_sort_strip}{Unstrands and calls reduce(gr) which also drops all metadata columns,
25 | and then sorts the final gr. Default TRUE; use FALSE to preserve original gr structure.}
26 | }
27 | \description{
28 | Tries to format an input GRanges object to be compatible with a CESAnalysis reference
29 | genome. Optionally, also applies padding to start and end positions of ranges, stopping
30 | at chromosome ends. Either stops with an error or returns a clean granges object.
31 | }
32 | \keyword{internal}
33 |
--------------------------------------------------------------------------------
/man/clear_effect_output.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/ces_variant.R
3 | \name{clear_effect_output}
4 | \alias{clear_effect_output}
5 | \title{Clear variant effect output}
6 | \usage{
7 | clear_effect_output(cesa, run_names = names(cesa$selection))
8 | }
9 | \arguments{
10 | \item{cesa}{CESAnalysis}
11 |
12 | \item{run_names}{Which previous runs to remove; defaults to removing all.}
13 | }
14 | \description{
15 | Remove output from previous ces_variant() runs from CESAnalysis
16 | }
17 |
--------------------------------------------------------------------------------
/man/clear_epistasis_output.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/ces_variant.R
3 | \name{clear_epistasis_output}
4 | \alias{clear_epistasis_output}
5 | \title{Clear epistasis output}
6 | \usage{
7 | clear_epistasis_output(cesa, run_names = names(cesa$epistasis))
8 | }
9 | \arguments{
10 | \item{cesa}{CESAnalysis.}
11 |
12 | \item{run_names}{Which previous runs to remove; defaults to removing all.}
13 | }
14 | \description{
15 | Remove previous epistatic effect estimations from CESAnalysis.
16 | }
17 |
--------------------------------------------------------------------------------
/man/clear_gene_rates.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/gene_mutation_rates.R
3 | \name{clear_gene_rates}
4 | \alias{clear_gene_rates}
5 | \title{Clear regional mutation rates}
6 | \usage{
7 | clear_gene_rates(cesa = NULL)
8 | }
9 | \arguments{
10 | \item{cesa}{CESAnalysis}
11 | }
12 | \value{
13 | The CESAnalysis with rates cleared
14 | }
15 | \description{
16 | Remove all gene/coding region mutation rates, usually in order to re-run with different
17 | parameters without having to create a new CESAnalysis.
18 | }
19 |
--------------------------------------------------------------------------------
/man/clear_sample_data.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/load_sample_data.R
3 | \name{clear_sample_data}
4 | \alias{clear_sample_data}
5 | \title{Clear sample data}
6 | \usage{
7 | clear_sample_data(cesa, cols)
8 | }
9 | \arguments{
10 | \item{cesa}{CESAnalysis}
11 |
12 | \item{cols}{names of data columns to clear}
13 | }
14 | \description{
15 | Remove data columns by name from CESAnalysis sample table. You can't clear
16 | cancereffectsizeR-generated columns, such as coverage.
17 | }
18 |
--------------------------------------------------------------------------------
/man/clear_trinuc_rates_and_signatures.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/trinuc_mutation_rates.R
3 | \name{clear_trinuc_rates_and_signatures}
4 | \alias{clear_trinuc_rates_and_signatures}
5 | \title{Clear mutational signature attributions and related mutation rate information}
6 | \usage{
7 | clear_trinuc_rates_and_signatures(cesa)
8 | }
9 | \arguments{
10 | \item{cesa}{CESAnalysis}
11 | }
12 | \description{
13 | Removes all data calculated or supplied via trinuc_mutation_rates,
14 | set_signature_weights, set_trinuc_rates, etc. This function can be used if you want to
15 | re-run signature analysis with different sample groupings or parameters.
16 | }
17 |
--------------------------------------------------------------------------------
/man/complete_aac_ids.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/variant_id_handling.R
3 | \name{complete_aac_ids}
4 | \alias{complete_aac_ids}
5 | \title{Create full AAC ID}
6 | \usage{
7 | complete_aac_ids(partial_ids, refset)
8 | }
9 | \arguments{
10 | \item{partial_ids}{AAC variant id prefixes, such as "KRAS_G12C" or "MIB2 G395C"}
11 |
12 | \item{refset}{reference data set (environment object)}
13 | }
14 | \description{
15 | For example, KRAS_G12C -> KRAS_G12C_ENSP00000256078 (ces.refset.hg19). In cases of
16 | multiple protein IDs per gene, will return more IDs than input. Otherwise,
17 | input/output will maintain order.
18 | }
19 | \keyword{internal}
20 |
--------------------------------------------------------------------------------
/man/convert_signature_weights_for_mp.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/set_signature_weights.R
3 | \name{convert_signature_weights_for_mp}
4 | \alias{convert_signature_weights_for_mp}
5 | \title{Get MutationalPatterns contributions matrix}
6 | \usage{
7 | convert_signature_weights_for_mp(signature_weight_table)
8 | }
9 | \arguments{
10 | \item{signature_weight_table}{As created by trinuc_mutation_rates(); typically accessed
11 | via (CESAnalysis)$mutational_signatures.}
12 | }
13 | \description{
14 | Reformat a signature weights table from mutational signature analysis into the
15 | contributions matrix required for MutationalPatterns functions, including
16 | visualizations.
17 | }
18 |
--------------------------------------------------------------------------------
/man/copy_cesa.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/CESAnalysis.R
3 | \name{copy_cesa}
4 | \alias{copy_cesa}
5 | \title{Create an independent copy of a CESAnalysis}
6 | \usage{
7 | copy_cesa(cesa)
8 | }
9 | \arguments{
10 | \item{cesa}{CESAnalysis}
11 | }
12 | \description{
13 | Used internally to "copy" CESAnalysis objects while keeping memory use to a minimum.
14 | }
15 | \details{
16 | The trick is to use data.table's copy function on all data.tables (and lists of
17 | data.tables) within CESAnalysis slots. (If you just call copy on the whole object, the
18 | data tables won't be handled in a memory-efficient way. And if you call copy on
19 | non-data.tables, it's actually less efficient since it forces an immediate full copy
20 | instead of the usual copy-on-modify.)
21 | }
22 | \keyword{internal}
23 |
--------------------------------------------------------------------------------
/man/cosmic_signature_info.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/plot_signature_effects.R
3 | \name{cosmic_signature_info}
4 | \alias{cosmic_signature_info}
5 | \title{Get COSMIC signature descriptions}
6 | \usage{
7 | cosmic_signature_info()
8 | }
9 | \description{
10 | Returns a table describing COSMIC signatures. All signatures from v3.0 to the latest release
11 | (v3.4) are included, with information derived from the most recent information on the COSMIC
12 | website. (Exception: A reported association between SBS16 and alcohol consumption, noted here, is
13 | not mentioned on the COSMIC website.)
14 | }
15 |
--------------------------------------------------------------------------------
/man/create_refset.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/create_refset.R
3 | \name{create_refset}
4 | \alias{create_refset}
5 | \title{Create a custom refset}
6 | \usage{
7 | create_refset(
8 | output_dir,
9 | refcds_dndscv,
10 | refcds_anno = NULL,
11 | species_name,
12 | genome_build_name,
13 | BSgenome_name,
14 | supported_chr = c(1:22, "X", "Y"),
15 | default_exome = NULL,
16 | exome_interval_padding = 0,
17 | transcripts = NULL,
18 | cores = 1
19 | )
20 | }
21 | \arguments{
22 | \item{output_dir}{Name/path of an existing, writable output directory where all data
23 | will be saved. The name of this directory will serve as the name of the custom refset.}
24 |
25 | \item{refcds_dndscv}{Transcript information in the two-item list (consisting of RefCDS
26 | and gr_genes) that is output by \code{build_RefCDS}. This transcript information will be used with dNdScv.}
27 |
28 | \item{refcds_anno}{Transcript information in the two-item list (consisting of RefCDS
29 | and gr_genes) that is output by \code{build_RefCDS}. This transcript information will be used for
30 | cancereffectsizeR's annotations. If unspecified, the same reference information as supplied for dNdScv will be used.}
31 |
32 | \item{species_name}{Name of the species, primarily for display (e.g., "human").}
33 |
34 | \item{genome_build_name}{Name of the genome build, primarily for display (e.g., "hg19").}
35 |
36 | \item{BSgenome_name}{The name of the BSgenome package to use (e.g., "hg19"); will used by
37 | cancereffectsizeR to load the reference genome via BSgenome::getBSgenome().}
38 |
39 | \item{supported_chr}{Character vector of supported chromosomes. Note that cancereffectsizeR uses
40 | NCBI-style chromosome names, which means no chr prefixes ("X", not "chrX"). Mitochondrial
41 | contigs shouldn't be included since they would require special handling that hasn't been
42 | implemented.}
43 |
44 | \item{default_exome}{A BED file or GRanges object that defines coding regions in the genome as
45 | might be used by an exome capture kit. This file (or GRanges) might be acquired or generated
46 | from exome capture kit documentation, or alternatively, coding regions defined in a GTF file
47 | (or the granges output by build_RefCDS()).}
48 |
49 | \item{exome_interval_padding}{Number of bases to pad start/end of each covered interval, to allow
50 | for some variants to be called just outside of targeted regions, where there still may be
51 | pretty good sequencing coverage.}
52 |
53 | \item{transcripts}{Additional information about coding (and, optionally, noncoding)
54 | transcripts from a Gencode GTF, supplied as a data.table. See the format provided in
55 | ces.refset.hg38. You'll have to match the format (including column names) pretty closely to get
56 | expected behavior. Noncoding transcripts are represented only by records with transcript_type =
57 | "transcript", and protein-coding transcripts are representing with transcript, CDS, and UTR
58 | records. Note that in Gencode format.}
59 |
60 | \item{cores}{How many cores to use (default 1).}
61 | }
62 | \description{
63 | Use this function to create and save a directory of custom reference data that can be
64 | used with cancereffectsizeR instead of supplied refsets like \code{ces.refset.hg19}. All
65 | arguments are required except default_exome/exome_interval_padding, which are recommended.
66 | }
67 | \details{
68 | To run this function, you'll need to have output from \code{build_RefCDS()}.
69 | }
70 |
--------------------------------------------------------------------------------
/man/default_epistasis_pvalue_calc.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/default_epistasis_pvalue_calc.R
3 | \name{default_epistasis_pvalue_calc}
4 | \alias{default_epistasis_pvalue_calc}
5 | \title{Calculate epistasis p-values}
6 | \usage{
7 | default_epistasis_pvalue_calc(fit)
8 | }
9 | \arguments{
10 | \item{fit}{fitted epistatic model}
11 | }
12 | \description{
13 | Calculates p-values and other interesting information.
14 | }
15 | \keyword{internal}
16 |
--------------------------------------------------------------------------------
/man/define_compound_variants.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/compound_variants.R
3 | \name{define_compound_variants}
4 | \alias{define_compound_variants}
5 | \title{Divide batches of variants into a CompoundVariantSet}
6 | \usage{
7 | define_compound_variants(cesa, variant_table, by = NULL, merge_distance = 0)
8 | }
9 | \arguments{
10 | \item{cesa}{CESAnalysis}
11 |
12 | \item{variant_table}{Data table of variants, in the style generated by select_variants().}
13 |
14 | \item{by}{One or more column names to use for initial splitting of the input table into variant
15 | groups. Each distinct group will then be further divided into compound variants based on \code{merge_distance}}
16 |
17 | \item{merge_distance}{maximum genomic distance between a given variant and the nearest
18 | variant in compound variant for the variant to variant to be merged into the compound
19 | variant (as opposed to being assigned to its own compound variant).}
20 | }
21 | \description{
22 | A CompoundVariantSet is a collection of "compound variants". A compound variant is an arbitrary
23 | group of variants that have sequencing coverage across some set of samples. (Any of these samples
24 | with one or more of the constituent SNVs "has the compound variant"--samples with coverage at
25 | only some of the sites are not considered.) The compound variants within a CompoundVariantSet are
26 | always disjoint: that is, no individual variant appears in more than one of the compound
27 | variants. After collecting variants of interest into a table using select_variants()--and further
28 | subsetting or annotating the table as desired--use this function to produce a CompoundVariantSet
29 | that combines variants into distinct compound variants based on your criteria.
30 | }
31 | \details{
32 | This function works first by splitting the input table by the columns given in
33 | \code{by}. For example, splitting on "gene" will split the table into gene-specific
34 | subtables. Then, each subtable is divided into compound variants based on
35 | \code{merge_distance}. All variants in each subtable within the specified genomic
36 | distance of each other will be merged into a candidate compound variant, and then
37 | compound variants will be repeatedly merged until the nearest two variants in each pair
38 | of compound variants are not within \code{merge_distance}. Note that overlapping
39 | variants will always be merged unless you use \code{by} to separate them into different
40 | subtables (for example, by splitting on alt or aa_alt). If you use \code{by} to split
41 | variants by some functional annotation, you can set \code{merge_distance} very high to
42 | merge all same-chromosome sites (e.g., 1e9 on human genome). To merge sites across chromosomes,
43 | set \code{merge_distance = Inf}.
44 | }
45 |
--------------------------------------------------------------------------------
/man/detect_mnv.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/detect_mnv.R
3 | \name{detect_mnv}
4 | \alias{detect_mnv}
5 | \title{Find likely MNVs in an MAF table}
6 | \usage{
7 | detect_mnv(maf)
8 | }
9 | \arguments{
10 | \item{maf}{a valid MAF-style data.table}
11 | }
12 | \value{
13 | a table with MAF records
14 | }
15 | \description{
16 | Same-sample variants with 2bp of other variants are likely MNVs.
17 | }
18 | \keyword{internal}
19 |
--------------------------------------------------------------------------------
/man/dot-add_covered_regions.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/covered_regions_handling.R
3 | \name{.add_covered_regions}
4 | \alias{.add_covered_regions}
5 | \title{.add_covered_regions}
6 | \usage{
7 | .add_covered_regions(
8 | cesa,
9 | coverage_type,
10 | covered_regions,
11 | covered_regions_name,
12 | covered_regions_padding
13 | )
14 | }
15 | \arguments{
16 | \item{coverage_type}{exome or targeted, if not using source_cesa}
17 |
18 | \item{covered_regions}{A GRanges object or BED file path with genome build matching the target_cesa,
19 | if not using source_cesa}
20 |
21 | \item{covered_regions_name}{A name to identify the covered regions, if not using source_cesa}
22 |
23 | \item{covered_regions_padding}{optionally, add +/- this many bp to each interval in covered_regions}
24 | }
25 | \value{
26 | CESAnalysis given in target_cesa, with the new covered regions added
27 | }
28 | \description{
29 | .add_covered_regions
30 | }
31 | \keyword{internal}
32 |
--------------------------------------------------------------------------------
/man/dot-variant_counts.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/variant_counts.R
3 | \name{.variant_counts}
4 | \alias{.variant_counts}
5 | \title{Internal variant prevalence and coverage calculation}
6 | \usage{
7 | .variant_counts(
8 | cesa,
9 | samples,
10 | snv_from_aac,
11 | noncoding_snv_id,
12 | by_cols = character()
13 | )
14 | }
15 | \arguments{
16 | \item{cesa}{CESAnalysis}
17 |
18 | \item{samples}{validated samples table}
19 |
20 | \item{snv_from_aac}{data.table with columns aac_id, snv_id (validated and with annotations in CESAnalysis)}
21 |
22 | \item{noncoding_snv_id}{vector of snv_ids to treat as noncoding variants}
23 |
24 | \item{by_cols}{validated column names from sample table that are suitable to use for counting by.}
25 | }
26 | \description{
27 | Called by variant_counts() (and select_variants()) with validated inputs.
28 | }
29 | \keyword{internal}
30 |
--------------------------------------------------------------------------------
/man/epistasis_plot_schematic.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/plot_epistasis.R
3 | \name{epistasis_plot_schematic}
4 | \alias{epistasis_plot_schematic}
5 | \title{Get epistatic effect schematic}
6 | \usage{
7 | epistasis_plot_schematic(
8 | title = "Types of effects",
9 | schematic_label_size = 3,
10 | with_border = TRUE
11 | )
12 | }
13 | \arguments{
14 | \item{title}{Schematic title text.}
15 |
16 | \item{schematic_label_size}{Text size of labels in the schematic (title gets size + 1).}
17 |
18 | \item{with_border}{TRUE/FALSE on the appearance of a thin visible border around the schematic.}
19 | }
20 | \value{
21 | The schematic as a ggplot.
22 | }
23 | \description{
24 | Get a copy of the explanatory schematic that appears in epistatic effect plots (see
25 | \code{plot_epistasis()}). May be useful for putting the schematic in custom locations when
26 | assembling complex figures.
27 | }
28 |
--------------------------------------------------------------------------------
/man/epistasis_results.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/CESAnalysis.R
3 | \name{epistasis_results}
4 | \alias{epistasis_results}
5 | \title{View output from epistasis functions}
6 | \usage{
7 | epistasis_results(cesa = NULL)
8 | }
9 | \arguments{
10 | \item{cesa}{CESAnalysis object}
11 | }
12 | \description{
13 | returns a list of data tables with results from epistasis functions
14 | }
15 |
--------------------------------------------------------------------------------
/man/excluded_maf_records.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/CESAnalysis.R
3 | \name{excluded_maf_records}
4 | \alias{excluded_maf_records}
5 | \title{View excluded MAF data}
6 | \usage{
7 | excluded_maf_records(cesa = NULL)
8 | }
9 | \arguments{
10 | \item{cesa}{CESAnalysis object}
11 | }
12 | \description{
13 | returns a data.table containing MAF records that were excluded from the given CESAnalysis
14 | }
15 |
--------------------------------------------------------------------------------
/man/figures/current_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Townsend-Lab-Yale/cancereffectsizeR/c0f5503e2beca7726bc535caf4f1fcee918afec3/man/figures/current_logo.png
--------------------------------------------------------------------------------
/man/gene_mutation_rates.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/gene_mutation_rates.R
3 | \name{gene_mutation_rates}
4 | \alias{gene_mutation_rates}
5 | \title{Use dNdScv with tissue-specific covariates to calculate gene-level mutation rates}
6 | \usage{
7 | gene_mutation_rates(
8 | cesa,
9 | covariates = NULL,
10 | samples = character(),
11 | dndscv_args = list(),
12 | save_all_dndscv_output = FALSE
13 | )
14 | }
15 | \arguments{
16 | \item{cesa}{CESAnalysis object}
17 |
18 | \item{covariates}{Tissue-specific mutation rate covariates. Typically, supply the
19 | covariates object from your refset (e.g., ces.refset.hg19$covariates$bladder), or the
20 | object name ("bladder"). Run list_ces_covariates() to see choices. For hg19 data
21 | only, set to "hg19" to use dNdScv's non-tissue-specific covariates. If no appropriate
22 | covariates data are available, set to NULL to run without. Finally, you can also
23 | supply custom covariates data in the form of a matrix or prcomp object (see website
24 | for details).}
25 |
26 | \item{samples}{Which samples to include in the current run. Defaults to all samples. Can be a
27 | vector of Unique_Patient_Identifiers, or a data.table containing rows from the CESAnalysis
28 | sample table.}
29 |
30 | \item{dndscv_args}{Custom arguments to pass to dNdScv. (The arguments \code{mutations},
31 | \code{gene_list}, \code{cv}, and \code{refdb} are supplied by cancereffectsizeR and can't be
32 | substituted.)}
33 |
34 | \item{save_all_dndscv_output}{Default FALSE; when TRUE, saves all dndscv output, not
35 | just what's needed by cancereffectsizeR. (Full output can be very large, in the gigabytes.)}
36 | }
37 | \value{
38 | CESAnalysis object with gene-level mutation rates calculated
39 | }
40 | \description{
41 | This function calculates gene-level neutral mutation rates based on counts
42 | of nonsynonymous and synonymous mutations per gene under the dNdScv package's model,
43 | as described in \href{https://doi.org/10.1016/j.cell.2017.09.042}{Martincorena et al.}
44 | }
45 |
--------------------------------------------------------------------------------
/man/get_PathScore_coding_regions.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/make_PathScore_input.R
3 | \name{get_PathScore_coding_regions}
4 | \alias{get_PathScore_coding_regions}
5 | \title{Get PathScore coding regions}
6 | \usage{
7 | get_PathScore_coding_regions(genome = "hg38")
8 | }
9 | \arguments{
10 | \item{genome}{Genome build: Either "hg39" (default) or "hg19".}
11 | }
12 | \description{
13 | Returns GRanges that represent the coding sequence (CDS) definitions used by PathScore. The hg19
14 | version was created by running liftOver on the hg38 intervals.
15 | }
16 |
--------------------------------------------------------------------------------
/man/get_TCGA_project_MAF.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/get_TCGA_project_MAF.R
3 | \name{get_TCGA_project_MAF}
4 | \alias{get_TCGA_project_MAF}
5 | \title{Get MAF data from TCGA cohort}
6 | \usage{
7 | get_TCGA_project_MAF(
8 | project = NULL,
9 | filename = NULL,
10 | test_run = FALSE,
11 | exclude_TCGA_nonprimary = TRUE
12 | )
13 | }
14 | \arguments{
15 | \item{project}{TCGA project name (e.g., "TCGA-BRCA").}
16 |
17 | \item{filename}{Output filename where MAF data should be saved. Must end in '.maf'
18 | (plaintext) or '.maf.gz' (gzip compressed).}
19 |
20 | \item{test_run}{Default FALSE. When TRUE, gets MAF data for a few samples instead of the whole cohort.}
21 |
22 | \item{exclude_TCGA_nonprimary}{Default TRUE. For TCGA projects, exclude samples not
23 | associated with a patient's initial primary tumor. (In many TCGA projects, a small
24 | handful of patients have metastatic, recurrent, or additional primary samples.)}
25 | }
26 | \description{
27 | This convenience function queries the Genomic Data Commons API to get MAF data
28 | generated with the Aliquot Ensemble Somatic Variant Merging and Masking workflow for
29 | the specified project, and writes an MAF file. The API always provides data from the
30 | latest data release. This function might work with non-TCGA MAF data hosted on GDC
31 | (e.g., TARGET and GENIE-MSK), but it hasn't been tested and users should proceed with
32 | caution.
33 | }
34 | \details{
35 | TCGA cohort MAFs will be structured as downloaded, with a Unique_Patient_Identifier
36 | column generated from the first 12 characters of Tumor_Sample_Barcode. When passed to
37 | preload_maf() or load_maf(), this column will supersede Tumor_Sample_Barcode. In the
38 | handful of patients with multiple Tumor_Sample_Barcodes (essentially replicated
39 | sequencing, with very high variant overlap), these functions will effectively take the
40 | union of these samples for each patient. Relatedly, the small number of TCGA
41 | non-primary tumor samples should not be handled this way (and such samples are by
42 | default removed by this function).
43 |
44 | Temporary aliquot MAF files downloaded by this function are deleted after they are read.
45 | }
46 |
--------------------------------------------------------------------------------
/man/get_ces_signature_set.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/genome_data_handling.R
3 | \name{get_ces_signature_set}
4 | \alias{get_ces_signature_set}
5 | \title{get_ces_signature_set}
6 | \usage{
7 | get_ces_signature_set(refset, name)
8 | }
9 | \arguments{
10 | \item{refset}{name of refset (if using a custom refset, it must be loaded into a CESAnalysis already)}
11 |
12 | \item{name}{name of signature set}
13 | }
14 | \description{
15 | For a given CES reference data collection and signature set name, returns
16 | cancereffectsizeR's internal data for the signature set in a three-item list:
17 | the signature set name, a data table of signature metadata, and a signature
18 | definition data frame
19 | }
20 |
--------------------------------------------------------------------------------
/man/get_cesa_bsg.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/genome_data_handling.R
3 | \name{get_cesa_bsg}
4 | \alias{get_cesa_bsg}
5 | \title{get_cesa_bsg}
6 | \usage{
7 | get_cesa_bsg(cesa)
8 | }
9 | \arguments{
10 | \item{cesa}{CESAnalysis}
11 | }
12 | \description{
13 | Loads the right BSgenome for a CESAnalysis
14 | }
15 | \keyword{internal}
16 |
--------------------------------------------------------------------------------
/man/get_dndscv_model_fit.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/gene_mutation_rates.R
3 | \name{get_dndscv_model_fit}
4 | \alias{get_dndscv_model_fit}
5 | \title{This little function called by gene_mutation_rates() is separated for testing purposes.}
6 | \usage{
7 | get_dndscv_model_fit(dndscv_output)
8 | }
9 | \description{
10 | This little function called by gene_mutation_rates() is separated for testing purposes.
11 | }
12 | \keyword{internal}
13 |
--------------------------------------------------------------------------------
/man/get_gene_rates.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/CESAnalysis.R
3 | \name{get_gene_rates}
4 | \alias{get_gene_rates}
5 | \title{Get table of neutral gene mutation rates}
6 | \usage{
7 | get_gene_rates(cesa = NULL)
8 | }
9 | \arguments{
10 | \item{cesa}{CESAnalysis object}
11 | }
12 | \description{
13 | Get table of neutral gene mutation rates
14 | }
15 |
--------------------------------------------------------------------------------
/man/get_gr_from_table.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/select_variants.R
3 | \name{get_gr_from_table}
4 | \alias{get_gr_from_table}
5 | \title{Get GRanges from chr/start/end table}
6 | \usage{
7 | get_gr_from_table(variant_table)
8 | }
9 | \arguments{
10 | \item{variant_table}{data.table}
11 | }
12 | \description{
13 | Mainly built for select_variants() output, and uses the
14 | center_nt_pos on AACs (rather than all from start-end). Assumes
15 | MAF-like coordinates (1-based, closed).
16 | }
17 | \keyword{internal}
18 |
--------------------------------------------------------------------------------
/man/get_ref_data.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/genome_data_handling.R
3 | \name{get_ref_data}
4 | \alias{get_ref_data}
5 | \title{get_ref_data}
6 | \usage{
7 | get_ref_data(data_dir_or_cesa, datatype)
8 | }
9 | \description{
10 | reads in the requested reference data for the ref set associated with the CESAnalysis
11 | }
12 | \keyword{internal}
13 |
--------------------------------------------------------------------------------
/man/get_refset_dirs.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/genome_data_handling.R
3 | \name{get_refset_dirs}
4 | \alias{get_refset_dirs}
5 | \title{get_refset_dirs}
6 | \usage{
7 | get_refset_dirs()
8 | }
9 | \description{
10 | returns a character vector mapping ref set names to their data directories
11 | }
12 | \keyword{internal}
13 |
--------------------------------------------------------------------------------
/man/get_sample_info.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/CESAnalysis.R
3 | \name{get_sample_info}
4 | \alias{get_sample_info}
5 | \title{View sample metadata}
6 | \usage{
7 | get_sample_info(cesa = NULL)
8 | }
9 | \arguments{
10 | \item{cesa}{CESAnalysis object}
11 | }
12 | \description{
13 | returns a data.table with info on all samples in the CESAnalysis
14 | }
15 |
--------------------------------------------------------------------------------
/man/get_signature_weights.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/CESAnalysis.R
3 | \name{get_signature_weights}
4 | \alias{get_signature_weights}
5 | \title{Get table of signature attributions}
6 | \usage{
7 | get_signature_weights(cesa = NULL, raw = F, artifacts_zeroed = NULL)
8 | }
9 | \arguments{
10 | \item{cesa}{CESAnalysis object}
11 |
12 | \item{raw}{Default FALSE. When TRUE, return raw signature attributions as found by the
13 | signature extraction tool. Format may vary by tool.}
14 |
15 | \item{artifacts_zeroed}{Deprecated.}
16 | }
17 | \value{
18 | A data.table of signature attributions for each sample. By default, these are
19 | estimated relative weights of biologically-associated signatures (i.e., non-artifact
20 | signatures) that sum to 1.
21 | }
22 | \description{
23 | View SNV signature attributions associated with CESAnalysis samples.
24 | }
25 | \details{
26 | Use raw = TRUE to get signature attributions as produced by the signature extraction
27 | tool (or as provided by the user with set_signature_weights()), without any of the
28 | adjustments that are made by cancereffectsizeR's trinuc_mutation_rates().
29 | }
30 |
--------------------------------------------------------------------------------
/man/get_trinuc_rates.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/CESAnalysis.R
3 | \name{get_trinuc_rates}
4 | \alias{get_trinuc_rates}
5 | \title{Get estimated relative rates of trinucleotide-specific SNV mutation}
6 | \usage{
7 | get_trinuc_rates(cesa = NULL)
8 | }
9 | \arguments{
10 | \item{cesa}{CESAnalysis object}
11 | }
12 | \description{
13 | Get estimated relative rates of trinucleotide-specific SNV mutation
14 | }
15 |
--------------------------------------------------------------------------------
/man/identify_maf_variants.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/internal_read_maf.R
3 | \name{identify_maf_variants}
4 | \alias{identify_maf_variants}
5 | \title{Annotate MAF data with variant types and variant IDs}
6 | \usage{
7 | identify_maf_variants(maf)
8 | }
9 | \arguments{
10 | \item{maf}{a validated data.table with MAF-like columns}
11 | }
12 | \value{
13 | input table with variant_type and variant_id columns
14 | }
15 | \description{
16 | Annotate MAF data with variant types and variant IDs
17 | }
18 | \keyword{internal}
19 |
--------------------------------------------------------------------------------
/man/lift_bed.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/lift_bed.R
3 | \name{lift_bed}
4 | \alias{lift_bed}
5 | \title{Convert BED intervals between genome builds}
6 | \usage{
7 | lift_bed(bed, chain, outfile = NULL)
8 | }
9 | \arguments{
10 | \item{bed}{Pathname of a BED file, or a GRanges (typically loaded from a BED file with \code{rtracklayer::import.bed()}).}
11 |
12 | \item{chain}{A UCSC-style chain file, or a Chain object (such as from \code{rtracklayer::import.chain()}).}
13 |
14 | \item{outfile}{If not NULL, the returned GRanges will be saved to the specified path using \code{rtracklayer::export.chain()}.}
15 | }
16 | \value{
17 | GRanges representing lifted intervals from input \code{bed}.
18 | }
19 | \description{
20 | Use this utility to convert BED intervals between genome coordinate systems using liftOver. Only
21 | the chr/start/end fields of the input BED are used (strand is ignored). The output GRanges
22 | will have no associated seqinfo.
23 | }
24 | \details{
25 | A warning is given if the lifted intervals are less than 95\% of the size of the original
26 | intervals. When the BED input represents sequencing target intervals, most of the input
27 | intervals will usually lift successfully.
28 | }
29 |
--------------------------------------------------------------------------------
/man/list_ces_covariates.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/genome_data_handling.R
3 | \name{list_ces_covariates}
4 | \alias{list_ces_covariates}
5 | \title{list_ces_covariates}
6 | \usage{
7 | list_ces_covariates()
8 | }
9 | \description{
10 | Prints names of available built-in covariate data sets for all loaded CES genomes
11 | }
12 |
--------------------------------------------------------------------------------
/man/list_ces_refsets.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/genome_data_handling.R
3 | \name{list_ces_refsets}
4 | \alias{list_ces_refsets}
5 | \title{list_ces_refsets}
6 | \usage{
7 | list_ces_refsets()
8 | }
9 | \description{
10 | Prints names of built-in reference data sets
11 | }
12 |
--------------------------------------------------------------------------------
/man/list_ces_signature_sets.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/genome_data_handling.R
3 | \name{list_ces_signature_sets}
4 | \alias{list_ces_signature_sets}
5 | \title{list_ces_signature_sets}
6 | \usage{
7 | list_ces_signature_sets()
8 | }
9 | \description{
10 | Prints names of available mutational signature sets. Just to be clear, we're calling
11 | them ces_signature_sets because they're ready to use with cancereffectsizeR. We didn't
12 | derive any of these signature sets.
13 | }
14 |
--------------------------------------------------------------------------------
/man/load_cesa.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/CESAnalysis.R
3 | \name{load_cesa}
4 | \alias{load_cesa}
5 | \title{Load a previously saved CESAnalysis}
6 | \usage{
7 | load_cesa(file)
8 | }
9 | \arguments{
10 | \item{file}{filename/path of CESAnalysis that has been saved with saveRDS, expected to end in .rds}
11 | }
12 | \description{
13 | Load a previously saved CESAnalysis
14 | }
15 |
--------------------------------------------------------------------------------
/man/load_maf.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/load_maf.R
3 | \name{load_maf}
4 | \alias{load_maf}
5 | \title{Load MAF somatic mutation data}
6 | \usage{
7 | load_maf(
8 | cesa = NULL,
9 | maf = NULL,
10 | maf_name = character(),
11 | coverage = "exome",
12 | covered_regions = NULL,
13 | covered_regions_name = NULL,
14 | covered_regions_padding = 0,
15 | sample_data_cols = character(),
16 | enforce_default_exome_coverage = FALSE
17 | )
18 | }
19 | \arguments{
20 | \item{cesa}{CESAnalysis.}
21 |
22 | \item{maf}{Path of tab-delimited text file in MAF format, or an MAF in data.table or
23 | data.frame format.}
24 |
25 | \item{maf_name}{Optionally, a name to identify samples coming from the current MAF. Used to
26 | populate the maf_source field of the CESAnalysis samples table.}
27 |
28 | \item{coverage}{exome, genome, or targeted (default exome).}
29 |
30 | \item{covered_regions}{optional for exome, required for targeted: a GRanges object or a
31 | BED file of covered intervals matching the CESAnalysis genome.}
32 |
33 | \item{covered_regions_name}{a name describing the covered regions (e.g.,
34 | "my_custom_targeted_regions"); required when covered_regions are supplied.}
35 |
36 | \item{covered_regions_padding}{How many bases (default 0) to expand start and end of
37 | each covered_regions interval, to include variants called just outside of targeted
38 | regions. Consider setting from 0-100bp, or up to the sequencing read length. If the
39 | input data has been trimmed to the targeted regions, leave set to 0.}
40 |
41 | \item{sample_data_cols}{MAF columns containing sample-level data (e.g., tumor grade)
42 | that you would like to have copied into the CESAnalysis samples table.}
43 |
44 | \item{enforce_default_exome_coverage}{When loading default exome data, exclude records
45 | that aren't covered in the default exome capture intervals included with CES genome
46 | reference data (default FALSE).}
47 | }
48 | \value{
49 | CESAnalysis with the specified MAF data loaded. The MAF data table includes
50 | CES-generated variant IDs, a list of all genes overlapping the site, and top_gene and
51 | top_consequence columns that give the most significant annotated coding changes for
52 | each mutation record. Annotation precedence is determined by MAF prevalence (usually
53 | equal), essential splice status, premature stop codon, nonsilent status, MAF mutation
54 | prevalence across the transcript (often favors longer transcripts), and finally
55 | alphabetical order. The columns are recalculated when more data is loaded, so changes
56 | in MAF prevalence can change which variants are highlighted. Note that
57 | \code{[CESAnalysis]$variants} contains more information about all top_consequence
58 | variants and all noncoding variants from the MAF.
59 | }
60 | \description{
61 | Load MAF data from a text file or data table into your CESAnalysis. Column names are
62 | expected to match MAF format specifications (Chromosome, Start_Position, etc.). It's
63 | recommended to use preload_maf() to prep the input (including, optionally, liftOver
64 | conversion of genomic coordinates), but if you have clean MAF data, you can run this
65 | function directly. By default, data is assumed to be derived from whole-exome
66 | sequencing. Whole-genome data and targeted sequencing data are also supported when the
67 | \code{coverage} option is specified.
68 | }
69 |
--------------------------------------------------------------------------------
/man/load_sample_data.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/load_sample_data.R
3 | \name{load_sample_data}
4 | \alias{load_sample_data}
5 | \title{Add sample data}
6 | \usage{
7 | load_sample_data(cesa, sample_data)
8 | }
9 | \arguments{
10 | \item{cesa}{CESAnalysis}
11 |
12 | \item{sample_data}{data.table or data.frame with a Unique_Patient_Identifier column to
13 | match the CESAnalysis samples table, with one row per sample. (It's okay if some
14 | samples aren't present in the table.)}
15 | }
16 | \description{
17 | Insert sample-level data into a CESAnalysis samples table.
18 | }
19 |
--------------------------------------------------------------------------------
/man/maf_records.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/CESAnalysis.R
3 | \name{maf_records}
4 | \alias{maf_records}
5 | \title{View data loaded into CESAnalysis}
6 | \usage{
7 | maf_records(cesa = NULL)
8 | }
9 | \arguments{
10 | \item{cesa}{CESAnalysis object}
11 | }
12 | \description{
13 | returns a data.table containing MAF records used in the given CESAnalysis
14 | }
15 |
--------------------------------------------------------------------------------
/man/make_PathScore_input.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/make_PathScore_input.R
3 | \name{make_PathScore_input}
4 | \alias{make_PathScore_input}
5 | \title{Make a PathScore input file from MAF data}
6 | \usage{
7 | make_PathScore_input(maf, file = NULL, genome = "hg38")
8 | }
9 | \arguments{
10 | \item{maf}{An MAF-like data.table, as from preload_maf(). If the MAF has a column named \code{annot},
11 | this column will be preserved in output (since PathScore supports an optional annotation column
12 | with this name).}
13 |
14 | \item{file}{Name/path where PathScore-formatted data should be written. Will be a tab-delimited text file.}
15 |
16 | \item{genome}{The genome build associated with the MAF file. Must be "hg38" (default) or "hg19".}
17 | }
18 | \value{
19 | A copy of the PathScore-formatted data as a data.table.
20 | }
21 | \description{
22 | Produce a file from MAF data for use with PathScore, a web tool for identifying significantly
23 | altered pathways in cancer. See https://pathscore.publichealth.yale.edu/ and
24 | https://doi.org/10.1093/bioinformatics/btw512.
25 | }
26 |
--------------------------------------------------------------------------------
/man/mutational_signature_effects.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/mutational_signature_effects.R
3 | \name{mutational_signature_effects}
4 | \alias{mutational_signature_effects}
5 | \title{Attribute cancer effects to mutational signatures}
6 | \usage{
7 | mutational_signature_effects(cesa = cesa, effects = NULL, samples = NULL)
8 | }
9 | \arguments{
10 | \item{cesa}{CESAnalysis with cancer effects calculated for the variants of interest.}
11 |
12 | \item{effects}{A table of cancer effect estimates for a set of variants, as produced with
13 | \code{ces_variant()}. Different sets of variants (or parameter choices in the
14 | \code{ces_variant} run) will affect output.}
15 |
16 | \item{samples}{Samples for which to calculate mutational sources and effect shares; defaults to all
17 | samples. Reported averages apply to the samples included.}
18 | }
19 | \value{
20 | A nested list containing...
21 | \itemize{
22 | \item \strong{mutational_sources} (list):
23 | \itemize{
24 | \item \strong{source_probabilities} (data.table): For each variant in each sample, the probability that each signature was the source of the variant.
25 | \item \strong{average_by_variant} (data.table): For each distinct variant, the source probabilities averaged over all samples with the variant.
26 | \item \strong{average_source_shares} (numeric): For each signature, the average proportion of each sample's mutations that are attributable to the signature.
27 | Calculated by averaging \code{[CESAnalysis]$mutational_signatures$biological_weights} of included samples. Compare
28 | with average_effect_shares (described below) to identify signatures with disproportionate contributions to oncogenesis.
29 | }
30 | \item \strong{effect_shares} (list):
31 | \itemize{
32 | \item \strong{by_sample} (data.table): The share of each sample's cancer effect (summed across the sample's variants) attributable to each signature.
33 | \item \strong{average_effect_shares} (numeric): For each signature, the average proportion of each sample's cancer effects that are attributable to signature. Compare with
34 | average_source_shares, above, to identify signatures with disproportionate contributions to oncogenesis. Only variants
35 | present in \code{effects} are included in this calculation.
36 | }
37 | }
38 | }
39 | \description{
40 | Within patients and across the cohort, calculate mutational source probabilities and the share
41 | of cancer effects attributable to each source, where sources are biologically-associated
42 | mutational signatures. See \href{https://doi.org/10.1093/molbev/msac084}{Attribution of Cancer Origins to Endogenous, Exogenous, and Preventable Mutational Processes}
43 | for background and applications.
44 | }
45 |
--------------------------------------------------------------------------------
/man/pairwise_epistasis_lik.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/epistasis_objectives.R
3 | \name{pairwise_epistasis_lik}
4 | \alias{pairwise_epistasis_lik}
5 | \title{pairwise_epistasis_lik}
6 | \usage{
7 | pairwise_epistasis_lik(with_just_1, with_just_2, with_both, with_neither)
8 | }
9 | \arguments{
10 | \item{with_just_1}{two-item list of baseline rates in v1/v2 for tumors with mutation in just the first variant(s)}
11 |
12 | \item{with_just_2}{two-item list of baseline rates in v1/v2 for tumors with mutation in just the second variant(s)}
13 |
14 | \item{with_both}{two-item list of baseline rates in v1/v2 for tumors with mutation in both}
15 |
16 | \item{with_neither}{two-item list of baseline rates in v1/v2 for tumors with mutation n neither}
17 | }
18 | \value{
19 | A likelihood function
20 | }
21 | \description{
22 | For a pair of variants (or two groups of variants), creates a likelihood function for a
23 | model of pairwise epistasis with a "strong mutation, weak selection" assumption.
24 | }
25 | \details{
26 | The arguments to this function are automatically supplied by \code{ces_epistasis()} and \code{ces_gene_epistasis()}.
27 | }
28 |
--------------------------------------------------------------------------------
/man/pairwise_variant_epistasis.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/pairwise_variant_epistasis.R
3 | \name{pairwise_variant_epistasis}
4 | \alias{pairwise_variant_epistasis}
5 | \title{Estimate selection under pairwise epistasis model}
6 | \usage{
7 | pairwise_variant_epistasis(
8 | cesa,
9 | variant_pair,
10 | samples,
11 | conf,
12 | compound_variants = NULL,
13 | model = "default",
14 | lik_args = list(),
15 | optimizer_args = list(),
16 | pval_calc_fn = NULL
17 | )
18 | }
19 | \arguments{
20 | \item{cesa}{CESAnalysis}
21 |
22 | \item{variant_pair}{2-length character of variant IDs, or 2-length numeric giving
23 | indices of CompoundVariantSet for the current two compound variants}
24 |
25 | \item{samples}{Validated samples data.table (as from select_samples())}
26 |
27 | \item{compound_variants}{If testing a pair of compound variants, the CompoundVariantSet defining them}
28 |
29 | \item{model}{Passed from ces_epistasis or ces_gene_epistasis. Set to "default" to use built-in
30 | model of epistatic selection, or supply a custom function factory (see details).}
31 |
32 | \item{lik_args}{Extra arguments, given as a list, passed from ces_epistasis or ces_gene_epistasis}
33 |
34 | \item{optimizer_args}{List of arguments to pass to the optimizer (bbmle::mle2).}
35 |
36 | \item{pval_calc_fn}{For use with custom models; optional. A function that takes an epistasis
37 | model fit as input and returns p-values and other descriptives.}
38 | }
39 | \description{
40 | Estimate selection under pairwise epistasis model
41 | }
42 | \keyword{internal}
43 |
--------------------------------------------------------------------------------
/man/plot_effects.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/plot_effects.R
3 | \name{plot_effects}
4 | \alias{plot_effects}
5 | \title{Plot cancer effects}
6 | \usage{
7 | plot_effects(
8 | effects,
9 | topn = 30,
10 | group_by = "variant",
11 | title = "",
12 | x_title = NULL,
13 | y_title = NULL,
14 | y_label = "auto",
15 | color_by = "darkseagreen4",
16 | color_label = NULL,
17 | legend.position = "right",
18 | legend_size_name = "auto",
19 | legend_color_name = NULL,
20 | viridis_option = "cividis",
21 | legend_size_breaks = "auto",
22 | label_individual_variants = TRUE,
23 | order_by_effect = TRUE,
24 | prevalence_method = "auto",
25 | show_ci = TRUE
26 | )
27 | }
28 | \arguments{
29 | \item{effects}{Cancer effects table, as produced by \code{ces_variant()}. You can combine multiple tables via rbind()
30 | to plot multiple effects per variant, such as to compare effects across subgroups.}
31 |
32 | \item{topn}{Include up to this many variants. The highest-effect variants are plotted. (Or, if
33 | \code{group_by} is gene, include up to this many groups. Groups are ranked by their
34 | highest-effect variants.)}
35 |
36 | \item{group_by}{If 'variant' (the default), one variant per row in the plot. If "gene" or some
37 | other column name, variants will be plotted together accordingly.}
38 |
39 | \item{title}{Main title for the plot (by default, no title)}
40 |
41 | \item{x_title}{Text for the X-axis label.}
42 |
43 | \item{y_title}{Text for the Y-axis label.}
44 |
45 | \item{y_label}{Y-axis labels for each group of variants. By default ("auto"), will be variant names
46 | when \code{group_by = "variant"}, and the values in the group_by column otherwise.}
47 |
48 | \item{color_by}{A single color to use for geom_point fill (default "darkseagreen4"). Or, the name of
49 | a column that specifies color groupings. Can be used to distinguish points when multiple effects
50 | are plotted per variant (for example, when comparing effects between subgroups), or to
51 | highlight related groups of variants. A viridis color scale will be applied, unless ever single value
52 | in the color column is interpretable as an R color, in which case the given colors will be used.}
53 |
54 | \item{color_label}{If color_by is supplying color names for scale_color_identity(), optionally
55 | include color_label so that colors can be labeled in the plot legend.}
56 |
57 | \item{legend.position}{Passed to ggplot's legend.position (none, left, right, top, bottom, or
58 | coordinates). Use "none" to eliminate the legend. Defaults to "right".}
59 |
60 | \item{legend_size_name}{The title for the point size scale (larger points = more prevalent variants).}
61 |
62 | \item{legend_color_name}{The title for the point fill color scale.}
63 |
64 | \item{viridis_option}{If using \code{color_by}, this argument
65 | specifies which viridis color map to use. Ignored if you specify your own colors.}
66 |
67 | \item{legend_size_breaks}{Vector of specific mutation counts (or percentages) to depict in the point size legend.
68 | Specify numeric values if you don't like what gets produced by the default ("auto"). Set to
69 | FALSE or to a single desired point size to turn of size scaling.}
70 |
71 | \item{label_individual_variants}{When TRUE (default), individual variants within groups will be
72 | labeled when group_by is not 'variant'. Set FALSE to not label variants, or specify a column
73 | name that supplies a label for each row in the effects table. By default, variant names will be
74 | used for labels. If group_by is exactly "gene", labels will be shortened to just the amino acid
75 | changes. Some labels will be omitted (with a warning) if it seems there are too many to display
76 | in the plot space.}
77 |
78 | \item{order_by_effect}{When TRUE (default), variants are plotted in order of effect. When FALSE,
79 | variants are plotted top-down in the order they are supplied.}
80 |
81 | \item{prevalence_method}{Show each variant's prevalence as a raw mutation count ("count", the default), or as
82 | a percentage of samples with sequencing coverage at the site ("percent"). If the effects table
83 | has the same number of samples covering every inference, you can choose "both".}
84 |
85 | \item{show_ci}{TRUE/FALSE to depict confidence intervals in plot (default TRUE).}
86 | }
87 | \value{
88 | A ggplot
89 | }
90 | \description{
91 | Visualize and compare cancer effects for variants of interest.
92 | }
93 |
--------------------------------------------------------------------------------
/man/plot_epistasis.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/plot_epistasis.R
3 | \name{plot_epistasis}
4 | \alias{plot_epistasis}
5 | \title{Plot pairwise epistasis}
6 | \usage{
7 | plot_epistasis(
8 | epistatic_effects,
9 | pairs_per_row = 8,
10 | x_title = "Site pairs",
11 | variant_label_size = 6.5,
12 | dodge_labels = "auto",
13 | alternating_colors = c("#7cb4de", "#7DD3AF"),
14 | include_schematic = TRUE,
15 | schematic_label_size = 2.5,
16 | significance_levels = c(0.05, 0.01, 0.001),
17 | significance_cols = list(A_change = "p_A_change", B_change = "p_B_change", model =
18 | "p_epistasis"),
19 | inference_floor = 0.001
20 | )
21 | }
22 | \arguments{
23 | \item{epistatic_effects}{Epistatic effects inference table, as produced by \code{ces_epistasis()}
24 | or \code{ces_gene_epistasis()}.}
25 |
26 | \item{pairs_per_row}{How many epistatic pairs to show in each plot row. The provided value is
27 | incremented if need to prevent the legend schematic from being isolated on its own row.}
28 |
29 | \item{x_title}{X-axis label. Set NULL for no label.}
30 |
31 | \item{variant_label_size}{Text size for the variant labels.}
32 |
33 | \item{dodge_labels}{TRUE/FALSE on using n.dodge (height staggering) on variant labels. Defaults
34 | to "auto"; you can try setting manually if labels are not looking good.}
35 |
36 | \item{alternating_colors}{Colors, provided as character vector, to use on epistatic effect
37 | arrows. It's recommended to supply one or two colors, but more will work.}
38 |
39 | \item{include_schematic}{TRUE/FALSE on whether to include the schematic that shows how to interpret the plot.
40 | If you need to put the schematic somewhere special, set to FALSE and then get your own copy of it
41 | with \code{epistasis_plot_schematic()}.}
42 |
43 | \item{schematic_label_size}{Text size of labels in the schematic.}
44 |
45 | \item{significance_levels}{A vector of 1-3 distinct numeric values on (0, 1) in descending order
46 | to use for significance annotations.}
47 |
48 | \item{significance_cols}{A named list of column names that give significance values for nonzero
49 | change in selection for each pair of sites A and B, and for the performance of the epistatic
50 | selection model over a model that ignores epistatic interactions. List elements must be named
51 | A_change (default "p_A_change") B_change ("p_B_change"), and model ("p_epistasis"). The purpose
52 | of this option is to support the use of different columns when multiple testing correction is
53 | performed.}
54 |
55 | \item{inference_floor}{Numeric value of the optimization floor used in epistatic effect
56 | inference. Typically, should be left at the default value, which matches cancereffectsizeR
57 | epistatic inference defaults. For plot legibility, there will be a dashed horizontal line in
58 | the output plot, higher than \code{inference_floor} and lower than any non-minimized parameter
59 | inference, which indicates that all arrows pointing below the line have estimates at the
60 | optimization floor.}
61 | }
62 | \description{
63 | Visualize pairwise epistatic scaled selection coefficients, as calculated by
64 | \code{ces_epistasis()} or \code{ces_gene_epistasis()}. For each variant, the isolated site
65 | effect--the overall scaled selection at the site without regard for the mutation status of the
66 | other site--is also depicted.
67 | }
68 | \details{
69 | Variant pairs for which the epistatic selection model is not significantly better than a
70 | no-epistasis model (in which \code{ces_A0 = ces_A_on_B} and \code{ces_B0 = ces_B_on_A}) are depicted
71 | with faded arrows; the threshold is \code{p_epistasis < .05} (or, instead of p_epistasis,
72 | whatever corrected significance column is specified with \code{significance_cols}).
73 | }
74 |
--------------------------------------------------------------------------------
/man/plot_signature_effects.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/plot_signature_effects.R
3 | \name{plot_signature_effects}
4 | \alias{plot_signature_effects}
5 | \title{Plot mutational source and effect attributions}
6 | \usage{
7 | plot_signature_effects(
8 | mutational_effects = NULL,
9 | signature_groupings = "auto",
10 | viridis_option = NULL,
11 | num_sig_groups = 7,
12 | other_color = "white"
13 | )
14 | }
15 | \arguments{
16 | \item{mutational_effects}{Output from \link{mutational_signature_effects}(). To compare
17 | groups of samples, supply a named list with each element corresponding to output
18 | from a separate run of mutational_signature_effects().}
19 |
20 | \item{signature_groupings}{A data.table of signature names and descriptions; signatures with
21 | identical descriptions are grouped together. Only signatures present in the data get displayed.
22 | Setting to "auto" (the default) uses the table returned by \code{\link{cosmic_signature_info}()}, which only
23 | makes sense when using COSMIC signatures. A custom table should have columns "name",
24 | "short_name", and "description". Additional options:
25 | \itemize{
26 | \item To force a signature group to appear in the plot even if it
27 | has a low effect share, add a column called "prioritize" and set to TRUE where desired.
28 | \item To make a signature appear in its own group, make its description unique.
29 | \item Add a "color" column to manually specify colors for each group.
30 | }
31 | Alternatively, setting \code{signature_groupings = "cannataro"} applies the same signature
32 | groupings and color palette as
33 | \href{https://academic.oup.com/mbe/article/39/5/msac084/6570859}{Cannataro et al. 2022}. You can use
34 | Cannataro signature groupings with a different color palette by specifying \code{viridis_option}.}
35 |
36 | \item{viridis_option}{A viridis color mapping, specified with a single letter ('A' to 'H'). By
37 | default, map 'G' (mako) unless using Cannataro signature groupings.}
38 |
39 | \item{num_sig_groups}{How many groups of signatures to display. Groups are ordered by their
40 | highest effect shares, and the rest get lumped into an "other signatures" group.}
41 |
42 | \item{other_color}{Color to use for signatures in the "other" group, supplied as a scalar character.}
43 | }
44 | \description{
45 | Compare the extent to which mutational signatures contribute mutations (mutational source share)
46 | to the degree to which they contribute high-effect mutations (cancer effect share).
47 | }
48 |
--------------------------------------------------------------------------------
/man/preload_maf.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/preload_maf.R
3 | \name{preload_maf}
4 | \alias{preload_maf}
5 | \title{Read and verify MAF somatic mutation data}
6 | \usage{
7 | preload_maf(
8 | maf = NULL,
9 | refset = NULL,
10 | coverage_intervals_to_check = NULL,
11 | chain_file = NULL,
12 | sample_col = "Unique_Patient_Identifier",
13 | chr_col = "Chromosome",
14 | start_col = "Start_Position",
15 | ref_col = "Reference_Allele",
16 | tumor_allele_col = "guess",
17 | keep_extra_columns = FALSE,
18 | detect_hidden_mnv = TRUE
19 | )
20 | }
21 | \arguments{
22 | \item{maf}{Path of tab-delimited text file in MAF format, or a data.table/data.frame with MAF data}
23 |
24 | \item{refset}{name of reference data set (refset) to use; run \code{list_ces_refsets()} for
25 | available refsets. Alternatively, the path to a custom reference data directory.}
26 |
27 | \item{coverage_intervals_to_check}{If available, a BED file or GRanges object
28 | represented the expected coverage intervals of the sequencing method used to generate
29 | the MAF data. Unless the coverage intervals are incorrect, most records will be
30 | covered. Output will show how far away uncovered records are from covered regions,
31 | which can inform whether to use the covered_regions_padding option in load_maf().
32 | (For example, some variant callers will identify variants up to 100bp out of the
33 | target regions, and you may want to pad the covered intervals to allow these variants
34 | to remain in your data. Alternatively, if all records are already covered, then the
35 | calls have probably already be trimmed to the coverage intervals, which means no
36 | padding should be added.)}
37 |
38 | \item{chain_file}{a LiftOver chain file (text format, name ends in .chain) to convert MAF
39 | records to the genome build used in the CESAnalysis.}
40 |
41 | \item{sample_col}{column name with patient ID; defaults to
42 | Unique_Patient_Identifier, or, in its absence, Tumor_Sample_Barcode}
43 |
44 | \item{chr_col}{column name with chromosome data (Chromosome)}
45 |
46 | \item{start_col}{column name with start position (Start_Position)}
47 |
48 | \item{ref_col}{column name with reference allele data (Reference_Allele)}
49 |
50 | \item{tumor_allele_col}{column name with alternate allele data; by default,
51 | values from Tumor_Seq_Allele2 and Tumor_Seq_Allele1 columns are used}
52 |
53 | \item{keep_extra_columns}{TRUE/FALSE to load data columns not needed by cancereffectsizeR,
54 | or a vector of column names to keep.}
55 |
56 | \item{detect_hidden_mnv}{Find same-sample adjacent SNVs and replace these records with
57 | DBS (doublet base substitution) records. Also, find groups of same-sample variants
58 | within 2 bp of each other and replace these records with MNV (multi-nucleotide
59 | variant) records.}
60 | }
61 | \value{
62 | a data.table of MAF data, with any problematic records flagged and a few
63 | quality-control annotations (if available with the chosen refset data).
64 | }
65 | \description{
66 | Reads MAF-formatted data from a text file or data table, checks for problems, and
67 | provides a few quality check annotations (if available). If core MAF columns don't have
68 | standard names (Chromosome, Start_Position, etc., with Tumor_Sample_Barcode used as the
69 | sample ID column), you can supply your own column names. If the data you are loading is
70 | from a different genome build than the chosen reference data set (refset) you can use
71 | the \code{chain_file} option to supply a UCSC-style chain file, and your MAF coordinates
72 | will be automatically converted with rtracklayer's version of liftOver.
73 | }
74 | \details{
75 | The \code{ces.refset.hg19} \code{ces.refset.hg38} refsets provides three annotations
76 | that you may consider using for quality filtering of MAF records:
77 | \itemize{
78 | \item cosmic_site_tier Indicates if the variant's position overlaps a mutation in
79 | COSMIC v92's Cancer Mutation Census. Mutations are classified as Tier 1, Tier 2, Tier
80 | 3, and Other. Note that the MAF mutation itself is not necessarily in the census. See
81 | COSMIC's website for tier definitions.
82 | \item germline_variant_site The variant's position overlaps a site of common germline
83 | variation. Roughly, this means that gnomAD 2.1.1 shows an overlapping germline variant at
84 | greater than 1\% prevalence in some population.
85 | \item repetitive_region The variant overlaps a site marked as repetitive sequence by
86 | the RepeatMasker tool (data taken from UCSC Table Browser). Variant calls in repetitive
87 | sites frequently reflect sequencing or calling error.
88 | }
89 | }
90 |
--------------------------------------------------------------------------------
/man/preload_ref_data.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/genome_data_handling.R
3 | \name{preload_ref_data}
4 | \alias{preload_ref_data}
5 | \title{preload_ref_data}
6 | \usage{
7 | preload_ref_data(data_dir)
8 | }
9 | \description{
10 | Used when loading or creating a CESAnalysis to load reference into an environment for quick access
11 | }
12 | \keyword{internal}
13 |
--------------------------------------------------------------------------------
/man/read_in_maf.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/internal_read_maf.R
3 | \name{read_in_maf}
4 | \alias{read_in_maf}
5 | \title{Load MAF somatic mutation data}
6 | \usage{
7 | read_in_maf(
8 | maf,
9 | refset_env,
10 | chr_col = "Chromosome",
11 | start_col = "Start_Position",
12 | ref_col = "Reference_Allele",
13 | tumor_allele_col = "guess",
14 | sample_col = "Unique_Patient_Identifier",
15 | more_cols = NULL,
16 | chain_file = NULL,
17 | separate_old_problems = FALSE
18 | )
19 | }
20 | \arguments{
21 | \item{maf}{Path of tab-delimited text file in MAF format, or an MAF in data.table or data.frame format}
22 |
23 | \item{refset_env}{a refset data environment}
24 |
25 | \item{chr_col}{column name with chromosome data (Chromosome)}
26 |
27 | \item{start_col}{column name with start position (Start_Position)}
28 |
29 | \item{ref_col}{column name with reference allele data (Reference_Allele)}
30 |
31 | \item{tumor_allele_col}{column name with alternate allele data; by default,
32 | values from Tumor_Seq_Allele2 and Tumor_Seq_Allele1 columns are used.}
33 |
34 | \item{sample_col}{column name with sample ID data (Tumor_Sample_Barcode or Unique_Patient_Identifier)}
35 |
36 | \item{chain_file}{a LiftOver chain file (text format, name ends in .chain) to convert MAF
37 | records to the genome build used in the CESAnalysis.}
38 |
39 | \item{separate_old_problems}{When TRUE (as used by load_maf), respect old problems that
40 | look like they came from cancereffectsizeR (typically from preload_maf). These get
41 | separated as "old_problem", and the records won't be checked. chain_file must be
42 | NULL.}
43 | }
44 | \value{
45 | data.table with core MAF columns, any other requested columns, and a "problem" column
46 | }
47 | \description{
48 | Load MAF data from a text file or data table into your CESAnalysis. If column names
49 | don't match MAF format specifications (Chromosome, Start_Position, etc., with
50 | Tumor_Sample_Barcode used as the sample ID column), you can supply your own column
51 | names. When your CESAnalysis has defined sample groups (see \code{?CESAnalysis}),
52 | specify "group_col". By default, data is assumed to be derived from whole-exome
53 | sequencing. Whole-genome data and targeted sequencing data are also supported when the
54 | "coverage" option is specified. If the data you are loading is from a different genome
55 | build than your CESAnalysis, you can use the "chain_file" option to supply a UCSC-style
56 | chain file, and your MAF coordinates will be automatically converted with
57 | rtracklayer's version of liftOver.
58 | }
59 | \keyword{internal}
60 |
--------------------------------------------------------------------------------
/man/read_vcf.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/vcf_to_maf_table.R
3 | \name{read_vcf}
4 | \alias{read_vcf}
5 | \title{Internal VCF parser}
6 | \usage{
7 | read_vcf(vcf, sample_id, vcf_name = sample_id)
8 | }
9 | \arguments{
10 | \item{vcf}{VCF filename or VCF-like data.table.}
11 |
12 | \item{sample_id}{1-length sample identifier.}
13 |
14 | \item{vcf_name}{1-length identifier used in some user messages.}
15 | }
16 | \value{
17 | MAF-like data.table
18 | }
19 | \description{
20 | Used by vcfs_to_maf_table()
21 | }
22 | \keyword{internal}
23 |
--------------------------------------------------------------------------------
/man/run_deconstructSigs.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/run_deconstructSigs.R
3 | \name{run_deconstructSigs}
4 | \alias{run_deconstructSigs}
5 | \title{cancereffectsizeR wrapper for deconstructSigs}
6 | \usage{
7 | run_deconstructSigs(
8 | tumor_trinuc_counts,
9 | signatures_df,
10 | signatures_to_remove,
11 | tri.counts.method
12 | )
13 | }
14 | \arguments{
15 | \item{tumor_trinuc_counts}{one-row data.frame of trinuc variant counts (in deconstructSigs order) for one tumor}
16 |
17 | \item{signatures_df}{data.frame of signatures (see COSMIC v3 signatures included with package for format)}
18 |
19 | \item{signatures_to_remove}{names of signatures in signatures_df to keep out of deconstructSigs and assign zero weights}
20 |
21 | \item{tri.counts.method}{exome/genome trinucleotide content normalization argument to pass to deconstructSigs (see its docs)}
22 | }
23 | \value{
24 | a data.frame of signature weights
25 | }
26 | \description{
27 | This function gets called internally by trinuc_mutation_rates() for each tumor in a CESAnalysis, accepting
28 | a data.frame of mutation counts and returning a data.frame of signature weights.
29 | }
30 | \keyword{internal}
31 |
--------------------------------------------------------------------------------
/man/run_dndscv.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/gene_mutation_rates.R
3 | \name{run_dndscv}
4 | \alias{run_dndscv}
5 | \title{Internal function to run dNdScv}
6 | \usage{
7 | run_dndscv(mutations, gene_list, cv, refdb, gr_genes, ...)
8 | }
9 | \description{
10 | Internal function to run dNdScv
11 | }
12 | \keyword{internal}
13 |
--------------------------------------------------------------------------------
/man/run_mutational_patterns.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/run_mutational_patterns.R
3 | \name{run_mutational_patterns}
4 | \alias{run_mutational_patterns}
5 | \title{cancereffectsizeR wrapper for fit_to_signatures}
6 | \usage{
7 | run_mutational_patterns(
8 | tumor_trinuc_counts,
9 | signatures_df,
10 | signatures_to_remove,
11 | mp_strict_args = list(),
12 | bootstrap_mutations = FALSE
13 | )
14 | }
15 | \arguments{
16 | \item{tumor_trinuc_counts}{matrix of trinuc variant counts where columns respond to tumors and
17 | order of trinucleotide changes match signatures_df}
18 |
19 | \item{signatures_df}{data.frame of signatures (see COSMIC v3 signatures included with package for format)}
20 |
21 | \item{signatures_to_remove}{names of signatures in signatures_df to keep out of MutationalPatterns
22 | and assign zero weights. Only occurs when strict == FALSE}
23 |
24 | \item{mp_strict_args}{named list of additional arguments to fit_to_signatures_strict}
25 |
26 | \item{bootstrap_mutations}{T/F (default FALSE) whether to run
27 | fit_to_signatures_bootstrapped() with n_boot = 1 instead of fit_to_signatures_strict().}
28 | }
29 | \value{
30 | a data.frame of signature weights
31 | }
32 | \description{
33 | This function gets called internally by trinuc_mutation_rates() for each tumor in a CESAnalysis, accepting
34 | a data.frame of mutation counts and returning fit_to_signatures output. Note: this function supports indels
35 | if passed in the same format.
36 | }
37 | \keyword{internal}
38 |
--------------------------------------------------------------------------------
/man/samples_with.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/select_samples.R
3 | \name{samples_with}
4 | \alias{samples_with}
5 | \title{Find samples with specified variants}
6 | \usage{
7 | samples_with(cesa, any_of = NULL)
8 | }
9 | \arguments{
10 | \item{cesa}{CESAnalysis}
11 |
12 | \item{any_of}{Select samples with ANY of the given variant names/IDs, such as
13 | c("8:142506482_C>G", "KRAS G12C"). When a gene has multiple transcripts in reference
14 | data, you may wish to use full IDs, such as "KRAS_G12C_ENSP00000256078".}
15 | }
16 | \description{
17 | A convenience function to identify samples with specific variants.
18 | }
19 |
--------------------------------------------------------------------------------
/man/save_cesa.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/CESAnalysis.R
3 | \name{save_cesa}
4 | \alias{save_cesa}
5 | \title{Save a CESAnalysis in progress}
6 | \usage{
7 | save_cesa(cesa, file)
8 | }
9 | \arguments{
10 | \item{cesa}{CESAnalysis to save}
11 |
12 | \item{file}{filename to save to (must end in .rds)}
13 | }
14 | \description{
15 | Saves a CESAnalysis to a file by calling using base R's saveRDS function. Also updates
16 | run history for reproducibility. Files saved should be reloaded with \code{load_cesa()}.
17 | }
18 | \details{
19 | Note that the genome reference data associated with a CESAnalysis (refset) is not
20 | actually part of the CESAnalysis, so it is not saved here. (Saving this data with the
21 | analysis would make file sizes too large.) When you reload the CESAnalysis, you can
22 | re-associate the correct reference data.
23 | }
24 |
--------------------------------------------------------------------------------
/man/select_samples.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/select_samples.R
3 | \name{select_samples}
4 | \alias{select_samples}
5 | \title{Retrieve validated subset of CESAnalysis samples table}
6 | \usage{
7 | select_samples(cesa = NULL, samples = character())
8 | }
9 | \arguments{
10 | \item{cesa}{CESAnalysis}
11 |
12 | \item{samples}{Vector of Unique_Patient_Identifiers, or data.table consisting
13 | of rows from a CESAnalysis samples table. If empty, returns full sample table.}
14 | }
15 | \value{
16 | data.table consisting of one or more rows from the CESAnalysis samples table.
17 | }
18 | \description{
19 | Retrieve validated subset of CESAnalysis samples table
20 | }
21 | \keyword{internal}
22 |
--------------------------------------------------------------------------------
/man/set_gene_rates.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/set_gene_rates.R
3 | \name{set_gene_rates}
4 | \alias{set_gene_rates}
5 | \title{Assign pre-calculated regional mutation rates}
6 | \usage{
7 | set_gene_rates(
8 | cesa = NULL,
9 | rates = NULL,
10 | samples = character(),
11 | missing_genes_take_nearest = FALSE
12 | )
13 | }
14 | \arguments{
15 | \item{cesa}{CESAnalysis object}
16 |
17 | \item{rates}{A two-column data.table with either gene name or protein_id in column 1
18 | and rate in column 2}
19 |
20 | \item{samples}{Which samples the input rates apply to. Defaults to all samples. Can be
21 | a vector of Unique_Patient_Identifiers, or a data.table containing rows from the
22 | CESAnalysis sample table.}
23 |
24 | \item{missing_genes_take_nearest}{Set to TRUE to have each gene/protein_id missing from
25 | rates take the rate of the nearest non-missing gene/protein.}
26 | }
27 | \description{
28 | This function allows you to specify regional rates of mutation--calculated
29 | however you like--to samples in your CESAnalysis. Rates can be assigned to all samples
30 | or to specified samples.
31 | }
32 | \details{
33 | Provide rates in a data.table with two columns: gene name or protein ID (character) and
34 | rate (numeric, non-negative). Gene names or protein IDs must match those in CESAnalysis
35 | reference data. (Some reference data sets, such as ces.refset.hg19, only allow
36 | gene-level rates.) If you don't want to supply rates for every gene, set
37 | \code{missing_genes_take_nearest = T} to have each missing gene or coding region take
38 | the rate of the nearest non-missing one.
39 | }
40 |
--------------------------------------------------------------------------------
/man/set_refset_dir.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/CESAnalysis.R
3 | \name{set_refset_dir}
4 | \alias{set_refset_dir}
5 | \title{Set reference data directory}
6 | \usage{
7 | set_refset_dir(cesa, dir)
8 | }
9 | \arguments{
10 | \item{cesa}{CESAnalysis}
11 |
12 | \item{dir}{path to data directory}
13 | }
14 | \description{
15 | When working with custom reference data or loading a previously saved CESAnalysis in a
16 | new environment, use this function to reassociate the location of reference data with
17 | the analysis. (If \code{load_cesa()} didn't give you a warning when loading your
18 | analysis, you probably don't need to use this function.)
19 | }
20 |
--------------------------------------------------------------------------------
/man/set_signature_weights.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/set_signature_weights.R
3 | \name{set_signature_weights}
4 | \alias{set_signature_weights}
5 | \title{Set SNV signature weights}
6 | \usage{
7 | set_signature_weights(
8 | cesa,
9 | signature_set,
10 | weights,
11 | ignore_extra_samples = FALSE
12 | )
13 | }
14 | \arguments{
15 | \item{cesa}{CESAnalysis}
16 |
17 | \item{signature_set}{signature set name (see \code{list_ces_signature_sets()}), or a
18 | custom signature set (see documentation in \code{trinuc_mutation_rates()})}
19 |
20 | \item{weights}{data.table of relative signature weights for each sample (see details)}
21 |
22 | \item{ignore_extra_samples}{skip samples in the input table that are not in the
23 | CESAnalysis (when false, will stop with an error)}
24 | }
25 | \description{
26 | If you wish to use your own method to calculate sample-specific SNV signature weights
27 | (as opposed the signature extraction built into trinuc_mutation_rates()), you can use
28 | this function to load them into the CESAnalysis. Your input signatures will be used to
29 | infer relative trinucleotide-context-specific mutation rates for all tumors. (This
30 | means you can run set_signature_weights() or set_trinuc_rates(), but not both.) As in
31 | trinuc_mutation_rates(), you can use a built-in set of signatures, such as COSMIC_v3.1,
32 | or you can supply your own signature set definitions as documented in
33 | ?trinuc_mutation_rates.
34 | }
35 | \details{
36 | The input data table must have a Unique_Patient_Identifier column and one column per
37 | signature in the signature set. All samples in the CESAnalysis must be included in the
38 | input table, and each sample's weights should have a sum on (0, 1]. Since these weights
39 | are used by cancereffectsizeR to infer trinucleotide-context-specific relative rates
40 | of SNV mutations, each sample must have at least one non-artifact signature with
41 | nonzero weight. (In the unlikely event that this is a problem, consider assigning
42 | group-average signature weights to the artifact-only samples.)
43 | }
44 |
--------------------------------------------------------------------------------
/man/set_trinuc_rates.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/set_trinuc_rates.R
3 | \name{set_trinuc_rates}
4 | \alias{set_trinuc_rates}
5 | \title{Assign pre-calculated relative trinucleotide mutation rates}
6 | \usage{
7 | set_trinuc_rates(cesa, trinuc_rates, ignore_extra_samples = FALSE)
8 | }
9 | \arguments{
10 | \item{cesa}{CESAnalysis object}
11 |
12 | \item{trinuc_rates}{a matrix or data table (see description for format)}
13 |
14 | \item{ignore_extra_samples}{skip samples in the input table that are not in the CESAnalysis (when false, will stop with an error)}
15 | }
16 | \description{
17 | This function assigns trinucleotide-context-specific relative SNV mutation rates to
18 | tumors in a CESAnalysis. (These could be rates previously generated with
19 | \code{trinuc_mutation_rates()}, or they could calculated using your own methods.) The
20 | input rates must be a data.table or matrix. If supplying a data table, there must be a
21 | Unique_Patient_Identifier column; if supplying a a matrix, the identifiers should be
22 | supplied as rownames instead. Either way, all samples in the CESAnalysis must be
23 | represented in the input rates. To avoid user error, there cannot be any superfluous
24 | samples in the input rates unless \code{ignore_extra_samples = T}. Besides the
25 | identifier column (or matrix rownames), there must be 96 columns, with column names
26 | exactly matching the deconstructSigs/MutationalPatterns naming and order (run this
27 | function with incorrect column names, and the names you need to use will be printed).
28 | Since CES uses relative trinuc rates, rows must sum to 1, with all values greater than
29 | 0. You'll get a warning if any rate is less than 1e-9, since (unrealistically) low
30 | rates may crash selection model likelihood functions that aren't expecting such small
31 | values.
32 | }
33 |
--------------------------------------------------------------------------------
/man/snv_results.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/CESAnalysis.R
3 | \name{snv_results}
4 | \alias{snv_results}
5 | \title{View results from ces_variant}
6 | \usage{
7 | snv_results(cesa = NULL)
8 | }
9 | \arguments{
10 | \item{cesa}{CESAnalysis object}
11 | }
12 | \description{
13 | returns a list of ces_variant() results tables, with variant annotations added
14 | }
15 |
--------------------------------------------------------------------------------
/man/sort_and_validate_variant_ids.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/variant_counts.R
3 | \name{sort_and_validate_variant_ids}
4 | \alias{sort_and_validate_variant_ids}
5 | \title{Sort and validate input variant IDs}
6 | \usage{
7 | sort_and_validate_variant_ids(cesa, input_ids, drop_unannotated = FALSE)
8 | }
9 | \arguments{
10 | \item{cesa}{CESAnalysis}
11 |
12 | \item{input_ids}{Variant names/IDs, typically from user.}
13 |
14 | \item{drop_unannotated}{Whether to include variants that are not annotated in output.}
15 | }
16 | \value{
17 | List of variant_ids, with each element corresponding to one variant_type.
18 | }
19 | \description{
20 | Sorts input variant IDs by type, completes IDs by adding protein ID to plain variant
21 | names (e.g. "KRAS G12C"), and ensures that IDs are valid even if not present in
22 | annotations. This includes verifying that reference alleles are correct in SNV IDs and
23 | that amino acid changes are possible given the coding sequence.
24 | }
25 | \keyword{internal}
26 |
--------------------------------------------------------------------------------
/man/sswm_lik.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/single_variant_si_objectives.R
3 | \name{sswm_lik}
4 | \alias{sswm_lik}
5 | \title{sswm_lik}
6 | \usage{
7 | sswm_lik(rates_tumors_with, rates_tumors_without)
8 | }
9 | \arguments{
10 | \item{rates_tumors_with}{vector of site-specific mutation rates for all tumors with variant}
11 |
12 | \item{rates_tumors_without}{vector of site-specific mutation rates for all eligible tumors without variant}
13 | }
14 | \description{
15 | Generates log-likelihood function of site-level selection with "strong selection, weak
16 | mutation" assumption. All arguments to this likelihood function factory are
17 | automatically supplied by \code{ces_variant()}.
18 | }
19 |
--------------------------------------------------------------------------------
/man/suggest_cosmic_signature_exclusions.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/suggest_cosmic_signatures_to_remove.R
3 | \name{suggest_cosmic_signature_exclusions}
4 | \alias{suggest_cosmic_signature_exclusions}
5 | \title{Tissue-specific mutational signature exclusions}
6 | \usage{
7 | suggest_cosmic_signature_exclusions(
8 | cancer_type = NULL,
9 | treatment_naive = NULL,
10 | quiet = FALSE
11 | )
12 | }
13 | \arguments{
14 | \item{cancer_type}{See \href{https://townsend-lab-yale.github.io/cancereffectsizeR/articles/cosmic_cancer_type_note.html}{here}
15 | for supported cancer type labels.}
16 |
17 | \item{treatment_naive}{give TRUE if samples were taken pre-treatment; FALSE or leave
18 | NULL otherwise.}
19 |
20 | \item{quiet}{(default false) for non-interactive use, suppress explanations and advice.}
21 | }
22 | \value{
23 | a vector of signatures to feed to the \code{trinuc_mutation_rates()}
24 | \code{signature_exclusions} argument.
25 | }
26 | \description{
27 | Get suggestions on signatures_to_remove for trinuc_mutation_rates for COSMIC signatures v3 and later.
28 | For details, see \href{https://townsend-lab-yale.github.io/cancereffectsizeR/articles/cosmic_cancer_type_note.html}{this article}
29 | on our website.
30 | }
31 |
--------------------------------------------------------------------------------
/man/trinuc_snv_counts.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/trinuc_mutation_rates.R
3 | \name{trinuc_snv_counts}
4 | \alias{trinuc_snv_counts}
5 | \title{Tabulate SNVs by trinucleotide context}
6 | \usage{
7 | trinuc_snv_counts(
8 | maf,
9 | genome,
10 | exclude_recurrent = FALSE,
11 | style = "MutationalPatterns"
12 | )
13 | }
14 | \arguments{
15 | \item{maf}{a cancereffectsizeR-style MAF data table}
16 |
17 | \item{genome}{BSgenome reference genome (for looking up trinucleotide contexts)}
18 |
19 | \item{exclude_recurrent}{Default FALSE. When TRUE, only mutations private to each sample are included in counts, in order to
20 | reduce the influence of selection. (If you load more MAF data into the CESAnalysis later, recurrency may change.)}
21 |
22 | \item{style}{"MutationalPatterns" or "deconstructSigs"}
23 | }
24 | \value{
25 | Matrix or data frame of SNV counts, suitable for use with MutationalPatterns or
26 | deconstructSigs. Samples with zero passing SNVs will not appear.
27 | }
28 | \description{
29 | This function produces trinucleotide-context-specific SNV counts from MAF data for
30 | input to mutational signature extraction tools. Output can be tailored to meet
31 | formatting requirements of MutationalPatterns or deconstructSigs, which are probably
32 | similar to formats used by other tools.
33 | }
34 |
--------------------------------------------------------------------------------
/man/univariate_si_conf_ints.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/si_uniroot_conf_int.R
3 | \name{univariate_si_conf_ints}
4 | \alias{univariate_si_conf_ints}
5 | \title{Calculate uniroot CIs on selection intensities}
6 | \usage{
7 | univariate_si_conf_ints(fit, lik_fn, min_si, max_si, conf)
8 | }
9 | \arguments{
10 | \item{fit}{From bbmle}
11 |
12 | \item{lik_fn}{likelihood function}
13 |
14 | \item{min_si}{lower limit on SI/CI}
15 |
16 | \item{max_si}{upper limit on SI/CI}
17 |
18 | \item{conf}{e.g., .95 -> 95\% CIs}
19 | }
20 | \description{
21 | Given a model fit, calculate univariate confidence intervals for each parameter.
22 | Returns a list of low/high bounds.
23 | }
24 | \keyword{internal}
25 |
--------------------------------------------------------------------------------
/man/update_covered_in.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/covered_regions_handling.R
3 | \name{update_covered_in}
4 | \alias{update_covered_in}
5 | \title{update_covered_in}
6 | \usage{
7 | update_covered_in(cesa)
8 | }
9 | \arguments{
10 | \item{cesa}{CESAnalysis}
11 | }
12 | \value{
13 | CESAnalysis with regenerated covered-in annotations
14 | }
15 | \description{
16 | Updates the covered_in annotation for all variants
17 | to include all covered regions in the CESAnalysis
18 | }
19 | \details{
20 | Also updates internal cached output of select_variants().
21 | }
22 | \keyword{internal}
23 |
--------------------------------------------------------------------------------
/man/validate_aac_ids.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/variant_id_handling.R
3 | \name{validate_aac_ids}
4 | \alias{validate_aac_ids}
5 | \title{Ensure AAC IDs are valid for a given reference data set}
6 | \usage{
7 | validate_aac_ids(aac_ids, refset)
8 | }
9 | \arguments{
10 | \item{aac_ids}{AAC variant IDs}
11 |
12 | \item{refset}{reference data set (environment object)}
13 | }
14 | \description{
15 | Given a vector of AAC IDs, determines whether each is valid/possible, returning
16 | NULL if all are valid, or a list of problems, or an error if input isn't parseable.
17 | }
18 | \details{
19 | An ID is invalid if the gene and/or protein_id are not in the reference data, or if the
20 | reference amino acid ("G" in KRAS_G12C) is incorrect, or if there is no possible SNV
21 | that can create the proposed change. For example, KRAS G12C is possible when the codon
22 | (GGT) acquires a G>T substitution in the first position, while G12K is not possible
23 | because no single substitution can transform GGT into a lysine codon.
24 | }
25 | \keyword{internal}
26 |
--------------------------------------------------------------------------------
/man/validate_optimizer_args.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/validate_optimizer_args.R
3 | \name{validate_optimizer_args}
4 | \alias{validate_optimizer_args}
5 | \title{Check custom optimizer arguments}
6 | \usage{
7 | validate_optimizer_args(optimizer_args)
8 | }
9 | \arguments{
10 | \item{optimizer_args}{List of arguments/values to pass to the optimizer.}
11 | }
12 | \description{
13 | Check custom optimizer arguments
14 | }
15 | \keyword{internal}
16 |
--------------------------------------------------------------------------------
/man/validate_signature_set.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/genome_data_handling.R
3 | \name{validate_signature_set}
4 | \alias{validate_signature_set}
5 | \title{validate_signature_set}
6 | \usage{
7 | validate_signature_set(signature_set)
8 | }
9 | \arguments{
10 | \item{signature_set}{signature set list (see docs for format)}
11 | }
12 | \description{
13 | Checks if a custom CES signature is properly formatted; stops with an error if not
14 | }
15 |
--------------------------------------------------------------------------------
/man/validate_snv_ids.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/variant_id_handling.R
3 | \name{validate_snv_ids}
4 | \alias{validate_snv_ids}
5 | \title{validate_snv_ids}
6 | \usage{
7 | validate_snv_ids(snv_ids, bsg)
8 | }
9 | \arguments{
10 | \item{snv_ids}{character vector of snv_ids}
11 |
12 | \item{bsg}{BSgenome for getting reference sequence}
13 | }
14 | \description{
15 | Ensures SNV IDs are valid for the given genome
16 | }
17 | \keyword{internal}
18 |
--------------------------------------------------------------------------------
/man/variant_counts.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/variant_counts.R
3 | \name{variant_counts}
4 | \alias{variant_counts}
5 | \title{Assess variant prevalence and coverage}
6 | \usage{
7 | variant_counts(cesa, variant_ids = character(), by = character())
8 | }
9 | \arguments{
10 | \item{cesa}{CESAnalysis}
11 |
12 | \item{variant_ids}{variant names ("KRAS G12C") or full variant IDs. If left empty, uses
13 | non-overlapping variants as returned by `select_variants()` with \code{min_freq = 1}.}
14 |
15 | \item{by}{Optionally, a vector of one or more sample table columns. Variant prevalence
16 | and coverage data will be broken down by the groups defined by unique combinations of
17 | values in these columns.}
18 | }
19 | \description{
20 | Determine variant prevalence (and how many samples have sequencing coverage) across your MAF data,
21 | or within different groups of samples.
22 | }
23 |
--------------------------------------------------------------------------------
/man/vcfs_to_maf_table.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/vcf_to_maf_table.R
3 | \name{vcfs_to_maf_table}
4 | \alias{vcfs_to_maf_table}
5 | \title{Read a VCF into an MAF-like table}
6 | \usage{
7 | vcfs_to_maf_table(vcfs, sample_ids)
8 | }
9 | \arguments{
10 | \item{vcfs}{Vector of VCF file paths, or a list of VCF-like data.tables, or a single data.table.}
11 |
12 | \item{sample_ids}{Identifiers to populate Tumor_Sample_Barcode, one per VCF.}
13 | }
14 | \value{
15 | A single data.table with MAF-style fields, suitable for use with cancereffectsizeR.
16 | }
17 | \description{
18 | This function loads VCF files into MAF-like tables. A Tumor_Sample_Barcode column is
19 | added, and the contents of the POS/REF/ALT fields are converted to match the style used
20 | by MAF files for Start_Position/Reference_Allele/Tumor_Seq_Allele2. The VCF file
21 | should represent high-confidence somatic variant calls for a single tumor sample.
22 | }
23 |
--------------------------------------------------------------------------------
/pkgdown/extra.css:
--------------------------------------------------------------------------------
1 | .grad1 {
2 | background: -webkit-linear-gradient(5deg, #8d2014, #d41818);
3 | -webkit-background-clip: text;
4 | -webkit-text-fill-color: transparent;
5 | }
6 |
7 | .grad2 {
8 | background: -webkit-linear-gradient(5deg, #f22121, #e20e0e);
9 | -webkit-background-clip: text;
10 | -webkit-text-fill-color: transparent;
11 | }
12 | .grad3 {
13 | background: -webkit-linear-gradient(5deg, #d41818, #8d2014);
14 | -webkit-background-clip: text;
15 | -webkit-text-fill-color: transparent;
16 | }
17 |
18 | @media (min-width: 1000px) {
19 | .container .row {
20 | justify-content: center
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/pkgdown/favicon/apple-touch-icon-120x120.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Townsend-Lab-Yale/cancereffectsizeR/c0f5503e2beca7726bc535caf4f1fcee918afec3/pkgdown/favicon/apple-touch-icon-120x120.png
--------------------------------------------------------------------------------
/pkgdown/favicon/apple-touch-icon-152x152.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Townsend-Lab-Yale/cancereffectsizeR/c0f5503e2beca7726bc535caf4f1fcee918afec3/pkgdown/favicon/apple-touch-icon-152x152.png
--------------------------------------------------------------------------------
/pkgdown/favicon/apple-touch-icon-180x180.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Townsend-Lab-Yale/cancereffectsizeR/c0f5503e2beca7726bc535caf4f1fcee918afec3/pkgdown/favicon/apple-touch-icon-180x180.png
--------------------------------------------------------------------------------
/pkgdown/favicon/apple-touch-icon-60x60.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Townsend-Lab-Yale/cancereffectsizeR/c0f5503e2beca7726bc535caf4f1fcee918afec3/pkgdown/favicon/apple-touch-icon-60x60.png
--------------------------------------------------------------------------------
/pkgdown/favicon/apple-touch-icon-76x76.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Townsend-Lab-Yale/cancereffectsizeR/c0f5503e2beca7726bc535caf4f1fcee918afec3/pkgdown/favicon/apple-touch-icon-76x76.png
--------------------------------------------------------------------------------
/pkgdown/favicon/apple-touch-icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Townsend-Lab-Yale/cancereffectsizeR/c0f5503e2beca7726bc535caf4f1fcee918afec3/pkgdown/favicon/apple-touch-icon.png
--------------------------------------------------------------------------------
/pkgdown/favicon/favicon-16x16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Townsend-Lab-Yale/cancereffectsizeR/c0f5503e2beca7726bc535caf4f1fcee918afec3/pkgdown/favicon/favicon-16x16.png
--------------------------------------------------------------------------------
/pkgdown/favicon/favicon-32x32.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Townsend-Lab-Yale/cancereffectsizeR/c0f5503e2beca7726bc535caf4f1fcee918afec3/pkgdown/favicon/favicon-32x32.png
--------------------------------------------------------------------------------
/pkgdown/favicon/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Townsend-Lab-Yale/cancereffectsizeR/c0f5503e2beca7726bc535caf4f1fcee918afec3/pkgdown/favicon/favicon.ico
--------------------------------------------------------------------------------
/vignettes/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | *.R
3 |
--------------------------------------------------------------------------------
/vignettes/MAF_filtering_tips.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "MAF filtering and sample validation"
3 | ---
4 |
5 | Since our goal is to quantify somatic selection, we want MAF data to represent a complete set of high-confidence somatic variants for the sample set. We accordingly expect the following:
6 |
7 | * There should be few to no mutations at sites where population databases show common germline variation.
8 | * There should be few to no mutations in repetitive or poorly mapped regions of the genome.
9 | * Samples should have little mutational overlap, especially at sites without known cancer association.
10 |
11 | #### Filtering to reduce false positive variants
12 |
13 | Well-curated data, such as MAF files produced using the Genomic Data Commons Aliquot Ensemble Somatic Variant Merging and Masking workflow, should not need quality filtering. For data produced with other or unknown somatic calling methods, reading an MAF file with `preload_maf()` provides three relevant annotation columns:
14 |
15 | * germline_variant_site: The variant overlaps a region that contains a common germline variant according to gnomAD (common being >1% prevalence in some population).
16 | * repetitive_region: The variant is in a region of the genome marked as repetitive by the RepeatMasker tool.
17 | * cosmic_site_tier: Indicates if the variant overlaps a site annotated as cancer-related (tiers 1, 2, and 3) by COSMIC.
18 |
19 | A simple strategy to reduce false positive calls is to filter out all germline site records, as well as records from repetitive regions except for the few with COSMIC annotations. We can apply this filtering like this:
20 | ```{r echo = T, eval = F}
21 | library(cancereffectsizeR)
22 | maf = preload_maf("my_data.maf", refset = "ces.refset.hg38") # also works with ces.refset.hg19
23 | maf = maf[germline_variant_site == F][repetitive_region == F | cosmic_site_tier %in% 1:3]
24 | ```
25 |
26 |
27 | #### Sample re-use, contamination, and multi-sample sequencing
28 |
29 | When combining data sources, it's important to verify that a patient's mutation data is not duplicated. Since it can be hard to be sure, we recommend both careful manual curation and the use of `check_sample_overlap()` to flag possible sample overlap.
30 |
31 | Sometimes, patients from the same data source will show suspiciously high mutational overlap. This could be due to shared calling error, or worse, contamination between samples during sequencing. If the latter appears likely, the data should not be used.
32 |
33 | Relatedly, patients with multiple distinct sequenced samples (multi-region sequencing, or multiple timepoints) should contribute just one sample to an effect analysis, unless there is evidence that the tissues evolved independently (unusual).
34 |
35 |
36 | #### What not to filter
37 |
38 | We don't apply the above filters to targeted gene sequencing data sets, since they presumably come from high-depth sequencing of cancer hotspots.
39 |
40 | To allow a complete picture of the mutational processes present in tissues for mutation rate estimation, whole-exome/whole-genome variants should not be filtered on any sort of functional criteria. One thing to watch out for: Occasionally, researchers will leave out synonymous variants when publishing their study data. As synonymous variants are essential for calculating neutral gene mutation rates, they must be included in any WXS/WGS data.
41 |
42 |
43 |
44 |
--------------------------------------------------------------------------------
/vignettes/cosmic_cancer_type_note.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Cancer type considerations for COSMIC signature extraction"
3 | ---
4 | The `trinuc_mutation_rates()` function uses mutational signature extraction to calculate relative trinucleotide-specific SNV mutation rates in tumors. The "signatures_to_remove" option allows some signatures to be excluded from this analysis, which means each tumor will receive a weight of 0 for these signatures, indicating that none of the tumor's SNVs are attributable to these signatures. This page describes the behavior of the helper function `suggest_cosmic_signature_exclusions()` and the reasoning behind it.
5 |
6 | As reported in [Alexandrov 2020](https://doi.org/10.1038/s41586-020-1943-3), and in COSMIC v3.1/v3.2 mutational signature releases, some signatures are only expected to appear in certain cancer types. For more reliable signature extraction, consider excluding implausible signatures when running `trinuc_mutation_rates()`.
7 |
8 | There are also some signatures associated with various drug treatments (SBS11, SBS31, SBS32, SBS35, SBS86, SBS87, SBS90), so you if you know that your samples are treatment-naive or haven't been exposed to the implicated drugs, some or all of these signatures can be excluded.
9 |
10 | Note that while some COSMIC signatures are attributed to sequencing artifacts, you shouldn't exclude these because cancereffectsizeR already handles these signatures specially.
11 |
12 |
13 | The `suggest_cosmic_signature_exclusions()` function will identify possible signature exclusions based on
14 | cancer type and treatment status.
15 | ```{r eval = F}
16 | suggest_cosmic_signature_exclusions(cancer_type = "BRCA", treatment_naive = TRUE)
17 | suggest_cosmic_signature_exclusions(cancer_type = "Kidney-RCC")
18 | ```
19 |
20 | The cancer type recommendations are based on Extended Data Figure 5 of Alexandrov 2020 and the COSMIC website (for signatures released after the paper's publication). The first two columns of the table below give the labels accepted by the `cancer_type` argument.
21 |
22 | Before excluding signatures, make sure your data set does not contain tumors from multiple PCAWG categories. For example, TCGA HNSC (head and neck cancer) includes oral cancers, which are listed separately here as Oral-SCC, so excluding all signatures that do not appear in Head-SCC (such as SBS29, tobacco chewing) would not be appropriate.
23 |
24 | (You can access a text version of this table [here](https://github.com/Townsend-Lab-Yale/cancereffectsizeR/blob/main/inst/extdata/COSMIC_v3.2_signatures_by_cancer_type.txt).)
25 |
26 | ```{r, echo=FALSE, warning=FALSE, message=FALSE}
27 | data_source = paste0(system.file("extdata", package = "cancereffectsizeR"), '/COSMIC_v3.2_signatures_by_cancer_type.txt')
28 | cancer_type = data.table::fread(data_source)
29 | cancer_type[is.na(cancer_type)] = "(none)"
30 | cancer_type[, "SBS signatures found" := sapply(apply(cancer_type, 1, function(x) names(cancer_type)[x == 1]), paste, collapse = ", ")]
31 | formattable::formattable(cancer_type[, .(PCAWG, "Applicable TCGA" = Applicable_TCGA, Description, `SBS signatures found`)])
32 | ```
33 |
34 |
--------------------------------------------------------------------------------
/vignettes/custom_refset_instructions.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Running with custom reference data"
3 | ---
4 |
5 | A refset is a collection of all the reference data required to run a cancereffectsizeR analysis. Currently, refsets for the hg38 and hg19 human genome builds are available in the form as [ces.refset.hg38](https://github.com/Townsend-Lab-Yale/ces.refset.hg38) and [ces.refset.hg19](https://github.com/Townsend-Lab-Yale/ces.refset.hg19) data packages. It's possible to generate your own reference data for pretty much any species or genome build. Here is the general method:
6 |
7 | 1. Find a GTF file containing coding sequence (CDS) definitions, filter to high-quality and desired CDS regions, and run `build_RefCDS()` to get a collection of gene/transcript/CDS information.
8 |
9 | 2. Run `create_refset()`, which will require your RefCDS output and some additional information, to save reference data to an output directory.
10 |
11 | The output of `create_refset` is usable with cancereffectsizeR, but there are two additional sources of data that you will probably want to either add to your refset directory, or at least have available to supply to cancereffectsizeR functions:
12 |
13 | 3. A CES Signature Set, containing SNV signature definitions and metadata, is required for mutational signature extraction. As long as you have a matrix of signature definitions (e.g., see those published by COSMIC), you can create your own set. You can include one or more signature set in your refset, or you can pass a signature set as an argument to `trinuc_mutation_rates()`. (In either case, you can also copy a signature set from an existing refset if the genome build is compatible.) See the details in `trinuc_mutation_rates()` for how to make a signature set, then validate with `validate_signature_set()`. Finally, to save with your refset, create a subdirectory called "signatures" in your refset directory, and then use `saveRDS` to save your signature set with the name "[set_name]_signatures.rds".
14 |
15 | 4. Tissue covariates data inform the calculation of gene mutation rates when cancereffectsizeR calls dNdScv. Covariates data can be saved in a refset subdirectory called "covariates", or you can supply them as an argument to `gene_mutation_rates()`. (Example: "covariates/lung.rds" can be specified with `covariates = "lung"`.) [This guide](create_custom_covariates.html) shows how to generate covariates data by combining and processing tissue-specific experimental data from several sources. As a last resort, `gene_mutation_rates` can be run without covariates, but it's best to use them if available.
16 |
17 | For an example refset, see the ces.refset.hg38 data package's refset directory [here](https://github.com/Townsend-Lab-Yale/ces.refset.hg38/tree/main/inst/refset). If your refset also uses the hg19 or hg38, you can copy signatures and covariates from the refset packages into your own refset directory if desired.
18 |
--------------------------------------------------------------------------------