├── AUTHORS ├── data ├── input_table.rda └── results │ ├── CRUKTOY001.tree.RDS │ ├── pytree_and_bar.pdf │ └── pytree_multipletrees.pdf ├── man ├── unfold_tree.Rd ├── prune.tree.Rd ├── conipher_run.Rd ├── color.tree.Rd ├── extract_consensus_relationships.Rd ├── get_tree_level.Rd ├── is.there.ccf.issue.Rd ├── remove_clustered_clones.Rd ├── treebuilding_plot.Rd ├── clusterDistributionAcrossGenome.Rd ├── get_terminal_clusters.Rd ├── correct.clonality.nesting.Rd ├── createAllPathsList.Rd ├── process_mean_cluster_ccfs.Rd ├── extract_daughters.Rd ├── input_table.Rd ├── conipher_treebuilding.Rd ├── permute.clusters.to.remove.Rd ├── compute_tree_edge_probability.Rd ├── compute_sum_condition_error.Rd ├── treebuilding_preprocess.Rd ├── calc.pyclone.ci.Rd ├── conipher_clustering.Rd ├── test.distributions.Rd ├── grow.multi.trees.Rd ├── clustering_run.Rd ├── grow.trees.Rd ├── compute_subclone_proportions.Rd ├── clustering_preprocess.Rd ├── clonality.function.Rd ├── clustering_postprocess.Rd ├── determine.cluster.nesting.Rd ├── compute_subclonal_expansion_score.Rd └── treebuilding_run.Rd ├── R ├── example.R ├── main_conipher_run.R ├── sequenza_functions.R ├── functionsForSimpleClustering.v13.R ├── main_treebuilding_functions.R └── main_clustering_functions.R ├── inst └── extdata │ └── template.config.yaml ├── DESCRIPTION ├── NAMESPACE ├── LICENSE └── README.md /AUTHORS: -------------------------------------------------------------------------------- 1 | Nicholas McGranahan, 2 | Ariana Huebner, 3 | Kristiana Grigoriadis -------------------------------------------------------------------------------- /data/input_table.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGranahanLab/CONIPHER/HEAD/data/input_table.rda -------------------------------------------------------------------------------- /data/results/CRUKTOY001.tree.RDS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGranahanLab/CONIPHER/HEAD/data/results/CRUKTOY001.tree.RDS -------------------------------------------------------------------------------- /data/results/pytree_and_bar.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGranahanLab/CONIPHER/HEAD/data/results/pytree_and_bar.pdf -------------------------------------------------------------------------------- /data/results/pytree_multipletrees.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGranahanLab/CONIPHER/HEAD/data/results/pytree_multipletrees.pdf -------------------------------------------------------------------------------- /man/unfold_tree.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/treebuilding_helper_functions.R 3 | \name{unfold_tree} 4 | \alias{unfold_tree} 5 | \title{Function to unfod tree} 6 | \usage{ 7 | unfold_tree(edgelist, lower, trunk) 8 | } 9 | \description{ 10 | Function to unfod tree 11 | } 12 | -------------------------------------------------------------------------------- /man/prune.tree.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/treebuilding_helper_functions.R 3 | \name{prune.tree} 4 | \alias{prune.tree} 5 | \title{Function to prune the ancestral graph} 6 | \usage{ 7 | prune.tree(edgelist, nestedclust) 8 | } 9 | \description{ 10 | Function to prune the ancestral graph 11 | } 12 | -------------------------------------------------------------------------------- /man/conipher_run.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/main_conipher_run.R 3 | \name{conipher_run} 4 | \alias{conipher_run} 5 | \title{Full CONIPHER run} 6 | \usage{ 7 | conipher_run(case_id, prefix, out_dir, input_tsv_loc, ...) 8 | } 9 | \arguments{ 10 | \item{opt}{a list of options} 11 | } 12 | \description{ 13 | This function takes all the input options and runs the three main steps: 14 | preprocess, tree building run and postprocess 15 | } 16 | -------------------------------------------------------------------------------- /man/color.tree.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/treebuilding_helper_functions.R 3 | \name{color.tree} 4 | \alias{color.tree} 5 | \title{Plotting function to colour the nodes on the phylogenetic tree} 6 | \usage{ 7 | color.tree(edgelength, opacity = 255) 8 | } 9 | \arguments{ 10 | \item{edgelength}{A named vector containing number of mutations of each cluster} 11 | } 12 | \description{ 13 | Plotting function to colour the nodes on the phylogenetic tree 14 | } 15 | -------------------------------------------------------------------------------- /man/extract_consensus_relationships.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/treebuilding_helper_functions.R 3 | \name{extract_consensus_relationships} 4 | \alias{extract_consensus_relationships} 5 | \title{Function to take a list of trees and identify the consensus relationships} 6 | \usage{ 7 | extract_consensus_relationships(tree_list, output_as_table = FALSE) 8 | } 9 | \description{ 10 | Function to take a list of trees and identify the consensus relationships 11 | } 12 | -------------------------------------------------------------------------------- /man/get_tree_level.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/treebuilding_helper_functions.R 3 | \name{get_tree_level} 4 | \alias{get_tree_level} 5 | \title{Function to compute the tree level of a cluster} 6 | \usage{ 7 | get_tree_level(tree_graph, cluster) 8 | } 9 | \arguments{ 10 | \item{tree_graph}{A matrix of a tree structure} 11 | 12 | \item{cluster}{Name of a cluster for which you want to get the tree level} 13 | } 14 | \description{ 15 | Function to compute the tree level of a cluster 16 | } 17 | -------------------------------------------------------------------------------- /man/is.there.ccf.issue.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/treebuilding_helper_functions.R 3 | \name{is.there.ccf.issue} 4 | \alias{is.there.ccf.issue} 5 | \title{Function to check whether there is a CCF level issue} 6 | \usage{ 7 | is.there.ccf.issue( 8 | nestedclust, 9 | directed_input_graph, 10 | ccf_ci_lower, 11 | trunk_cluster, 12 | clusters_to_remove, 13 | clusters_to_use, 14 | max_per_level = 115 15 | ) 16 | } 17 | \description{ 18 | Function to check whether there is a CCF level issue 19 | } 20 | -------------------------------------------------------------------------------- /man/remove_clustered_clones.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/treebuilding_helper_functions.R 3 | \name{remove_clustered_clones} 4 | \alias{remove_clustered_clones} 5 | \title{Function to remove mutation clusters with genomically clustered mutations} 6 | \usage{ 7 | remove_clustered_clones( 8 | test.pyclone, 9 | clonal_cluster, 10 | p_value_cut = 0.01, 11 | clustering_estimate_cut = 2 12 | ) 13 | } 14 | \description{ 15 | Function to remove mutation clusters with genomically clustered mutations 16 | } 17 | -------------------------------------------------------------------------------- /man/treebuilding_plot.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/main_treebuilding_functions.R 3 | \name{treebuilding_plot} 4 | \alias{treebuilding_plot} 5 | \title{TRACERx tree plotting function} 6 | \usage{ 7 | treebuilding_plot(sample_pyclone_tree) 8 | } 9 | \arguments{ 10 | \item{sample_pyclone_tree}{A list containing all information about the 11 | tree inferred using function tracerx.tree.building()} 12 | } 13 | \description{ 14 | This function is the CONIPHER function to plot the inferred phylogenetic tree. 15 | } 16 | -------------------------------------------------------------------------------- /man/clusterDistributionAcrossGenome.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/treebuilding_helper_functions.R 3 | \name{clusterDistributionAcrossGenome} 4 | \alias{clusterDistributionAcrossGenome} 5 | \title{Function to determine the distribution of location of mutations within a cluster} 6 | \usage{ 7 | clusterDistributionAcrossGenome( 8 | cluster, 9 | clonal_cluster, 10 | test.pyclone, 11 | iterations = 10000 12 | ) 13 | } 14 | \description{ 15 | Function to determine the distribution of location of mutations within a cluster 16 | } 17 | -------------------------------------------------------------------------------- /man/get_terminal_clusters.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/treebuilding_helper_functions.R 3 | \name{get_terminal_clusters} 4 | \alias{get_terminal_clusters} 5 | \title{Function to extract the terminal nodes of a phylogenetic tree} 6 | \usage{ 7 | get_terminal_clusters(tree_structure) 8 | } 9 | \arguments{ 10 | \item{tree_structure}{A matrix of a tree structure (edge matrix)} 11 | } 12 | \value{ 13 | A vector of the terminal nodes in the tree 14 | } 15 | \description{ 16 | Function to extract the terminal nodes of a phylogenetic tree 17 | } 18 | -------------------------------------------------------------------------------- /man/correct.clonality.nesting.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/treebuilding_helper_functions.R 3 | \name{correct.clonality.nesting} 4 | \alias{correct.clonality.nesting} 5 | \title{Function to correct nesting based on cluster clonality} 6 | \usage{ 7 | correct.clonality.nesting( 8 | nestedlist, 9 | pyclone, 10 | clonality_table, 11 | pval_cutoff = 0.01, 12 | min_cluster_size = 5, 13 | use_boot = TRUE, 14 | min_ccf = 0.05, 15 | prefix = prefix 16 | ) 17 | } 18 | \description{ 19 | Function to correct nesting based on cluster clonality 20 | } 21 | -------------------------------------------------------------------------------- /man/createAllPathsList.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/treebuilding_helper_functions.R 3 | \name{createAllPathsList} 4 | \alias{createAllPathsList} 5 | \title{Function to create a list of all tree paths} 6 | \usage{ 7 | createAllPathsList(tree.structure, trunk) 8 | } 9 | \arguments{ 10 | \item{pyclone}{An R matrix describing the tree structure with two columns 11 | specifying 'parent' (column 1) and child (column 2)} 12 | } 13 | \value{ 14 | An R list of all tree paths from trunk to leaves 15 | } 16 | \description{ 17 | Function to create a list of all tree paths 18 | } 19 | -------------------------------------------------------------------------------- /R/example.R: -------------------------------------------------------------------------------- 1 | #' Example input table 2 | #' 3 | #' Example input table to input to CONIPHER tree building. The input table should 4 | #' have the following columns: CASE_ID, SAMPLE, CHR, POS, REF, 5 | #' ALT, REF_COUNT, VAR_COUNT, DEPTH, CLUSTER, CCF_PHYLO, CCF_OBS, 6 | #' MUT_COPY, COPY_NUMBER_A, COPY_NUMBER_B, ACF, and PLOIDY. Full description of 7 | #' the input format can be found in our companion manuscript. 8 | #' 9 | #' @docType data 10 | #' 11 | #' @usage data(input_table) 12 | #' 13 | #' @format An object of class \code{"data.frame"} 14 | #' 15 | #' @keywords datasets 16 | #' 17 | #' @examples 18 | #' data(input_table) 19 | #' head(input_table) 20 | "input_table" 21 | -------------------------------------------------------------------------------- /inst/extdata/template.config.yaml: -------------------------------------------------------------------------------- 1 | num_iters: 10000 2 | 3 | base_measure_params: 4 | alpha: 1 5 | beta: 1 6 | 7 | concentration: 8 | value: 1.0 9 | 10 | prior: 11 | shape: 1.0 12 | rate: 0.001 13 | 14 | density: pyclone_beta_binomial 15 | 16 | beta_binomial_precision_params: 17 | value: 1000 18 | 19 | prior: 20 | shape: 1.0 21 | rate: 0.0001 22 | 23 | proposal: 24 | precision: 0.01 25 | 26 | working_dir: working.directory.location 27 | 28 | trace_dir: trace 29 | 30 | samples: 31 | TCGA.barcode: 32 | mutations_file: mutations.yaml 33 | 34 | tumour_content: 35 | value: 1.0 36 | 37 | error_rate: 0.001 38 | 39 | 40 | -------------------------------------------------------------------------------- /man/process_mean_cluster_ccfs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/treebuilding_helper_functions.R 3 | \name{process_mean_cluster_ccfs} 4 | \alias{process_mean_cluster_ccfs} 5 | \title{Function to extract the terminal nodes of a phylogenetic tree} 6 | \usage{ 7 | process_mean_cluster_ccfs(ccf_table_pyclone_clean) 8 | } 9 | \arguments{ 10 | \item{ccf_table_pyclone_clean}{A mutation table with PhyloCCF values. This 11 | table is an item in the R list object list sample_pyclone_tree, which is the 12 | output of the CONIPHER treebuilding_run() function.} 13 | } 14 | \description{ 15 | Function to extract the terminal nodes of a phylogenetic tree 16 | } 17 | -------------------------------------------------------------------------------- /man/extract_daughters.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/treebuilding_helper_functions.R 3 | \name{extract_daughters} 4 | \alias{extract_daughters} 5 | \title{Function to extract all daughter clones from a parent using a phylogenetic tree} 6 | \usage{ 7 | extract_daughters(tree, parent.clones) 8 | } 9 | \arguments{ 10 | \item{tree}{A phylogenetic tree matrix with two columns specifying 11 | 'parent' (column 1) and child (column 2)} 12 | 13 | \item{parent.clones}{The name of the parent clone(s) for which you wish to find all daughters} 14 | } 15 | \description{ 16 | Function to extract all daughter clones from a parent using a phylogenetic tree 17 | } 18 | -------------------------------------------------------------------------------- /man/input_table.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/example.R 3 | \docType{data} 4 | \name{input_table} 5 | \alias{input_table} 6 | \title{Example input table} 7 | \format{ 8 | An object of class \code{"data.frame"} 9 | } 10 | \usage{ 11 | data(input_table) 12 | } 13 | \description{ 14 | Example input table to input to CONIPHER tree building. The input table should 15 | have the following columns: CASE_ID, SAMPLE, CHR, POS, REF, 16 | ALT, REF_COUNT, VAR_COUNT, DEPTH, CLUSTER, CCF_PHYLO, CCF_OBS, 17 | MUT_COPY, COPY_NUMBER_A, COPY_NUMBER_B, ACF, and PLOIDY. Full description of 18 | the input format can be found in our companion manuscript. 19 | } 20 | \examples{ 21 | data(input_table) 22 | head(input_table) 23 | } 24 | \keyword{datasets} 25 | -------------------------------------------------------------------------------- /man/conipher_treebuilding.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/main_treebuilding_functions.R 3 | \name{conipher_treebuilding} 4 | \alias{conipher_treebuilding} 5 | \title{Full tree building run function} 6 | \usage{ 7 | conipher_treebuilding( 8 | input_tsv_loc, 9 | out_dir, 10 | prefix, 11 | ccf_buffer = 10, 12 | pval_cutoff = 0.01, 13 | use_boot = TRUE, 14 | merge_clusters = TRUE, 15 | correct_cpn_clusters = TRUE, 16 | adjust_noisy_clusters = FALSE, 17 | adjust_noisy_clusters_prop = 0.05, 18 | min_ccf = 0.01, 19 | min_cluster_size = 5, 20 | multi_trees = TRUE, 21 | ... 22 | ) 23 | } 24 | \arguments{ 25 | \item{opt}{a list of options} 26 | } 27 | \description{ 28 | This function takes all the input options and runs the three main steps: 29 | preprocess, tree building run and postprocess 30 | } 31 | -------------------------------------------------------------------------------- /man/permute.clusters.to.remove.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/treebuilding_helper_functions.R 3 | \name{permute.clusters.to.remove} 4 | \alias{permute.clusters.to.remove} 5 | \title{Function to test which clusters are best to remove from tree 6 | This function permutes through clusters and checks which are the best to remove 7 | from the tree - takes into account number of mutations} 8 | \usage{ 9 | permute.clusters.to.remove( 10 | test_out, 11 | nestedclust, 12 | max_per_level, 13 | tlevels, 14 | trunk_cluster, 15 | cluster_qc, 16 | ccf_ci_lower, 17 | nclusters, 18 | speed_cluster = 15 19 | ) 20 | } 21 | \description{ 22 | Function to test which clusters are best to remove from tree 23 | This function permutes through clusters and checks which are the best to remove 24 | from the tree - takes into account number of mutations 25 | } 26 | -------------------------------------------------------------------------------- /man/compute_tree_edge_probability.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/treebuilding_helper_functions.R 3 | \name{compute_tree_edge_probability} 4 | \alias{compute_tree_edge_probability} 5 | \title{Function to compute the edge probability score for a list of trees} 6 | \usage{ 7 | compute_tree_edge_probability(tree_list, edgelength, trunk) 8 | } 9 | \arguments{ 10 | \item{tree_list}{A list of tree matrices} 11 | 12 | \item{edgelength}{A named vector containing number of mutations of each cluster} 13 | 14 | \item{trunk}{The name of the truncal cluster} 15 | } 16 | \value{ 17 | sce_vec, A named vector of the sum condition error (SCE) for each 18 | tree structure in the input tree list 19 | } 20 | \description{ 21 | This function takes a list of phylogenetic tree structures and the PhyloCCF cluster table and 22 | computes the sum condition error for each tree. 23 | } 24 | -------------------------------------------------------------------------------- /man/compute_sum_condition_error.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/treebuilding_helper_functions.R 3 | \name{compute_sum_condition_error} 4 | \alias{compute_sum_condition_error} 5 | \title{Function to compute the Sum Condition Error for a list of trees} 6 | \usage{ 7 | compute_sum_condition_error(tree_list, ccf_cluster_table, trunk) 8 | } 9 | \arguments{ 10 | \item{tree_list}{A list of tree matrices} 11 | 12 | \item{ccf_cluster_table}{A matrix of mean PhyloCCF of each cluster in 13 | each tumour region} 14 | 15 | \item{trunk}{The name of the truncal cluster} 16 | } 17 | \value{ 18 | sce_vec, A named vector of the sum condition error (SCE) for each 19 | tree structure in the input tree list 20 | } 21 | \description{ 22 | This function takes a list of phylogenetic tree structures and the PhyloCCF cluster table and 23 | computes the sum condition error for each tree. 24 | } 25 | -------------------------------------------------------------------------------- /man/treebuilding_preprocess.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/main_treebuilding_functions.R 3 | \name{treebuilding_preprocess} 4 | \alias{treebuilding_preprocess} 5 | \title{Input data preprocessing function} 6 | \usage{ 7 | treebuilding_preprocess(input_table, prefix, out_dir) 8 | } 9 | \arguments{ 10 | \item{input_table}{An dataframe of the input mutation table in the correct 11 | format. For more information on the input table format, please see our 12 | tree building protocol.} 13 | 14 | \item{prefix}{A tumour case and sample prefix, e.g. 'CRUK'.} 15 | 16 | \item{out_dir}{A file path to the desired output directory} 17 | } 18 | \description{ 19 | This function takes the input tsv and formats the data to be compatible with 20 | the main CONIPHER tree building function. NOTE: it is assumed that 21 | clustering has been carried out prior to running tree building. 22 | } 23 | -------------------------------------------------------------------------------- /R/main_conipher_run.R: -------------------------------------------------------------------------------- 1 | #' Full CONIPHER run 2 | #' 3 | #' This function takes all the input options and runs the three main steps: 4 | #' preprocess, tree building run and postprocess 5 | #' @param opt a list of options 6 | #' @returns NULL 7 | #' @export conipher_run 8 | 9 | conipher_run <- function(case_id, prefix, out_dir, input_tsv_loc, ...) { 10 | out_dir_tmp <- paste0(out_dir, "/Clustering/") 11 | conipher_clustering(case_id = case_id, 12 | out_dir = out_dir_tmp, 13 | input_tsv_loc = input_tsv_loc, 14 | ...) 15 | tree_input_tsv_loc <- paste0(out_dir_tmp, case_id, ".SCoutput.CLEAN.tsv") 16 | out_dir_tmp <- paste0(out_dir, "/Trees/") 17 | conipher_treebuilding(input_tsv_loc = tree_input_tsv_loc, 18 | out_dir = out_dir_tmp, 19 | prefix = prefix, 20 | ...) 21 | } 22 | -------------------------------------------------------------------------------- /man/calc.pyclone.ci.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/treebuilding_helper_functions.R 3 | \name{calc.pyclone.ci} 4 | \alias{calc.pyclone.ci} 5 | \title{Function to compute confidence intervals of each cluster in each region} 6 | \usage{ 7 | calc.pyclone.ci(pyclone, pyclust, nclusters, prefix = "LTX", lower_min = 0) 8 | } 9 | \arguments{ 10 | \item{pyclone}{An R list object containing information about the PhyloCCF 11 | of each mutation in each tumour region} 12 | } 13 | \value{ 14 | An R list containing elements: 'ccf_cluster_table', 'mean_phylo_ccf', 15 | 'median_pyclone_ccf', 'median_phylo_ccf', 'ccf_ci_upper', 'ccf_ci_lower', 16 | 'ccf_ci_boot_upper', 'ccf_ci_boot_lower' 17 | } 18 | \description{ 19 | This function takes in an R list containing mutation PhyloCCF in each region, 20 | and mutation assignments to a cluster and computes bootstrapped 21 | confidence intervals. 22 | } 23 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: CONIPHER 2 | Type: Package 3 | Title: R package for tumour phylogenetic tree reconstruction 4 | Version: 2.1.0 5 | Author: Nicholas McGranahan 6 | Maintainers: Kristiana Grigoriadis ; Ariana Huebner 7 | Description: CONIPHER is an R package for clustering mutation data and reconstruction of tumour phylogenetic 8 | trees from DNA sequencing. 9 | A full description of CONIPHER can be found in our pre-print XXX. 10 | Depends: 11 | R (>= 3.6.1) 12 | Imports: 13 | stats, 14 | utils, 15 | graphics, 16 | grDevices, 17 | plyr, 18 | dplyr, 19 | tidyr, 20 | parallel, 21 | boot, 22 | coin, 23 | RColorBrewer, 24 | wordcloud, 25 | data.table, 26 | beeswarm, 27 | mapplots, 28 | igraph, 29 | gplots 30 | Suggests: 31 | devtools 32 | License: use_bsd3clause_license() 33 | Encoding: UTF-8 34 | LazyData: true 35 | RoxygenNote: 7.2.3.9000 36 | -------------------------------------------------------------------------------- /man/conipher_clustering.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/main_clustering_functions.R 3 | \name{conipher_clustering} 4 | \alias{conipher_clustering} 5 | \title{Full clustering run function} 6 | \usage{ 7 | conipher_clustering( 8 | case_id, 9 | out_dir, 10 | input_tsv_loc, 11 | input_seg_tsv_loc = NULL, 12 | subclonal_copy_correction = TRUE, 13 | only_truncal_subclonal_copy_correction = TRUE, 14 | pyclone_yaml_loc = NULL, 15 | min_cluster_size = 5, 16 | multiple_test_correction = TRUE, 17 | clean_clusters = TRUE, 18 | clonal_cutOff = 0.9, 19 | propClonal_threshold = 0.25, 20 | fix_absentCCFs = TRUE, 21 | driver_filter = "1A,1,2A", 22 | burn_in = 1000, 23 | seed = 1024, 24 | nProcs = 1, 25 | ... 26 | ) 27 | } 28 | \arguments{ 29 | \item{opt}{a list of options} 30 | } 31 | \description{ 32 | This function takes all the input options and runs the three main steps: 33 | preprocess, clustering run and postprocess 34 | } 35 | -------------------------------------------------------------------------------- /R/sequenza_functions.R: -------------------------------------------------------------------------------- 1 | ### functions from sequenza version 2.1.2 2 | ### copied over by Ariana Huebner 3 | 4 | types.matrix <- function (CNt.min, CNt.max, CNn = 2) { 5 | cn.ratio.vect <- seq(from = CNt.min / CNn, to = CNt.max / CNn, by = 1 / CNn) 6 | CNt <- cn.ratio.vect * CNn 7 | mut.comb <- lapply(CNt, FUN = function(x) seq(from = 0, to = x)) 8 | times.muts <- sapply(mut.comb, length) 9 | data.frame(CNn = CNn, CNt = rep(CNt, times = times.muts), Mt = unlist(mut.comb)) 10 | } 11 | 12 | theoretical.mufreq <- function (Mt, CNt, CNn = 2, cellularity) { 13 | normal.alleles <- (CNt - Mt) * cellularity + CNn * (1 - cellularity) 14 | all.alleles <- (CNt * cellularity) + CNn * (1 - cellularity) 15 | 1 - (normal.alleles / all.alleles) 16 | } 17 | 18 | mufreq.dpois <- function (mufreq, mufreq.model, depth.t, seq.errors = 0.01, ...) { 19 | mufreq.model[mufreq.model == 0] <- seq.errors 20 | n.success <- round(mufreq * depth.t, 0) 21 | dpois(x = n.success, lambda = mufreq.model * depth.t, ...) 22 | } -------------------------------------------------------------------------------- /man/test.distributions.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/treebuilding_helper_functions.R 3 | \name{test.distributions} 4 | \alias{test.distributions} 5 | \title{Test difference in PhyloCCF distributions of each pair of mutation clusters} 6 | \usage{ 7 | test.distributions(pyclone, nclusters, pval_cutoff = 0.05) 8 | } 9 | \arguments{ 10 | \item{pyclone}{An R list object containing information about the PhyloCCF 11 | of each mutation in each tumour region.} 12 | 13 | \item{nclusters}{Number of clusters} 14 | 15 | \item{pval_cutoff}{A p-value significance threshold for testing whether 16 | clusters can be nested. (i.e. a p-value < pval_cutoff is significant)} 17 | } 18 | \value{ 19 | This function returns list of nesting matrices. Each element of the 20 | list is a nesting matrix for one tumour region, that describes whether a 21 | cluster A (row) can be nested within a cluster B (column). 22 | } 23 | \description{ 24 | This function compares the distributions of the PhyloCCF of each pair of 25 | mutation clusters in the dataset and outputs a nesting matrix. 26 | } 27 | -------------------------------------------------------------------------------- /man/grow.multi.trees.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/treebuilding_helper_functions.R 3 | \name{grow.multi.trees} 4 | \alias{grow.multi.trees} 5 | \title{Function to determine all possible alternative phylogenies} 6 | \usage{ 7 | grow.multi.trees( 8 | nestedlist, 9 | graph_pyclone, 10 | pyclone, 11 | ccf_buffer = 10, 12 | n_clusters_to_move = 5 13 | ) 14 | } 15 | \arguments{ 16 | \item{nestedlist}{An R list containing information about the nesting 17 | structure of mutation clusters in each region.} 18 | 19 | \item{graph_pyclone}{An R list containing information about the tree structure} 20 | 21 | \item{pyclone}{An R list containing information about mutation PhyloCCF} 22 | 23 | \item{ccf_buffer}{PhyloCCF buffer permitted when checking tree level issue} 24 | 25 | \item{n_clusters_to_move}{Maximum number of clusters to move simultaneously} 26 | } 27 | \value{ 28 | An R list containing all possible alternative tree structures and 29 | information about which branches are consensus across multiple trees 30 | } 31 | \description{ 32 | Function to determine all possible alternative phylogenies 33 | } 34 | -------------------------------------------------------------------------------- /man/clustering_run.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/main_clustering_functions.R 3 | \name{clustering_run} 4 | \alias{clustering_run} 5 | \title{Main clustering function} 6 | \usage{ 7 | clustering_run( 8 | input_list, 9 | nProcs, 10 | new.dir, 11 | burn_in, 12 | pyclone_seed, 13 | template.config.yaml 14 | ) 15 | } 16 | \arguments{ 17 | \item{input_list}{A list created by the clustering preprocess function 18 | including patient id, regions to use, phylo region list and others.} 19 | 20 | \item{nProcs}{A value referring to how many parallel processes 21 | of pyclone should be run.} 22 | 23 | \item{new.dir}{A character specifying the directory where the pyclone 24 | output should be saved.} 25 | 26 | \item{burn_in}{Burn-in for DP clustering.} 27 | 28 | \item{pyclone_seed}{Seed for PyClone run.} 29 | 30 | \item{template.config.yaml}{Location of the template yaml file used to run PyClone.} 31 | } 32 | \value{ 33 | sample.results which is the location of the pyclone output table. 34 | } 35 | \description{ 36 | This function takes the input list created in the preprocessing along with 37 | the number of cores and output directory to run the main clustering. 38 | } 39 | -------------------------------------------------------------------------------- /man/grow.trees.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/treebuilding_helper_functions.R 3 | \name{grow.trees} 4 | \alias{grow.trees} 5 | \title{Function to determine default tree structure} 6 | \usage{ 7 | grow.trees( 8 | nestedlist, 9 | pyclone, 10 | min_cluster_size = 5, 11 | ccf_buffer = 10, 12 | force_trunk = TRUE, 13 | skip_size = 20 14 | ) 15 | } 16 | \arguments{ 17 | \item{nestedlist}{An R list containing information about the nesting 18 | structure of mutation clusters in each region.} 19 | 20 | \item{pyclone}{An R list object containing information about the PhyloCCF 21 | of each mutation in each tumour region.} 22 | 23 | \item{min_cluster_size}{Threshold for minimum number of mutations required in 24 | a mutation cluster} 25 | 26 | \item{ccf_buffer}{PhyloCCF buffer permitted when checking tree level issue} 27 | 28 | \item{prefix}{A character string indicating the sample and tumour case prefix} 29 | } 30 | \description{ 31 | This function takes as input an R list containing information about 32 | PhyloCCF of each mutation (pyclone) and an R list containing a cluster ccf table 33 | and nesting matrix (nestedlist), and returns an R list containing the default tree 34 | structure. 35 | } 36 | -------------------------------------------------------------------------------- /man/compute_subclone_proportions.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/treebuilding_helper_functions.R 3 | \name{compute_subclone_proportions} 4 | \alias{compute_subclone_proportions} 5 | \title{Function to compute clone proportions on a selected alternative tree structure} 6 | \usage{ 7 | compute_subclone_proportions( 8 | tree_list, 9 | ccf_cluster_table, 10 | clonality_table, 11 | trunk, 12 | force_clonal_100 = TRUE, 13 | tree_id = 1 14 | ) 15 | } 16 | \arguments{ 17 | \item{tree_list}{A list of tree matrices} 18 | 19 | \item{ccf_cluster_table}{A matrix of mean PhyloCCF of each cluster in 20 | each tumour region} 21 | 22 | \item{clonality_table}{A matrix of clonality calls for each cluster in 23 | each tumour region} 24 | 25 | \item{trunk}{The name of the truncal cluster} 26 | 27 | \item{force_clonal_100}{A logical indicating whether to for clusters that are 28 | 'clonal' in a region to have CCF==100} 29 | 30 | \item{tree_id}{The tree index of the selected alternative tree for which you 31 | want to compute the clone proportions} 32 | } 33 | \value{ 34 | clone_proportion_table, a matrix containing the clone proportions of 35 | each clone (rows) in each tumour sample (columns) 36 | } 37 | \description{ 38 | Function to compute clone proportions on a selected alternative tree structure 39 | } 40 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(clustering_postprocess) 4 | export(clustering_preprocess) 5 | export(clustering_run) 6 | export(compute_subclonal_expansion_score) 7 | export(compute_subclone_proportions) 8 | export(conipher_clustering) 9 | export(conipher_run) 10 | export(conipher_treebuilding) 11 | export(get_terminal_clusters) 12 | export(treebuilding_plot) 13 | export(treebuilding_preprocess) 14 | export(treebuilding_run) 15 | import(plyr) 16 | importFrom(RColorBrewer,"brewer.pal") 17 | importFrom(boot,"boot") 18 | importFrom(boot,"boot.ci") 19 | importFrom(data.table,":=") 20 | importFrom(dplyr,"%>%") 21 | importFrom(grDevices,"colorRampPalette") 22 | importFrom(grDevices,"dev.off") 23 | importFrom(grDevices,"pdf") 24 | importFrom(graphics,"abline") 25 | importFrom(graphics,"axis") 26 | importFrom(graphics,"barplot") 27 | importFrom(graphics,"layout") 28 | importFrom(graphics,"legend") 29 | importFrom(graphics,"par") 30 | importFrom(graphics,"plot.new") 31 | importFrom(graphics,"segments") 32 | importFrom(graphics,"text") 33 | importFrom(graphics,"title") 34 | importFrom(igraph,"get.edgelist") 35 | importFrom(igraph,"shortest.paths") 36 | importFrom(parallel,mclapply) 37 | importFrom(stats,"median") 38 | importFrom(stats,"qnorm") 39 | importFrom(stats,"sd") 40 | importFrom(stats,"setNames") 41 | importFrom(stats,"wilcox.test") 42 | importFrom(utils,"combn") 43 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2023, the respective contributors, as shown by the AUTHORS file. 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /man/clustering_preprocess.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/main_clustering_functions.R 3 | \name{clustering_preprocess} 4 | \alias{clustering_preprocess} 5 | \title{Input data preprocessing function} 6 | \usage{ 7 | clustering_preprocess( 8 | input_table, 9 | new.dir, 10 | subclonal_copy_correction = TRUE, 11 | multiple_test_correction = TRUE, 12 | only_truncal_subclonal_copy_correction = TRUE, 13 | fix_absentCCFs = TRUE 14 | ) 15 | } 16 | \arguments{ 17 | \item{input_table}{An dataframe of the input mutation table in the correct 18 | format. For more information on the input table format, please see our 19 | tree building protocol.} 20 | 21 | \item{new.dir}{A character specifying the directory where the pyclone 22 | output should be saved.} 23 | 24 | \item{subclonal_copy_correction}{A logical value that specifies whether subclonal 25 | copy number correction should be performed. 26 | Default is set to TRUE} 27 | 28 | \item{multiple_test_correction}{A logical value that specifies whether multiple 29 | testing correction should be applied for the copy number correcting mutations. 30 | Default is set to TRUE} 31 | 32 | \item{only_truncal_subclonal_copy_correction}{A logical value that specifies 33 | whether only truncal subclonal copy number correction should be used. 34 | Default is set to TRUE} 35 | 36 | \item{fix_absentCCFs}{A logical value that specifies whether CCF 37 | of absent mutations should be set to zero. 38 | Default is set to TRUE} 39 | } 40 | \value{ 41 | list including patient, regions.to.use, mut.table, seg.mat.copy 42 | seg.mat.phylo, phylo.region.list, simpleClusterList 43 | } 44 | \description{ 45 | This function takes the input tsv and formats the data to be compatible with 46 | the main CONIPHER clustering function. 47 | } 48 | -------------------------------------------------------------------------------- /man/clonality.function.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/treebuilding_helper_functions.R 3 | \name{clonality.function} 4 | \alias{clonality.function} 5 | \title{Function to determine cluster clonality in each tumour region} 6 | \usage{ 7 | clonality.function( 8 | pyclone, 9 | trunk, 10 | ccf_buffer = 10, 11 | prefix = "LTX", 12 | min_cluster_size = 5, 13 | pval_cutoff = 0.01, 14 | min_ccf = 0.05, 15 | use_boot = FALSE 16 | ) 17 | } 18 | \arguments{ 19 | \item{pyclone}{An R list object containing information about the PhyloCCF 20 | of each mutation in each tumour region.} 21 | 22 | \item{trunk}{truncal cluster name} 23 | 24 | \item{ccf_buffer}{PhyloCCF buffer permitted when checking tree level issue} 25 | 26 | \item{min_cluster_size}{Threshold for minimum number of mutations required in 27 | a mutation cluster} 28 | 29 | \item{pval_cutoff}{A p-value significance threshold for testing whether 30 | clusters can be nested. (i.e. a p-value < pval_cutoff is significant)} 31 | 32 | \item{min_ccf}{Minimum threshold for cluster PhyloCCF allowed to be classified 33 | as present} 34 | 35 | \item{use_boot}{Whether to use bootstrapping to determine confidence intervals 36 | for each mutation cluster} 37 | } 38 | \value{ 39 | a matrix of dimensions (n_clusters x n_regions) classifying each 40 | cluster as 'clonal', 'subclonal', or 'absent' in each tumour region. 41 | } 42 | \description{ 43 | This function takes as input an R list containing information about PhyloCCF 44 | of each mutation (pyclone) and computes confidence intervals for each mutation 45 | cluster, of the PhyloCCF distributions of the mutations in that cluster. If 46 | use_boot==TRUE, then confidence intervals are computed using bootstrapping. 47 | The function then performs a statistical test (Wilcoxon) for every pair of 48 | clusters to determine whether one cluster can be nested within another. 49 | } 50 | -------------------------------------------------------------------------------- /man/clustering_postprocess.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/main_clustering_functions.R 3 | \name{clustering_postprocess} 4 | \alias{clustering_postprocess} 5 | \title{Postprocessing of clustering function} 6 | \usage{ 7 | clustering_postprocess( 8 | input_list, 9 | sample.results, 10 | new.dir, 11 | input_tsv, 12 | input_seg_tsv_loc = NULL, 13 | min_cluster_size = 5, 14 | driver_cat = "1", 15 | clean_clusters = TRUE, 16 | min_ccf_present = 0.1, 17 | clonal_cutOff = 0.9, 18 | propClonal_threshold = 0.25 19 | ) 20 | } 21 | \arguments{ 22 | \item{input_list}{A list created by the clustering preprocess function 23 | including patient id, regions to use, phylo region list and others.} 24 | 25 | \item{sample.results}{which is the location of the pyclone output table.} 26 | 27 | \item{new.dir}{A character specifying the directory where the pyclone 28 | output should be saved.} 29 | 30 | \item{input_tsv}{the input mutation tsv.} 31 | 32 | \item{input_seg_tsv_loc}{path to a copy number segment tsv file that is used for 33 | across genome copy number plotting. 34 | Default NULL} 35 | 36 | \item{min_cluster_size}{Minimum number of mutations needed for a cluster to be considered. 37 | Default 5} 38 | 39 | \item{driver_cat}{Which categories to use as driver mutations 40 | Default "1"} 41 | 42 | \item{clean_clusters}{should clusters be cleaned and merged? 43 | Default TRUE} 44 | 45 | \item{min_ccf_present}{minimum CCF to consider a mutation as present. 46 | Default 0.1} 47 | 48 | \item{clonal_cutOff}{lower threshold CCF to consider mutations as clonal. 49 | Default 0.9} 50 | 51 | \item{propClonal_threshold}{Proportion of mutations in cluster which needs to be 52 | considered clonal to merge. 53 | Default 0.25} 54 | } 55 | \description{ 56 | This function takes the input tsv and formats the data to be compatible with 57 | the main CONIPHER clustering function. 58 | } 59 | -------------------------------------------------------------------------------- /man/determine.cluster.nesting.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/treebuilding_helper_functions.R 3 | \name{determine.cluster.nesting} 4 | \alias{determine.cluster.nesting} 5 | \title{Function to determine cluster nesting structure.} 6 | \usage{ 7 | determine.cluster.nesting( 8 | pyclone, 9 | prefix = "LTX", 10 | min_cluster_size = 5, 11 | pval_cutoff = 0.01, 12 | min_ccf = 0.01, 13 | use_boot = FALSE 14 | ) 15 | } 16 | \arguments{ 17 | \item{pyclone}{An R list object containing information about the PhyloCCF 18 | of each mutation in each tumour region.} 19 | 20 | \item{prefix}{A character string indicating the sample and tumour case prefix} 21 | 22 | \item{min_cluster_size}{Threshold for minimum number of mutations required in 23 | a mutation cluster} 24 | 25 | \item{pval_cutoff}{A p-value significance threshold for testing whether 26 | clusters can be nested. (i.e. a p-value < pval_cutoff is significant)} 27 | 28 | \item{min_ccf}{Minimum threshold for cluster PhyloCCF allowed to be classified 29 | as present} 30 | 31 | \item{use_boot}{Whether to use bootstrapping to determine confidence intervals 32 | for each mutation cluster} 33 | } 34 | \value{ 35 | 'nestedlist', an R list containing information about the nesting 36 | structure of mutation clusters in each region. Elements of the list include: 37 | 'nestedclust', 'ccf_ci_lower', 'ccf_ci_upper', 'ccf_cluster_table', 'cluster_qc'. 38 | } 39 | \description{ 40 | This function takes as input an R list containing information about PhyloCCF 41 | of each mutation (pyclone) and computes confidence intervals for each mutation 42 | cluster, of the PhyloCCF distributions of the mutations in that cluster. If 43 | use_boot==TRUE, then confidence intervals are computed using bootstrapping. 44 | The function then performs a statistical test (Wilcoxon) for every pair of 45 | clusters to determine whether one cluster can be nested within another. 46 | } 47 | -------------------------------------------------------------------------------- /man/compute_subclonal_expansion_score.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/treebuilding_helper_functions.R 3 | \name{compute_subclonal_expansion_score} 4 | \alias{compute_subclonal_expansion_score} 5 | \title{Function to compute subclonal expansion score on a selected alternative tree, 6 | on a tumour sample and whole tumour level. 7 | The subclonal expansion score for each tumour sample is computed as the 8 | maximum CCF of any of the terminal (leaf) nodes present in that tumour sample. 9 | Note, for multi-sample cases, there may exist a sample with no terminal nodes 10 | present, in which case the subclonal expansion score for this sample is set 11 | to 0. The tumour level subclonal expansion score is taken as the maximum 12 | subclonal expansion score across tumour samples.} 13 | \usage{ 14 | compute_subclonal_expansion_score(tree_list, tree_id, ccf_table_pyclone_clean) 15 | } 16 | \arguments{ 17 | \item{tree_list}{A list of tree matrices} 18 | 19 | \item{tree_id}{The tree index of the selected alternative tree for which you 20 | want to compute the subclonal expansion score} 21 | 22 | \item{ccf_table_pyclone_clean}{The output mutation PhyloCCF data frame that is 23 | computed as part of CONIPHER tree building} 24 | } 25 | \value{ 26 | subclonal_exp_score_df, a data frame with the subclonal expansion 27 | score computed for each tumour sample (column subclonal_expansion_score), 28 | and across the whole tumour (column subclonal_expansion_score_tumour). 29 | each clone (rows) in each tumour sample (columns) 30 | } 31 | \description{ 32 | Function to compute subclonal expansion score on a selected alternative tree, 33 | on a tumour sample and whole tumour level. 34 | The subclonal expansion score for each tumour sample is computed as the 35 | maximum CCF of any of the terminal (leaf) nodes present in that tumour sample. 36 | Note, for multi-sample cases, there may exist a sample with no terminal nodes 37 | present, in which case the subclonal expansion score for this sample is set 38 | to 0. The tumour level subclonal expansion score is taken as the maximum 39 | subclonal expansion score across tumour samples. 40 | } 41 | -------------------------------------------------------------------------------- /man/treebuilding_run.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/main_treebuilding_functions.R 3 | \name{treebuilding_run} 4 | \alias{treebuilding_run} 5 | \title{TRACERx tree building function} 6 | \usage{ 7 | treebuilding_run( 8 | sample_input_list, 9 | ccf_buffer = 10, 10 | pval_cutoff = 0.01, 11 | use_boot = TRUE, 12 | merge_clusters = TRUE, 13 | correct_cpn_clusters = TRUE, 14 | adjust_noisy_clusters = FALSE, 15 | adjust_noisy_clusters_prop = 0.05, 16 | min_ccf = 0.01, 17 | min_cluster_size = 5, 18 | run.multi.trees = TRUE, 19 | n_clusters_to_move = 5 20 | ) 21 | } 22 | \arguments{ 23 | \item{sample_input_list}{An R list object. This object contains information 24 | about each mutation in each tumour region sampled, including which cluster 25 | each mutation was assigned to in mutation clustering. 26 | This object can be generated by running the data preprocessing function} 27 | 28 | \item{ccf_buffer}{PhyloCCF buffer allowance for testing tree level issue 29 | (default=10)} 30 | 31 | \item{pval_cutoff}{P-value cut off for testing cluster nesting (default=0.01)} 32 | 33 | \item{use_boot}{Should bootstrapping be used to compute confidence interval? 34 | (default=TRUE)} 35 | 36 | \item{merge_clusters}{Should similar clusters be merged if possible? 37 | (default=TRUE)} 38 | 39 | \item{correct_cpn_clusters}{Should clusters driven by copy number errors be 40 | removed? (default=TRUE)} 41 | 42 | \item{adjust_noisy_clusters}{Should noisy clusters be adjusted? (default=TRUE)} 43 | 44 | \item{adjust_noisy_clusters_prop}{What is the minimum proportion of mutations 45 | required to be present in a region to avoid cluster adjustment? (default=0.05)} 46 | 47 | \item{min_ccf}{What is the minimum CCF threshold to consider a mutation as 48 | present? (default=0.01)} 49 | 50 | \item{min_cluster_size}{What is the minimum number of mutations required in a 51 | cluster to be included in analysis? (default=5)} 52 | 53 | \item{run.multi.trees}{Should alternative tumour phylogenies be explored? 54 | (default=TRUE)} 55 | 56 | \item{n_clusters_to_move}{When running multiple trees specify the maximum 57 | number of clusters to attempt moving. (default=5)} 58 | } 59 | \value{ 60 | sample_pyclone_tree, an R list object containing output information 61 | from CONIPHER tree building 62 | } 63 | \description{ 64 | This function is the main CONIPHER wrapper function to run phylogenetic 65 | tree building from mutation clustering output. NOTE: it is assumed that 66 | clustering has been carried out prior to running tree building. 67 | } 68 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CONIPHER 2 | 3 | ## CONIPHER mutation clustering and phylogenetic tree building R package 4 | 5 | This is the official github repository for the R package to perform mutation clustering and phylogenetic tree building using CONIPHER. For details on how to run mutation clustering and phylogenetic tree builing consecutively with one wrapper script from the command line, please refer to the github repository [CONIPHER-wrapper](https://github.com/McGranahanLab/CONIPHER-wrapper). For full details of all the inputs into CONIPHER clustering and tree building, refer to our protocol (https://doi.org/10.1038/s41596-023-00913-9). 6 | 7 | ### Software 8 | The current implementation of CONIPHER is written in `R>=3.6.1` and is distributed as an R package. 9 | 10 | --- 11 | ## CONIPHER installation 12 | 13 | 14 | CONIPHER can be installed and run in multiple modes. 15 | 16 | 1) To run mutation clustering + phylogenetic tree building, install the `conipher` conda environment from bioconda using the command below. This environment contains R version 4.1.3, the CONIPHER R package, PyClone v0.13.1 (which is required for mutation clustering), and all other dependencies. 17 | 18 | ``` 19 | conda create -n conipher -c conda-forge -c bioconda conipher 20 | ``` 21 | 22 | 2) To run phylogenetic tree building only, the conda environment can also be used but is not strictly necessary. Alternatively, just the CONIPHER R package can be installed, provided the R package `devtools >= 2.4.1` is installed. 23 | 24 | To install the CONIPHER R package from an R console, run the following command: 25 | 26 | ``` 27 | library(devtools) 28 | devtools::install_github("McGranahanLab/CONIPHER") 29 | ``` 30 | 31 | --- 32 | ## Quick start 33 | 34 | ### Running clustering + tree building end-to-end 35 | To get start quickly, you can install CONIPHER and perform mutation clustering and phylogenetic tree reconstruction on the example data provided using the following instructions. 36 | 37 | 38 | **Step 1.** 39 | Install the `conipher` conda environment using the instructions above. 40 | 41 | **Step 2.** 42 | Start R and load the 'CONIPHER' and 'tidyverse' R packages using the following command: 43 | ``` 44 | library(CONIPHER) 45 | library(tidyverse) 46 | ``` 47 | 48 | **Step 3.** 49 | Specify a parent output directory where the clustering and tree building results will be saved in individual subfolders, for example using the following command: 50 | ``` 51 | out_dir <- "conipher_results/" 52 | ``` 53 | 54 | **Step 4.** 55 | Specify the location of the input table .tsv file. For example, the file path of the toy input table provided in this package is specified using the following command: 56 | ``` 57 | input_tsv_loc <- system.file("extdata", "input_table.tsv", package = "CONIPHER", mustWork = TRUE) 58 | ``` 59 | 60 | **Step 5.** 61 | Run clustering + tree building end-to-end (interactively) using the following command: 62 | ``` 63 | conipher_run(case_id = "CRUKTOY001", 64 | prefix = "CRUK", 65 | out_dir = out_dir, 66 | input_tsv_loc = input_tsv_loc) 67 | ``` 68 | 69 | ### Running clustering only 70 | Run steps 1 - 4 as described in "Running clustering + tree building end-to-end" above. 71 | 72 | **Step 5a.** 73 | Run clustering (interactively) using the following command: 74 | 75 | ``` 76 | conipher_clustering(case_id = "CRUKTOY001", 77 | out_dir = out_dir, 78 | input_tsv_loc = input_tsv_loc) 79 | ``` 80 | 81 | ### Running tree building only 82 | Run steps 1 - 4 as described in "Running clustering + tree building end-to-end" above. 83 | 84 | **Step 1b.** 85 | Alternatively, instead of installing the `conipher` conda environment, install the CONIPHER R package only using the instructions described above. 86 | 87 | 88 | **Step 5b.** 89 | Run tree building (interactively) using the following command: 90 | 91 | ``` 92 | conipher_treebuilding(prefix = "CRUK", 93 | out_dir = out_dir, 94 | input_tsv_loc = tree_input_tsv_loc) 95 | ``` 96 | 97 | 98 | --- 99 | ### Anticipated results 100 | The clustering output will include the following output files (examples are in "conipher_results/Clustering"): 101 | 102 | 103 | The tree building output will include 3 output files (examples are in "conipher_results/Trees"): 104 | - .tree.RDS: an R list object containing tree building output information 105 | - pytree_and_bar.pdf: a plot of the default reconstructed tree and barplot 106 | - pytree_multipletrees.pdf: a plot showing all possible alternative phylogenetic trees found by CONIPHER 107 | 108 | 109 | 110 | -------------------------------------------------------------------------------- /R/functionsForSimpleClustering.v13.R: -------------------------------------------------------------------------------- 1 | findSimpleClusters <- function(input_list, runType = "WES") { 2 | ###################################################################################################################################################### 3 | ### summary of what's required for function 4 | ### phylo.region.list - output as part of the initial steps of pyclone script (this contains the CCF of each mutation in each region prior to cluster) 5 | ### mut.table - the mutation table for the patient 6 | ###################################################################################################################################################### 7 | new.dir <- input_list$new.dir 8 | phylo.region.list <- input_list$phylo.region.list 9 | mut.table <- input_list$mut.table 10 | seg.mat.phylo <- input_list$seg.mat.phylo 11 | patient <- input_list$patient 12 | 13 | ### define the output 14 | SmallClusters <- list() 15 | 16 | regionsToUse <- names(phylo.region.list) 17 | CCFtable <- c() 18 | 19 | sharedMutations <- unlist(phylo.region.list[[1]]$mutation_id) 20 | for (region in regionsToUse) { 21 | sharedMutations <- intersect(sharedMutations, unlist(phylo.region.list[[region]]$mutation_id)) 22 | } 23 | new.phylo.region.list <- lapply(phylo.region.list, function(x) { 24 | x$cov <- (unlist(x$var_counts) + unlist(x$ref_counts)) 25 | x$var_count <- unlist(x$var_counts) 26 | x$VAF <- x$var_count / x$cov * 100 27 | x$presence <- ifelse(x$var_count >= 1 & x$VAF > 1, TRUE, FALSE) 28 | return(x) 29 | }) 30 | for (region in regionsToUse) { 31 | CCFtable <- cbind(CCFtable, unlist(new.phylo.region.list[[region]][sharedMutations, "presence"])) 32 | } 33 | rownames(CCFtable) <- sharedMutations 34 | colnames(CCFtable) <- regionsToUse 35 | BinaryTable <- ifelse(CCFtable, 1, 0) 36 | MutClusters <- apply(BinaryTable, 1, PasteVector, sep = ":") 37 | 38 | ### filter out mutations that are in small clusters 39 | UniqCluster <- unique(MutClusters) 40 | 41 | ### set Names for the UniqClusters 42 | names(UniqCluster) <- as.character(1:length(UniqCluster)) 43 | ### reverse the names as well 44 | UniqClusterNumber <- names(UniqCluster) 45 | names(UniqClusterNumber) <- UniqCluster 46 | MutClusterNum <- UniqClusterNumber[MutClusters] 47 | names(MutClusterNum) <- names(MutClusters) 48 | 49 | ### check whether evidence for multi-modality 50 | ClusterEvidenceForMultiModal <- rep(0, length(UniqCluster)) 51 | names(ClusterEvidenceForMultiModal) <- names(UniqCluster) 52 | 53 | for (cluster in names(UniqCluster)) { 54 | SmallClusters[[cluster]]$clusterID <- cluster 55 | SmallClusters[[cluster]]$clusterBinary <- UniqCluster[cluster] 56 | SmallClusters[[cluster]]$MutationsWithCluster <- names(MutClusterNum[MutClusterNum %in% cluster]) 57 | 58 | ### next, make pyclone tables so we can also run pyclone on these samples. 59 | ClusterMutationsIDs <- names(MutClusterNum[MutClusterNum %in% cluster]) 60 | specClusterTable <- BinaryTable[ClusterMutationsIDs, , drop = FALSE] 61 | RegionsInCluster <- colnames(specClusterTable)[which(specClusterTable[1, ] == 1)] 62 | ClusterTable <- mut.table[mut.table$mutation_id %in% ClusterMutationsIDs, , drop = FALSE] 63 | SmallClusters[[cluster]]$RegionsInCluster <- RegionsInCluster 64 | 65 | ### check which regions are in the cluster 66 | for (region in RegionsInCluster) { 67 | 68 | region.mut.table <- ClusterTable 69 | region.seg.copy <- seg.mat.phylo[seg.mat.phylo$SampleID %in% region, , drop = FALSE] 70 | pyclone.table <- data.frame(t(sapply(1:nrow(region.mut.table), identify.subclonal.mut.copy.number.ascat, region.mut.table, region.seg.copy, region, patient)), stringsAsFactors = FALSE) 71 | 72 | na.mutations <- pyclone.table[is.na(pyclone.table$minor_cn), , drop = FALSE] 73 | loss.mutations <- pyclone.table[as.numeric(pyclone.table$major_cn) == 0 | (as.numeric(pyclone.table$var_counts) + as.numeric(pyclone.table$ref_counts) == 0), ] 74 | error.muts <- rbind(na.mutations, loss.mutations) 75 | error.muts <- unlist(na.mutations$mutation_id, loss.mutations$mutation_id) 76 | error.muts.table <- paste0(new.dir, "/", region, ".error.muts.tsv") 77 | 78 | if (runType == "sim") { 79 | #CMR for sim data, modify NA CN 80 | print("Running CN updates for simulation data.") 81 | pyclone.table$major_cn[is.na(pyclone.table$minor_cn)] <- 1 82 | pyclone.table$major_raw[is.na(pyclone.table$minor_cn)] <- 1 83 | pyclone.table$minor_raw[is.na(pyclone.table$minor_cn)] <- 1 84 | pyclone.table$fracA[is.na(pyclone.table$minor_cn)] <- 1 85 | pyclone.table$nMaj_A[is.na(pyclone.table$minor_cn)] <- 1 86 | pyclone.table$nMin_A[is.na(pyclone.table$minor_cn)] <- 1 87 | pyclone.table$fracB[is.na(pyclone.table$minor_cn)] <- 0 88 | pyclone.table$nMaj_B[is.na(pyclone.table$minor_cn)] <- 1 89 | pyclone.table$nMin_B[is.na(pyclone.table$minor_cn)] <- 1 90 | pyclone.table$minor_cn[is.na(pyclone.table$minor_cn)] <- 1 91 | } 92 | 93 | ### a few sanity checks 94 | pyclone.table <- pyclone.table[!is.na(pyclone.table$minor_cn), ] 95 | pyclone.table <- pyclone.table[!is.na(pyclone.table$ref_counts), ] 96 | pyclone.table <- pyclone.table[!duplicated(pyclone.table$mutation_id), ] 97 | pyclone.table <- pyclone.table[as.numeric(pyclone.table$major_cn) >= 1, ] 98 | pyclone.table <- pyclone.table[!is.na(pyclone.table$minor_cn), ] 99 | 100 | ### now, let's check what the cancer cell fraction estimates are for this region 101 | region.ccf <- phylo.region.list[[region]] 102 | region.ccf <- data.frame(region.ccf, stringsAsFactors = FALSE) 103 | rownames(region.ccf) <- region.ccf$mutation_id 104 | tmp <- intersect(unlist(pyclone.table$mutation_id), unlist(region.ccf$mutation_id)) 105 | rownames(pyclone.table) <- pyclone.table$mutation_id 106 | pyclone.table <- pyclone.table[tmp, , drop = FALSE] 107 | region.ccf <- region.ccf[tmp, , drop = FALSE] 108 | 109 | if (nrow(pyclone.table) > 0) { 110 | tmp <- round(((unlist(pyclone.table$var_counts)) / (unlist(region.ccf$phyloCCF) / 2)) - unlist(pyclone.table$var_counts)) 111 | tmp[is.na(tmp)] <- unlist(pyclone.table$ref_counts[(is.na(tmp))]) 112 | pyclone.table$ref_counts <- tmp 113 | pyclone.table$minor_cn <- 0 114 | pyclone.table$major_cn <- 2 115 | pyclone.table$ref_counts <- apply(cbind(pyclone.table$ref_counts, 2), 1, max) 116 | } 117 | 118 | SmallClusters[[cluster]]$PyCloneTables[[region]]$pyclone.table <- pyclone.table 119 | } 120 | } 121 | return(SmallClusters) 122 | } 123 | 124 | RunPyCloneWithSimpleClusters <- function(clusterName, patientID, SmallClusters, patientDirToUse = new.dir, yamlConfigLoc = template.config.yaml, pyclone.burnin = 1000, pyclone.seed = 1024, run.pyclone = TRUE, pyclone.module = "PyClone/0.12.3-foss-2016b-Python-2.7.12-tkinter") { 125 | PyClone <- "PyClone" 126 | ### give a name to the sample 127 | PyCloneRunName <- paste0(patientID, "_cluster", clusterName) 128 | ### create a specific subDirectory for this analysis 129 | ClusterDir <- paste0(patientDirToUse, "/", PyCloneRunName, "/") 130 | if (!dir.exists(ClusterDir)) { 131 | dir.create(ClusterDir, recursive = TRUE) 132 | } 133 | 134 | ### make the mutation files for each region 135 | RegionsInClustering <- SmallClusters[[clusterName]]$RegionsInCluster 136 | for (region in RegionsInClustering) { 137 | pyclone.tsv <- paste0(ClusterDir, "/", region, ".tsv") 138 | pyclone.table <- SmallClusters[[clusterName]]$PyCloneTables[[region]]$pyclone.table 139 | if (nrow(pyclone.table) == 1) { 140 | pyclone.out <- matrix(apply(pyclone.table, 2, as.character), nrow = 1) 141 | colnames(pyclone.out) <- colnames(pyclone.table) 142 | } else { 143 | pyclone.out <- apply(pyclone.table, 2, as.character) 144 | } 145 | write.table(pyclone.out, sep = "\t", quote = FALSE, col.names = TRUE, row.names = FALSE, file = pyclone.tsv) 146 | 147 | ### Run PyClone build_mutations_file TSV_FILE where TSV_FILE is the input file you have created. 148 | pyclone.yaml <- paste0(ClusterDir, "/", region, ".yaml") 149 | 150 | cmd <- paste0(PyClone 151 | , " build_mutations_file" 152 | , " --in_file ", pyclone.tsv 153 | , " --out_file ", pyclone.yaml) 154 | cat('\n') 155 | 156 | if (run.pyclone) { 157 | cat(cmd) 158 | system(cmd) 159 | 160 | ### AH edit change so it works with states separated over multiple lines as well 161 | #exclude state including g_v=AB from yaml file 162 | yaml <- readLines(pyclone.yaml) 163 | rm.indx <- grep("AB", yaml) 164 | if (length(grep("prior_weight", grep("AB", yaml, value = TRUE))) > 0) { 165 | yaml <- yaml[-rm.indx] 166 | } else { 167 | yaml <- yaml[-c(rm.indx-2, rm.indx-1, rm.indx, rm.indx+1)] 168 | } 169 | write.table(yaml, file = pyclone.yaml, col.names = FALSE, row.names = FALSE, quote = FALSE) 170 | } 171 | } 172 | 173 | 174 | pyclone.config.yaml <- paste0(ClusterDir, "/", PyCloneRunName, ".config.yaml") 175 | pyclone.config <- readLines(yamlConfigLoc) 176 | start.samples <- (grep("samples", pyclone.config) + 1) 177 | end.samples <- length(pyclone.config) 178 | 179 | sample.lines <- pyclone.config[start.samples:end.samples] 180 | pyclone.config <- pyclone.config[-c(start.samples:end.samples)] 181 | pyclone.config <- c(pyclone.config, "init_method: connected", "", "samples:") 182 | pyclone.config <- gsub("working.directory.location", ClusterDir, pyclone.config) 183 | 184 | write.table(pyclone.config, file = pyclone.config.yaml, col.names = FALSE, row.names = FALSE, quote = FALSE) 185 | 186 | RegionsInClustering <- SmallClusters[[clusterName]]$RegionsInCluster 187 | for (region in RegionsInClustering) { 188 | sample.config <- gsub("TCGA.barcode", region, sample.lines) 189 | # pyclone.yaml <- paste0(new.dir, "/", region, ".yaml") 190 | pyclone.yaml <- paste0(region, ".yaml") 191 | sample.config <- gsub("mutations.yaml", pyclone.yaml, sample.config) 192 | region.purity <- 0.5 193 | 194 | sample.config <- gsub("value: 1.0", paste0("value: ", signif(region.purity, 3)), sample.config) 195 | sample.config <- sample.config[1:8] 196 | 197 | if (run.pyclone) { 198 | write.table(sample.config, file = pyclone.config.yaml, append = TRUE, quote = FALSE, col.names = FALSE, row.names = FALSE) 199 | } 200 | } 201 | 202 | ### next, run pyclone 203 | cmd <- paste0(PyClone 204 | , " run_analysis --config_file " 205 | , pyclone.config.yaml 206 | , " --seed " 207 | , pyclone.seed) 208 | cat('\n') 209 | 210 | if (run.pyclone) { 211 | cat(cmd) 212 | system(cmd) 213 | } 214 | cat('\n') 215 | 216 | sample.results <- paste0(ClusterDir, "/", patientID, '.results.tsv') 217 | cmd <- paste0(PyClone 218 | , " build_table --config_file " 219 | , pyclone.config.yaml 220 | , " --table_type old_style --out_file " 221 | , sample.results 222 | , " --max_clusters ", min(max(1, floor(length(SmallClusters[[clusterName]]$MutationsWithCluster) / 5)), 10) 223 | , " --burnin " 224 | , pyclone.burnin) 225 | cat('\n') 226 | 227 | if(run.pyclone) { 228 | cat(cmd) 229 | system(cmd) 230 | } 231 | cat('\n') 232 | } 233 | 234 | CreateOutputNoPyCloneRun <- function(clusterName, patientID, SmallClusters, patientDirToUse = new.dir) { 235 | 236 | ### give a name to the sample 237 | PyCloneRunName <- paste0(patientID, "_cluster", clusterName) 238 | ### create a specific subDirectory for this analysis 239 | ClusterDir <- paste0(patientDirToUse, "/", PyCloneRunName, "/") 240 | if (!dir.exists(ClusterDir)) { 241 | dir.create(ClusterDir, recursive = TRUE) 242 | } 243 | 244 | sample.results <- paste0(ClusterDir, "/", patientID, '.results.tsv') 245 | tmp <- matrix(0, nrow = length(SmallClusters[[clusterName]]$MutationsWithCluster), ncol = length(SmallClusters[[clusterName]]$RegionsInCluster)) 246 | rownames(tmp) <- SmallClusters[[clusterName]]$MutationsWithCluster 247 | colnames(tmp) <- SmallClusters[[clusterName]]$RegionsInCluster 248 | tmp <- data.frame(tmp, cluster_id = clusterName) 249 | write.table(tmp, sep = "\t", row.names = TRUE, col.names = TRUE, quote = FALSE, file = sample.results) 250 | } 251 | 252 | #################################################################################################### 253 | ######################################### Helper functions ######################################### 254 | #################################################################################################### 255 | 256 | ### get modified VAFs (extracted from findSimpleClustersWithSciClone) 257 | getModifiedVAF <- function(rowNumber, regionClusterTable) { 258 | cat(rowNumber) 259 | varCountObs <- unlist(regionClusterTable[rowNumber, ]$var_counts) 260 | if (varCountObs == 0) { 261 | Depth <- as.numeric(regionClusterTable[rowNumber, ]$ref_counts) 262 | } 263 | if (varCountObs != 0) { 264 | Depth <- round(varCountObs / unlist(regionClusterTable[rowNumber, ]$phyloCCF / 2)) 265 | } 266 | RefCount <- Depth - varCountObs 267 | mutID <- unlist(regionClusterTable[rowNumber, ]$mutation_id) 268 | outPutRow <- cbind(unlist(strsplit(mutID, split = ":"))[2], as.numeric(unlist(strsplit(mutID, split = ":"))[3]), as.numeric(RefCount), as.numeric(varCountObs), as.numeric(varCountObs / Depth)) 269 | if (as.numeric(varCountObs / Depth) > 0.75) { 270 | outPutRow <- cbind(unlist(strsplit(mutID, split = ":"))[2], as.numeric(unlist(strsplit(mutID, split = ":"))[3]), as.numeric(varCountObs), as.numeric(varCountObs), 0.5) 271 | } 272 | colnames(outPutRow) <- c('chr', 'pos', 'ref_count', 'var_count', 'vaf') 273 | return(outPutRow) 274 | } 275 | 276 | 277 | 278 | -------------------------------------------------------------------------------- /R/main_treebuilding_functions.R: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | ################## MAIN FUNCTIONS ################## 3 | ################################################################################ 4 | 5 | #' Full tree building run function 6 | #' 7 | #' This function takes all the input options and runs the three main steps: 8 | #' preprocess, tree building run and postprocess 9 | #' @param opt a list of options 10 | #' @returns NULL 11 | #' @export conipher_treebuilding 12 | 13 | conipher_treebuilding <- function(input_tsv_loc, 14 | out_dir, 15 | prefix, 16 | ccf_buffer = 10, 17 | pval_cutoff = 0.01, 18 | use_boot = TRUE, 19 | merge_clusters = TRUE, 20 | correct_cpn_clusters = TRUE, 21 | adjust_noisy_clusters = FALSE, 22 | adjust_noisy_clusters_prop = 0.05, 23 | min_ccf = 0.01, 24 | min_cluster_size = 5, 25 | multi_trees = TRUE, 26 | ...) { 27 | out_dir <- paste0(out_dir, "/") 28 | 29 | # cat("\nCONIPHER tree building analysis of the following tumour case:\n") 30 | # print(patient) 31 | # cat("\n") 32 | 33 | if (!file.exists(out_dir)) { 34 | if (!dir.create(out_dir, recursive = TRUE)) { 35 | stop("Unable to create root directory.\n") 36 | } 37 | } 38 | 39 | if(!file.exists(input_tsv_loc)) { 40 | stop("Unable to find input_tsv.\n") 41 | } 42 | input_tsv <- read.delim(input_tsv_loc, sep = "\t", stringsAsFactors = FALSE, header = TRUE, fill = TRUE, quote = "") 43 | if (nrow(input_tsv) == 0) { 44 | stop('No mutations passed filtering, stopping PyClone phylo clustering') 45 | } 46 | #### =========== PREOCESS INPUT DATA ========= #### 47 | 48 | # preprocess input data into correct form for tree building 49 | input_list <- treebuilding_preprocess(input_tsv, prefix, out_dir) 50 | 51 | #### =========== RUN TREE BUILDING ========= #### 52 | 53 | # run main CONIPHER tree building function 54 | sample_pyclone_tree <- treebuilding_run(sample_input_list = input_list 55 | , ccf_buffer = ccf_buffer 56 | , pval_cutoff = pval_cutoff 57 | , use_boot = use_boot 58 | , merge_clusters = merge_clusters 59 | , correct_cpn_clusters = correct_cpn_clusters 60 | , adjust_noisy_clusters = adjust_noisy_clusters 61 | , adjust_noisy_clusters_prop = adjust_noisy_clusters_prop 62 | , min_ccf = min_ccf 63 | , min_cluster_size = min_cluster_size 64 | , run.multi.trees = multi_trees 65 | ) 66 | 67 | #### =========== SAVE OUTPUT ========= #### 68 | 69 | # Save all tree building output 70 | if(!is.na(sample_pyclone_tree$graph_pyclone[1])) 71 | cat('\nSaving all treebuilding output\n') 72 | { 73 | ### Plotting tree 74 | treebuilding_plot(sample_pyclone_tree) 75 | 76 | ### Creating human readable format 77 | ### writing all trees 78 | treeFile <- paste0(sample_pyclone_tree$parameters$generalSave, "allTrees.txt") 79 | if ("alt_trees" %in% names(sample_pyclone_tree$graph_pyclone)) { 80 | write.table(paste0("### ", length(sample_pyclone_tree$graph_pyclone$alt_trees), " trees"), file = treeFile, row.names = FALSE, col.names = FALSE, quote = FALSE, sep = "\t") 81 | tmp <- sapply(seq(1, length(sample_pyclone_tree$graph_pyclone$alt_trees)), function(x) { 82 | write.table(paste0("# tree ", x), file = treeFile, append = TRUE, row.names = FALSE, col.names = FALSE, quote = FALSE, sep = "\t") 83 | write.table(sample_pyclone_tree$graph_pyclone$alt_trees[[x]], file = treeFile, append = TRUE, row.names = FALSE, col.names = FALSE, quote = FALSE, sep = "\t") 84 | }) 85 | } else { 86 | write.table(paste0("### ", 1, " trees"), file = treeFile, row.names = FALSE, col.names = FALSE, quote = FALSE, sep = "\t") 87 | write.table(paste0("# tree ", 1), file = treeFile, append = TRUE, row.names = FALSE, col.names = FALSE, quote = FALSE, sep = "\t") 88 | write.table(sample_pyclone_tree$graph_pyclone$default_tree, file = treeFile, append = TRUE, row.names = FALSE, col.names = FALSE, quote = FALSE, sep = "\t") 89 | } 90 | 91 | ### writing consensus branches 92 | consensusBranchesFile <- paste0(sample_pyclone_tree$parameters$generalSave, "consensusBranches.txt") 93 | write.table(Reduce(rbind, strsplit(sample_pyclone_tree$graph_pyclone$consensus_branches, split = ":")), file = consensusBranchesFile, row.names = FALSE, col.names = FALSE, quote = FALSE, sep = "\t") 94 | 95 | ### writing consensus relationships 96 | consensusRelationshipsFile <- paste0(sample_pyclone_tree$parameters$generalSave, "consensusRelationships.txt") 97 | write.table(Reduce(rbind, strsplit(sample_pyclone_tree$graph_pyclone$consensus_relationships, split = ":")), file = consensusRelationshipsFile, row.names = FALSE, col.names = FALSE, quote = FALSE, sep = "\t") 98 | 99 | ### writing cluster information 100 | clusterInfoFile <- paste0(sample_pyclone_tree$parameters$generalSave, "clusterInfo.txt") 101 | 102 | clusterInfoDF <- data.frame(clusterID = names(sample_pyclone_tree$graph_pyclone$edgelength), stringsAsFactors = FALSE) 103 | clusterInfoDF$truncal <- ifelse(clusterInfoDF$clusterID %in% sample_pyclone_tree$graph_pyclone$trunk, TRUE, FALSE) 104 | clusterInfoDF$treeClust <- ifelse(clusterInfoDF$clusterID %in% unique(c(sample_pyclone_tree$graph_pyclone$default_tree)), TRUE, FALSE) 105 | clusterInfoDF$cpnRemClust <- ifelse(clusterInfoDF$clusterID %in% sample_pyclone_tree$cpn_removed_clusters, TRUE, FALSE) 106 | clusterInfoDF$nMuts <- as.numeric(sample_pyclone_tree$graph_pyclone$edgelength) 107 | 108 | clusterInfoDF <- clusterInfoDF %>% dplyr::full_join(data.frame(sample_pyclone_tree$nested_pyclone$ccf_cluster_table, stringsAsFactors = FALSE) %>% dplyr::mutate(clusterID = rownames(.)) %>% tidyr::pivot_longer(!clusterID, names_to = "Region", values_to = "meanCCF"), by = c("clusterID")) 109 | clusterInfoDF <- clusterInfoDF %>% dplyr::full_join(data.frame(sample_pyclone_tree$nested_pyclone$ccf_ci_lower, stringsAsFactors = FALSE) %>% dplyr::mutate(clusterID = rownames(.)) %>% tidyr::pivot_longer(!clusterID, names_to = "Region", values_to = "CCF_CI_low"), by = c("clusterID", "Region")) 110 | clusterInfoDF <- clusterInfoDF %>% dplyr::full_join(data.frame(sample_pyclone_tree$nested_pyclone$ccf_ci_upper, stringsAsFactors = FALSE) %>% dplyr::mutate(clusterID = rownames(.)) %>% tidyr::pivot_longer(!clusterID, names_to = "Region", values_to = "CCF_CI_high"), by = c("clusterID", "Region")) 111 | clusterInfoDF <- clusterInfoDF %>% dplyr::full_join(data.frame(sample_pyclone_tree$clonality_out$clonality_table_corrected, stringsAsFactors = FALSE) %>% dplyr::mutate(clusterID = rownames(.)) %>% tidyr::pivot_longer(!clusterID, names_to = "Region", values_to = "clonality"), by = c("clusterID", "Region")) 112 | clusterInfoDF <- clusterInfoDF %>% dplyr::full_join(data.frame(sample_pyclone_tree$clone_proportion_out$clone_proportion_table, stringsAsFactors = FALSE) %>% dplyr::mutate(clusterID = rownames(.)) %>% tidyr::pivot_longer(!clusterID, names_to = "Region", values_to = "clone_proportions_default"), by = c("clusterID", "Region")) 113 | 114 | clusterInfoDF <- clusterInfoDF %>% dplyr::rename(SAMPLE = Region) 115 | write.table(clusterInfoDF, file = clusterInfoFile, row.names = FALSE, quote = FALSE, sep = "\t") 116 | 117 | ### writing clone proportion information 118 | cloneproportionInfoFile <- paste0(sample_pyclone_tree$parameters$generalSave, "cloneProportionsMinErrorTrees.txt") 119 | 120 | cp_min_sce_trees <- sample_pyclone_tree$clone_proportion_out$clone_proportions_min_sce_trees 121 | cloneproportionInfoList <- lapply(seq(cp_min_sce_trees), function(i){ 122 | tree_id <- names(cp_min_sce_trees)[i] 123 | cp_table <- data.frame(cp_min_sce_trees[[i]], stringsAsFactors = FALSE) 124 | cp_table$clusterID <- rownames(cp_table) 125 | cp_table$treeID <- tree_id 126 | return(cp_table) 127 | }) 128 | cloneproportionInfoDF <- do.call(rbind, cloneproportionInfoList) 129 | write.table(cloneproportionInfoDF, file = cloneproportionInfoFile, row.names = FALSE, quote = FALSE, sep = "\t") 130 | 131 | ### writing subclonal expansion score data 132 | subcloneExpansionInfoFile <- paste0(sample_pyclone_tree$parameters$generalSave, "subclonalExpansionScoreMinErrorTrees.txt") 133 | 134 | ses_min_sce_trees <- sample_pyclone_tree$subclonal_expansion_score_out$subclonal_exp_score_min_sce_trees 135 | subcloneExpansionInfoList <- lapply(seq(ses_min_sce_trees), function(i){ 136 | tree_id <- names(ses_min_sce_trees)[i] 137 | ses_table <- data.frame(ses_min_sce_trees[[i]], stringsAsFactors = FALSE) 138 | ses_table$treeID <- tree_id 139 | return(ses_table) 140 | }) 141 | subcloneExpansionInfoDF <- do.call(rbind, subcloneExpansionInfoList) 142 | write.table(subcloneExpansionInfoDF, file = subcloneExpansionInfoFile, row.names = FALSE, quote = FALSE, sep = "\t") 143 | 144 | ### writing output muttable - similar to input 145 | input_tsv <- input_tsv %>% dplyr::rename(originalCLUSTER = CLUSTER) 146 | if (is.null(nrow(sample_pyclone_tree$merged_clusters))) { 147 | input_tsv <- input_tsv %>% dplyr::mutate(treeCLUSTER = originalCLUSTER) 148 | } else { 149 | input_tsv <- input_tsv %>% dplyr::mutate(treeCLUSTER = originalCLUSTER) 150 | for (i in 1:nrow(sample_pyclone_tree$merged_clusters)) { 151 | input_tsv$treeCLUSTER <- gsub(sample_pyclone_tree$merged_clusters[i, 1], sample_pyclone_tree$merged_clusters[i, 3], input_tsv$treeCLUSTER) 152 | } 153 | } 154 | write.table(input_tsv, file = paste0(sample_pyclone_tree$parameters$generalSave, "treeTable.tsv"), row.names = FALSE, quote = FALSE, sep = "\t") 155 | 156 | ### writing alternative trees summary metrics 157 | altTreeInfoFile <- paste0(sample_pyclone_tree$parameters$generalSave, "alternativeTreeMetrics.txt") 158 | 159 | altTreeInfoDF <- data.frame(treeID = seq(sample_pyclone_tree$graph_pyclone$alt_trees), stringsAsFactors = FALSE) 160 | 161 | altTreeInfoDF$sum_condition_error <- sapply(altTreeInfoDF$treeID, function(i) sample_pyclone_tree$graph_pyclone$alt_trees_sum_condition_error[i]) 162 | altTreeInfoDF$SCE_ranking <- match(altTreeInfoDF$sum_condition_error, sort(unique(altTreeInfoDF$sum_condition_error))) 163 | altTreeInfoDF$lowest_SCE <- ifelse(altTreeInfoDF$sum_condition_error == min(altTreeInfoDF$sum_condition_error), 'Lowest SCE tree', 'Alternative tree') 164 | 165 | altTreeInfoDF$edge_probability_score <- sapply(altTreeInfoDF$treeID, function(i) sample_pyclone_tree$graph_pyclone$alt_trees_edge_probability[i]) 166 | altTreeInfoDF$edge_probability_ranking <- match(altTreeInfoDF$edge_probability_score, rev(sort(unique(altTreeInfoDF$edge_probability_score)))) 167 | altTreeInfoDF$highest_edge_probability <- ifelse(altTreeInfoDF$edge_probability_score == max(altTreeInfoDF$edge_probability_score), 'Highest edge probability tree', 'Alternative tree') 168 | write.table(altTreeInfoDF, file = altTreeInfoFile, row.names = FALSE, quote = FALSE, sep = "\t") 169 | 170 | } 171 | } 172 | 173 | 174 | #' Input data preprocessing function 175 | #' 176 | #' This function takes the input tsv and formats the data to be compatible with 177 | #' the main CONIPHER tree building function. NOTE: it is assumed that 178 | #' clustering has been carried out prior to running tree building. 179 | #' @param input_table An dataframe of the input mutation table in the correct 180 | #' format. For more information on the input table format, please see our 181 | #' tree building protocol. 182 | #' @param prefix A tumour case and sample prefix, e.g. 'CRUK'. 183 | #' @param out_dir A file path to the desired output directory 184 | #' @export treebuilding_preprocess 185 | 186 | treebuilding_preprocess <- function(input_table, prefix, out_dir) { 187 | cat('\n Preprocessing input data \n') 188 | # check if the correct columns are included 189 | required_cols <- c("CASE_ID", "SAMPLE", "CHR", "POS", "REF", "ALT", "CLUSTER", "CCF_PHYLO", "CCF_OBS", "MUT_COPY", "COPY_NUMBER_A", "COPY_NUMBER_B") 190 | if (FALSE%in% (required_cols %in% colnames(input_table))) 191 | { 192 | print('\nThe following columns are required in input_tsv:\n') 193 | cat(required_cols) 194 | stop() 195 | } 196 | 197 | # add mutation id column 198 | input_table$mutation_id <- paste(input_table$CASE_ID, 199 | input_table$CHR, 200 | input_table$POS, 201 | input_table$REF, 202 | input_table$ALT, 203 | sep=":") 204 | 205 | nr_unique_muts <- length(unique(input_table$mutation_id)) 206 | nr_regions <- length(unique(input_table$SAMPLE)) 207 | regions <- unique(input_table$SAMPLE) 208 | 209 | # Raise an error if prefix is not specified, or incorrectly specified 210 | if (is.null(prefix)){ 211 | stop("No prefix specified. Please indicate a prefix for the current tumour case.") 212 | } else if (!grepl(prefix, input_table$CASE_ID[1])){ 213 | stop("Incorrect prefix specified. Please input the correct prefix for the current tumour case.") 214 | } 215 | 216 | # make sure all columns are the correct class 217 | input_table$POS <- as.numeric(input_table$POS) 218 | input_table$CCF_PHYLO <- as.numeric(input_table$CCF_PHYLO) 219 | input_table$CCF_OBS <- as.numeric(input_table$CCF_OBS) 220 | input_table$MUT_COPY <- as.numeric(input_table$MUT_COPY) 221 | input_table$COPY_NUMBER_A <- as.numeric(input_table$COPY_NUMBER_A) 222 | input_table$COPY_NUMBER_B <- as.numeric(input_table$COPY_NUMBER_B) 223 | input_table$REF_COUNT <- as.numeric(input_table$REF_COUNT) 224 | input_table$VAR_COUNT <- as.numeric(input_table$VAR_COUNT) 225 | input_table$DEPTH <- as.numeric(input_table$DEPTH) 226 | input_table$ACF <- as.numeric(input_table$ACF) 227 | input_table$PLOIDY <- as.numeric(input_table$PLOIDY) 228 | 229 | 230 | # make sure all mutations have a cluster assigned 231 | tmp <- input_table[!is.na(input_table$CLUSTER),] 232 | removed_mutations <- c() 233 | for (mutation_id in unique(input_table$mutation_id)) 234 | { 235 | if(length(unique(tmp[tmp$mutation_id%in%mutation_id,]$CLUSTER))==0) 236 | { 237 | # add warning 238 | removed_mutations <- c(removed_mutations,mutation_id) 239 | cat('\nwarning:') 240 | cat('', paste(mutation_id),'does not have a CLUSTER assigned, will remove') 241 | next; 242 | } 243 | 244 | input_table[input_table$mutation_id%in%mutation_id,]$CLUSTER <- unique(tmp[tmp$mutation_id%in%mutation_id,]$CLUSTER) 245 | } 246 | 247 | if(length(removed_mutations)>=1) 248 | { 249 | cat(paste('\nwarning: ',length(removed_mutations), ' mutations removed due to lack of cluster assignment',sep="")) 250 | } 251 | 252 | input_table <- input_table[!is.na(input_table$CLUSTER),,drop=FALSE] 253 | 254 | # check again: 255 | nr_unique_muts <- length(unique(input_table$mutation_id)) 256 | nr_regions <- length(unique(input_table$SAMPLE)) 257 | regions <- unique(input_table$SAMPLE) 258 | 259 | 260 | # Next convert the input_table into a sample_input_list 261 | input_list <- list() 262 | names_input_list <- c("pyclone", 263 | "pyclone_absolute", 264 | "sampleID", 265 | "prefix", 266 | "generalSave", 267 | "merged_clusters") 268 | 269 | #create the pyclone table 270 | input_format <- data.frame(matrix(data = NA, 271 | nrow = nr_unique_muts, 272 | ncol = 11*nr_regions+2), 273 | stringsAsFactors = FALSE) 274 | colnames(input_format) <- c(paste(regions, "_cov", sep = "") 275 | ,paste(regions, "_var_count", sep = "") 276 | ,paste(regions, "_VAF", sep = "") 277 | ,paste(regions,"_PhyloCCF",sep="") 278 | ,paste(regions,"_PycloneCCF",sep="") 279 | ,paste(regions,"_Pyclone_0.05",sep="") 280 | ,paste(regions,"_Pyclone_0.95",sep="") 281 | ,paste(regions,"_cpn.copies",sep="") 282 | ,paste(regions,"_mut.cpn.num",sep="") 283 | ,paste(regions,"_nAraw",sep="") 284 | ,paste(regions,"_nBraw",sep="") 285 | ,"PycloneCluster" 286 | ,"CleanCluster") 287 | rownames(input_format) <- unique(input_table$mutation_id) 288 | # next populate the table 289 | for (mutation_id in rownames(input_format)) 290 | { 291 | spec_mut_table <- input_table[input_table$mutation_id%in%mutation_id,,drop=FALSE] 292 | for (i in 1:nrow(spec_mut_table)) 293 | { 294 | region_spec_mut <- spec_mut_table[i,,drop=FALSE] 295 | input_format[mutation_id,paste(region_spec_mut$SAMPLE,'_cov',sep="")] <- region_spec_mut$DEPTH 296 | input_format[mutation_id,paste(region_spec_mut$SAMPLE,'_var_count',sep="")] <- region_spec_mut$VAR_COUNT 297 | input_format[mutation_id,paste(region_spec_mut$SAMPLE,'_VAF',sep="")] <- region_spec_mut$VAR_COUNT / region_spec_mut$DEPTH 298 | 299 | input_format[mutation_id,paste(region_spec_mut$SAMPLE,'_PhyloCCF',sep="")] <- region_spec_mut$CCF_PHYLO 300 | input_format[mutation_id,paste(region_spec_mut$SAMPLE,'_PycloneCCF',sep="")] <- region_spec_mut$CCF_PHYLO 301 | input_format[mutation_id,paste(region_spec_mut$SAMPLE,'_Pyclone_0.05',sep="")] <- region_spec_mut$CCF_PHYLO 302 | input_format[mutation_id,paste(region_spec_mut$SAMPLE,'_Pyclone_0.95',sep="")] <- region_spec_mut$CCF_PHYLO 303 | input_format[mutation_id,paste(region_spec_mut$SAMPLE,'_cpn.copies',sep="")] <- 1 304 | 305 | input_format[mutation_id,paste(region_spec_mut$SAMPLE,'_mut.cpn.num',sep="")] <- region_spec_mut$MUT_COPY 306 | input_format[mutation_id,paste(region_spec_mut$SAMPLE,'_nAraw',sep="")] <- region_spec_mut$COPY_NUMBER_A 307 | input_format[mutation_id,paste(region_spec_mut$SAMPLE,'_nBraw',sep="")] <- region_spec_mut$COPY_NUMBER_B 308 | input_format[mutation_id,"PycloneCluster"] <- region_spec_mut$CLUSTER 309 | input_format[mutation_id,"CleanCluster"] <- 1 310 | } 311 | } 312 | 313 | # Do the same thing for pyclone_absolute (non-subclonal-copy-number-corrected version) 314 | #create the pyclone table 315 | input_format_absolute <- data.frame(matrix(data = NA, 316 | nrow = nr_unique_muts, 317 | ncol = 11*nr_regions+2), 318 | stringsAsFactors = FALSE) 319 | colnames(input_format_absolute) <- c(paste(regions, "_cov", sep = "") 320 | ,paste(regions, "_var_count", sep = "") 321 | ,paste(regions, "_VAF", sep = "") 322 | ,paste(regions,"_PhyloCCF",sep="") 323 | ,paste(regions,"_PycloneCCF",sep="") 324 | ,paste(regions,"_Pyclone_0.05",sep="") 325 | ,paste(regions,"_Pyclone_0.95",sep="") 326 | ,paste(regions,"_cpn.copies",sep="") 327 | ,paste(regions,"_mut.cpn.num",sep="") 328 | ,paste(regions,"_nAraw",sep="") 329 | ,paste(regions,"_nBraw",sep="") 330 | ,"PycloneCluster" 331 | ,"CleanCluster") 332 | rownames(input_format_absolute) <- unique(input_table$mutation_id) 333 | # next populate the table 334 | for (mutation_id in rownames(input_format_absolute)) 335 | { 336 | spec_mut_table <- input_table[input_table$mutation_id%in%mutation_id,,drop=FALSE] 337 | 338 | for (i in 1:nrow(spec_mut_table)) 339 | { 340 | region_spec_mut <- spec_mut_table[i,,drop=FALSE] 341 | input_format[mutation_id,paste(region_spec_mut$SAMPLE,'_cov',sep="")] <- region_spec_mut$DEPTH 342 | input_format[mutation_id,paste(region_spec_mut$SAMPLE,'_var_count',sep="")] <- region_spec_mut$VAR_COUNT 343 | input_format[mutation_id,paste(region_spec_mut$SAMPLE,'_VAF',sep="")] <- region_spec_mut$VAR_COUNT / region_spec_mut$DEPTH 344 | 345 | input_format_absolute[mutation_id,paste(region_spec_mut$SAMPLE,'_PhyloCCF',sep="")] <- region_spec_mut$CCF_OBS 346 | input_format_absolute[mutation_id,paste(region_spec_mut$SAMPLE,'_PycloneCCF',sep="")] <- region_spec_mut$CCF_OBS 347 | input_format_absolute[mutation_id,paste(region_spec_mut$SAMPLE,'_Pyclone_0.05',sep="")] <- region_spec_mut$CCF_OBS 348 | input_format_absolute[mutation_id,paste(region_spec_mut$SAMPLE,'_Pyclone_0.95',sep="")] <- region_spec_mut$CCF_OBS 349 | input_format_absolute[mutation_id,paste(region_spec_mut$SAMPLE,'_cpn.copies',sep="")] <- 1 350 | 351 | input_format_absolute[mutation_id,paste(region_spec_mut$SAMPLE,'_mut.cpn.num',sep="")] <- region_spec_mut$MUT_COPY 352 | input_format_absolute[mutation_id,paste(region_spec_mut$SAMPLE,'_nAraw',sep="")] <- region_spec_mut$COPY_NUMBER_A 353 | input_format_absolute[mutation_id,paste(region_spec_mut$SAMPLE,'_nBraw',sep="")] <- region_spec_mut$COPY_NUMBER_B 354 | input_format_absolute[mutation_id,"PycloneCluster"] <- region_spec_mut$CLUSTER 355 | input_format_absolute[mutation_id,"CleanCluster"] <- 1 356 | 357 | } 358 | 359 | } 360 | 361 | # Now create list object for input to treebuilding 362 | input_list$pyclone <- input_format 363 | input_list$pyclone_absolute <- input_format_absolute 364 | input_list$sampleID <- input_table$CASE_ID[1] 365 | input_list$prefix <- prefix 366 | input_list$generalSave <- out_dir 367 | input_list$merged_clusters <- NA 368 | 369 | if(!file.exists(input_list$generalSave)) 370 | { 371 | dir.create(input_list$generalSave,showWarnings = TRUE, recursive = TRUE, mode = "0775") 372 | } 373 | 374 | return(input_list) 375 | } 376 | 377 | 378 | #' TRACERx tree building function 379 | #' 380 | #' This function is the main CONIPHER wrapper function to run phylogenetic 381 | #' tree building from mutation clustering output. NOTE: it is assumed that 382 | #' clustering has been carried out prior to running tree building. 383 | #' @param sample_input_list An R list object. This object contains information 384 | #' about each mutation in each tumour region sampled, including which cluster 385 | #' each mutation was assigned to in mutation clustering. 386 | #' This object can be generated by running the data preprocessing function 387 | #' @param ccf_buffer PhyloCCF buffer allowance for testing tree level issue 388 | #' (default=10) 389 | #' @param pval_cutoff P-value cut off for testing cluster nesting (default=0.01) 390 | #' @param use_boot Should bootstrapping be used to compute confidence interval? 391 | #' (default=TRUE) 392 | #' @param merge_clusters Should similar clusters be merged if possible? 393 | #' (default=TRUE) 394 | #' @param correct_cpn_clusters Should clusters driven by copy number errors be 395 | #' removed? (default=TRUE) 396 | #' @param adjust_noisy_clusters Should noisy clusters be adjusted? (default=TRUE) 397 | #' @param adjust_noisy_clusters_prop What is the minimum proportion of mutations 398 | #' required to be present in a region to avoid cluster adjustment? (default=0.05) 399 | #' @param min_ccf What is the minimum CCF threshold to consider a mutation as 400 | #' present? (default=0.01) 401 | #' @param min_cluster_size What is the minimum number of mutations required in a 402 | #' cluster to be included in analysis? (default=5) 403 | #' @param run.multi.trees Should alternative tumour phylogenies be explored? 404 | #' (default=TRUE) 405 | #' @param n_clusters_to_move When running multiple trees specify the maximum 406 | #' number of clusters to attempt moving. (default=5) 407 | #' @returns sample_pyclone_tree, an R list object containing output information 408 | #' from CONIPHER tree building 409 | #' @export treebuilding_run 410 | 411 | treebuilding_run <- function(sample_input_list 412 | , ccf_buffer = 10 413 | , pval_cutoff = 0.01 414 | , use_boot = TRUE 415 | , merge_clusters = TRUE 416 | , correct_cpn_clusters = TRUE 417 | , adjust_noisy_clusters = FALSE 418 | , adjust_noisy_clusters_prop = 0.05 419 | , min_ccf = 0.01 420 | , min_cluster_size = 5 421 | , run.multi.trees = TRUE 422 | , n_clusters_to_move = 5 423 | ) { 424 | suppressPackageStartupMessages(require(igraph)) 425 | suppressPackageStartupMessages(require(mapplots)) 426 | 427 | cat('\n\nStarting Tree Building') 428 | #first keep track of parameters used for this 429 | input_parameter_list <- list() 430 | input_parameter_list$sampleID <- sampleID <- sample_input_list$sampleID 431 | input_parameter_list$prefix <- prefix <- sample_input_list$prefix 432 | input_parameter_list$generalSave <- generalSave <- sample_input_list$generalSave 433 | input_parameter_list$ccf_buffer <- ccf_buffer 434 | input_parameter_list$pval_cutoff <- pval_cutoff 435 | input_parameter_list$use_boot <- use_boot 436 | input_parameter_list$merge_clusters <- merge_clusters 437 | input_parameter_list$correct_cpn_clusters <- correct_cpn_clusters 438 | input_parameter_list$adjust_noisy_clusters <- adjust_noisy_clusters 439 | input_parameter_list$adjust_noisy_clusters_prop <- adjust_noisy_clusters_prop 440 | input_parameter_list$min_ccf <- min_ccf 441 | input_parameter_list$min_cluster_size <- min_cluster_size 442 | 443 | cat('\nFollowing parameters used for tree building:\n') 444 | print(do.call(rbind,input_parameter_list)) 445 | 446 | # prepare the output 447 | output_list <- list() 448 | output_list$ccf_table_pyclone <- sample_input_list$pyclone 449 | output_list$ccf_table_absolute <- sample_input_list$pyclone_absolute 450 | output_list$ccf_table_pyclone_clean <- sample_input_list$pyclone 451 | output_list$ccf_table_absolute_clean <- sample_input_list$pyclone_absolute 452 | 453 | output_list$merged_clusters <- sample_input_list$merged_clusters 454 | 455 | output_list$noisy_clusters_adjusted <- NA 456 | output_list$cpn_removed_clusters <- NA 457 | output_list$tree_removed_clusters <- NA 458 | 459 | test_pyclone <- sample_input_list$pyclone 460 | test_pyclone_absolute <- sample_input_list$pyclone_absolute 461 | 462 | if(adjust_noisy_clusters) 463 | { 464 | cat('\nAdjusting noisy clusters\n') 465 | pyclone_adj <- clean.noisy.clusters(pyclone = test_pyclone,max.absent.prop = adjust_noisy_clusters_prop) 466 | test_pyclone <- pyclone_adj$corrected_pyclone 467 | 468 | output_list$ccf_table_pyclone_clean <- test_pyclone 469 | 470 | if(!is.na(pyclone_adj$corrected_cluster)[1]) 471 | { 472 | tmp <- pyclone_adj$corrected_cluster 473 | colnames(tmp) <- c('region','cluster') 474 | output_list$noisy_clusters_adjusted <- tmp 475 | cat('\nThe following clusters were adjusted:\n') 476 | print(tmp) 477 | } 478 | 479 | pyclone_adj_absolute <- clean.noisy.clusters(test_pyclone_absolute,max.absent.prop = adjust_noisy_clusters_prop) 480 | test_pyclone_absolute <- pyclone_adj_absolute$corrected_pyclone 481 | 482 | output_list$ccf_table_absolute_clean <- test_pyclone_absolute 483 | } 484 | 485 | # make sure you only use clean clusters 486 | test_pyclone <- test_pyclone[test_pyclone[, "CleanCluster"] %in% 1, -ncol(test_pyclone)] 487 | test_pyclone_absolute <- test_pyclone_absolute[test_pyclone_absolute[, "CleanCluster"] %in% 1, -ncol(test_pyclone_absolute)] 488 | 489 | if (nrow(test_pyclone) < min_cluster_size) { stop('too few mutations to run tree building') } 490 | if (sort(table(test_pyclone[, 'PycloneCluster']), decreasing = T)[1] < min_cluster_size) { stop('too few mutations to run tree building') } 491 | 492 | clusters_with_min_cluster_sizeations <- table(test_pyclone[,'PycloneCluster'])[table(test_pyclone[,'PycloneCluster'])>=min_cluster_size] 493 | 494 | 495 | cat('\n\n\nDetermining nesting of clusters\n') 496 | nested_pyclone <- determine.cluster.nesting(pyclone = test_pyclone 497 | , prefix = prefix 498 | , min_cluster_size = max(c(2, min_cluster_size)) 499 | , pval_cutoff = pval_cutoff 500 | , use_boot =use_boot 501 | , min_ccf = min_ccf 502 | ) 503 | 504 | # NM additional step, taking clonality into account when nesting [05/04/2022] 505 | directedGraph_input_full <- matrix(0, 0, 2) 506 | colsums <- colSums(nested_pyclone$nestedclust) 507 | rowsums <- rowSums(nested_pyclone$nestedclust) 508 | trunk_cluster <- names(colsums[which(colsums == max(colsums))]) 509 | if (length(trunk_cluster) > 1) { 510 | trunk_cluster <- names(sort(rowMeans(nested_pyclone$ccf_cluster_table[trunk_cluster,, drop = F]), decreasing = T))[1] 511 | } 512 | 513 | clonality_table <- clonality.function(pyclone = test_pyclone 514 | ,trunk =trunk_cluster 515 | ,prefix = prefix 516 | , min_cluster_size = max(c(2, min_cluster_size)) 517 | ,pval_cutoff = pval_cutoff 518 | ,use_boot =use_boot ) 519 | 520 | 521 | nested_pyclone <- correct.clonality.nesting(nestedlist = nested_pyclone 522 | , pyclone = test_pyclone 523 | , clonality_table = clonality_table 524 | , pval_cutoff = pval_cutoff 525 | , min_cluster_size = min_cluster_size 526 | , min_ccf = min_ccf 527 | , prefix = prefix 528 | ) 529 | # finish additional step NM [05/04/2022] 530 | cat('\nThe following nesting identified:\n') 531 | print(nested_pyclone$nestedclust[,order(colSums(nested_pyclone$nestedclust),decreasing=T)]) 532 | cat('\n') 533 | 534 | nested_pyclone_absolute <- determine.cluster.nesting(pyclone = test_pyclone_absolute 535 | , prefix = prefix 536 | , min_cluster_size = max(c(2, min_cluster_size)) 537 | , pval_cutoff = pval_cutoff 538 | , use_boot =use_boot 539 | , min_ccf = min_ccf 540 | ) 541 | 542 | 543 | # we only merge clusters if we're not using absolute. 544 | if(merge_clusters %in% TRUE & 545 | nrow(nested_pyclone$nestedclust) > 1 & 546 | nrow(nested_pyclone_absolute$nestedclust) > 1) 547 | { 548 | cat('\nChecking for cluster merging\n') 549 | out <- merge.clusters.full(test_pyclone = test_pyclone 550 | ,test_pyclone_absolute = test_pyclone_absolute 551 | ,nested_pyclone = nested_pyclone 552 | ,nested_pyclone_absolute = nested_pyclone_absolute 553 | ,prefix = prefix 554 | ,min_ccf = min_ccf 555 | ,p_value_cut = pval_cutoff 556 | ,min_cluster_size = min_cluster_size 557 | ,use_boot = use_boot 558 | ) 559 | 560 | nested_pyclone <- out$nested_pyclone 561 | test_pyclone <- out$test_pyclone 562 | 563 | if(!is.na(out$mergedclusters[1]) & !is.na(output_list$merged_clusters)) 564 | { 565 | output_list$merged_clusters <- rbind(out$mergedclusters,output_list$merged_clusters) 566 | cat('\nThe following clusters were merged:\n') 567 | print(out$mergedclusters) 568 | cat('\n') 569 | } 570 | else if (!is.na(out$mergedclusters[1])) { 571 | output_list$merged_clusters <- out$mergedclusters 572 | cat('\nThe following clusters were merged:\n') 573 | print(out$mergedclusters) 574 | cat('\n') 575 | } 576 | output_list$ccf_table_pyclone_clean <- test_pyclone 577 | } 578 | 579 | 580 | # remove the genomically clustered clones which may be driven by undetected subclonal copy number 581 | if(correct_cpn_clusters) 582 | { 583 | cat('\nChecking for chromosome clustered clusters') 584 | # determine clonal/trunk cluster using same method as for tree building 585 | directedGraph_input_full <- matrix(0, 0, 2) 586 | colsums <- colSums(nested_pyclone$nestedclust) 587 | rowsums <- rowSums(nested_pyclone$nestedclust) 588 | trunk_cluster <- names(colsums[which(colsums == max(colsums))]) 589 | if (length(trunk_cluster) > 1) { 590 | trunk_cluster <- names(sort(rowMeans(nested_pyclone$ccf_cluster_table[trunk_cluster,, drop = F]), decreasing = T))[1] 591 | } 592 | #remove the genomically clustered clones which may be driven by undetected subclonal copy number 593 | new_test_pyclone <- remove_clustered_clones(test_pyclone, 594 | clonal_cluster = trunk_cluster, 595 | p_value_cut = 0.01, 596 | clustering_estimate_cut = 2 ) 597 | 598 | if(identical(new_test_pyclone,test_pyclone)) 599 | { 600 | cat('\nNo clusters removed\n') 601 | } 602 | if(!identical(new_test_pyclone,test_pyclone)) 603 | { 604 | # 605 | cat('\nThe following clusters removed due to genomic clustering:\n') 606 | # sort out if copy number cluster removed. 607 | cpn_removed_clusters <- names(nested_pyclone$cluster_qc[,'ClusterName'])[!nested_pyclone$cluster_qc[,'ClusterName']%in%unique(new_test_pyclone[,'PycloneCluster'])] 608 | nested_pyclone$cluster_qc[nested_pyclone$cluster_qc[,'ClusterName']%in%cpn_removed_clusters,'CopyNumRemove'] <- 1 609 | output_list$cpn_removed_clusters <- cpn_removed_clusters 610 | 611 | 612 | 613 | cat(cpn_removed_clusters) 614 | cat('\n') 615 | } 616 | 617 | test_pyclone <- new_test_pyclone 618 | output_list$ccf_table_pyclone_clean <- test_pyclone 619 | } 620 | 621 | cat('\nBuilding trees...') 622 | # check whether this means all the clusters are removed. 623 | graph_pyclone <- grow.trees( nestedlist = nested_pyclone 624 | , pyclone = test_pyclone 625 | , min_cluster_size = min_cluster_size 626 | , force_trunk = TRUE 627 | , ccf_buffer = ccf_buffer 628 | ) 629 | 630 | output_list$tree_removed_clusters <- graph_pyclone$Clusters_with_issues 631 | 632 | cat('\n------------------\n') 633 | cat('\nTree identified\n') 634 | 635 | clonality_table <- clonality.function(pyclone = test_pyclone 636 | ,trunk = graph_pyclone$trunk 637 | ,prefix = prefix 638 | , min_cluster_size = max(c(2, min_cluster_size)) 639 | ,pval_cutoff = pval_cutoff 640 | ,use_boot = use_boot ) 641 | 642 | clonality_out <- correct.clonality.table(clonality_table = clonality_table 643 | , graph_pyclone = graph_pyclone 644 | , trunk_cluster = graph_pyclone$trunk) #TODO still may need correcting for one region cases #EC 20210509 645 | 646 | ### AH edit set CCF in ccf cluster table as well as upper and lower CIs to 0 if cluster is defined as absent 647 | for (region in colnames(clonality_table)) { 648 | tmp.absentClust <- rownames(clonality_table)[clonality_table[,region] == "absent"] 649 | if (any(nested_pyclone$ccf_cluster_table[tmp.absentClust, region] != 0)) { 650 | print("Absent clusters with meanCCF > 0. Resetting to 0 in ccf cluster table") 651 | nested_pyclone$ccf_cluster_table[tmp.absentClust, region] <- 0 652 | nested_pyclone$ccf_ci_lower[tmp.absentClust, region] <- 0 653 | nested_pyclone$ccf_ci_upper[tmp.absentClust, region] <- 0 654 | } 655 | } 656 | ### AH edit done 657 | 658 | if (run.multi.trees) { 659 | cat('\nExploring presence of multiple alternate trees') 660 | multi.trees <- grow.multi.trees(nestedlist = nested_pyclone 661 | ,pyclone = test_pyclone 662 | ,graph_pyclone = graph_pyclone 663 | ,ccf_buffer = ccf_buffer 664 | ,n_clusters_to_move = n_clusters_to_move 665 | ) 666 | 667 | } else { 668 | multi.trees <- NULL 669 | } 670 | 671 | graph_pyclone$alt_trees <- multi.trees$good.trees 672 | 673 | 674 | if(length(multi.trees)==0) 675 | { 676 | graph_pyclone$consensus_branches <- paste(graph_pyclone$default_tree[,1],graph_pyclone$default_tree[,2],sep=":") 677 | graph_pyclone$nested_clust <- nested_pyclone[[1]] 678 | 679 | # list all clone - clone relationships which are common to all alternative trees 680 | # This captures some tree info for clones where the exact tree position is uncertain 681 | graph_pyclone$consensus_relationships <- extract_consensus_relationships( list(graph_pyclone$default_tree ) ) 682 | graph_pyclone$alt_trees <- list(graph_pyclone$default_tree) 683 | } 684 | 685 | if(length(multi.trees)!=0) 686 | { 687 | # check whether any repeats in alt_trees [this can happen due to level issue] 688 | tree_vector <- c() 689 | for (i in 1:length(graph_pyclone$alt_trees)) 690 | { 691 | tree_vector <- c(tree_vector,PasteVector(sort(paste(graph_pyclone$alt_trees[[i]][,1],graph_pyclone$alt_trees[[i]][,2],sep=":")),sep=",")) 692 | 693 | } 694 | 695 | alt_trees <- list() 696 | trees_to_use <- c(1:length(graph_pyclone$alt_trees))[!duplicated(tree_vector)] 697 | for (i in 1:length(trees_to_use)) 698 | { 699 | alt_trees[[i]] <- graph_pyclone$alt_trees[[trees_to_use[i]]] 700 | } 701 | 702 | graph_pyclone$alt_trees <- alt_trees 703 | 704 | graph_pyclone$consensus_branches <- multi.trees$consensus.branches 705 | graph_pyclone$nested_clust <- multi.trees$consensus.nestedclust 706 | 707 | # list all clone - clone relationships which are common to all alternative trees 708 | # This captures some tree info for clones where the exact tree position is uncertain 709 | graph_pyclone$consensus_relationships <- extract_consensus_relationships( alt_trees ) 710 | 711 | } 712 | 713 | 714 | ### Compute alternative tree metrics: 715 | # 1) Compute sum condition error SCE for each alternative tree + find trees with lowest SCE: 716 | 717 | cat('\n\nComputing sum condition error for each alternative tree') 718 | graph_pyclone$alt_trees_sum_condition_error <- compute_sum_condition_error(tree_list = graph_pyclone$alt_trees, ccf_cluster_table = nested_pyclone$ccf_cluster_table, trunk = trunk_cluster) 719 | graph_pyclone$min_sce_trees <- names(which(graph_pyclone$alt_trees_sum_condition_error == min(graph_pyclone$alt_trees_sum_condition_error))) 720 | cat('\nTrees with minimum sum condition error: ', graph_pyclone$min_sce_trees, '\n') 721 | 722 | # 2) Compute edge probability for each alternative tree + find trees with highest edge probability: 723 | 724 | cat('\n\nComputing edge probability score for each alternative tree\n') 725 | graph_pyclone$alt_trees_edge_probability <- compute_tree_edge_probability(tree_list = graph_pyclone$alt_trees, edgelength = graph_pyclone$edgelength, trunk = trunk_cluster) 726 | graph_pyclone$max_edge_probability_trees <- names(which(graph_pyclone$alt_trees_edge_probability == max(graph_pyclone$alt_trees_edge_probability))) 727 | cat('\nTrees with maximum edge probability: ', graph_pyclone$max_edge_probability_trees, '\n') 728 | 729 | 730 | ### Compute clone proportions output: 731 | # 1) Compute subclone proportions from default tree: 732 | cat('\n\nComputing clone proportions from default tree\n') 733 | clone_proportion_table <- compute_subclone_proportions(tree_list = graph_pyclone$alt_trees, 734 | ccf_cluster_table = nested_pyclone$ccf_cluster_table, 735 | clonality_table = clonality_out$clonality_table_corrected, 736 | trunk = trunk_cluster, 737 | force_clonal_100 = TRUE, 738 | tree_id = 1) 739 | 740 | # 2) Compute subclone proportions from lowest error tree: 741 | cat('\n\nComputing clone proportions from tree with lowest sum condition error\n') 742 | clone_proportions_min_sce_trees <- lapply(graph_pyclone$min_sce_trees, function(i){ 743 | compute_subclone_proportions(tree_list = graph_pyclone$alt_trees, 744 | ccf_cluster_table = nested_pyclone$ccf_cluster_table, 745 | clonality_table = clonality_out$clonality_table_corrected, 746 | trunk = trunk_cluster, 747 | force_clonal_100 = TRUE, 748 | tree_id = as.numeric(i)) 749 | }) 750 | names(clone_proportions_min_sce_trees) <- graph_pyclone$min_sce_trees 751 | clone_proportion_out <- list(clone_proportion_table = clone_proportion_table, clone_proportions_min_sce_trees = clone_proportions_min_sce_trees) 752 | 753 | 754 | ### Compute subclonal expansion score: 755 | # 1) Compute subclonal expansion score from default tree: 756 | cat('\n\nComputing subclonal expansion score from default tree\n') 757 | subclonal_exp_score <- compute_subclonal_expansion_score(tree_list = graph_pyclone$alt_trees, 758 | tree_id = 1, 759 | ccf_table_pyclone_clean = output_list$ccf_table_pyclone_clean) 760 | 761 | # 2) Compute subclonal expansion score from lowest error tree: 762 | cat('\n\nComputing subclonal expansion score from tree with lowest sum condition error\n') 763 | subclonal_exp_score_min_sce_trees <- lapply(graph_pyclone$min_sce_trees, function(i){ 764 | compute_subclonal_expansion_score(tree_list = graph_pyclone$alt_trees, 765 | tree_id = as.numeric(i), 766 | ccf_table_pyclone_clean = output_list$ccf_table_pyclone_clean) 767 | 768 | }) 769 | names(subclonal_exp_score_min_sce_trees) <- graph_pyclone$min_sce_trees 770 | subclonal_exp_score_out <- list(subclonal_exp_score = subclonal_exp_score, subclonal_exp_score_min_sce_trees = subclonal_exp_score_min_sce_trees) 771 | 772 | 773 | ### Finally, save all tree output 774 | # Save sample ID 775 | graph_pyclone$sampleID <- sampleID 776 | graph_pyclone$long_sampleID <- trx_rename.fn(sampleID, trialID = prefix) 777 | 778 | # Saving all output to list 779 | output_list$graph_pyclone <- graph_pyclone 780 | output_list$parameters <- input_parameter_list 781 | output_list$nested_pyclone <- nested_pyclone 782 | output_list$clonality_table <- clonality_table 783 | output_list$clonality_out <- clonality_out 784 | output_list$clone_proportion_out <- clone_proportion_out 785 | output_list$subclonal_expansion_score_out <- subclonal_exp_score_out 786 | 787 | #let's save the output_list 788 | output_rds <- file.path(generalSave, paste0(sampleID, ".tree.RDS")) 789 | saveRDS(output_list, file = output_rds) 790 | 791 | return(output_list) 792 | } 793 | 794 | 795 | #' TRACERx tree plotting function 796 | #' 797 | #' This function is the CONIPHER function to plot the inferred phylogenetic tree. 798 | #' @param sample_pyclone_tree A list containing all information about the 799 | #' tree inferred using function tracerx.tree.building() 800 | #' @importFrom grDevices "colorRampPalette" "dev.off" "pdf" 801 | #' @importFrom graphics "abline" "axis" "barplot" "layout" "legend" 802 | #' "par" "plot.new" "segments" "text" "title" 803 | #' @importFrom igraph "get.edgelist" 804 | #' @export treebuilding_plot 805 | 806 | treebuilding_plot <- function(sample_pyclone_tree) { 807 | cat('\n Plotting inferred phylogenetic tree \n') 808 | require(mapplots) 809 | sampleID <- sample_pyclone_tree$parameters$sampleID 810 | prefix <- sample_pyclone_tree$parameters$prefix 811 | generalSave <- sample_pyclone_tree$parameters$generalSave 812 | ccf_buffer <- sample_pyclone_tree$parameters$ccf_buffer 813 | pval_cutoff <- sample_pyclone_tree$parameters$pval_cutoff 814 | use_boot <- sample_pyclone_tree$parameters$use_boot 815 | merge_clusters <- sample_pyclone_tree$parameters$merge_clusters 816 | correct_cpn_clusters <- sample_pyclone_tree$parameters$correct_cpn_clusters 817 | adjust_noisy_clusters <- sample_pyclone_tree$parameters$adjust_noisy_clusters 818 | adjust_noisy_clusters_prop <- sample_pyclone_tree$parameters$adjust_noisy_clusters_prop 819 | min_ccf <- sample_pyclone_tree$parameters$min_ccf 820 | min_cluster_size <- sample_pyclone_tree$parameters$min_cluster_size 821 | 822 | nested_pyclone <- sample_pyclone_tree$nested_pyclone 823 | pyclone_tree <- sample_pyclone_tree$graph_pyclone 824 | clonality_table <- sample_pyclone_tree$clonality_out$clonality_table_corrected 825 | clonality_out <- sample_pyclone_tree$clonality_out 826 | test_pyclone <- sample_pyclone_tree$ccf_table_pyclone_clean 827 | cpn_removed_clusters <- sample_pyclone_tree$cpn_removed_clusters 828 | if(length(cpn_removed_clusters)==0) 829 | { 830 | cpn_removed_clusters <- NA 831 | } 832 | 833 | merged_clusters <- sample_pyclone_tree$merged_clusters 834 | 835 | ### Plot trees -- AUTOMATIC 836 | date <- gsub('-', '', substr(Sys.time(), 1, 10)) 837 | 838 | pdfname <- file.path(generalSave, 'pytree_and_bar.pdf') 839 | 840 | height.mult.factor <- ceiling(nrow(nested_pyclone$ccf_cluster_table)/25) 841 | width.mult.factor <- ceiling(nrow(nested_pyclone$ccf_cluster_table)/25) 842 | 843 | 844 | pdf(pdfname, width=22*width.mult.factor, height=12*height.mult.factor) 845 | { 846 | par(mar=c(0,0,0,0)) 847 | layout(cbind(1:(nrow(nested_pyclone$ccf_cluster_table)+2),rep(nrow(nested_pyclone$ccf_cluster_table)+3,nrow(nested_pyclone$ccf_cluster_table)+2),rep(nrow(nested_pyclone$ccf_cluster_table)+3,nrow(nested_pyclone$ccf_cluster_table)+2))) 848 | require(beeswarm) 849 | 850 | tmp <- nested_pyclone$ccf_cluster_table 851 | main <- paste(substr(colnames(tmp)[1], 1, 8), '\ Phylo CCF values', sep = '') 852 | colnames(tmp) <- gsub(paste0(substr(colnames(tmp)[1], 1, 8), "_"), "", colnames(tmp)) 853 | suppressPackageStartupMessages(require(gplots)) 854 | plot.new() 855 | par(mar=c(2,2,2,2)) 856 | title(main, cex = 2) 857 | 858 | colours.to.use <- color.tree(1:nrow(nested_pyclone$ccf_cluster_table)) 859 | 860 | par(mar=c(0.1,5,0.1,2),lend=1) 861 | 862 | for (j in 1:nrow(nested_pyclone$ccf_cluster_table)) 863 | { 864 | 865 | if(j==1) 866 | { 867 | border.col <- ifelse(clonality_table[j,]=='clonal','black','grey') 868 | bp <- barplot(nested_pyclone$ccf_cluster_table[j,],las=1,col=colours.to.use[j],border=border.col,names="",ylab=paste("Cl",rownames(nested_pyclone$ccf_cluster_table)[j],sep=" "),ylim=c(0,115),yaxt='n',cex.axis=1.25) 869 | 870 | } 871 | if(j!=1) 872 | { 873 | border.col <- ifelse(clonality_table[j,]=='clonal','black','grey') 874 | bp <- barplot(nested_pyclone$ccf_cluster_table[j,],las=1,col=colours.to.use[j],border=border.col,names="",ylab=paste("Cl",rownames(nested_pyclone$ccf_cluster_table)[j],sep=" "),ylim=c(0,115),yaxt='n',cex.axis=1.25) 875 | 876 | } 877 | axis(side = 2,at = c(0,50,100),labels=c(c(0,50,100)),las=1) 878 | if(j ==nrow(nested_pyclone$ccf_cluster_table)) 879 | { 880 | axis(side=1,at=bp,labels=gsub(paste0(substr(colnames(nested_pyclone$ccf_cluster_table)[1], 1, 8), "_"), "",colnames(nested_pyclone$ccf_cluster_table)) 881 | ,tick=FALSE 882 | ,cex.axis=1.25) 883 | 884 | } 885 | abline(h=0) 886 | abline(h=100,lty='dashed') 887 | abline(h=50,lty='dashed') 888 | for (bar in 1:length(bp)) 889 | { 890 | 891 | beeswarm(test_pyclone[test_pyclone[,'PycloneCluster']%in%rownames(nested_pyclone$ccf_cluster_table)[j],grep('PhyloCCF',colnames(test_pyclone))[bar]]*100 892 | ,at=bp[bar] 893 | ,add=TRUE 894 | ,corralWidth = 0.5 895 | ,method='swarm' 896 | ,corral='wrap' 897 | ,pch=21 898 | ,col=colours.to.use[j] 899 | ,bg='grey') 900 | segments(x0 = bp[bar],x1 = bp[bar],y0 = nested_pyclone$ccf_ci_lower[j,bar],y1 = nested_pyclone$ccf_ci_upper[j,bar],lwd=5) 901 | text(x=bp[bar],y=25,labels=nested_pyclone$ccf_cluster_table[j,bar],cex =1.5) 902 | } 903 | } 904 | 905 | plot.new() 906 | par(mar=c(2.1, 2.1, 4.1, 38), xpd=TRUE) 907 | 908 | g <- graph.data.frame(pyclone_tree$default_tree,directed = FALSE) 909 | indx <- V(g)$name 910 | vcol <- setNames(color.tree(pyclone_tree$edgelength), names(pyclone_tree$edgelength))[indx] 911 | 912 | l <- layout_as_tree(g, root = pyclone_tree$trunk) 913 | 914 | pie.size <- ncol(sample_pyclone_tree$nested_pyclone$ccf_cluster_table) 915 | node.shape <- setNames(rep('pie', length(vcol)), names(vcol)) 916 | pie.slices <- lapply(1:length(vcol), function(x) rep(1, pie.size)) 917 | empty.col = '#bdbdbd'#'white' 918 | 919 | node_size_factor <- log2(max(pyclone_tree$edgelength)) / 30 920 | node.size <- log2(pyclone_tree$edgelength) / node_size_factor 921 | node.size <- node.size[names(node.shape)] 922 | 923 | pie.colors <- sample_pyclone_tree$nested_pyclone$ccf_cluster_table[match(names(vcol), rownames(sample_pyclone_tree$nested_pyclone$ccf_cluster_table)),, drop = F] 924 | pie.colors <- ifelse(pie.colors>=90,99,pie.colors) 925 | pie.colors <- ifelse(pie.colors<10&pie.colors>=1,10,pie.colors) 926 | pie.colors <- lapply(1:nrow(pie.colors), function(x) { 927 | if(!all(is.na(pie.colors[x,]))){ 928 | tmp <- pie.colors[x,] 929 | tmp2 <- tmp 930 | colfunc <- colorRampPalette(c("white", vcol[rownames(pie.colors)[x]])) 931 | speccolours <- colfunc(100) 932 | tmp[tmp>0] <- speccolours[tmp] 933 | tmp[tmp2 == 0] <- empty.col 934 | tmp 935 | } 936 | }) 937 | 938 | 939 | g_dir <- graph.data.frame(pyclone_tree$default_tree,directed = TRUE) 940 | edges <- get.edgelist(g_dir) 941 | ecol <- setNames(rep('#bdbdbd', nrow(edges)),edges[,2])# baseline, set edge color to black 942 | ewidth <- rep(1,length(ecol)) 943 | 944 | #label consensus edges in other colour 945 | ecol[paste(edges[,1],edges[,2],sep=":")%in%pyclone_tree$consensus_branches] <- '#000000' 946 | ewidth[paste(edges[,1],edges[,2],sep=":")%in%pyclone_tree$consensus_branches] <-150 947 | 948 | plot(g 949 | , layout=l 950 | , main = sampleID 951 | , vertex.color = vcol[indx] 952 | , vertex.frame.color=vcol[indx] 953 | , vertex.shape = node.shape 954 | , vertex.lwd=5 955 | , vertex.pie.lwd=3 956 | , vertex.pie = pie.slices 957 | , vertex.pie.color = lapply(pie.colors,rev) 958 | , vertex.size = node.size 959 | , edge.color=ecol 960 | , edge.size=ewidth 961 | , vertex.label.cex=2 962 | , vertex.label.pos=2 963 | , vertex.label.dist=0 964 | , vertex.label.family='Helvetica' 965 | , vertex.label.font=2 966 | , vertex.label.color = 'black') 967 | 968 | 969 | legend.pie(1,1,labels=gsub(paste0(substr(colnames(tmp)[1], 1, 8), "_"), "", colnames(tmp)), radius=0.2, bty="n", col='#bdbdbd', 970 | cex=1.25, label.dist=0.8 971 | ,border='white') 972 | 973 | snv_clusters <- sort(pyclone_tree$edgelength[indx], decreasing = T) 974 | 975 | snv_clusters_removed <- pyclone_tree$edgelength 976 | snv_clusters_removed <- sort(snv_clusters_removed[!names(snv_clusters_removed) %in% indx], decreasing = T) 977 | if(!is.na(cpn_removed_clusters[1])) 978 | { 979 | snv_clusters_removed <- c(snv_clusters_removed,table(sample_pyclone_tree$ccf_table_pyclone[,'PycloneCluster'])[cpn_removed_clusters]) 980 | } 981 | 982 | tmp <- legend('topright', inset = c(-0.3, 0), legend = paste(names(snv_clusters), ' (', snv_clusters,' SNVs)', sep = ''), col = vcol[names(snv_clusters)], pch = 19, title = 'Clusters included:', bty = 'n') ## inset option controls how far from x and y margins 983 | if (length(snv_clusters_removed) > 0) { 984 | if(!is.na(cpn_removed_clusters[1])) 985 | { 986 | to_plot <- table(sample_pyclone_tree$ccf_table_pyclone[,'PycloneCluster'])[cpn_removed_clusters] 987 | legend(x=tmp$rect$left,y = 0,inset = c(-0.3, 0),legend = paste(names(to_plot), ' (', to_plot, ' SNVs)', sep = ''), col = vcol[names(to_plot)], pch = 19, title = 'Copy# clusters removed:', bty = 'n') 988 | 989 | } 990 | legend('bottomright', inset = c(-0.3, 0), legend = paste(names(snv_clusters_removed), ' (', snv_clusters_removed, ' SNVs)', sep = ''), col = vcol[names(snv_clusters_removed)], pch = 19, title = 'Clusters removed:', bty = 'n') 991 | } 992 | 993 | 994 | 995 | 996 | } 997 | dev.off() 998 | 999 | #next, plot all the possible trees 1000 | trees.to.plot <- pyclone_tree$alt_trees 1001 | if(length(trees.to.plot)==0) 1002 | { 1003 | #nothing to plot here. 1004 | } 1005 | if(length(trees.to.plot)!=0) 1006 | { 1007 | 1008 | date <- gsub('-', '', substr(Sys.time(), 1, 10)) 1009 | 1010 | pdfname <- file.path(generalSave, 'pytree_multipletrees.pdf') 1011 | 1012 | mult.factor <- ceiling(length(trees.to.plot)/50) 1013 | 1014 | pdf(pdfname, width=12*mult.factor, height=12*mult.factor) 1015 | { 1016 | nr.trees <- length(trees.to.plot) 1017 | columnnum <- 1 1018 | rownum <- nr.trees/columnnum 1019 | 1020 | if(nr.trees<=50) 1021 | { 1022 | nr.to.use <- nr.trees 1023 | } 1024 | 1025 | if(nr.trees>50) 1026 | { 1027 | nr.to.use <- signif(nr.trees+5,2) 1028 | } 1029 | 1030 | for(i in 1: nr.to.use) { 1031 | if((nr.to.use %% i) == 0) { 1032 | if((i+(nr.to.use/i))<(columnnum+rownum)) 1033 | { 1034 | columnnum <- i 1035 | rownum <- nr.to.use/columnnum 1036 | } 1037 | } 1038 | } 1039 | 1040 | if(columnnum==1) 1041 | { 1042 | columnnum <- ceiling(columnnum*2) 1043 | rownum <- ceiling(rownum/2) 1044 | } 1045 | 1046 | par(mfrow=c(rownum,columnnum),xpd=TRUE,mar=c(1, 1,1, 1)) 1047 | for (i in 1:nr.trees) 1048 | { 1049 | auto_tree <- trees.to.plot[[i]] 1050 | g <- graph.data.frame(auto_tree,directed = FALSE) 1051 | indx <- V(g)$name 1052 | vcol <- setNames(color.tree(pyclone_tree$edgelength), names(pyclone_tree$edgelength))[indx] 1053 | 1054 | l <- layout_as_tree(g, root = pyclone_tree$trunk) 1055 | 1056 | pie.size <- ncol(nested_pyclone$ccf_cluster_table) 1057 | node.shape <- setNames(rep('pie', length(vcol)), names(vcol)) 1058 | pie.slices <- lapply(1:length(vcol), function(x) rep(1, pie.size)) 1059 | empty.col = 'gray85' 1060 | 1061 | node_size_factor <- log2(max(pyclone_tree$edgelength)) / 30 1062 | node.size <- log2(pyclone_tree$edgelength) / node_size_factor 1063 | node.size <- node.size[names(node.shape)] 1064 | 1065 | pie.colors <- nested_pyclone$ccf_cluster_table[match(names(vcol), rownames(nested_pyclone$ccf_cluster_table)),, drop = F] 1066 | pie.colors <- lapply(1:nrow(pie.colors), function(x) { 1067 | if(!all(is.na(pie.colors[x,]))){ 1068 | tmp <- pie.colors[x,] 1069 | tmp[tmp > 0] <- vcol[rownames(pie.colors)[x]] 1070 | tmp[tmp == 0] <- empty.col 1071 | tmp 1072 | } 1073 | }) 1074 | 1075 | g_dir <- graph.data.frame(auto_tree,directed = TRUE) 1076 | edges <- get.edgelist(g_dir) 1077 | ecol <- setNames(rep('#bdbdbd', nrow(edges)),edges[,2])# baseline, set edge color to black 1078 | ewidth <- rep(1,length(ecol)) 1079 | 1080 | #label consensus edges in other colour 1081 | ecol[paste(edges[,1],edges[,2],sep=":")%in%pyclone_tree$consensus_branches] <- '#000000' 1082 | ewidth[paste(edges[,1],edges[,2],sep=":")%in%pyclone_tree$consensus_branches] <- 2 1083 | 1084 | plot(g, main = sampleID 1085 | , layout = l 1086 | , vertex.color = vcol[indx] 1087 | , vertex.shape = node.shape 1088 | , vertex.pie = pie.slices 1089 | , vertex.pie.color = pie.colors 1090 | , vertex.pie.lty = 0 1091 | , vertex.size = node.size 1092 | , edge.width =ewidth 1093 | , edge.color=ecol 1094 | , arrow.size =0 1095 | ,arrow.width=0 1096 | ,arrow.mode=0 1097 | ) 1098 | 1099 | } 1100 | 1101 | } 1102 | dev.off() 1103 | } 1104 | } 1105 | 1106 | 1107 | 1108 | 1109 | -------------------------------------------------------------------------------- /R/main_clustering_functions.R: -------------------------------------------------------------------------------- 1 | #' Full clustering run function 2 | #' 3 | #' This function takes all the input options and runs the three main steps: 4 | #' preprocess, clustering run and postprocess 5 | #' @param opt a list of options 6 | #' @returns NULL 7 | #' @export conipher_clustering 8 | 9 | conipher_clustering <- function(case_id, 10 | out_dir, 11 | input_tsv_loc, 12 | input_seg_tsv_loc = NULL, 13 | subclonal_copy_correction = TRUE, 14 | only_truncal_subclonal_copy_correction = TRUE, 15 | pyclone_yaml_loc = NULL, 16 | min_cluster_size = 5, 17 | multiple_test_correction = TRUE, 18 | clean_clusters = TRUE, 19 | clonal_cutOff = 0.9, 20 | propClonal_threshold = 0.25, 21 | fix_absentCCFs = TRUE, 22 | driver_filter = "1A,1,2A", 23 | burn_in = 1000, 24 | seed = 1024, 25 | nProcs = 1, 26 | ...) { 27 | patient <- case_id 28 | new.dir <- paste0(out_dir, "/") 29 | driver_cat <- unlist(strsplit(driver_filter, split = ",")) 30 | if (is.null(pyclone_yaml_loc)) { 31 | template.config.yaml <- system.file("extdata", "template.config.yaml", package = "CONIPHER", mustWork = TRUE) 32 | } else { 33 | if (file.exists(pyclone_yaml_loc)) { 34 | template.config.yaml <- pyclone_yaml_loc 35 | } else { 36 | stop("PyClone template yaml file does not exist. \nPlease specify full path to file or set parameter to NULL to use default.\n") 37 | } 38 | } 39 | 40 | cat("\nCONIPHER clustering analysis of the following tumour case:\n") 41 | print(patient) 42 | cat("\n") 43 | 44 | if (!file.exists(new.dir)) { 45 | if (!dir.create(new.dir, recursive = TRUE)) { 46 | stop("Unable to create root directory.\n") 47 | } 48 | } 49 | 50 | if(!file.exists(input_tsv_loc)) { 51 | stop("Unable to find input_tsv.\n") 52 | } 53 | input_tsv <- read.delim(input_tsv_loc, sep = "\t", stringsAsFactors = FALSE, header = TRUE, fill = TRUE, quote = "") 54 | if (nrow(input_tsv) == 0) { 55 | stop('No mutations passed filtering, stopping PyClone phylo clustering') 56 | } 57 | 58 | ### fix issue with sample names including '-' 59 | input_tsv$SAMPLE <- gsub("-", "\\.", input_tsv$SAMPLE) 60 | 61 | input_list <- clustering_preprocess(input_tsv, new.dir = new.dir, subclonal_copy_correction = subclonal_copy_correction, multiple_test_correction = multiple_test_correction, only_truncal_subclonal_copy_correction = only_truncal_subclonal_copy_correction, fix_absentCCFs = fix_absentCCFs) 62 | sample.results <- clustering_run(input_list, nProcs = nProcs, new.dir = new.dir, burn_in = burn_in, pyclone_seed = seed, template.config.yaml = template.config.yaml) 63 | clustering_postprocess(input_list, sample.results, new.dir = new.dir, input_tsv = input_tsv, input_seg_tsv_loc = input_seg_tsv_loc, min_cluster_size = min_cluster_size, driver_cat = driver_cat, clean_clusters = clean_clusters, min_ccf_present = 0.1, clonal_cutOff = clonal_cutOff, propClonal_threshold = propClonal_threshold) 64 | } 65 | 66 | 67 | #' Input data preprocessing function 68 | #' 69 | #' This function takes the input tsv and formats the data to be compatible with 70 | #' the main CONIPHER clustering function. 71 | #' @param input_table An dataframe of the input mutation table in the correct 72 | #' format. For more information on the input table format, please see our 73 | #' tree building protocol. 74 | #' @param new.dir A character specifying the directory where the pyclone 75 | #' output should be saved. 76 | #' @param subclonal_copy_correction A logical value that specifies whether subclonal 77 | #' copy number correction should be performed. 78 | #' Default is set to TRUE 79 | #' @param multiple_test_correction A logical value that specifies whether multiple 80 | #' testing correction should be applied for the copy number correcting mutations. 81 | #' Default is set to TRUE 82 | #' @param only_truncal_subclonal_copy_correction A logical value that specifies 83 | #' whether only truncal subclonal copy number correction should be used. 84 | #' Default is set to TRUE 85 | #' @param fix_absentCCFs A logical value that specifies whether CCF 86 | #' of absent mutations should be set to zero. 87 | #' Default is set to TRUE 88 | #' @returns list including patient, regions.to.use, mut.table, seg.mat.copy 89 | #' seg.mat.phylo, phylo.region.list, simpleClusterList 90 | #' @importFrom dplyr "%>%" 91 | #' @export clustering_preprocess 92 | 93 | clustering_preprocess <- function(input_table, new.dir, subclonal_copy_correction = TRUE, multiple_test_correction = TRUE, only_truncal_subclonal_copy_correction = TRUE, fix_absentCCFs = TRUE) { 94 | gender <- "male" 95 | patient <- unique(input_table$CASE_ID) 96 | regions.to.use <- unique(input_table$SAMPLE) 97 | input_table[, "key"] <- paste(paste0("chr", input_table[, "CHR"]), 98 | input_table[, "POS"], 99 | input_table[, "REF"], 100 | input_table[, "ALT"], 101 | sep = ":") 102 | 103 | if (sum(grepl("MUT_TYPE", colnames(input_table))) == 1) { 104 | mut.table <- data.frame(key = input_table[, "key"], 105 | chr = input_table[, "CHR"], 106 | start = input_table[, "POS"], 107 | stop = input_table[, "POS"], 108 | ref = input_table[, "REF"], 109 | var = input_table[, "ALT"], 110 | is_SNV = TRUE, 111 | Use.For.Plots = (input_table[, "MUT_TYPE"] == "SNV"), 112 | Use.For.Plots.Indel = (input_table[, "MUT_TYPE"] == "INDEL"), 113 | stringsAsFactors = FALSE) 114 | 115 | mut.table <- mut.table %>% 116 | dplyr::full_join(input_table %>% 117 | dplyr::select(key, SAMPLE, REF_COUNT, VAR_COUNT, DEPTH) %>% 118 | dplyr::rename(cov = DEPTH, ref_count = REF_COUNT, var_count = VAR_COUNT) %>% 119 | dplyr::mutate(VAF = var_count / cov * 100) %>% 120 | tidyr::pivot_wider(names_from = SAMPLE, values_from = c(cov, ref_count, var_count, VAF), names_glue = "{SAMPLE}.{.value}"), 121 | by = "key") %>% 122 | dplyr::rowwise() %>% 123 | dplyr::mutate(is_blacklist = FALSE, 124 | max.var_count = max(dplyr::c_across(ends_with(".var_count"))), 125 | max.VAF = max(dplyr::c_across(ends_with(".VAF")))) 126 | 127 | mut.table <- mut.table %>% 128 | dplyr::mutate(Gene.refGene = "") %>% 129 | dplyr::mutate(driverCategory = NA) 130 | 131 | mut.table <- data.frame(mut.table, stringsAsFactors = FALSE) 132 | } else { 133 | mut.table <- data.frame(key = input_table[, "key"], 134 | chr = input_table[, "CHR"], 135 | start = input_table[, "POS"], 136 | stop = input_table[, "POS"], 137 | ref = input_table[, "REF"], 138 | var = input_table[, "ALT"], 139 | is_SNV = TRUE, 140 | stringsAsFactors = FALSE) 141 | 142 | mut.table <- mut.table %>% 143 | dplyr::full_join(input_table %>% 144 | dplyr::select(key, SAMPLE, REF_COUNT, VAR_COUNT, DEPTH) %>% 145 | dplyr::rename(cov = DEPTH, ref_count = REF_COUNT, var_count = VAR_COUNT) %>% 146 | dplyr::mutate(VAF = var_count / cov * 100) %>% 147 | tidyr::pivot_wider(names_from = SAMPLE, values_from = c(cov, ref_count, var_count, VAF), names_glue = "{SAMPLE}.{.value}"), 148 | by = "key") %>% 149 | dplyr::rowwise() %>% 150 | dplyr::mutate(is_blacklist = FALSE, 151 | max.var_count = max(dplyr::c_across(ends_with(".var_count"))), 152 | max.VAF = max(dplyr::c_across(ends_with(".VAF")))) 153 | 154 | mut.table <- mut.table %>% 155 | dplyr::mutate(Gene.refGene = "") %>% 156 | dplyr::mutate(Use.For.Plots = TRUE, Use.For.Plots.Indel = FALSE) %>% 157 | dplyr::mutate(driverCategory = NA) 158 | 159 | mut.table <- data.frame(mut.table, stringsAsFactors = FALSE) 160 | } 161 | 162 | seg.mat.copy <- data.frame(SampleID = input_table[,"SAMPLE"], 163 | chr = input_table[, "CHR"], 164 | startpos = input_table[, "POS"], 165 | endpos = input_table[, "POS"], 166 | n.het = 1, 167 | cnTotal = round(input_table[, "COPY_NUMBER_A"] + input_table[, "COPY_NUMBER_B"]), 168 | nMajor = round(input_table[, "COPY_NUMBER_A"]), 169 | nMinor = round(input_table[, "COPY_NUMBER_B"]), 170 | Ploidy = input_table[, "PLOIDY"], 171 | ACF = input_table[, "ACF"], 172 | COPY_NUMBER_A = input_table[, "COPY_NUMBER_A"], 173 | COPY_NUMBER_B = input_table[, "COPY_NUMBER_B"], 174 | stringsAsFactors = FALSE) 175 | 176 | mut.table <- mut.table[mut.table$chr %in% 1:22,, drop = FALSE] 177 | seg.mat.copy <- seg.mat.copy[seg.mat.copy$chr %in% 1:22,, drop = FALSE] 178 | 179 | mut.table$mutation_id <- paste(patient, mut.table$chr, mut.table$start, mut.table$ref, sep = ":") 180 | mut.table <- mut.table[order(mut.table$max.VAF, decreasing = TRUE),] 181 | mut.table <- mut.table[!duplicated(mut.table$mutation_id),, drop = FALSE] 182 | rownames(mut.table) <- mut.table$mutation_id 183 | 184 | max.vaf <- c() 185 | max.var.count <- c() 186 | 187 | for (region in regions.to.use) { 188 | if (paste(region, ".VAF", sep = "") %in% colnames(mut.table)) { 189 | max.vaf <- cbind(max.vaf, mut.table[, paste(region, ".VAF", sep = "")]) 190 | max.var.count <- cbind(max.var.count, mut.table[, paste(region, ".var_count", sep = "")]) 191 | } 192 | } 193 | 194 | mut.table$max.VAF <- apply(max.vaf, 1, max) 195 | mut.table$max.var_count <- apply(max.var.count, 1, max) 196 | 197 | mut.table <- mut.table[!is.na(mut.table$max.VAF),, drop = FALSE] 198 | 199 | mut.table <- mut.table[mut.table$Use.For.Plots | mut.table$Use.For.Plots.Indel, ] 200 | # mut.table <- mut.table[!((mut.table$Use.For.Plots & mut.table$max.var_count < 10) | is.na(mut.table$max.var_count)), ] 201 | 202 | 203 | 204 | seg.mat.phylo <- create.subclonal.copy.number(seg.mat.copy = seg.mat.copy,min.subclonal = 0.01) 205 | 206 | if (subclonal_copy_correction %in% "FALSE") { 207 | cat('\nRunning without subclonal copy number mode') 208 | seg.mat.phylo$COPY_NUMBER_A <- seg.mat.phylo$nMajor 209 | seg.mat.phylo$COPY_NUMBER_B <- seg.mat.phylo$nMinor 210 | seg.mat.phylo$fracA <- 1 211 | seg.mat.phylo$fracB <- 0 212 | seg.mat.phylo$fracC <- NA 213 | seg.mat.phylo$fracD <- NA 214 | seg.mat.phylo$nMaj_A <- seg.mat.phylo$nMajor 215 | seg.mat.phylo$nMin_A <- seg.mat.phylo$nMinor 216 | seg.mat.phylo$nMaj_B <- seg.mat.phylo$nMajor 217 | seg.mat.phylo$nMin_B <- seg.mat.phylo$nMinor 218 | seg.mat.phylo$nMaj_C <- NA 219 | seg.mat.phylo$nMin_C <- NA 220 | seg.mat.phylo$nMaj_D <- NA 221 | seg.mat.phylo$nMin_D <- NA 222 | 223 | seg.mat.copy$COPY_NUMBER_A <- seg.mat.phylo$nMajor 224 | seg.mat.copy$COPY_NUMBER_B <- seg.mat.phylo$nMinor 225 | } 226 | 227 | patient.list <- list() 228 | phylo.region.list <- list() 229 | cellularity <- rep(NA, length(regions.to.use)) 230 | names(cellularity) <- regions.to.use 231 | 232 | # determine the indelCorrectionFactor 233 | if (length(regions.to.use) > 1) { 234 | indelCorrectionFactor <- determineIndelCorrectionFactor(patient = patient, mut.table = mut.table, regions.to.use = regions.to.use, seg.mat.phylo = seg.mat.phylo, seg.mat.copy = seg.mat.copy) 235 | indelMuts <- rownames(mut.table[mut.table$Use.For.Plots.Indel %in% TRUE,, drop = FALSE]) 236 | } 237 | 238 | for (region in regions.to.use) { 239 | region.mut.table <- mut.table 240 | region.seg.copy <- seg.mat.copy[seg.mat.copy$SampleID %in% region,, drop = FALSE] 241 | region.seg.phylo <- seg.mat.phylo[seg.mat.phylo$SampleID %in% region,, drop = FALSE] 242 | pyclone.table <- data.frame(t(sapply(1:nrow(region.mut.table),identify.subclonal.mut.copy.number.ascat,region.mut.table,region.seg.phylo,region,patient)), stringsAsFactors = FALSE) 243 | pyclone.table <- pyclone.table[!is.na(pyclone.table$minor_cn),] 244 | pyclone.table <- pyclone.table[!is.na(pyclone.table$ref_counts),] 245 | pyclone.table <- pyclone.table[!duplicated(pyclone.table$mutation_id),] 246 | 247 | sample.purity <- region.seg.copy$ACF[1] 248 | 249 | pyclone.table <- pyclone.table[(as.numeric(pyclone.table$ref_counts) + as.numeric(pyclone.table$var_counts)) >= 1,, drop = FALSE] 250 | region.earlyLate <- earlyORlateGender(region = region, complete.mutation.table = pyclone.table, purity = sample.purity, gender = gender) 251 | if (multiple_test_correction %in% FALSE) { 252 | region.phyloCCF <- calculate_phylo_ccf(region = region, complete.mutation.table = pyclone.table, purity = sample.purity, order.by.pos = TRUE, gender = gender) 253 | } 254 | 255 | if (multiple_test_correction %in% TRUE) { 256 | region.phyloCCF <- calculate_phylo_ccf_withBH(region = region, complete.mutation.table = pyclone.table, purity = sample.purity, order.by.pos = TRUE, gender = gender) 257 | } 258 | 259 | if (length(regions.to.use) > 1) { 260 | tmp <- intersect(rownames(region.phyloCCF), indelMuts) 261 | # let's look at indels specifically 262 | region.phyloCCF[tmp, 'phyloCCF'] <- region.phyloCCF[tmp, 'phyloCCF'] * as.numeric(indelCorrectionFactor[region]) 263 | region.phyloCCF[tmp, 'mutCopyNum'] <- region.phyloCCF[tmp, 'mutCopyNum'] * as.numeric(indelCorrectionFactor[region]) 264 | } 265 | 266 | phylo.region.list[[region]] <- region.phyloCCF 267 | cellularity[region] <- sample.purity 268 | } 269 | 270 | # perform additional copy number correction where needed 271 | muts_to_consider <- unlist(phylo.region.list[[1]]$mutation_id) 272 | for (mut in muts_to_consider) { 273 | mut_table <- c() 274 | for (region in names(phylo.region.list)) { 275 | mut_table <- rbind(mut_table, phylo.region.list[[region]][phylo.region.list[[region]]$mutation_id %in% mut,, drop = FALSE]) 276 | } 277 | 278 | if (max(mut_table$phyloCCF) > 1.5) { 279 | #all mutations are greater than 1.5 --> re-centre on 1 or #one or more (but not all) mutation is greater than 1.5 (but all are clonal) --> re-centre on 1 280 | if (length(which(mut_table$phyloCCF > 1.5)) == nrow(mut_table) | length(which(mut_table$phyloCCF.0.05 >= 1)) == nrow(mut_table)) { 281 | small_mut_table <- mut_table[mut_table$phyloCCF.0.05 > 1,, drop = FALSE] 282 | if (nrow(small_mut_table) > 0) { 283 | for (i in 1:nrow(small_mut_table)) { 284 | small_row <- small_mut_table[i,, drop = FALSE] 285 | region <- unlist(small_row$region) 286 | region.copy <- seg.mat.phylo[seg.mat.phylo$SampleID %in% region,] 287 | 288 | phyloCCF <- small_row$absolute.ccf 289 | phyloCCF_0.05 <- min(small_row$absolute.ccf.0.05, small_row$absolute.ccf.0.05 - abs(small_row$phyloCCF - small_row$phyloCCF.0.05)) 290 | phyloCCF_0.95 <- max(small_row$absolute.ccf.0.95, small_row$absolute.ccf.0.95 + abs(small_row$phyloCCF - small_row$phyloCCF.0.05)) 291 | 292 | phylo.region.list[[region]][phylo.region.list[[region]]$mutation_id %in% small_row$mutation_id,]$phyloCCF.0.05 <- phyloCCF_0.05 293 | phylo.region.list[[region]][phylo.region.list[[region]]$mutation_id %in% small_row$mutation_id,]$phyloCCF.0.95 <- phyloCCF_0.95 294 | phylo.region.list[[region]][phylo.region.list[[region]]$mutation_id %in% small_row$mutation_id,]$phyloCCF <- phyloCCF 295 | } 296 | } 297 | } 298 | 299 | #at least one mutation is subclonal --> properly adjust no.chrs.bearing.mut 300 | if (length(which(mut_table$phyloCCF.0.95 * mut_table$no.chrs.bearing.mut < 1)) >= 1) { 301 | small_mut_table <- mut_table[mut_table$phyloCCF.0.05 > 1,, drop = FALSE] 302 | if (nrow(small_mut_table) > 0) { 303 | for (i in 1:nrow(small_mut_table)) { 304 | small_row <- small_mut_table[i,, drop = FALSE] 305 | region <- unlist(small_row$region) 306 | region.copy <- seg.mat.phylo[seg.mat.phylo$SampleID %in% region,] 307 | 308 | if (small_row$phyloCCF != small_row$mutCopyNum) { 309 | phyloCCF <- small_row$phyloCCF / small_row$no.chrs.bearing.mut 310 | phyloCCF_0.05 <- small_row$phyloCCF.0.05 / small_row$no.chrs.bearing.mut 311 | phyloCCF_0.95 <- small_row$phyloCCF.0.95 / small_row$no.chrs.bearing.mut 312 | } else { 313 | phyloCCF <- small_row$phyloCCF 314 | phyloCCF_0.05 <- small_row$phyloCCF.0.05 315 | phyloCCF_0.95 <- small_row$phyloCCF.0.95 316 | } 317 | phylo.region.list[[region]][phylo.region.list[[region]]$mutation_id %in% small_row$mutation_id,]$phyloCCF.0.05 <- phyloCCF_0.05 318 | phylo.region.list[[region]][phylo.region.list[[region]]$mutation_id %in% small_row$mutation_id,]$phyloCCF.0.95 <- phyloCCF_0.95 319 | phylo.region.list[[region]][phylo.region.list[[region]]$mutation_id %in% small_row$mutation_id,]$phyloCCF <- phyloCCF 320 | } 321 | } 322 | } 323 | } 324 | 325 | if (TRUE %in% c(mut_table$phyloCCF > mut_table$mutCopyNum)) { 326 | # are all the mutations now truncal? 327 | if (length(which(mut_table$phyloCCF.0.95 < 1)) == 0) next 328 | if (only_truncal_subclonal_copy_correction %in% TRUE) { 329 | if (length(which(mut_table$phyloCCF.0.95 < 1)) >= 1) { 330 | # so we've performed copy number correction, but that didn't make the mutation clonal, so let's revert back to non-copy number corrected 331 | muts_to_revert <- mut_table[mut_table$no.chrs.bearing.mut < 1,, drop = FALSE] 332 | 333 | for (region in unlist(muts_to_revert$region)) { 334 | mut_to_revert <- muts_to_revert[unlist(muts_to_revert$region) %in% region,, drop = FALSE] 335 | region.copy <- seg.mat.phylo[seg.mat.phylo$SampleID %in% region,] 336 | expVAF <- min(1 - 1e-6, c((region.copy$ACF[1]*1) / (2*(1-region.copy$ACF[1]) + region.copy$ACF[1]*(as.numeric(mut_to_revert$major_raw)+as.numeric(mut_to_revert$minor_raw))))) 337 | VAF_ci <- prop.test(x = as.numeric(mut_to_revert$var_counts),n = as.numeric(mut_to_revert$ref_counts)+as.numeric(mut_to_revert$var_counts),p = expVAF) 338 | phyloCCF <- (VAF_ci$estimate *1/region.copy$ACF[1])*((region.copy$ACF[1]*(as.numeric(mut_to_revert$major_raw)+as.numeric(mut_to_revert$minor_raw)))+2*(1-region.copy$ACF[1])) 339 | phyloCCF_0.05 <- (VAF_ci$conf.int[1] *1/region.copy$ACF[1])*((region.copy$ACF[1]*(as.numeric(mut_to_revert$major_raw)+as.numeric(mut_to_revert$minor_raw)))+2*(1-region.copy$ACF[1])) 340 | phyloCCF_0.95 <- (VAF_ci$conf.int[2] *1/region.copy$ACF[1])*((region.copy$ACF[1]*(as.numeric(mut_to_revert$major_raw)+as.numeric(mut_to_revert$minor_raw)))+2*(1-region.copy$ACF[1])) 341 | 342 | phylo.region.list[[region]][phylo.region.list[[region]]$mutation_id %in% mut_to_revert$mutation_id,]$no.chrs.bearing.mut <- 1 343 | phylo.region.list[[region]][phylo.region.list[[region]]$mutation_id %in% mut_to_revert$mutation_id,]$phyloCCF.0.05 <- phyloCCF_0.05 344 | phylo.region.list[[region]][phylo.region.list[[region]]$mutation_id %in% mut_to_revert$mutation_id,]$phyloCCF.0.95 <- phyloCCF_0.95 345 | phylo.region.list[[region]][phylo.region.list[[region]]$mutation_id %in% mut_to_revert$mutation_id,]$phyloCCF <- phyloCCF 346 | phylo.region.list[[region]][phylo.region.list[[region]]$mutation_id %in% mut_to_revert$mutation_id,]$expected.VAF <- expVAF 347 | } 348 | } 349 | } 350 | if (only_truncal_subclonal_copy_correction %in% FALSE) next 351 | } 352 | } 353 | 354 | input_list <- list(patient = patient, new.dir = new.dir, mut.table = mut.table, seg.mat.phylo = seg.mat.phylo, phylo.region.list = phylo.region.list) 355 | simpleClusterList <- findSimpleClusters(input_list) 356 | ### fail safe to remove clusters of mutations not sufficiently present in any region 357 | simpleClusterList <- simpleClusterList[!sapply(strsplit(sapply(simpleClusterList, function(x) x$clusterBinary), split = ":"), function(y) all(y == "0"))] 358 | ### changing CCFs to 0 if cluster is absent 359 | if (fix_absentCCFs) { 360 | clusterPresence.df <- Reduce(rbind, lapply(names(simpleClusterList), function(x) data.frame(clusterID = x, regions = simpleClusterList[[x]]$RegionsInCluster, stringsAsFactors = FALSE))) 361 | for (region in regions.to.use) { 362 | tmp.clusterPresence <- clusterPresence.df %>% dplyr::filter(regions %in% region) %>% dplyr::pull(clusterID) 363 | tmp.clusterMutations <- as.character(unlist(lapply(simpleClusterList[as.character(tmp.clusterPresence)], function(x) x$MutationsWithCluster))) 364 | phylo.region.list[[region]][!as.character(unlist(phylo.region.list[[region]]$mutation_id)) %in% tmp.clusterMutations, "phyloCCF"] <- 0 365 | phylo.region.list[[region]][!as.character(unlist(phylo.region.list[[region]]$mutation_id)) %in% tmp.clusterMutations, "phyloCCF.0.05"] <- 0 366 | phylo.region.list[[region]][!as.character(unlist(phylo.region.list[[region]]$mutation_id)) %in% tmp.clusterMutations, "phyloCCF.0.95"] <- 0 367 | } 368 | } 369 | 370 | return(list(patient = patient, regions.to.use = regions.to.use, mut.table = mut.table, seg.mat.copy = seg.mat.copy, seg.mat.phylo = seg.mat.phylo, phylo.region.list = phylo.region.list, simpleClusterList = simpleClusterList)) 371 | } 372 | 373 | 374 | #' Main clustering function 375 | #' 376 | #' This function takes the input list created in the preprocessing along with 377 | #' the number of cores and output directory to run the main clustering. 378 | #' @param input_list A list created by the clustering preprocess function 379 | #' including patient id, regions to use, phylo region list and others. 380 | #' @param nProcs A value referring to how many parallel processes 381 | #' of pyclone should be run. 382 | #' @param new.dir A character specifying the directory where the pyclone 383 | #' output should be saved. 384 | #' @param burn_in Burn-in for DP clustering. 385 | #' @param pyclone_seed Seed for PyClone run. 386 | #' @param template.config.yaml Location of the template yaml file used to run PyClone. 387 | #' @returns sample.results which is the location of the pyclone output table. 388 | #' @importFrom parallel mclapply 389 | #' @export clustering_run 390 | 391 | clustering_run <- function(input_list, nProcs, new.dir, burn_in, pyclone_seed, template.config.yaml) { 392 | patient <- input_list$patient 393 | simpleClusterList <- input_list$simpleClusterList 394 | 395 | no_cores <- nProcs 396 | print(paste0("Number of cores that are available: ", no_cores)) 397 | 398 | ### always run pyclone 399 | tmp <- parallel::mclapply(simpleClusterList, function(x) { 400 | if (length(x$MutationsWithCluster) < 5) { 401 | CreateOutputNoPyCloneRun(clusterName = x$clusterID, patientID = patient, SmallClusters = simpleClusterList, patientDirToUse = new.dir) 402 | } else { 403 | RunPyCloneWithSimpleClusters(clusterName = x$clusterID, patientID = patient, SmallClusters = simpleClusterList, patientDirToUse = new.dir, yamlConfigLoc = template.config.yaml, pyclone.burnin = burn_in, pyclone.seed = pyclone_seed, run.pyclone = TRUE, pyclone.module = "PyClone/0.12.3-foss-2016b-Python-2.7.12-tkinter") 404 | } 405 | }, mc.cores = no_cores) 406 | rm(list = c("no_cores", "tmp")) 407 | 408 | allClusters <- paste0(list.files(new.dir, pattern = paste0(patient, "_cluster"), full.names = TRUE), "/", patient, ".results.tsv") 409 | pyclone.results.list <- lapply(names(simpleClusterList), function(clusterID) { 410 | cluster.results.file <- grep(paste0("cluster", clusterID, "/"), allClusters, value = TRUE) 411 | pyclone.results <- read.table(cluster.results.file, sep = "\t", header = TRUE, stringsAsFactors = FALSE) 412 | colnames(pyclone.results) <- gsub('mutation_id', 'X', colnames(pyclone.results)) 413 | pyclone.results$cluster_id <- as.numeric(clusterID) * 100 + as.numeric(pyclone.results$cluster_id) 414 | if (length(grep("^X$", colnames(pyclone.results))) == 0) { 415 | pyclone.results$X <- rownames(pyclone.results) 416 | } 417 | return(pyclone.results[, c("X", "cluster_id")]) 418 | }) 419 | pyclone.results <- Reduce(rbind, pyclone.results.list) 420 | sample.results <- paste(new.dir,"/",patient,'.results.tsv',sep="") 421 | write.table(pyclone.results, sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE, file = sample.results) 422 | return(sample.results) 423 | } 424 | 425 | 426 | #' Postprocessing of clustering function 427 | #' 428 | #' This function takes the input tsv and formats the data to be compatible with 429 | #' the main CONIPHER clustering function. 430 | #' @param input_list A list created by the clustering preprocess function 431 | #' including patient id, regions to use, phylo region list and others. 432 | #' @param sample.results which is the location of the pyclone output table. 433 | #' @param new.dir A character specifying the directory where the pyclone 434 | #' output should be saved. 435 | #' @param input_tsv the input mutation tsv. 436 | #' @param input_seg_tsv_loc path to a copy number segment tsv file that is used for 437 | #' across genome copy number plotting. 438 | #' Default NULL 439 | #' @param min_cluster_size Minimum number of mutations needed for a cluster to be considered. 440 | #' Default 5 441 | #' @param driver_cat Which categories to use as driver mutations 442 | #' Default "1" 443 | #' @param clean_clusters should clusters be cleaned and merged? 444 | #' Default TRUE 445 | #' @param min_ccf_present minimum CCF to consider a mutation as present. 446 | #' Default 0.1 447 | #' @param clonal_cutOff lower threshold CCF to consider mutations as clonal. 448 | #' Default 0.9 449 | #' @param propClonal_threshold Proportion of mutations in cluster which needs to be 450 | #' considered clonal to merge. 451 | #' Default 0.25 452 | #' @returns NULL 453 | #' @importFrom dplyr "%>%" 454 | #' @export clustering_postprocess 455 | 456 | clustering_postprocess <- function(input_list, sample.results, new.dir, input_tsv, input_seg_tsv_loc = NULL, min_cluster_size = 5, driver_cat = "1", clean_clusters = TRUE, min_ccf_present = 0.1, clonal_cutOff = 0.9, propClonal_threshold = 0.25) { 457 | phylo.region.list <- input_list$phylo.region.list 458 | mut.table <- input_list$mut.table 459 | seg.mat.phylo <- input_list$seg.mat.phylo 460 | seg.mat.copy <- input_list$seg.mat.copy 461 | regions.to.use <- input_list$regions.to.use 462 | simpleClusterList <- input_list$simpleClusterList 463 | patient <- input_list$patient 464 | 465 | ITH1clust <- names(which(sapply(simpleClusterList, function(x) length(x$RegionsInCluster)) == length(phylo.region.list))) 466 | ITH1muts <- simpleClusterList[[as.character(ITH1clust)]]$MutationsWithCluster 467 | 468 | pyclone.results <- read.table(sample.results, sep = "\t", header = TRUE, stringsAsFactors = FALSE) 469 | 470 | # let's make sure the same mutations are being used. 471 | colnames(pyclone.results) <- gsub('mutation_id', 'X', colnames(pyclone.results)) 472 | rownames(pyclone.results) <- pyclone.results$X 473 | pyclone.results <- pyclone.results[rownames(pyclone.results) %in% mut.table$mutation_id,, drop = FALSE] 474 | 475 | most.likely.cluster <- pyclone.results$cluster_id 476 | names(most.likely.cluster) <- pyclone.results$X 477 | muts.to.remove <- c() 478 | # let's have a look at whether any of the clusters are explained by copy number events 479 | 480 | 481 | mut.pvals <- c() 482 | cluster.prop.aber <- c() 483 | # require(coin) 484 | 485 | for (cluster in unique(most.likely.cluster)) { 486 | 487 | tmp <- copy.driven.clusterNEW(cluster, seg.mat.copy = seg.mat.copy, most.likely.cluster = most.likely.cluster, region.earlyLate.list = phylo.region.list, min.prop.cens = 0.1, loss.thresh = 0.25, diff.thresh = 0.55) 488 | 489 | if(TRUE %in% is.na(tmp[, 1])) stop 490 | 491 | cluster.prop.aber <- c(cluster.prop.aber, length(which(tmp[, 1] < 0.05 & tmp[, 2] <= 0.5)) / nrow(tmp)) 492 | mut.pvals <- rbind(mut.pvals,tmp) 493 | } 494 | 495 | names(cluster.prop.aber) <- unique(most.likely.cluster) 496 | # Let's also check what the minimum copy number is for each mutation that is classified as lost 497 | 498 | out <- c() 499 | for (mutation_id in rownames(mut.pvals)) { 500 | out <- rbind(out, getMinCPN(mutation_id, phylo.region.list)) 501 | } 502 | 503 | mut.pvals <- cbind(mut.pvals, out[, 2]) 504 | sig.pvals <- mut.pvals[which(as.numeric(mut.pvals[, 1]) < 0.05 & as.numeric(mut.pvals[, 2]) <= 0.75 & as.numeric(mut.pvals[, 3]) == 0), 1] 505 | sig.table <- mut.pvals[which(as.numeric(mut.pvals[, 1]) < 0.05 & as.numeric(mut.pvals[, 2]) <= 0.75 & as.numeric(mut.pvals[, 3]) == 0),, drop = FALSE] 506 | 507 | print(length(sig.pvals)) 508 | # let's see what happens if we set any cluster with over X% sig pvals as the entire cluster missing 509 | clusters.to.remove <- names(which(as.numeric(cluster.prop.aber) >= 0.85)) 510 | muts.to.remove <- names(most.likely.cluster)[which(most.likely.cluster %in% clusters.to.remove)] 511 | muts.to.remove <- unique(c(names(sig.pvals),muts.to.remove)) 512 | muts.to.remove <- muts.to.remove[!is.na(muts.to.remove)] 513 | names(muts.to.remove) <- muts.to.remove 514 | 515 | # let's be clever about this and give the clusters we're removing the highest names 516 | cluster.size.remove <- names(most.likely.cluster[most.likely.cluster %in% names(which(table(most.likely.cluster) < min_cluster_size))]) 517 | small.clusters <- most.likely.cluster[cluster.size.remove] 518 | 519 | # are any of the small clusters private 520 | for (small.cluster in unique(small.clusters)) { 521 | # create a presence absence heatmap 522 | binary.heatmap <- pyclone.results[, 2:(ncol(pyclone.results) - 1), drop = FALSE] 523 | binary.heatmap <- ifelse(binary.heatmap < min_ccf_present, 0, 1) 524 | rownames(binary.heatmap) <- pyclone.results$X 525 | row.names <- pyclone.results[pyclone.results$cluster_id %in% small.cluster, 'X'] 526 | 527 | if (median(rowSums(binary.heatmap[row.names,, drop = FALSE])) == 1) { 528 | small.clusters <- small.clusters[!small.clusters %in% small.cluster] 529 | } 530 | } 531 | 532 | cluster.size.remove <- cluster.size.remove[cluster.size.remove %in% names(small.clusters)] 533 | 534 | 535 | old.most.likely.cluster <- most.likely.cluster 536 | 537 | most.likely.cluster <- most.likely.cluster[!names(most.likely.cluster) %in% unique(c(muts.to.remove,cluster.size.remove))] 538 | 539 | if (length(most.likely.cluster) == 0) { 540 | stop("ClusterSize\nYou don't have any clean clusters greater than min.cluster.size") 541 | } 542 | 543 | # Let's rename the most likely cluster 544 | tmp <- most.likely.cluster 545 | most.likely.cluster <- as.numeric(factor(most.likely.cluster)) 546 | names(most.likely.cluster) <- names(tmp) 547 | if (length(most.likely.cluster) > 1) { 548 | tmp <- as.character(1:length(unique(most.likely.cluster))) 549 | names(tmp) <- names(table(most.likely.cluster)[order(table(most.likely.cluster), decreasing = TRUE)]) 550 | tmp2 <- most.likely.cluster 551 | most.likely.cluster <- tmp[as.character(match(most.likely.cluster, 1:length(most.likely.cluster)))] 552 | names(most.likely.cluster) <- names(tmp2) 553 | solid.cluster.end <- max(as.numeric(most.likely.cluster)) 554 | new.cluster.start <- max(as.numeric(most.likely.cluster)) + 1 555 | removed.muts <- old.most.likely.cluster[which(!names(old.most.likely.cluster) %in% names(most.likely.cluster))] 556 | removed.clusters <- c() 557 | for (cluster in names(table(removed.muts)[order(table(removed.muts), decreasing = TRUE)])) { 558 | new.cluster <- removed.muts[removed.muts %in% cluster] 559 | new.cluster[new.cluster %in% cluster] <- new.cluster.start 560 | removed.clusters <- c(removed.clusters,new.cluster) 561 | new.cluster.start <- new.cluster.start + 1 562 | } 563 | 564 | # let's add back the ones to remove, but make these higher clusters (i.e. these will have higher number) 565 | # this will make their removal seem easier later hopefully 566 | 567 | 568 | most.likely.cluster <- c(most.likely.cluster,removed.clusters) 569 | most.likely.cluster <- most.likely.cluster[order(as.numeric(most.likely.cluster))] 570 | names.muts <- names(most.likely.cluster) 571 | most.likely.cluster <- as.numeric(most.likely.cluster) 572 | names(most.likely.cluster) <- names.muts 573 | } 574 | 575 | if (length(most.likely.cluster) == 0) { 576 | solid.cluster.end <- 0 577 | new.cluster.start <- 1 578 | 579 | removed.clusters <- c() 580 | for (cluster in names(table(removed.muts)[order(table(removed.muts), decreasing = TRUE)])) { 581 | new.cluster <- removed.muts[removed.muts %in% cluster] 582 | new.cluster[new.cluster %in% cluster] <- new.cluster.start 583 | removed.clusters <- c(removed.clusters,new.cluster) 584 | new.cluster.start <- new.cluster.start + 1 585 | } 586 | most.likely.cluster <- removed.clusters 587 | } 588 | 589 | 590 | 591 | #Let's add the muts to remove back into the table 592 | v.pvals <- rep(NA, nrow(mut.table)) 593 | names(v.pvals) <- mut.table$mutation_id 594 | v.pvals[intersect(names(v.pvals), rownames(mut.pvals))] <- mut.pvals[intersect(names(v.pvals), rownames(mut.pvals)), 1] 595 | v.remove <- names(v.pvals) %in% muts.to.remove 596 | names(v.remove) <- names(v.pvals) 597 | v.cluster <- rep(NA, nrow(mut.table)) 598 | names(v.cluster) <- names(v.pvals) 599 | v.cluster[names(most.likely.cluster)] <- most.likely.cluster 600 | v.size.remove <- names(v.pvals) %in% cluster.size.remove 601 | names(v.size.remove) <- names(v.pvals) 602 | v.minCPN <- rep(NA, nrow(mut.table)) 603 | names(v.minCPN) <- names(v.pvals) 604 | v.minCPN[intersect(names(v.minCPN), rownames(mut.pvals))] <- mut.pvals[intersect(names(v.minCPN), rownames(mut.pvals)), 3] 605 | v.regionLoss <- rep(NA, nrow(mut.table)) 606 | names(v.regionLoss) <- names(v.pvals) 607 | v.regionLoss[intersect(names(v.regionLoss), rownames(mut.pvals))]<- mut.pvals[intersect(names(v.regionLoss), rownames(mut.pvals)), 4] 608 | 609 | 610 | 611 | 612 | mut.table$cpn.remove.pval <- v.pvals 613 | mut.table$cpn.remove <- v.remove 614 | mut.table$cluster <- v.cluster 615 | mut.table$cluster.remove <- v.size.remove 616 | mut.table$minCPN <- v.minCPN 617 | mut.table$regionLoss <- v.regionLoss 618 | 619 | mut.table.save.name <- paste(new.dir, patient, '.all.SNV.cpn.xls', sep = "") 620 | write.table(mut.table, file = mut.table.save.name, sep = "\t", col.names = NA) 621 | 622 | # let's plot these mutations #### 623 | if (length(muts.to.remove) > 1) { 624 | pdf(paste(new.dir, patient, ".removedCPN.muts.pdf", sep = ""), width = 8, height = 8) 625 | clusters.to.plot <- most.likely.cluster[mut.table[mut.table$cpn.remove %in% TRUE, 'mutation_id']] 626 | 627 | { 628 | # let's only plot a cluster if it has removed musted 629 | print(clusters.to.plot) 630 | 631 | lyout <- c() 632 | for (i in seq(1, length(regions.to.use) * 2, by = 2)) { 633 | lyout <- rbind(lyout, rbind(c(rep(i, 9), i + 1))) 634 | } 635 | 636 | layout(lyout) 637 | 638 | for (region in regions.to.use) { 639 | 640 | region.earlyLate <- phylo.region.list[[region]] 641 | region.earlyLate <- region.earlyLate[!is.na(region.earlyLate$phyloCCF),] 642 | region.earlyLate <- region.earlyLate[region.earlyLate$mutation_id %in% muts.to.remove,, drop = FALSE] 643 | 644 | # Using seg file if exists for plotting 645 | region.seg.copy <- seg.mat.copy[seg.mat.copy$SampleID %in% region,, drop = FALSE] 646 | 647 | if (!is.null(input_seg_tsv_loc)) { 648 | print("Using specified seg file for plotting") 649 | region.seg.copy <- read.delim2(input_seg_tsv_loc, stringsAsFactors = FALSE) 650 | region.seg.copy$SAMPLE <- gsub("-", "\\.", region.seg.copy$SAMPLE) 651 | region.seg.copy$COPY_NUMBER_A <- as.numeric(region.seg.copy$COPY_NUMBER_A) 652 | region.seg.copy$COPY_NUMBER_B <- as.numeric(region.seg.copy$COPY_NUMBER_B) 653 | region.seg.copy$CHR <- as.numeric(region.seg.copy$CHR) 654 | region.seg.copy$STARTPOS <- as.numeric(region.seg.copy$STARTPOS) 655 | 656 | #Sort by start position within chromosome 657 | region.seg.copy <- region.seg.copy[order(region.seg.copy$CHR, 658 | region.seg.copy$STARTPOS), ] 659 | # If providing seg file, ensure the sample names match the sample names in input tsv 660 | if (!any(unique(region.seg.copy$SAMPLE) %in% unique(seg.mat.copy[,1]))) { 661 | stop('Sample IDs do not match between input_tsv and input_seg_tsv') 662 | } 663 | region.seg.copy <- region.seg.copy %>% dplyr::rename(SampleID = SAMPLE) 664 | } else { 665 | print("Using tsv data for plotting") 666 | region.seg.copy <- seg.mat.copy 667 | } 668 | 669 | region.seg.copy <- region.seg.copy %>% dplyr::filter(SampleID %in% region) 670 | # ensure raw copy number columns are numeric: 671 | region.seg.copy$COPY_NUMBER_A <- as.numeric(region.seg.copy$COPY_NUMBER_A) 672 | region.seg.copy$COPY_NUMBER_B <- as.numeric(region.seg.copy$COPY_NUMBER_B) 673 | 674 | # Rename columns 675 | sub.mat.copy <- region.seg.copy 676 | 677 | colnames(sub.mat.copy)[2] <- 'Chromosome' 678 | colnames(sub.mat.copy)[3] <- 'StartPosition' 679 | colnames(sub.mat.copy)[4] <- 'EndPosition' 680 | 681 | #pdf(early.late.pdf) 682 | par(mar = c(0.5, 5, 0.5, 0.2)) 683 | par(lend = 1) 684 | 685 | plot.simpleClusters.raw(seg.mat.patient = sub.mat.copy, most.likely.cluster = clusters.to.plot, TCGA.earlyLate = region.earlyLate, sub.clonal = 1) 686 | 687 | mtexti(region, side = 2, off = 0.5) 688 | 689 | ds <- density(ifelse(as.numeric(region.earlyLate$mutCopyNum) > 5, 5, as.numeric(region.earlyLate$mutCopyNum))) 690 | ds1 <- ds 691 | ds1$x <- ds$y 692 | ds1$y <- ds$x 693 | par(mar = c(0.5, 0, 0.5, 4)) 694 | A <- hist(ifelse(as.numeric(region.earlyLate$mutCopyNum) > 5, 5, as.numeric(region.earlyLate$mutCopyNum)), breaks = seq(-0.25, 6, by = 0.1), plot = FALSE) 695 | plot(NULL, type = "n", xlim = c(0, max(A$density)), ylim = c(-0.25, 6), bty = 'n', xaxs = 'i', xaxt = 'n', yaxt = 'n', yaxs = 'i', xlab = "", main = "", ylab = "") 696 | rect(0, A$breaks[1:(length(A$breaks) - 1)], A$density, A$breaks[2:length(A$breaks)], border = TRUE, col = "#CC6666") 697 | lines(ds1) 698 | } 699 | } 700 | 701 | for (cluster in unique(clusters.to.plot)) { 702 | # let's only plot a cluster if it has removed musted 703 | 704 | print(cluster) 705 | 706 | lyout <- c() 707 | for (i in seq(1, length(regions.to.use) * 2, by = 2)) { 708 | lyout <- rbind(lyout, rbind(c(rep(i, 9), i + 1))) 709 | } 710 | 711 | layout(lyout) 712 | 713 | for (region in regions.to.use) { 714 | region.earlyLate <- phylo.region.list[[region]] 715 | region.earlyLate <- region.earlyLate[!is.na(region.earlyLate$phyloCCF),] 716 | region.earlyLate <- region.earlyLate[region.earlyLate$mutation_id %in% muts.to.remove,, drop = FALSE] 717 | 718 | # Using seg file if exists for plotting 719 | region.seg.copy <- seg.mat.copy[seg.mat.copy$SampleID %in% region,, drop = FALSE] 720 | 721 | if (!is.null(input_seg_tsv_loc)) { 722 | print("Using specified seg file for plotting") 723 | region.seg.copy <- read.delim2(input_seg_tsv_loc, stringsAsFactors = FALSE) 724 | region.seg.copy$SAMPLE <- gsub("-", "\\.", region.seg.copy$SAMPLE) 725 | region.seg.copy$COPY_NUMBER_A <- as.numeric(region.seg.copy$COPY_NUMBER_A) 726 | region.seg.copy$COPY_NUMBER_B <- as.numeric(region.seg.copy$COPY_NUMBER_B) 727 | region.seg.copy$CHR <- as.numeric(region.seg.copy$CHR) 728 | region.seg.copy$STARTPOS <- as.numeric(region.seg.copy$STARTPOS) 729 | 730 | #Sort by start position within chromosome 731 | region.seg.copy <- region.seg.copy[order(region.seg.copy$CHR, 732 | region.seg.copy$STARTPOS), ] 733 | # If providing seg file, ensure the sample names match the sample names in input tsv 734 | if (!any(unique(region.seg.copy$SAMPLE) %in% unique(seg.mat.copy[, 1]))) { 735 | stop('Sample IDs do not match between input_tsv and input_seg_tsv') 736 | } 737 | region.seg.copy <- region.seg.copy %>% dplyr::rename(SampleID = SAMPLE) 738 | } else { 739 | print("Using tsv data for plotting") 740 | region.seg.copy <- seg.mat.copy 741 | } 742 | 743 | region.seg.copy <- region.seg.copy %>% dplyr::filter(SampleID %in% region) 744 | # ensure raw copy number columns are numeric: 745 | region.seg.copy$COPY_NUMBER_A <- as.numeric(region.seg.copy$COPY_NUMBER_A) 746 | region.seg.copy$COPY_NUMBER_B <- as.numeric(region.seg.copy$COPY_NUMBER_B) 747 | 748 | # Rename columns: 749 | sub.mat.copy <- region.seg.copy 750 | colnames(sub.mat.copy)[2] <- 'Chromosome' 751 | colnames(sub.mat.copy)[3] <- 'StartPosition' 752 | colnames(sub.mat.copy)[4] <- 'EndPosition' 753 | 754 | 755 | #pdf(early.late.pdf) 756 | par(mar = c(0.5, 5, 0.5, 0.2)) 757 | par(lend = 1) 758 | 759 | plot.simpleClusters.raw(seg.mat.patient = sub.mat.copy, most.likely.cluster = most.likely.cluster, cluster = cluster, TCGA.earlyLate = region.earlyLate, sub.clonal = 1) 760 | 761 | mtexti(region, side = 2, off = 0.5) 762 | 763 | ds <- density(ifelse(as.numeric(region.earlyLate$mutCopyNum) > 5, 5, as.numeric(region.earlyLate$mutCopyNum))) 764 | ds1 <- ds 765 | ds1$x <- ds$y 766 | ds1$y <- ds$x 767 | par(mar = c(0.5, 0, 0.5, 4)) 768 | A <- hist(ifelse(as.numeric(region.earlyLate$mutCopyNum) > 5, 5, as.numeric(region.earlyLate$mutCopyNum)), breaks = seq(-0.25, 6, by = 0.1), plot = FALSE) 769 | plot(NULL, type = "n", xlim = c(0, max(A$density)), ylim = c(-0.25, 6), bty = 'n', xaxs = 'i', xaxt = 'n', yaxt = 'n', yaxs = 'i', xlab = "", main = "", ylab = "") 770 | rect(0, A$breaks[1:(length(A$breaks) - 1)], A$density, A$breaks[2:length(A$breaks)], border = TRUE, col = "#CC6666") 771 | 772 | lines(ds1) 773 | } 774 | dev.off() 775 | } 776 | 777 | 778 | 779 | # let's also write these to a table 780 | mut.table.remove <- mut.table[mut.table$mutation_id %in% muts.to.remove,, drop = FALSE] 781 | write.table(mut.table.remove, file = paste(new.dir, patient, ".removed.muts.txt", sep = ""), sep = "\t", quote = FALSE, col.names = NA) 782 | if (TRUE %in% c(mut.table.remove$driverCategory %in% driver_cat)) { 783 | cat("You're removing driver muts!!") 784 | removed.drivers <- mut.table.remove[mut.table.remove$driverCategory %in% driver_cat,, drop = FALSE] 785 | write.table(removed.drivers, file = paste(new.dir, patient, ".removed.drivers.txt", sep = ""), quote = FALSE, sep = "\t", col.names = NA) 786 | cat('\n') 787 | cat(removed.drivers$Gene.refGene) 788 | } 789 | 790 | } 791 | 792 | no.optima = length(unique(most.likely.cluster)) 793 | max.cols = 12 794 | # require(RColorBrewer) 795 | cols = paste(RColorBrewer::brewer.pal(min(max.cols,no.optima),name = "Paired"), sep = "") 796 | cols = rep(cols, ceiling(no.optima / max.cols))[1:no.optima] 797 | cols.opac = paste(cols, '99', sep = "") 798 | 799 | clean.most.likely.clusters <- most.likely.cluster[most.likely.cluster %in% c(1:solid.cluster.end)] 800 | 801 | 802 | # let's get confidence intervals for each of the mutations 803 | # (and also plot the confidence intervals of the driver mutations) 804 | 805 | # # first, let's import the trace files 806 | region.trace <- list() 807 | region.preClustPosterior <- list() 808 | region.postClustPosterior <- list() 809 | 810 | for (region in regions.to.use) { 811 | clusters.to.use <- sapply(simpleClusterList, function(x) region %in% x$RegionsInCluster) 812 | clusters.to.use <- names(clusters.to.use)[clusters.to.use] 813 | 814 | for (clust in clusters.to.use) { 815 | pyclone.tsv <- read.table(paste0(new.dir, patient, "_cluster", clust, "/", patient, '.results.tsv'), stringsAsFactors = FALSE, header = TRUE) 816 | 817 | mutation_ids <- unlist(phylo.region.list[[region]]$mutation_id) 818 | if (length(grep("mutation_id", colnames(pyclone.tsv))) > 0) { 819 | tmp <- intersect(mutation_ids, pyclone.tsv$mutation_id) 820 | } else { 821 | tmp <- intersect(mutation_ids, rownames(pyclone.tsv)) 822 | } 823 | phylo.region.list[[region]]$phyloCCF_PyClone.cluster[mutation_ids %in% tmp] <- most.likely.cluster[tmp] 824 | } 825 | } 826 | 827 | 828 | # Finally, let's put this into a megatable, and write this to an appropriate place 829 | save(phylo.region.list, file = paste(new.dir, patient, '.PhyloRegionList.RData', sep = "")) 830 | save.image(file = paste(new.dir, patient, ".PyClone.RData", sep = "")) 831 | 832 | 833 | print("Creating human readable output") 834 | ### creating human readable output 835 | print("Running non-original output") 836 | tmp.phylo.region.list <- lapply(phylo.region.list, function(x) { 837 | tmp <- data.frame(x, stringsAsFactors = FALSE) 838 | rownames(tmp) <- NULL 839 | tmp <- tmp %>% 840 | dplyr::select(mutation_id, region, Reference_Base, Alternate_Base, ref_counts, var_counts, phyloCCF, phyloCCF.0.05, phyloCCF.0.95, absolute.ccf, mutCopyNum, major_cn, minor_cn) %>% 841 | dplyr::mutate(mutation_id = unlist(mutation_id), ref_counts = unlist(ref_counts), var_counts = unlist(var_counts), minor_cn = unlist(minor_cn), major_cn = unlist(major_cn), region = unlist(region), Reference_Base = unlist(Reference_Base), Alternate_Base = unlist(Alternate_Base)) %>% 842 | dplyr::rename(SAMPLE = region, REF = Reference_Base, ALT = Alternate_Base, REF_COUNT = ref_counts, VAR_COUNT = var_counts, CCF_PHYLO = phyloCCF, CCF_OBS = absolute.ccf, MUT_COPY = mutCopyNum, COPY_NUMBER_A = major_cn, COPY_NUMBER_B = minor_cn) 843 | return(tmp) 844 | }) 845 | output_tsv <- dplyr::bind_rows(tmp.phylo.region.list) 846 | 847 | output_tsv <- output_tsv %>% 848 | dplyr::mutate(CLUSTER = most.likely.cluster[output_tsv$mutation_id], 849 | CLEAN = ifelse(output_tsv$mutation_id %in% names(clean.most.likely.clusters), TRUE, FALSE)) 850 | 851 | output_tsv <- output_tsv %>% 852 | dplyr::mutate(CHR = as.numeric(sapply(strsplit(unlist(output_tsv$mutation_id), split = ":"), function(x) x[2])), 853 | POS = as.numeric(sapply(strsplit(unlist(output_tsv$mutation_id), split = ":"), function(x) x[3])), 854 | key = paste(paste0("chr", CHR), POS, REF, ALT, sep = ":")) %>% 855 | dplyr::left_join(input_tsv %>% dplyr::select(CASE_ID, SAMPLE, CHR, POS, REF, ALT, DEPTH, ACF, PLOIDY) %>% mutate(CHR = as.numeric(CHR), POS = as.numeric(POS)), by = c("CHR", "POS", "REF", "ALT", "SAMPLE")) %>% 856 | dplyr::select(CASE_ID, SAMPLE, CHR, POS, REF, ALT, REF_COUNT, VAR_COUNT, DEPTH, CLUSTER, CCF_PHYLO, CCF_OBS, MUT_COPY, COPY_NUMBER_A, COPY_NUMBER_B, ACF, PLOIDY, CLEAN, phyloCCF.0.05, phyloCCF.0.95) 857 | 858 | write.table(output_tsv %>% dplyr::select(-phyloCCF.0.05, -phyloCCF.0.95), file = paste0(new.dir, patient, ".SCoutput.FULL.tsv"), row.names = FALSE, quote = FALSE, sep = "\t") 859 | 860 | output_tsv_clean <- output_tsv %>% dplyr::filter(CLEAN) %>% dplyr::select(-CLEAN, -phyloCCF.0.05, -phyloCCF.0.95) 861 | output_tsv_dirty <- output_tsv %>% dplyr::filter(!CLEAN) %>% dplyr::select(-CLEAN, -phyloCCF.0.05, -phyloCCF.0.95) 862 | 863 | write.table(output_tsv_clean, file = paste0(new.dir, patient, ".SCoutput.CLEAN.tsv"), row.names = FALSE, quote = FALSE, sep = "\t") 864 | write.table(output_tsv_dirty, file = paste0(new.dir, patient, ".SCoutput.DIRTY.tsv"), row.names = FALSE, quote = FALSE, sep = "\t") 865 | 866 | 867 | #finally, save a version of the table that is cleaned 868 | 869 | if(clean_clusters %in% TRUE) { 870 | print("Cleaning clusters") 871 | output_tsv <- correct.clusters.from.table(output_tsv) #kg: merging clusters if sc cn correction created a new cluster 872 | dirty.clusters.remove <- output_tsv %>% 873 | dplyr::group_by(CLUSTER) %>% 874 | dplyr::filter(all(!CLEAN)) %>% 875 | dplyr::pull(CLUSTER) %>% 876 | unique() 877 | output_tsv <- output_tsv %>% 878 | dplyr::mutate(CLEAN = ifelse(!CLUSTER %in% dirty.clusters.remove, TRUE, FALSE)) 879 | 880 | ### merge clusters present in all regions at given thresholds 881 | print("Final merging of ubiqquitous clusters") 882 | ### select ubiquitous mutations in clean clusters 883 | issue_mutations <- output_tsv %>% 884 | dplyr::mutate(mutation_id = paste(CASE_ID, CHR, POS, REF, sep = ":")) %>% 885 | dplyr::filter(CLEAN, mutation_id %in% ITH1muts) 886 | if (nrow(issue_mutations) == 0) { 887 | print("No additional clusters corrected") 888 | } else { 889 | clusters_to_consider <- unique(issue_mutations$CLUSTER) 890 | ### calculate proportion of mutations that are greater or equal to the clonal threshold in each region 891 | clonalProportion.df <- issue_mutations %>% 892 | dplyr::group_by(CLUSTER, SAMPLE) %>% 893 | dplyr::mutate(nMuts = dplyr::n(), 894 | propClonal = sum(phyloCCF.0.95 >= clonal_cutOff) / nMuts) %>% 895 | dplyr::select(CLUSTER, SAMPLE, propClonal) %>% 896 | unique() %>% 897 | dplyr::ungroup() 898 | 899 | ### extract the lowest proportion and filter for clusters where this is greater than the threshold 900 | clusters_to_change <- as.character(clonalProportion.df %>% 901 | dplyr::group_by(CLUSTER) %>% 902 | dplyr::mutate(minPropClonal = min(propClonal)) %>% 903 | dplyr::select(CLUSTER, minPropClonal) %>% 904 | unique() %>% 905 | dplyr::filter(minPropClonal > propClonal_threshold) %>% 906 | dplyr::pull(CLUSTER)) 907 | 908 | ### if two or more clusters are above this threshold merge all clusters into the lowest cluster ID 909 | if (length(clusters_to_change) == 0) { 910 | print("No clusters above specified thresholds") 911 | } else if (length(clusters_to_change) == 1) { 912 | print("Only single cluster above thresholds identified. Nothing to merge") 913 | } else { 914 | output_tsv <- output_tsv %>% 915 | dplyr::mutate(CLUSTER = ifelse(CLUSTER %in% clusters_to_change, min(clusters_to_change), CLUSTER)) 916 | } 917 | } 918 | 919 | write.table(output_tsv %>% dplyr::select(-phyloCCF.0.05, -phyloCCF.0.95), file = paste0(new.dir, patient, ".SCoutput.FULL.tsv"), row.names = FALSE, quote = FALSE, sep = "\t") 920 | output_tsv_clean <- output_tsv %>% dplyr::filter(CLEAN) %>% dplyr::select(-CLEAN, -phyloCCF.0.05, -phyloCCF.0.95) 921 | output_tsv_dirty <- output_tsv %>% dplyr::filter(!CLEAN) %>% dplyr::select(-CLEAN, -phyloCCF.0.05, -phyloCCF.0.95) 922 | write.table(output_tsv_clean, file = paste0(new.dir, patient, ".SCoutput.CLEAN.tsv"), row.names = FALSE, quote = FALSE, sep = "\t") 923 | write.table(output_tsv_dirty, file = paste0(new.dir, patient, ".SCoutput.DIRTY.tsv"), row.names = FALSE, quote = FALSE, sep = "\t") 924 | 925 | } 926 | 927 | if(length(regions.to.use) > 1) { 928 | ### plot heatmap 929 | ### plot the clusters 930 | 931 | 932 | pdf(paste(new.dir, "/", patient, "_pyclone_cluster_assignment_ccf_dirty",".pdf",sep=""),height=4,width=4) 933 | plot.pyclone.clusters(patient = patient 934 | ,regionList=phylo.region.list 935 | ,mut.table=mut.table 936 | ,regions.to.use = regions.to.use 937 | ,mostLikelyClusters = most.likely.cluster 938 | ,driverCat = driver_cat 939 | ,ccf='absolute' 940 | ) 941 | dev.off() 942 | 943 | pdf(paste(new.dir,"/",patient,"_pyclone_cluster_assignment_phylo_dirty",".pdf",sep=""),height=4,width=4) 944 | plot.pyclone.clusters(patient=patient 945 | ,regionList=phylo.region.list 946 | ,mut.table=mut.table 947 | ,regions.to.use = regions.to.use 948 | ,mostLikelyClusters = most.likely.cluster 949 | ,driverCat = driver_cat 950 | ,ccf='phylo' 951 | ) 952 | dev.off() 953 | 954 | pdf(paste(new.dir,"/",patient,"_pyclone_cluster_assignment_mutCpn_dirty",".pdf",sep=""),height=8,width=8) 955 | plot.pycloneMutCpn.clusters(patient=patient 956 | ,regionList=phylo.region.list 957 | ,mut.table=mut.table 958 | ,regions.to.use = regions.to.use 959 | ,mostLikelyClusters = most.likely.cluster 960 | ,driverCat = driver_cat 961 | ,ccf='phylo' 962 | ) 963 | dev.off() 964 | 965 | 966 | pdf(paste(new.dir,"/",patient,"_pyclone_cluster_assignment_mutCpn_clean",".pdf",sep=""),height=8,width=8) 967 | plot.pycloneMutCpn.clusters(patient=patient 968 | ,regionList=phylo.region.list 969 | ,mut.table=mut.table 970 | ,regions.to.use = regions.to.use 971 | ,mostLikelyClusters = clean.most.likely.clusters 972 | ,driverCat = driver_cat 973 | ,ccf='phylo' 974 | ) 975 | dev.off() 976 | 977 | pdf(paste(new.dir,"/",patient,"_pyclone_cluster_assignment_phylo_clean",".pdf",sep=""),height=4,width=4) 978 | plot.pyclone.clusters(patient=patient 979 | ,regionList=phylo.region.list 980 | ,mut.table=mut.table 981 | ,regions.to.use = regions.to.use 982 | ,mostLikelyClusters = clean.most.likely.clusters 983 | ,driverCat = driver_cat 984 | ,ccf='phylo' 985 | ) 986 | dev.off() 987 | 988 | pdf(paste(new.dir,"/",patient,"_pyclone_cluster_assignment_ccf_clean",".pdf",sep=""),height=4,width=4) 989 | plot.pyclone.clusters(patient=patient 990 | ,regionList=phylo.region.list 991 | ,mut.table=mut.table 992 | ,regions.to.use = regions.to.use 993 | ,mostLikelyClusters = clean.most.likely.clusters 994 | ,driverCat = driver_cat 995 | ,ccf='absolute' 996 | ) 997 | dev.off() 998 | } 999 | 1000 | # let's plot the copy number clusters 1001 | # Using seg file if exists for plotting 1002 | 1003 | if (!is.null(input_seg_tsv_loc)) { 1004 | print("Using specified seg file for plotting") 1005 | seg.mat.copy.plot <- read.delim2(input_seg_tsv_loc, stringsAsFactors = F) 1006 | seg.mat.copy.plot$SAMPLE <- gsub("-", "\\.", seg.mat.copy.plot$SAMPLE) 1007 | seg.mat.copy.plot$COPY_NUMBER_A <- as.numeric(seg.mat.copy.plot$COPY_NUMBER_A) 1008 | seg.mat.copy.plot$COPY_NUMBER_B <- as.numeric(seg.mat.copy.plot$COPY_NUMBER_B) 1009 | seg.mat.copy.plot$CHR <- as.numeric(seg.mat.copy.plot$CHR) 1010 | seg.mat.copy.plot$STARTPOS <- as.numeric(seg.mat.copy.plot$STARTPOS) 1011 | 1012 | #Sort by start position within chromosome 1013 | seg.mat.copy.plot <- seg.mat.copy.plot[order(seg.mat.copy.plot$CHR, 1014 | seg.mat.copy.plot$STARTPOS), ] 1015 | # If providing seg file, ensure the sample names match the sample names in input tsv 1016 | if (!any(unique(seg.mat.copy.plot$SAMPLE) %in% unique(seg.mat.copy[,1]))) { 1017 | stop('Sample IDs do not match between input_tsv and input_seg_tsv') 1018 | } 1019 | } else { 1020 | print("Using tsv data for plotting") 1021 | seg.mat.copy.plot <- seg.mat.copy 1022 | } 1023 | 1024 | # ensure COPY_NUMBER_A and COPY_NUMBER_B are numeric 1025 | seg.mat.copy.plot$COPY_NUMBER_A <- as.numeric(seg.mat.copy.plot$COPY_NUMBER_A) 1026 | seg.mat.copy.plot$COPY_NUMBER_B <- as.numeric(seg.mat.copy.plot$COPY_NUMBER_B) 1027 | 1028 | pdf(paste(new.dir,"/",patient,"_pyclone_cluster_assignment_copynumber_dirty",".pdf",sep=""), height = ifelse(length(regions.to.use) == 1, 5, length(regions.to.use))) 1029 | plot.region.mutCopyNum(phylo.region.list = phylo.region.list,seg.mat.copy = seg.mat.copy.plot,mostLikelyClusters = most.likely.cluster,plot.separate.clusters = TRUE) 1030 | dev.off() 1031 | 1032 | pdf(paste(new.dir,"/",patient,"_pyclone_cluster_assignment_copynumber_clean",".pdf",sep=""), height = ifelse(length(regions.to.use) == 1, 5, length(regions.to.use))) 1033 | plot.region.mutCopyNum(phylo.region.list = phylo.region.list,seg.mat.copy = seg.mat.copy.plot,mostLikelyClusters = clean.most.likely.clusters,plot.separate.clusters = TRUE) 1034 | dev.off() 1035 | } 1036 | 1037 | 1038 | --------------------------------------------------------------------------------