├── AUTHORS
├── data
    ├── input_table.rda
    └── results
    │   ├── CRUKTOY001.tree.RDS
    │   ├── pytree_and_bar.pdf
    │   └── pytree_multipletrees.pdf
├── man
    ├── unfold_tree.Rd
    ├── prune.tree.Rd
    ├── conipher_run.Rd
    ├── color.tree.Rd
    ├── extract_consensus_relationships.Rd
    ├── get_tree_level.Rd
    ├── is.there.ccf.issue.Rd
    ├── remove_clustered_clones.Rd
    ├── treebuilding_plot.Rd
    ├── clusterDistributionAcrossGenome.Rd
    ├── get_terminal_clusters.Rd
    ├── correct.clonality.nesting.Rd
    ├── createAllPathsList.Rd
    ├── process_mean_cluster_ccfs.Rd
    ├── extract_daughters.Rd
    ├── input_table.Rd
    ├── conipher_treebuilding.Rd
    ├── permute.clusters.to.remove.Rd
    ├── compute_tree_edge_probability.Rd
    ├── compute_sum_condition_error.Rd
    ├── treebuilding_preprocess.Rd
    ├── calc.pyclone.ci.Rd
    ├── conipher_clustering.Rd
    ├── test.distributions.Rd
    ├── grow.multi.trees.Rd
    ├── clustering_run.Rd
    ├── grow.trees.Rd
    ├── compute_subclone_proportions.Rd
    ├── clustering_preprocess.Rd
    ├── clonality.function.Rd
    ├── clustering_postprocess.Rd
    ├── determine.cluster.nesting.Rd
    ├── compute_subclonal_expansion_score.Rd
    └── treebuilding_run.Rd
├── R
    ├── example.R
    ├── main_conipher_run.R
    ├── sequenza_functions.R
    ├── functionsForSimpleClustering.v13.R
    ├── main_treebuilding_functions.R
    └── main_clustering_functions.R
├── inst
    └── extdata
    │   └── template.config.yaml
├── DESCRIPTION
├── NAMESPACE
├── LICENSE
└── README.md


/AUTHORS:
--------------------------------------------------------------------------------
1 | Nicholas McGranahan,
2 | Ariana Huebner,
3 | Kristiana Grigoriadis


--------------------------------------------------------------------------------
/data/input_table.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/McGranahanLab/CONIPHER/HEAD/data/input_table.rda


--------------------------------------------------------------------------------
/data/results/CRUKTOY001.tree.RDS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/McGranahanLab/CONIPHER/HEAD/data/results/CRUKTOY001.tree.RDS


--------------------------------------------------------------------------------
/data/results/pytree_and_bar.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/McGranahanLab/CONIPHER/HEAD/data/results/pytree_and_bar.pdf


--------------------------------------------------------------------------------
/data/results/pytree_multipletrees.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/McGranahanLab/CONIPHER/HEAD/data/results/pytree_multipletrees.pdf


--------------------------------------------------------------------------------
/man/unfold_tree.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/treebuilding_helper_functions.R
 3 | \name{unfold_tree}
 4 | \alias{unfold_tree}
 5 | \title{Function to unfod tree}
 6 | \usage{
 7 | unfold_tree(edgelist, lower, trunk)
 8 | }
 9 | \description{
10 | Function to unfod tree
11 | }
12 | 


--------------------------------------------------------------------------------
/man/prune.tree.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/treebuilding_helper_functions.R
 3 | \name{prune.tree}
 4 | \alias{prune.tree}
 5 | \title{Function to prune the ancestral graph}
 6 | \usage{
 7 | prune.tree(edgelist, nestedclust)
 8 | }
 9 | \description{
10 | Function to prune the ancestral graph
11 | }
12 | 


--------------------------------------------------------------------------------
/man/conipher_run.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/main_conipher_run.R
 3 | \name{conipher_run}
 4 | \alias{conipher_run}
 5 | \title{Full CONIPHER run}
 6 | \usage{
 7 | conipher_run(case_id, prefix, out_dir, input_tsv_loc, ...)
 8 | }
 9 | \arguments{
10 | \item{opt}{a list of options}
11 | }
12 | \description{
13 | This function takes all the input options and runs the three main steps:
14 | preprocess, tree building run and postprocess
15 | }
16 | 


--------------------------------------------------------------------------------
/man/color.tree.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/treebuilding_helper_functions.R
 3 | \name{color.tree}
 4 | \alias{color.tree}
 5 | \title{Plotting function to colour the nodes on the phylogenetic tree}
 6 | \usage{
 7 | color.tree(edgelength, opacity = 255)
 8 | }
 9 | \arguments{
10 | \item{edgelength}{A named vector containing number of mutations of each cluster}
11 | }
12 | \description{
13 | Plotting function to colour the nodes on the phylogenetic tree
14 | }
15 | 


--------------------------------------------------------------------------------
/man/extract_consensus_relationships.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/treebuilding_helper_functions.R
 3 | \name{extract_consensus_relationships}
 4 | \alias{extract_consensus_relationships}
 5 | \title{Function to take a list of trees and identify the consensus relationships}
 6 | \usage{
 7 | extract_consensus_relationships(tree_list, output_as_table = FALSE)
 8 | }
 9 | \description{
10 | Function to take a list of trees and identify the consensus relationships
11 | }
12 | 


--------------------------------------------------------------------------------
/man/get_tree_level.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/treebuilding_helper_functions.R
 3 | \name{get_tree_level}
 4 | \alias{get_tree_level}
 5 | \title{Function to compute the tree level of a cluster}
 6 | \usage{
 7 | get_tree_level(tree_graph, cluster)
 8 | }
 9 | \arguments{
10 | \item{tree_graph}{A matrix of a tree structure}
11 | 
12 | \item{cluster}{Name of a cluster for which you want to get the tree level}
13 | }
14 | \description{
15 | Function to compute the tree level of a cluster
16 | }
17 | 


--------------------------------------------------------------------------------
/man/is.there.ccf.issue.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/treebuilding_helper_functions.R
 3 | \name{is.there.ccf.issue}
 4 | \alias{is.there.ccf.issue}
 5 | \title{Function to check whether there is a CCF level issue}
 6 | \usage{
 7 | is.there.ccf.issue(
 8 |   nestedclust,
 9 |   directed_input_graph,
10 |   ccf_ci_lower,
11 |   trunk_cluster,
12 |   clusters_to_remove,
13 |   clusters_to_use,
14 |   max_per_level = 115
15 | )
16 | }
17 | \description{
18 | Function to check whether there is a CCF level issue
19 | }
20 | 


--------------------------------------------------------------------------------
/man/remove_clustered_clones.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/treebuilding_helper_functions.R
 3 | \name{remove_clustered_clones}
 4 | \alias{remove_clustered_clones}
 5 | \title{Function to remove mutation clusters with genomically clustered mutations}
 6 | \usage{
 7 | remove_clustered_clones(
 8 |   test.pyclone,
 9 |   clonal_cluster,
10 |   p_value_cut = 0.01,
11 |   clustering_estimate_cut = 2
12 | )
13 | }
14 | \description{
15 | Function to remove mutation clusters with genomically clustered mutations
16 | }
17 | 


--------------------------------------------------------------------------------
/man/treebuilding_plot.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/main_treebuilding_functions.R
 3 | \name{treebuilding_plot}
 4 | \alias{treebuilding_plot}
 5 | \title{TRACERx tree plotting function}
 6 | \usage{
 7 | treebuilding_plot(sample_pyclone_tree)
 8 | }
 9 | \arguments{
10 | \item{sample_pyclone_tree}{A list containing all information about the
11 | tree inferred using function tracerx.tree.building()}
12 | }
13 | \description{
14 | This function is the CONIPHER function to plot the inferred phylogenetic tree.
15 | }
16 | 


--------------------------------------------------------------------------------
/man/clusterDistributionAcrossGenome.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/treebuilding_helper_functions.R
 3 | \name{clusterDistributionAcrossGenome}
 4 | \alias{clusterDistributionAcrossGenome}
 5 | \title{Function to determine the distribution of location of mutations within a cluster}
 6 | \usage{
 7 | clusterDistributionAcrossGenome(
 8 |   cluster,
 9 |   clonal_cluster,
10 |   test.pyclone,
11 |   iterations = 10000
12 | )
13 | }
14 | \description{
15 | Function to determine the distribution of location of mutations within a cluster
16 | }
17 | 


--------------------------------------------------------------------------------
/man/get_terminal_clusters.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/treebuilding_helper_functions.R
 3 | \name{get_terminal_clusters}
 4 | \alias{get_terminal_clusters}
 5 | \title{Function to extract the terminal nodes of a phylogenetic tree}
 6 | \usage{
 7 | get_terminal_clusters(tree_structure)
 8 | }
 9 | \arguments{
10 | \item{tree_structure}{A matrix of a tree structure (edge matrix)}
11 | }
12 | \value{
13 | A vector of the terminal nodes in the tree
14 | }
15 | \description{
16 | Function to extract the terminal nodes of a phylogenetic tree
17 | }
18 | 


--------------------------------------------------------------------------------
/man/correct.clonality.nesting.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/treebuilding_helper_functions.R
 3 | \name{correct.clonality.nesting}
 4 | \alias{correct.clonality.nesting}
 5 | \title{Function to correct nesting based on cluster clonality}
 6 | \usage{
 7 | correct.clonality.nesting(
 8 |   nestedlist,
 9 |   pyclone,
10 |   clonality_table,
11 |   pval_cutoff = 0.01,
12 |   min_cluster_size = 5,
13 |   use_boot = TRUE,
14 |   min_ccf = 0.05,
15 |   prefix = prefix
16 | )
17 | }
18 | \description{
19 | Function to correct nesting based on cluster clonality
20 | }
21 | 


--------------------------------------------------------------------------------
/man/createAllPathsList.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/treebuilding_helper_functions.R
 3 | \name{createAllPathsList}
 4 | \alias{createAllPathsList}
 5 | \title{Function to create a list of all tree paths}
 6 | \usage{
 7 | createAllPathsList(tree.structure, trunk)
 8 | }
 9 | \arguments{
10 | \item{pyclone}{An R matrix describing the tree structure with two columns
11 | specifying 'parent' (column 1) and child (column 2)}
12 | }
13 | \value{
14 | An R list of all tree paths from trunk to leaves
15 | }
16 | \description{
17 | Function to create a list of all tree paths
18 | }
19 | 


--------------------------------------------------------------------------------
/R/example.R:
--------------------------------------------------------------------------------
 1 | #' Example input table
 2 | #'
 3 | #' Example input table to input to CONIPHER tree building. The input table should
 4 | #' have the following columns: CASE_ID, SAMPLE, CHR, POS, REF,
 5 | #' ALT, REF_COUNT, VAR_COUNT, DEPTH, CLUSTER, CCF_PHYLO, CCF_OBS,
 6 | #' MUT_COPY, COPY_NUMBER_A, COPY_NUMBER_B, ACF, and PLOIDY. Full description of
 7 | #' the input format can be found in our companion manuscript.
 8 | #'
 9 | #' @docType data
10 | #'
11 | #' @usage data(input_table)
12 | #'
13 | #' @format An object of class \code{"data.frame"}
14 | #'
15 | #' @keywords datasets
16 | #'
17 | #' @examples
18 | #' data(input_table)
19 | #' head(input_table)
20 | "input_table"
21 | 


--------------------------------------------------------------------------------
/inst/extdata/template.config.yaml:
--------------------------------------------------------------------------------
 1 | num_iters: 10000
 2 | 
 3 | base_measure_params:
 4 |   alpha: 1
 5 |   beta: 1
 6 | 
 7 | concentration:
 8 |   value: 1.0
 9 |   
10 |   prior:
11 |     shape: 1.0
12 |     rate: 0.001
13 | 
14 | density: pyclone_beta_binomial
15 | 
16 | beta_binomial_precision_params:
17 |   value: 1000
18 | 
19 |   prior:
20 |     shape: 1.0
21 |     rate: 0.0001
22 | 
23 |   proposal:
24 |     precision: 0.01
25 | 
26 | working_dir: working.directory.location
27 | 
28 | trace_dir: trace
29 | 
30 | samples:
31 |   TCGA.barcode:
32 |     mutations_file: mutations.yaml
33 | 
34 |     tumour_content:
35 |       value: 1.0
36 | 
37 |     error_rate: 0.001
38 | 
39 | 
40 | 


--------------------------------------------------------------------------------
/man/process_mean_cluster_ccfs.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/treebuilding_helper_functions.R
 3 | \name{process_mean_cluster_ccfs}
 4 | \alias{process_mean_cluster_ccfs}
 5 | \title{Function to extract the terminal nodes of a phylogenetic tree}
 6 | \usage{
 7 | process_mean_cluster_ccfs(ccf_table_pyclone_clean)
 8 | }
 9 | \arguments{
10 | \item{ccf_table_pyclone_clean}{A mutation table with PhyloCCF values. This
11 | table is an item in the R list object list sample_pyclone_tree, which is the
12 | output of the CONIPHER treebuilding_run() function.}
13 | }
14 | \description{
15 | Function to extract the terminal nodes of a phylogenetic tree
16 | }
17 | 


--------------------------------------------------------------------------------
/man/extract_daughters.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/treebuilding_helper_functions.R
 3 | \name{extract_daughters}
 4 | \alias{extract_daughters}
 5 | \title{Function to extract all daughter clones from a parent using a phylogenetic tree}
 6 | \usage{
 7 | extract_daughters(tree, parent.clones)
 8 | }
 9 | \arguments{
10 | \item{tree}{A phylogenetic tree matrix with two columns specifying
11 | 'parent' (column 1) and child (column 2)}
12 | 
13 | \item{parent.clones}{The name of the parent clone(s) for which you wish to find all daughters}
14 | }
15 | \description{
16 | Function to extract all daughter clones from a parent using a phylogenetic tree
17 | }
18 | 


--------------------------------------------------------------------------------
/man/input_table.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/example.R
 3 | \docType{data}
 4 | \name{input_table}
 5 | \alias{input_table}
 6 | \title{Example input table}
 7 | \format{
 8 | An object of class \code{"data.frame"}
 9 | }
10 | \usage{
11 | data(input_table)
12 | }
13 | \description{
14 | Example input table to input to CONIPHER tree building. The input table should
15 | have the following columns: CASE_ID, SAMPLE, CHR, POS, REF,
16 | ALT, REF_COUNT, VAR_COUNT, DEPTH, CLUSTER, CCF_PHYLO, CCF_OBS,
17 | MUT_COPY, COPY_NUMBER_A, COPY_NUMBER_B, ACF, and PLOIDY. Full description of
18 | the input format can be found in our companion manuscript.
19 | }
20 | \examples{
21 | data(input_table)
22 | head(input_table)
23 | }
24 | \keyword{datasets}
25 | 


--------------------------------------------------------------------------------
/man/conipher_treebuilding.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/main_treebuilding_functions.R
 3 | \name{conipher_treebuilding}
 4 | \alias{conipher_treebuilding}
 5 | \title{Full tree building run function}
 6 | \usage{
 7 | conipher_treebuilding(
 8 |   input_tsv_loc,
 9 |   out_dir,
10 |   prefix,
11 |   ccf_buffer = 10,
12 |   pval_cutoff = 0.01,
13 |   use_boot = TRUE,
14 |   merge_clusters = TRUE,
15 |   correct_cpn_clusters = TRUE,
16 |   adjust_noisy_clusters = FALSE,
17 |   adjust_noisy_clusters_prop = 0.05,
18 |   min_ccf = 0.01,
19 |   min_cluster_size = 5,
20 |   multi_trees = TRUE,
21 |   ...
22 | )
23 | }
24 | \arguments{
25 | \item{opt}{a list of options}
26 | }
27 | \description{
28 | This function takes all the input options and runs the three main steps:
29 | preprocess, tree building run and postprocess
30 | }
31 | 


--------------------------------------------------------------------------------
/man/permute.clusters.to.remove.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/treebuilding_helper_functions.R
 3 | \name{permute.clusters.to.remove}
 4 | \alias{permute.clusters.to.remove}
 5 | \title{Function to test which clusters are best to remove from tree
 6 | This function permutes through clusters and checks which are the best to remove
 7 | from the tree - takes into account number of mutations}
 8 | \usage{
 9 | permute.clusters.to.remove(
10 |   test_out,
11 |   nestedclust,
12 |   max_per_level,
13 |   tlevels,
14 |   trunk_cluster,
15 |   cluster_qc,
16 |   ccf_ci_lower,
17 |   nclusters,
18 |   speed_cluster = 15
19 | )
20 | }
21 | \description{
22 | Function to test which clusters are best to remove from tree
23 | This function permutes through clusters and checks which are the best to remove
24 | from the tree - takes into account number of mutations
25 | }
26 | 


--------------------------------------------------------------------------------
/man/compute_tree_edge_probability.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/treebuilding_helper_functions.R
 3 | \name{compute_tree_edge_probability}
 4 | \alias{compute_tree_edge_probability}
 5 | \title{Function to compute the edge probability score for a list of trees}
 6 | \usage{
 7 | compute_tree_edge_probability(tree_list, edgelength, trunk)
 8 | }
 9 | \arguments{
10 | \item{tree_list}{A list of tree matrices}
11 | 
12 | \item{edgelength}{A named vector containing number of mutations of each cluster}
13 | 
14 | \item{trunk}{The name of the truncal cluster}
15 | }
16 | \value{
17 | sce_vec, A named vector of the sum condition error (SCE) for each
18 | tree structure in the input tree list
19 | }
20 | \description{
21 | This function takes a list of phylogenetic tree structures and the PhyloCCF cluster table and
22 | computes the sum condition error for each tree.
23 | }
24 | 


--------------------------------------------------------------------------------
/man/compute_sum_condition_error.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/treebuilding_helper_functions.R
 3 | \name{compute_sum_condition_error}
 4 | \alias{compute_sum_condition_error}
 5 | \title{Function to compute the Sum Condition Error for a list of trees}
 6 | \usage{
 7 | compute_sum_condition_error(tree_list, ccf_cluster_table, trunk)
 8 | }
 9 | \arguments{
10 | \item{tree_list}{A list of tree matrices}
11 | 
12 | \item{ccf_cluster_table}{A matrix of mean PhyloCCF of each cluster in
13 | each tumour region}
14 | 
15 | \item{trunk}{The name of the truncal cluster}
16 | }
17 | \value{
18 | sce_vec, A named vector of the sum condition error (SCE) for each
19 | tree structure in the input tree list
20 | }
21 | \description{
22 | This function takes a list of phylogenetic tree structures and the PhyloCCF cluster table and
23 | computes the sum condition error for each tree.
24 | }
25 | 


--------------------------------------------------------------------------------
/man/treebuilding_preprocess.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/main_treebuilding_functions.R
 3 | \name{treebuilding_preprocess}
 4 | \alias{treebuilding_preprocess}
 5 | \title{Input data preprocessing function}
 6 | \usage{
 7 | treebuilding_preprocess(input_table, prefix, out_dir)
 8 | }
 9 | \arguments{
10 | \item{input_table}{An dataframe of the input mutation table in the correct
11 | format. For more information on the input table format, please see our
12 | tree building protocol.}
13 | 
14 | \item{prefix}{A tumour case and sample prefix, e.g. 'CRUK'.}
15 | 
16 | \item{out_dir}{A file path to the desired output directory}
17 | }
18 | \description{
19 | This function takes the input tsv and formats the data to be compatible with
20 | the main CONIPHER tree building function. NOTE: it is assumed that
21 | clustering has been carried out prior to running tree building.
22 | }
23 | 


--------------------------------------------------------------------------------
/R/main_conipher_run.R:
--------------------------------------------------------------------------------
 1 | #' Full CONIPHER run
 2 | #'
 3 | #' This function takes all the input options and runs the three main steps:
 4 | #' preprocess, tree building run and postprocess
 5 | #' @param opt a list of options
 6 | #' @returns NULL
 7 | #' @export conipher_run
 8 | 
 9 | conipher_run <- function(case_id, prefix, out_dir, input_tsv_loc, ...) {
10 |     out_dir_tmp <- paste0(out_dir, "/Clustering/")
11 |     conipher_clustering(case_id = case_id, 
12 |                         out_dir = out_dir_tmp, 
13 |                         input_tsv_loc = input_tsv_loc, 
14 |                         ...)
15 |     tree_input_tsv_loc <- paste0(out_dir_tmp, case_id, ".SCoutput.CLEAN.tsv")
16 |     out_dir_tmp <- paste0(out_dir, "/Trees/")
17 |     conipher_treebuilding(input_tsv_loc = tree_input_tsv_loc,
18 |                           out_dir = out_dir_tmp,
19 |                           prefix = prefix,
20 |                           ...)
21 | }
22 | 


--------------------------------------------------------------------------------
/man/calc.pyclone.ci.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/treebuilding_helper_functions.R
 3 | \name{calc.pyclone.ci}
 4 | \alias{calc.pyclone.ci}
 5 | \title{Function to compute confidence intervals of each cluster in each region}
 6 | \usage{
 7 | calc.pyclone.ci(pyclone, pyclust, nclusters, prefix = "LTX", lower_min = 0)
 8 | }
 9 | \arguments{
10 | \item{pyclone}{An R list object containing information about the PhyloCCF
11 | of each mutation in each tumour region}
12 | }
13 | \value{
14 | An R list containing elements: 'ccf_cluster_table', 'mean_phylo_ccf',
15 | 'median_pyclone_ccf', 'median_phylo_ccf', 'ccf_ci_upper', 'ccf_ci_lower',
16 | 'ccf_ci_boot_upper', 'ccf_ci_boot_lower'
17 | }
18 | \description{
19 | This function takes in an R list containing mutation PhyloCCF in each region,
20 | and mutation assignments to a cluster and computes bootstrapped
21 | confidence intervals.
22 | }
23 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: CONIPHER
 2 | Type: Package
 3 | Title: R package for tumour phylogenetic tree reconstruction
 4 | Version: 2.1.0
 5 | Author: Nicholas McGranahan
 6 | Maintainers: Kristiana Grigoriadis <kristiana.grigoriadis.19@ucl.ac.uk>; Ariana Huebner <ariana.huebner.16@ucl.ac.uk>
 7 | Description: CONIPHER is an R package for clustering mutation data and reconstruction of tumour phylogenetic 
 8 |     trees from DNA sequencing. 
 9 |     A full description of CONIPHER can be found in our pre-print XXX.
10 | Depends:
11 |     R (>= 3.6.1)
12 | Imports:
13 |     stats,
14 |     utils,
15 |     graphics,
16 |     grDevices,
17 |     plyr,
18 |     dplyr,
19 |     tidyr,
20 |     parallel,
21 |     boot,
22 |     coin,
23 |     RColorBrewer,
24 |     wordcloud,
25 |     data.table,
26 |     beeswarm,
27 |     mapplots,
28 |     igraph,
29 |     gplots
30 | Suggests:
31 |     devtools
32 | License: use_bsd3clause_license()
33 | Encoding: UTF-8
34 | LazyData: true
35 | RoxygenNote: 7.2.3.9000
36 | 


--------------------------------------------------------------------------------
/man/conipher_clustering.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/main_clustering_functions.R
 3 | \name{conipher_clustering}
 4 | \alias{conipher_clustering}
 5 | \title{Full clustering run function}
 6 | \usage{
 7 | conipher_clustering(
 8 |   case_id,
 9 |   out_dir,
10 |   input_tsv_loc,
11 |   input_seg_tsv_loc = NULL,
12 |   subclonal_copy_correction = TRUE,
13 |   only_truncal_subclonal_copy_correction = TRUE,
14 |   pyclone_yaml_loc = NULL,
15 |   min_cluster_size = 5,
16 |   multiple_test_correction = TRUE,
17 |   clean_clusters = TRUE,
18 |   clonal_cutOff = 0.9,
19 |   propClonal_threshold = 0.25,
20 |   fix_absentCCFs = TRUE,
21 |   driver_filter = "1A,1,2A",
22 |   burn_in = 1000,
23 |   seed = 1024,
24 |   nProcs = 1,
25 |   ...
26 | )
27 | }
28 | \arguments{
29 | \item{opt}{a list of options}
30 | }
31 | \description{
32 | This function takes all the input options and runs the three main steps:
33 | preprocess, clustering run and postprocess
34 | }
35 | 


--------------------------------------------------------------------------------
/R/sequenza_functions.R:
--------------------------------------------------------------------------------
 1 | ### functions from sequenza version 2.1.2
 2 | ### copied over by Ariana Huebner
 3 | 
 4 | types.matrix <- function (CNt.min, CNt.max, CNn = 2) {
 5 |     cn.ratio.vect <- seq(from = CNt.min / CNn, to = CNt.max / CNn, by = 1 / CNn)
 6 |     CNt <- cn.ratio.vect * CNn
 7 |     mut.comb <- lapply(CNt, FUN = function(x) seq(from = 0, to = x))
 8 |     times.muts <- sapply(mut.comb, length)
 9 |     data.frame(CNn = CNn, CNt = rep(CNt, times = times.muts), Mt = unlist(mut.comb))
10 | }
11 | 
12 | theoretical.mufreq <- function (Mt, CNt, CNn = 2, cellularity) {
13 |     normal.alleles <- (CNt - Mt) * cellularity + CNn * (1 - cellularity)
14 |     all.alleles <- (CNt * cellularity) + CNn * (1 - cellularity)
15 |     1 - (normal.alleles / all.alleles)
16 | }
17 | 
18 | mufreq.dpois <- function (mufreq, mufreq.model, depth.t, seq.errors = 0.01, ...) {
19 |     mufreq.model[mufreq.model == 0] <- seq.errors
20 |     n.success <- round(mufreq * depth.t, 0)
21 |     dpois(x = n.success, lambda = mufreq.model * depth.t, ...)
22 | }


--------------------------------------------------------------------------------
/man/test.distributions.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/treebuilding_helper_functions.R
 3 | \name{test.distributions}
 4 | \alias{test.distributions}
 5 | \title{Test difference in PhyloCCF distributions of each pair of mutation clusters}
 6 | \usage{
 7 | test.distributions(pyclone, nclusters, pval_cutoff = 0.05)
 8 | }
 9 | \arguments{
10 | \item{pyclone}{An R list object containing information about the PhyloCCF
11 | of each mutation in each tumour region.}
12 | 
13 | \item{nclusters}{Number of clusters}
14 | 
15 | \item{pval_cutoff}{A p-value significance threshold for testing whether
16 | clusters can be nested. (i.e. a p-value < pval_cutoff is significant)}
17 | }
18 | \value{
19 | This function returns list of nesting matrices. Each element of the
20 | list is a nesting matrix for one tumour region, that describes whether a
21 | cluster A (row) can be nested within a cluster B (column).
22 | }
23 | \description{
24 | This function compares the distributions of the PhyloCCF of each pair of
25 | mutation clusters in the dataset and outputs a nesting matrix.
26 | }
27 | 


--------------------------------------------------------------------------------
/man/grow.multi.trees.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/treebuilding_helper_functions.R
 3 | \name{grow.multi.trees}
 4 | \alias{grow.multi.trees}
 5 | \title{Function to determine all possible alternative phylogenies}
 6 | \usage{
 7 | grow.multi.trees(
 8 |   nestedlist,
 9 |   graph_pyclone,
10 |   pyclone,
11 |   ccf_buffer = 10,
12 |   n_clusters_to_move = 5
13 | )
14 | }
15 | \arguments{
16 | \item{nestedlist}{An R list containing information about the nesting
17 | structure of mutation clusters in each region.}
18 | 
19 | \item{graph_pyclone}{An R list containing information about the tree structure}
20 | 
21 | \item{pyclone}{An R list containing information about mutation PhyloCCF}
22 | 
23 | \item{ccf_buffer}{PhyloCCF buffer permitted when checking tree level issue}
24 | 
25 | \item{n_clusters_to_move}{Maximum number of clusters to move simultaneously}
26 | }
27 | \value{
28 | An R list containing all possible alternative tree structures and
29 | information about which branches are consensus across multiple trees
30 | }
31 | \description{
32 | Function to determine all possible alternative phylogenies
33 | }
34 | 


--------------------------------------------------------------------------------
/man/clustering_run.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/main_clustering_functions.R
 3 | \name{clustering_run}
 4 | \alias{clustering_run}
 5 | \title{Main clustering function}
 6 | \usage{
 7 | clustering_run(
 8 |   input_list,
 9 |   nProcs,
10 |   new.dir,
11 |   burn_in,
12 |   pyclone_seed,
13 |   template.config.yaml
14 | )
15 | }
16 | \arguments{
17 | \item{input_list}{A list created by the clustering preprocess function
18 | including patient id, regions to use, phylo region list and others.}
19 | 
20 | \item{nProcs}{A value referring to how many parallel processes 
21 | of pyclone should be run.}
22 | 
23 | \item{new.dir}{A character specifying the directory where the pyclone
24 | output should be saved.}
25 | 
26 | \item{burn_in}{Burn-in for DP clustering.}
27 | 
28 | \item{pyclone_seed}{Seed for PyClone run.}
29 | 
30 | \item{template.config.yaml}{Location of the template yaml file used to run PyClone.}
31 | }
32 | \value{
33 | sample.results which is the location of the pyclone output table.
34 | }
35 | \description{
36 | This function takes the input list created in the preprocessing along with
37 | the number of cores and output directory to run the main clustering.
38 | }
39 | 


--------------------------------------------------------------------------------
/man/grow.trees.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/treebuilding_helper_functions.R
 3 | \name{grow.trees}
 4 | \alias{grow.trees}
 5 | \title{Function to determine default tree structure}
 6 | \usage{
 7 | grow.trees(
 8 |   nestedlist,
 9 |   pyclone,
10 |   min_cluster_size = 5,
11 |   ccf_buffer = 10,
12 |   force_trunk = TRUE,
13 |   skip_size = 20
14 | )
15 | }
16 | \arguments{
17 | \item{nestedlist}{An R list containing information about the nesting
18 | structure of mutation clusters in each region.}
19 | 
20 | \item{pyclone}{An R list object containing information about the PhyloCCF
21 | of each mutation in each tumour region.}
22 | 
23 | \item{min_cluster_size}{Threshold for minimum number of mutations required in
24 | a mutation cluster}
25 | 
26 | \item{ccf_buffer}{PhyloCCF buffer permitted when checking tree level issue}
27 | 
28 | \item{prefix}{A character string indicating the sample and tumour case prefix}
29 | }
30 | \description{
31 | This function takes as input an R list containing information about
32 | PhyloCCF of each mutation (pyclone) and an R list containing a cluster ccf table
33 | and nesting matrix (nestedlist), and returns an R list containing the default tree
34 | structure.
35 | }
36 | 


--------------------------------------------------------------------------------
/man/compute_subclone_proportions.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/treebuilding_helper_functions.R
 3 | \name{compute_subclone_proportions}
 4 | \alias{compute_subclone_proportions}
 5 | \title{Function to compute clone proportions on a selected alternative tree structure}
 6 | \usage{
 7 | compute_subclone_proportions(
 8 |   tree_list,
 9 |   ccf_cluster_table,
10 |   clonality_table,
11 |   trunk,
12 |   force_clonal_100 = TRUE,
13 |   tree_id = 1
14 | )
15 | }
16 | \arguments{
17 | \item{tree_list}{A list of tree matrices}
18 | 
19 | \item{ccf_cluster_table}{A matrix of mean PhyloCCF of each cluster in
20 | each tumour region}
21 | 
22 | \item{clonality_table}{A matrix of clonality calls for each cluster in
23 | each tumour region}
24 | 
25 | \item{trunk}{The name of the truncal cluster}
26 | 
27 | \item{force_clonal_100}{A logical indicating whether to for clusters that are
28 | 'clonal' in a region to have CCF==100}
29 | 
30 | \item{tree_id}{The tree index of the selected alternative tree for which you
31 | want to compute the clone proportions}
32 | }
33 | \value{
34 | clone_proportion_table, a matrix containing the clone proportions of
35 | each clone (rows) in each tumour sample (columns)
36 | }
37 | \description{
38 | Function to compute clone proportions on a selected alternative tree structure
39 | }
40 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(clustering_postprocess)
 4 | export(clustering_preprocess)
 5 | export(clustering_run)
 6 | export(compute_subclonal_expansion_score)
 7 | export(compute_subclone_proportions)
 8 | export(conipher_clustering)
 9 | export(conipher_run)
10 | export(conipher_treebuilding)
11 | export(get_terminal_clusters)
12 | export(treebuilding_plot)
13 | export(treebuilding_preprocess)
14 | export(treebuilding_run)
15 | import(plyr)
16 | importFrom(RColorBrewer,"brewer.pal")
17 | importFrom(boot,"boot")
18 | importFrom(boot,"boot.ci")
19 | importFrom(data.table,":=")
20 | importFrom(dplyr,"%>%")
21 | importFrom(grDevices,"colorRampPalette")
22 | importFrom(grDevices,"dev.off")
23 | importFrom(grDevices,"pdf")
24 | importFrom(graphics,"abline")
25 | importFrom(graphics,"axis")
26 | importFrom(graphics,"barplot")
27 | importFrom(graphics,"layout")
28 | importFrom(graphics,"legend")
29 | importFrom(graphics,"par")
30 | importFrom(graphics,"plot.new")
31 | importFrom(graphics,"segments")
32 | importFrom(graphics,"text")
33 | importFrom(graphics,"title")
34 | importFrom(igraph,"get.edgelist")
35 | importFrom(igraph,"shortest.paths")
36 | importFrom(parallel,mclapply)
37 | importFrom(stats,"median")
38 | importFrom(stats,"qnorm")
39 | importFrom(stats,"sd")
40 | importFrom(stats,"setNames")
41 | importFrom(stats,"wilcox.test")
42 | importFrom(utils,"combn")
43 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2023, the respective contributors, as shown by the AUTHORS file.
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/man/clustering_preprocess.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/main_clustering_functions.R
 3 | \name{clustering_preprocess}
 4 | \alias{clustering_preprocess}
 5 | \title{Input data preprocessing function}
 6 | \usage{
 7 | clustering_preprocess(
 8 |   input_table,
 9 |   new.dir,
10 |   subclonal_copy_correction = TRUE,
11 |   multiple_test_correction = TRUE,
12 |   only_truncal_subclonal_copy_correction = TRUE,
13 |   fix_absentCCFs = TRUE
14 | )
15 | }
16 | \arguments{
17 | \item{input_table}{An dataframe of the input mutation table in the correct
18 | format. For more information on the input table format, please see our
19 | tree building protocol.}
20 | 
21 | \item{new.dir}{A character specifying the directory where the pyclone
22 | output should be saved.}
23 | 
24 | \item{subclonal_copy_correction}{A logical value that specifies whether subclonal
25 | copy number correction should be performed. 
26 | Default is set to TRUE}
27 | 
28 | \item{multiple_test_correction}{A logical value that specifies whether multiple 
29 | testing correction should be applied for the copy number correcting mutations.
30 | Default is set to TRUE}
31 | 
32 | \item{only_truncal_subclonal_copy_correction}{A logical value that specifies 
33 | whether only truncal subclonal copy number correction should be used.
34 | Default is set to TRUE}
35 | 
36 | \item{fix_absentCCFs}{A logical value that specifies whether CCF 
37 | of absent mutations should be set to zero.
38 | Default is set to TRUE}
39 | }
40 | \value{
41 | list including patient, regions.to.use, mut.table, seg.mat.copy
42 | seg.mat.phylo, phylo.region.list, simpleClusterList
43 | }
44 | \description{
45 | This function takes the input tsv and formats the data to be compatible with
46 | the main CONIPHER clustering function.
47 | }
48 | 


--------------------------------------------------------------------------------
/man/clonality.function.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/treebuilding_helper_functions.R
 3 | \name{clonality.function}
 4 | \alias{clonality.function}
 5 | \title{Function to determine cluster clonality in each tumour region}
 6 | \usage{
 7 | clonality.function(
 8 |   pyclone,
 9 |   trunk,
10 |   ccf_buffer = 10,
11 |   prefix = "LTX",
12 |   min_cluster_size = 5,
13 |   pval_cutoff = 0.01,
14 |   min_ccf = 0.05,
15 |   use_boot = FALSE
16 | )
17 | }
18 | \arguments{
19 | \item{pyclone}{An R list object containing information about the PhyloCCF
20 | of each mutation in each tumour region.}
21 | 
22 | \item{trunk}{truncal cluster name}
23 | 
24 | \item{ccf_buffer}{PhyloCCF buffer permitted when checking tree level issue}
25 | 
26 | \item{min_cluster_size}{Threshold for minimum number of mutations required in
27 | a mutation cluster}
28 | 
29 | \item{pval_cutoff}{A p-value significance threshold for testing whether
30 | clusters can be nested. (i.e. a p-value < pval_cutoff is significant)}
31 | 
32 | \item{min_ccf}{Minimum threshold for cluster PhyloCCF allowed to be classified
33 | as present}
34 | 
35 | \item{use_boot}{Whether to use bootstrapping to determine confidence intervals
36 | for each mutation cluster}
37 | }
38 | \value{
39 | a matrix of dimensions (n_clusters x n_regions) classifying each
40 | cluster as 'clonal', 'subclonal', or 'absent' in each tumour region.
41 | }
42 | \description{
43 | This function takes as input an R list containing information about PhyloCCF
44 | of each mutation (pyclone) and computes confidence intervals for each mutation
45 | cluster, of the PhyloCCF distributions of the mutations in that cluster. If
46 | use_boot==TRUE, then confidence intervals are computed using bootstrapping.
47 | The function then performs a statistical test (Wilcoxon) for every pair of
48 | clusters to determine whether one cluster can be nested within another.
49 | }
50 | 


--------------------------------------------------------------------------------
/man/clustering_postprocess.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/main_clustering_functions.R
 3 | \name{clustering_postprocess}
 4 | \alias{clustering_postprocess}
 5 | \title{Postprocessing of clustering function}
 6 | \usage{
 7 | clustering_postprocess(
 8 |   input_list,
 9 |   sample.results,
10 |   new.dir,
11 |   input_tsv,
12 |   input_seg_tsv_loc = NULL,
13 |   min_cluster_size = 5,
14 |   driver_cat = "1",
15 |   clean_clusters = TRUE,
16 |   min_ccf_present = 0.1,
17 |   clonal_cutOff = 0.9,
18 |   propClonal_threshold = 0.25
19 | )
20 | }
21 | \arguments{
22 | \item{input_list}{A list created by the clustering preprocess function
23 | including patient id, regions to use, phylo region list and others.}
24 | 
25 | \item{sample.results}{which is the location of the pyclone output table.}
26 | 
27 | \item{new.dir}{A character specifying the directory where the pyclone
28 | output should be saved.}
29 | 
30 | \item{input_tsv}{the input mutation tsv.}
31 | 
32 | \item{input_seg_tsv_loc}{path to a copy number segment tsv file that is used for
33 | across genome copy number plotting.
34 | Default NULL}
35 | 
36 | \item{min_cluster_size}{Minimum number of mutations needed for a cluster to be considered.
37 | Default 5}
38 | 
39 | \item{driver_cat}{Which categories to use as driver mutations
40 | Default "1"}
41 | 
42 | \item{clean_clusters}{should clusters be cleaned and merged?
43 | Default TRUE}
44 | 
45 | \item{min_ccf_present}{minimum CCF to consider a mutation as present. 
46 | Default 0.1}
47 | 
48 | \item{clonal_cutOff}{lower threshold CCF to consider mutations as clonal.
49 | Default 0.9}
50 | 
51 | \item{propClonal_threshold}{Proportion of mutations in cluster which needs to be
52 | considered clonal to merge.
53 | Default 0.25}
54 | }
55 | \description{
56 | This function takes the input tsv and formats the data to be compatible with
57 | the main CONIPHER clustering function.
58 | }
59 | 


--------------------------------------------------------------------------------
/man/determine.cluster.nesting.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/treebuilding_helper_functions.R
 3 | \name{determine.cluster.nesting}
 4 | \alias{determine.cluster.nesting}
 5 | \title{Function to determine cluster nesting structure.}
 6 | \usage{
 7 | determine.cluster.nesting(
 8 |   pyclone,
 9 |   prefix = "LTX",
10 |   min_cluster_size = 5,
11 |   pval_cutoff = 0.01,
12 |   min_ccf = 0.01,
13 |   use_boot = FALSE
14 | )
15 | }
16 | \arguments{
17 | \item{pyclone}{An R list object containing information about the PhyloCCF
18 | of each mutation in each tumour region.}
19 | 
20 | \item{prefix}{A character string indicating the sample and tumour case prefix}
21 | 
22 | \item{min_cluster_size}{Threshold for minimum number of mutations required in
23 | a mutation cluster}
24 | 
25 | \item{pval_cutoff}{A p-value significance threshold for testing whether
26 | clusters can be nested. (i.e. a p-value < pval_cutoff is significant)}
27 | 
28 | \item{min_ccf}{Minimum threshold for cluster PhyloCCF allowed to be classified
29 | as present}
30 | 
31 | \item{use_boot}{Whether to use bootstrapping to determine confidence intervals
32 | for each mutation cluster}
33 | }
34 | \value{
35 | 'nestedlist', an R list containing information about the nesting
36 | structure of mutation clusters in each region. Elements of the list include:
37 | 'nestedclust',  'ccf_ci_lower', 'ccf_ci_upper', 'ccf_cluster_table', 'cluster_qc'.
38 | }
39 | \description{
40 | This function takes as input an R list containing information about PhyloCCF
41 | of each mutation (pyclone) and computes confidence intervals for each mutation
42 | cluster, of the PhyloCCF distributions of the mutations in that cluster. If
43 | use_boot==TRUE, then confidence intervals are computed using bootstrapping.
44 | The function then performs a statistical test (Wilcoxon) for every pair of
45 | clusters to determine whether one cluster can be nested within another.
46 | }
47 | 


--------------------------------------------------------------------------------
/man/compute_subclonal_expansion_score.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/treebuilding_helper_functions.R
 3 | \name{compute_subclonal_expansion_score}
 4 | \alias{compute_subclonal_expansion_score}
 5 | \title{Function to compute subclonal expansion score on a selected alternative tree,
 6 | on a tumour sample and whole tumour level.
 7 | The subclonal expansion score for each tumour sample is computed as the
 8 | maximum CCF of any of the terminal (leaf) nodes present in that tumour sample.
 9 | Note, for multi-sample cases, there may exist a sample with no terminal nodes
10 | present, in which case the subclonal expansion score for this sample is set
11 | to 0. The tumour level subclonal expansion score is taken as the maximum
12 | subclonal expansion score across tumour samples.}
13 | \usage{
14 | compute_subclonal_expansion_score(tree_list, tree_id, ccf_table_pyclone_clean)
15 | }
16 | \arguments{
17 | \item{tree_list}{A list of tree matrices}
18 | 
19 | \item{tree_id}{The tree index of the selected alternative tree for which you
20 | want to compute the subclonal expansion score}
21 | 
22 | \item{ccf_table_pyclone_clean}{The output mutation PhyloCCF data frame that is
23 | computed as part of CONIPHER tree building}
24 | }
25 | \value{
26 | subclonal_exp_score_df, a data frame with the subclonal expansion
27 | score computed for each tumour sample (column subclonal_expansion_score),
28 | and across the whole tumour (column subclonal_expansion_score_tumour).
29 | each clone (rows) in each tumour sample (columns)
30 | }
31 | \description{
32 | Function to compute subclonal expansion score on a selected alternative tree,
33 | on a tumour sample and whole tumour level.
34 | The subclonal expansion score for each tumour sample is computed as the
35 | maximum CCF of any of the terminal (leaf) nodes present in that tumour sample.
36 | Note, for multi-sample cases, there may exist a sample with no terminal nodes
37 | present, in which case the subclonal expansion score for this sample is set
38 | to 0. The tumour level subclonal expansion score is taken as the maximum
39 | subclonal expansion score across tumour samples.
40 | }
41 | 


--------------------------------------------------------------------------------
/man/treebuilding_run.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/main_treebuilding_functions.R
 3 | \name{treebuilding_run}
 4 | \alias{treebuilding_run}
 5 | \title{TRACERx tree building function}
 6 | \usage{
 7 | treebuilding_run(
 8 |   sample_input_list,
 9 |   ccf_buffer = 10,
10 |   pval_cutoff = 0.01,
11 |   use_boot = TRUE,
12 |   merge_clusters = TRUE,
13 |   correct_cpn_clusters = TRUE,
14 |   adjust_noisy_clusters = FALSE,
15 |   adjust_noisy_clusters_prop = 0.05,
16 |   min_ccf = 0.01,
17 |   min_cluster_size = 5,
18 |   run.multi.trees = TRUE,
19 |   n_clusters_to_move = 5
20 | )
21 | }
22 | \arguments{
23 | \item{sample_input_list}{An R list object. This object contains information
24 | about each mutation in each tumour region sampled, including which cluster
25 | each mutation was assigned to in mutation clustering.
26 | This object can be generated by running the data preprocessing function}
27 | 
28 | \item{ccf_buffer}{PhyloCCF buffer allowance for testing tree level issue
29 | (default=10)}
30 | 
31 | \item{pval_cutoff}{P-value cut off for testing cluster nesting (default=0.01)}
32 | 
33 | \item{use_boot}{Should bootstrapping be used to compute confidence interval?
34 | (default=TRUE)}
35 | 
36 | \item{merge_clusters}{Should similar clusters be merged if possible?
37 | (default=TRUE)}
38 | 
39 | \item{correct_cpn_clusters}{Should clusters driven by copy number errors be
40 | removed? (default=TRUE)}
41 | 
42 | \item{adjust_noisy_clusters}{Should noisy clusters be adjusted? (default=TRUE)}
43 | 
44 | \item{adjust_noisy_clusters_prop}{What is the minimum proportion of mutations
45 | required to be present in a region to avoid cluster adjustment? (default=0.05)}
46 | 
47 | \item{min_ccf}{What is the minimum CCF threshold to consider a mutation as
48 | present? (default=0.01)}
49 | 
50 | \item{min_cluster_size}{What is the minimum number of mutations required in a
51 | cluster to be included in analysis? (default=5)}
52 | 
53 | \item{run.multi.trees}{Should alternative tumour phylogenies be explored?
54 | (default=TRUE)}
55 | 
56 | \item{n_clusters_to_move}{When running multiple trees specify the maximum
57 | number of clusters to attempt moving. (default=5)}
58 | }
59 | \value{
60 | sample_pyclone_tree, an R list object containing output information
61 | from CONIPHER tree building
62 | }
63 | \description{
64 | This function is the main CONIPHER wrapper function to run phylogenetic
65 | tree building from mutation clustering output. NOTE: it is assumed that
66 | clustering has been carried out prior to running tree building.
67 | }
68 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # CONIPHER
  2 | 
  3 | ## CONIPHER mutation clustering and phylogenetic tree building R package
  4 | 
  5 | This is the official github repository for the R package to perform mutation clustering and phylogenetic tree building using CONIPHER. For details on how to run mutation clustering and phylogenetic tree builing consecutively with one wrapper script from the command line, please refer to the github repository [CONIPHER-wrapper](https://github.com/McGranahanLab/CONIPHER-wrapper). For full details of all the inputs into CONIPHER clustering and tree building, refer to our protocol (https://doi.org/10.1038/s41596-023-00913-9).
  6 | 
  7 | ### Software
  8 | The current implementation of CONIPHER is written in `R>=3.6.1` and is distributed as an R package.
  9 | 
 10 | ---
 11 | ## CONIPHER installation 
 12 | 
 13 | 
 14 | CONIPHER can be installed and run in multiple modes.
 15 | 
 16 | 1) To run mutation clustering + phylogenetic tree building, install the `conipher` conda environment from bioconda using the command below. This environment contains R version 4.1.3, the CONIPHER R package, PyClone v0.13.1 (which is required for mutation clustering), and all other dependencies.
 17 | 
 18 | ```
 19 | conda create -n conipher -c conda-forge -c bioconda conipher
 20 | ```
 21 | 
 22 | 2) To run phylogenetic tree building only, the conda environment can also be used but is not strictly necessary. Alternatively, just the CONIPHER R package can be installed, provided the R package `devtools >= 2.4.1` is installed.
 23 | 
 24 | To install the CONIPHER R package from an R console, run the following command:
 25 | 
 26 | ```
 27 | library(devtools)
 28 | devtools::install_github("McGranahanLab/CONIPHER")
 29 | ```
 30 | 
 31 | ---
 32 | ## Quick start
 33 | 
 34 | ### Running clustering + tree building end-to-end 
 35 | To get start quickly, you can install CONIPHER and perform mutation clustering and phylogenetic tree reconstruction on the example data provided using the following instructions.
 36 | 
 37 | 
 38 | **Step 1.** 
 39 | Install the `conipher` conda environment using the instructions above.
 40 | 
 41 | **Step 2.**
 42 | Start R and load the 'CONIPHER' and 'tidyverse' R packages using the following command:
 43 | ```
 44 | library(CONIPHER)
 45 | library(tidyverse)
 46 | ```
 47 | 
 48 | **Step 3.**
 49 | Specify a parent output directory where the clustering and tree building results will be saved in individual subfolders, for example using the following command:
 50 | ```
 51 | out_dir <- "conipher_results/"
 52 | ```
 53 | 
 54 | **Step 4.**
 55 | Specify the location of the input table .tsv file. For example, the file path of the toy input table provided in this package is specified using the following command:
 56 | ```
 57 | input_tsv_loc <- system.file("extdata", "input_table.tsv", package = "CONIPHER", mustWork = TRUE)
 58 | ```
 59 | 
 60 | **Step 5.**
 61 | Run clustering + tree building end-to-end (interactively) using the following command:
 62 | ```
 63 | conipher_run(case_id = "CRUKTOY001",
 64 |              prefix = "CRUK",
 65 |              out_dir = out_dir,
 66 |              input_tsv_loc = input_tsv_loc)
 67 | ```
 68 | 
 69 | ### Running clustering only
 70 | Run steps 1 - 4 as described in "Running clustering + tree building end-to-end" above. 
 71 | 
 72 | **Step 5a.**
 73 | Run clustering (interactively) using the following command:
 74 | 
 75 | ```
 76 | conipher_clustering(case_id = "CRUKTOY001", 
 77 |                     out_dir = out_dir, 
 78 |                     input_tsv_loc = input_tsv_loc)
 79 | ```
 80 | 
 81 | ### Running tree building only
 82 | Run steps 1 - 4 as described in "Running clustering + tree building end-to-end" above. 
 83 | 
 84 | **Step 1b.** 
 85 | Alternatively, instead of installing the `conipher` conda environment, install the CONIPHER R package only using the instructions described above.
 86 | 
 87 | 
 88 | **Step 5b.**
 89 | Run tree building (interactively) using the following command:
 90 | 
 91 | ```
 92 | conipher_treebuilding(prefix = "CRUK",
 93 |                       out_dir = out_dir,
 94 |                       input_tsv_loc = tree_input_tsv_loc)
 95 | ```
 96 | 
 97 | 
 98 | ---
 99 | ### Anticipated results
100 | The clustering output will include the following output files (examples are in "conipher_results/Clustering"):
101 | 
102 | 
103 | The tree building output will include 3 output files (examples are in "conipher_results/Trees"):
104 | - <CASE_ID>.tree.RDS: an R list object containing tree building output information
105 | - pytree_and_bar.pdf: a plot of the default reconstructed tree and barplot
106 | - pytree_multipletrees.pdf: a plot showing all possible alternative phylogenetic trees found by CONIPHER
107 | 
108 | 
109 | 
110 | 


--------------------------------------------------------------------------------
/R/functionsForSimpleClustering.v13.R:
--------------------------------------------------------------------------------
  1 | findSimpleClusters <- function(input_list, runType = "WES") {
  2 |     ######################################################################################################################################################
  3 |     ### summary of what's required for function
  4 |     ### phylo.region.list - output as part of the initial steps of pyclone script (this contains the CCF of each mutation in each region prior to cluster)
  5 |     ### mut.table - the mutation table for the patient
  6 |     ######################################################################################################################################################
  7 |     new.dir           <- input_list$new.dir
  8 |     phylo.region.list <- input_list$phylo.region.list
  9 |     mut.table         <- input_list$mut.table
 10 |     seg.mat.phylo     <- input_list$seg.mat.phylo
 11 |     patient           <- input_list$patient
 12 | 
 13 |     ### define the output
 14 |     SmallClusters    <- list()
 15 | 
 16 |     regionsToUse     <- names(phylo.region.list)
 17 |     CCFtable         <- c()
 18 | 
 19 |     sharedMutations  <- unlist(phylo.region.list[[1]]$mutation_id)
 20 |     for (region in regionsToUse) {
 21 |         sharedMutations <- intersect(sharedMutations, unlist(phylo.region.list[[region]]$mutation_id))
 22 |     }
 23 |     new.phylo.region.list <- lapply(phylo.region.list, function(x) {
 24 |         x$cov <- (unlist(x$var_counts) + unlist(x$ref_counts))
 25 |         x$var_count <- unlist(x$var_counts)
 26 |         x$VAF <- x$var_count / x$cov * 100
 27 |         x$presence <- ifelse(x$var_count >= 1 & x$VAF > 1, TRUE, FALSE)
 28 |         return(x)
 29 |     })
 30 |     for (region in regionsToUse) {
 31 |         CCFtable <- cbind(CCFtable, unlist(new.phylo.region.list[[region]][sharedMutations, "presence"]))
 32 |     }
 33 |     rownames(CCFtable) <- sharedMutations
 34 |     colnames(CCFtable) <- regionsToUse
 35 |     BinaryTable <- ifelse(CCFtable, 1, 0)
 36 |     MutClusters <- apply(BinaryTable, 1, PasteVector, sep = ":")
 37 | 
 38 |     ### filter out mutations that are in small clusters
 39 |     UniqCluster <- unique(MutClusters)  
 40 | 
 41 |     ### set Names for the UniqClusters
 42 |     names(UniqCluster) <- as.character(1:length(UniqCluster))
 43 |     ### reverse the names as well
 44 |     UniqClusterNumber  <- names(UniqCluster)
 45 |     names(UniqClusterNumber) <- UniqCluster
 46 |     MutClusterNum <- UniqClusterNumber[MutClusters]
 47 |     names(MutClusterNum) <- names(MutClusters)  
 48 | 
 49 |     ### check whether evidence for multi-modality
 50 |     ClusterEvidenceForMultiModal <- rep(0, length(UniqCluster))
 51 |     names(ClusterEvidenceForMultiModal) <- names(UniqCluster)
 52 | 
 53 |     for (cluster in names(UniqCluster)) {
 54 |         SmallClusters[[cluster]]$clusterID            <- cluster
 55 |         SmallClusters[[cluster]]$clusterBinary        <- UniqCluster[cluster]
 56 |         SmallClusters[[cluster]]$MutationsWithCluster <- names(MutClusterNum[MutClusterNum %in% cluster])
 57 | 
 58 |         ### next, make pyclone tables so we can also run pyclone on these samples. 
 59 |         ClusterMutationsIDs <- names(MutClusterNum[MutClusterNum %in% cluster])
 60 |         specClusterTable    <- BinaryTable[ClusterMutationsIDs, , drop = FALSE]
 61 |         RegionsInCluster    <- colnames(specClusterTable)[which(specClusterTable[1, ] == 1)]
 62 |         ClusterTable        <- mut.table[mut.table$mutation_id %in% ClusterMutationsIDs, , drop = FALSE] 
 63 |         SmallClusters[[cluster]]$RegionsInCluster     <- RegionsInCluster
 64 | 
 65 |         ### check which regions are in the cluster 
 66 |         for (region in RegionsInCluster) {
 67 | 
 68 |             region.mut.table <- ClusterTable
 69 |             region.seg.copy  <- seg.mat.phylo[seg.mat.phylo$SampleID %in% region, , drop = FALSE]
 70 |             pyclone.table    <- data.frame(t(sapply(1:nrow(region.mut.table), identify.subclonal.mut.copy.number.ascat, region.mut.table, region.seg.copy, region, patient)), stringsAsFactors = FALSE)
 71 | 
 72 |             na.mutations     <- pyclone.table[is.na(pyclone.table$minor_cn), , drop = FALSE]
 73 |             loss.mutations   <- pyclone.table[as.numeric(pyclone.table$major_cn) == 0 | (as.numeric(pyclone.table$var_counts) + as.numeric(pyclone.table$ref_counts) == 0), ]
 74 |             error.muts       <- rbind(na.mutations, loss.mutations)
 75 |             error.muts       <- unlist(na.mutations$mutation_id, loss.mutations$mutation_id)
 76 |             error.muts.table <- paste0(new.dir, "/", region, ".error.muts.tsv")
 77 | 
 78 |             if (runType == "sim") {
 79 |                 #CMR for sim data, modify NA CN 
 80 |                 print("Running CN updates for simulation data.")
 81 |                 pyclone.table$major_cn[is.na(pyclone.table$minor_cn)] <- 1
 82 |                 pyclone.table$major_raw[is.na(pyclone.table$minor_cn)] <- 1
 83 |                 pyclone.table$minor_raw[is.na(pyclone.table$minor_cn)] <- 1
 84 |                 pyclone.table$fracA[is.na(pyclone.table$minor_cn)] <- 1
 85 |                 pyclone.table$nMaj_A[is.na(pyclone.table$minor_cn)] <- 1
 86 |                 pyclone.table$nMin_A[is.na(pyclone.table$minor_cn)] <- 1
 87 |                 pyclone.table$fracB[is.na(pyclone.table$minor_cn)] <- 0
 88 |                 pyclone.table$nMaj_B[is.na(pyclone.table$minor_cn)] <- 1
 89 |                 pyclone.table$nMin_B[is.na(pyclone.table$minor_cn)] <- 1
 90 |                 pyclone.table$minor_cn[is.na(pyclone.table$minor_cn)] <- 1
 91 |             }
 92 | 
 93 |             ### a few sanity checks
 94 |             pyclone.table    <- pyclone.table[!is.na(pyclone.table$minor_cn), ]
 95 |             pyclone.table    <- pyclone.table[!is.na(pyclone.table$ref_counts), ]
 96 |             pyclone.table    <- pyclone.table[!duplicated(pyclone.table$mutation_id), ]
 97 |             pyclone.table    <- pyclone.table[as.numeric(pyclone.table$major_cn) >= 1, ]
 98 |             pyclone.table    <- pyclone.table[!is.na(pyclone.table$minor_cn), ]
 99 | 
100 |             ### now, let's check what the cancer cell fraction estimates are for this region
101 |             region.ccf               <- phylo.region.list[[region]]
102 |             region.ccf               <- data.frame(region.ccf, stringsAsFactors = FALSE)
103 |             rownames(region.ccf)     <- region.ccf$mutation_id
104 |             tmp                      <- intersect(unlist(pyclone.table$mutation_id), unlist(region.ccf$mutation_id))
105 |             rownames(pyclone.table)  <- pyclone.table$mutation_id
106 |             pyclone.table            <- pyclone.table[tmp, , drop = FALSE]
107 |             region.ccf               <- region.ccf[tmp, , drop = FALSE]
108 | 
109 |             if (nrow(pyclone.table) > 0) {
110 |                 tmp                      <- round(((unlist(pyclone.table$var_counts)) / (unlist(region.ccf$phyloCCF) / 2)) - unlist(pyclone.table$var_counts))
111 |                 tmp[is.na(tmp)]          <- unlist(pyclone.table$ref_counts[(is.na(tmp))])
112 |                 pyclone.table$ref_counts <- tmp
113 |                 pyclone.table$minor_cn   <- 0
114 |                 pyclone.table$major_cn   <- 2
115 |                 pyclone.table$ref_counts <- apply(cbind(pyclone.table$ref_counts, 2), 1, max)
116 |             }
117 |             
118 |             SmallClusters[[cluster]]$PyCloneTables[[region]]$pyclone.table <- pyclone.table
119 |         }
120 |     }
121 |     return(SmallClusters)
122 | }
123 | 
124 | RunPyCloneWithSimpleClusters <- function(clusterName, patientID, SmallClusters, patientDirToUse = new.dir, yamlConfigLoc = template.config.yaml, pyclone.burnin = 1000, pyclone.seed = 1024, run.pyclone = TRUE, pyclone.module = "PyClone/0.12.3-foss-2016b-Python-2.7.12-tkinter") {
125 |     PyClone <- "PyClone"
126 |     ### give a name to the sample
127 |     PyCloneRunName <- paste0(patientID, "_cluster", clusterName)
128 |     ### create a specific subDirectory for this analysis
129 |     ClusterDir <- paste0(patientDirToUse, "/", PyCloneRunName, "/")
130 |     if (!dir.exists(ClusterDir)) {
131 |         dir.create(ClusterDir, recursive = TRUE)
132 |     }
133 | 
134 |     ### make the mutation files for each region
135 |     RegionsInClustering <- SmallClusters[[clusterName]]$RegionsInCluster
136 |     for (region in RegionsInClustering) {
137 |         pyclone.tsv   <- paste0(ClusterDir, "/", region, ".tsv")
138 |         pyclone.table <- SmallClusters[[clusterName]]$PyCloneTables[[region]]$pyclone.table
139 |         if (nrow(pyclone.table) == 1) {
140 |             pyclone.out <- matrix(apply(pyclone.table, 2, as.character), nrow = 1)
141 |             colnames(pyclone.out) <- colnames(pyclone.table)
142 |         } else {
143 |             pyclone.out   <- apply(pyclone.table, 2, as.character)
144 |         }
145 |         write.table(pyclone.out, sep = "\t", quote = FALSE, col.names = TRUE, row.names = FALSE, file = pyclone.tsv)
146 | 
147 |         ### Run PyClone build_mutations_file TSV_FILE where TSV_FILE is the input file you have created.
148 |         pyclone.yaml <- paste0(ClusterDir, "/", region, ".yaml")
149 | 
150 |         cmd <- paste0(PyClone
151 |                     , " build_mutations_file"
152 |                     , " --in_file ", pyclone.tsv
153 |                     , " --out_file ", pyclone.yaml)
154 |         cat('\n')
155 | 
156 |         if (run.pyclone) {
157 |             cat(cmd)
158 |             system(cmd)
159 | 
160 |             ### AH edit change so it works with states separated over multiple lines as well
161 |             #exclude state including g_v=AB from yaml file
162 |             yaml <- readLines(pyclone.yaml)
163 |             rm.indx <- grep("AB", yaml)
164 |             if (length(grep("prior_weight", grep("AB", yaml, value = TRUE))) > 0) {
165 |                 yaml <- yaml[-rm.indx]    
166 |             } else {
167 |                 yaml <- yaml[-c(rm.indx-2, rm.indx-1, rm.indx, rm.indx+1)]
168 |             } 
169 |             write.table(yaml, file = pyclone.yaml, col.names = FALSE, row.names = FALSE, quote = FALSE)
170 |         }
171 |     }
172 | 
173 | 
174 |     pyclone.config.yaml <- paste0(ClusterDir, "/", PyCloneRunName, ".config.yaml")
175 |     pyclone.config      <- readLines(yamlConfigLoc)
176 |     start.samples       <- (grep("samples", pyclone.config) + 1)
177 |     end.samples         <- length(pyclone.config)
178 | 
179 |     sample.lines        <- pyclone.config[start.samples:end.samples]
180 |     pyclone.config      <- pyclone.config[-c(start.samples:end.samples)]
181 |     pyclone.config      <- c(pyclone.config, "init_method: connected", "", "samples:")
182 |     pyclone.config      <- gsub("working.directory.location", ClusterDir, pyclone.config)
183 | 
184 |     write.table(pyclone.config, file = pyclone.config.yaml, col.names = FALSE, row.names = FALSE, quote = FALSE)
185 | 
186 |     RegionsInClustering <- SmallClusters[[clusterName]]$RegionsInCluster
187 |     for (region in RegionsInClustering) {
188 |         sample.config <- gsub("TCGA.barcode", region, sample.lines)
189 |         # pyclone.yaml  <- paste0(new.dir, "/", region, ".yaml")
190 |         pyclone.yaml  <- paste0(region, ".yaml")
191 |         sample.config <- gsub("mutations.yaml", pyclone.yaml, sample.config)
192 |         region.purity <- 0.5
193 | 
194 |         sample.config <- gsub("value: 1.0", paste0("value: ", signif(region.purity, 3)), sample.config)
195 |         sample.config <- sample.config[1:8]
196 | 
197 |         if (run.pyclone) {
198 |             write.table(sample.config, file = pyclone.config.yaml, append = TRUE, quote = FALSE, col.names = FALSE, row.names = FALSE)
199 |         }
200 |     }
201 | 
202 |     ### next, run pyclone
203 |     cmd <- paste0(PyClone
204 |                 , " run_analysis --config_file "
205 |                 , pyclone.config.yaml
206 |                 , " --seed "
207 |                 , pyclone.seed)
208 |     cat('\n')
209 | 
210 |     if (run.pyclone) {
211 |         cat(cmd)
212 |         system(cmd)
213 |     }
214 |     cat('\n')
215 | 
216 |     sample.results <- paste0(ClusterDir, "/", patientID, '.results.tsv')
217 |     cmd <- paste0(PyClone
218 |                , " build_table --config_file "
219 |                , pyclone.config.yaml
220 |                , " --table_type old_style --out_file "
221 |                , sample.results
222 |                , " --max_clusters ", min(max(1, floor(length(SmallClusters[[clusterName]]$MutationsWithCluster) / 5)), 10)
223 |                , " --burnin "
224 |                , pyclone.burnin)
225 |     cat('\n')
226 | 
227 |     if(run.pyclone) {
228 |         cat(cmd)
229 |         system(cmd)
230 |     }
231 |     cat('\n')
232 | }
233 | 
234 | CreateOutputNoPyCloneRun <- function(clusterName, patientID, SmallClusters, patientDirToUse = new.dir) {
235 |   
236 |     ### give a name to the sample
237 |     PyCloneRunName <- paste0(patientID, "_cluster", clusterName)
238 |     ### create a specific subDirectory for this analysis
239 |     ClusterDir <- paste0(patientDirToUse, "/", PyCloneRunName, "/")
240 |     if (!dir.exists(ClusterDir)) {
241 |         dir.create(ClusterDir, recursive = TRUE)
242 |     }
243 | 
244 |     sample.results <- paste0(ClusterDir, "/", patientID, '.results.tsv')
245 |     tmp <- matrix(0, nrow = length(SmallClusters[[clusterName]]$MutationsWithCluster), ncol = length(SmallClusters[[clusterName]]$RegionsInCluster))
246 |     rownames(tmp) <- SmallClusters[[clusterName]]$MutationsWithCluster
247 |     colnames(tmp) <- SmallClusters[[clusterName]]$RegionsInCluster
248 |     tmp <- data.frame(tmp, cluster_id = clusterName)
249 |     write.table(tmp, sep = "\t", row.names = TRUE, col.names = TRUE, quote = FALSE, file = sample.results)
250 | }
251 | 
252 | ####################################################################################################
253 | ######################################### Helper functions #########################################
254 | ####################################################################################################
255 | 
256 | ### get modified VAFs (extracted from findSimpleClustersWithSciClone)
257 | getModifiedVAF <- function(rowNumber, regionClusterTable) {
258 |     cat(rowNumber)
259 |     varCountObs <- unlist(regionClusterTable[rowNumber, ]$var_counts)
260 |     if (varCountObs == 0) {
261 |         Depth <- as.numeric(regionClusterTable[rowNumber, ]$ref_counts)
262 |     }
263 |     if (varCountObs != 0) {
264 |         Depth <- round(varCountObs / unlist(regionClusterTable[rowNumber, ]$phyloCCF / 2))
265 |     }
266 |     RefCount    <- Depth - varCountObs
267 |     mutID       <- unlist(regionClusterTable[rowNumber, ]$mutation_id)
268 |     outPutRow   <- cbind(unlist(strsplit(mutID, split = ":"))[2], as.numeric(unlist(strsplit(mutID, split = ":"))[3]), as.numeric(RefCount), as.numeric(varCountObs), as.numeric(varCountObs / Depth))
269 |     if (as.numeric(varCountObs / Depth) > 0.75) {
270 |         outPutRow <- cbind(unlist(strsplit(mutID, split = ":"))[2], as.numeric(unlist(strsplit(mutID, split = ":"))[3]), as.numeric(varCountObs), as.numeric(varCountObs), 0.5)
271 |     }
272 |     colnames(outPutRow) <- c('chr', 'pos', 'ref_count', 'var_count', 'vaf')
273 |     return(outPutRow)
274 | }
275 | 
276 | 
277 | 
278 | 


--------------------------------------------------------------------------------
/R/main_treebuilding_functions.R:
--------------------------------------------------------------------------------
   1 | ################################################################################
   2 | ##################               MAIN FUNCTIONS               ##################
   3 | ################################################################################
   4 | 
   5 | #' Full tree building run function
   6 | #'
   7 | #' This function takes all the input options and runs the three main steps:
   8 | #' preprocess, tree building run and postprocess
   9 | #' @param opt a list of options
  10 | #' @returns NULL
  11 | #' @export conipher_treebuilding
  12 | 
  13 | conipher_treebuilding <- function(input_tsv_loc,
  14 |                                   out_dir,
  15 |                                   prefix,
  16 |                                   ccf_buffer = 10,
  17 |                                   pval_cutoff = 0.01,
  18 |                                   use_boot = TRUE,
  19 |                                   merge_clusters = TRUE,
  20 |                                   correct_cpn_clusters = TRUE,
  21 |                                   adjust_noisy_clusters = FALSE,
  22 |                                   adjust_noisy_clusters_prop = 0.05,
  23 |                                   min_ccf = 0.01,
  24 |                                   min_cluster_size = 5,
  25 |                                   multi_trees = TRUE,
  26 |                                   ...) {
  27 |     out_dir         <- paste0(out_dir, "/")
  28 | 
  29 |     # cat("\nCONIPHER tree building analysis of the following tumour case:\n")
  30 |     # print(patient)
  31 |     # cat("\n")
  32 | 
  33 |     if (!file.exists(out_dir)) {
  34 |         if (!dir.create(out_dir, recursive = TRUE)) {
  35 |             stop("Unable to create root directory.\n")
  36 |         }
  37 |     }
  38 | 
  39 |     if(!file.exists(input_tsv_loc)) {
  40 |         stop("Unable to find input_tsv.\n")
  41 |     }
  42 |     input_tsv     <- read.delim(input_tsv_loc, sep = "\t", stringsAsFactors = FALSE, header = TRUE, fill = TRUE, quote = "")
  43 |     if (nrow(input_tsv) == 0) {
  44 |         stop('No mutations passed filtering, stopping PyClone phylo clustering')
  45 |     }
  46 |     #### =========== PREOCESS INPUT DATA ========= ####
  47 | 
  48 |     # preprocess input data into correct form for tree building
  49 |     input_list <- treebuilding_preprocess(input_tsv, prefix, out_dir)
  50 | 
  51 |     #### =========== RUN TREE BUILDING ========= ####
  52 | 
  53 |     # run main CONIPHER tree building function
  54 |     sample_pyclone_tree <-      treebuilding_run(sample_input_list = input_list
  55 |                                                       , ccf_buffer = ccf_buffer
  56 |                                                       , pval_cutoff = pval_cutoff
  57 |                                                       , use_boot = use_boot
  58 |                                                       , merge_clusters = merge_clusters
  59 |                                                       , correct_cpn_clusters = correct_cpn_clusters
  60 |                                                       , adjust_noisy_clusters = adjust_noisy_clusters
  61 |                                                       , adjust_noisy_clusters_prop = adjust_noisy_clusters_prop
  62 |                                                       , min_ccf = min_ccf
  63 |                                                       , min_cluster_size = min_cluster_size
  64 |                                                       , run.multi.trees = multi_trees
  65 |     )
  66 | 
  67 |     #### =========== SAVE OUTPUT ========= ####
  68 | 
  69 |     # Save all tree building output
  70 |     if(!is.na(sample_pyclone_tree$graph_pyclone[1]))
  71 |       cat('\nSaving all treebuilding output\n')
  72 |     {
  73 |       ### Plotting tree
  74 |       treebuilding_plot(sample_pyclone_tree)
  75 | 
  76 |       ### Creating human readable format
  77 |       ### writing all trees
  78 |       treeFile <- paste0(sample_pyclone_tree$parameters$generalSave, "allTrees.txt")
  79 |       if ("alt_trees" %in% names(sample_pyclone_tree$graph_pyclone)) {
  80 |         write.table(paste0("### ", length(sample_pyclone_tree$graph_pyclone$alt_trees), " trees"), file = treeFile, row.names = FALSE, col.names = FALSE, quote = FALSE, sep = "\t")
  81 |         tmp <- sapply(seq(1, length(sample_pyclone_tree$graph_pyclone$alt_trees)), function(x) {
  82 |             write.table(paste0("# tree ", x), file = treeFile, append = TRUE, row.names = FALSE, col.names = FALSE, quote = FALSE, sep = "\t")
  83 |             write.table(sample_pyclone_tree$graph_pyclone$alt_trees[[x]], file = treeFile, append = TRUE, row.names = FALSE, col.names = FALSE, quote = FALSE, sep = "\t")
  84 |         })
  85 |       } else {
  86 |         write.table(paste0("### ", 1, " trees"), file = treeFile, row.names = FALSE, col.names = FALSE, quote = FALSE, sep = "\t")
  87 |         write.table(paste0("# tree ", 1), file = treeFile, append = TRUE, row.names = FALSE, col.names = FALSE, quote = FALSE, sep = "\t")
  88 |         write.table(sample_pyclone_tree$graph_pyclone$default_tree, file = treeFile, append = TRUE, row.names = FALSE, col.names = FALSE, quote = FALSE, sep = "\t")
  89 |       }
  90 | 
  91 |       ### writing consensus branches
  92 |       consensusBranchesFile <- paste0(sample_pyclone_tree$parameters$generalSave, "consensusBranches.txt")
  93 |       write.table(Reduce(rbind, strsplit(sample_pyclone_tree$graph_pyclone$consensus_branches, split = ":")), file = consensusBranchesFile, row.names = FALSE, col.names = FALSE, quote = FALSE, sep = "\t")
  94 | 
  95 |       ### writing consensus relationships
  96 |       consensusRelationshipsFile <- paste0(sample_pyclone_tree$parameters$generalSave, "consensusRelationships.txt")
  97 |       write.table(Reduce(rbind, strsplit(sample_pyclone_tree$graph_pyclone$consensus_relationships, split = ":")), file = consensusRelationshipsFile, row.names = FALSE, col.names = FALSE, quote = FALSE, sep = "\t")
  98 | 
  99 |       ### writing cluster information
 100 |       clusterInfoFile <- paste0(sample_pyclone_tree$parameters$generalSave, "clusterInfo.txt")
 101 | 
 102 |       clusterInfoDF <- data.frame(clusterID = names(sample_pyclone_tree$graph_pyclone$edgelength), stringsAsFactors = FALSE)
 103 |       clusterInfoDF$truncal <- ifelse(clusterInfoDF$clusterID %in% sample_pyclone_tree$graph_pyclone$trunk, TRUE, FALSE)
 104 |       clusterInfoDF$treeClust <- ifelse(clusterInfoDF$clusterID %in% unique(c(sample_pyclone_tree$graph_pyclone$default_tree)), TRUE, FALSE)
 105 |       clusterInfoDF$cpnRemClust <- ifelse(clusterInfoDF$clusterID %in% sample_pyclone_tree$cpn_removed_clusters, TRUE, FALSE)
 106 |       clusterInfoDF$nMuts <- as.numeric(sample_pyclone_tree$graph_pyclone$edgelength)
 107 | 
 108 |       clusterInfoDF <- clusterInfoDF %>% dplyr::full_join(data.frame(sample_pyclone_tree$nested_pyclone$ccf_cluster_table, stringsAsFactors = FALSE) %>% dplyr::mutate(clusterID = rownames(.)) %>% tidyr::pivot_longer(!clusterID, names_to = "Region", values_to = "meanCCF"), by = c("clusterID"))
 109 |       clusterInfoDF <- clusterInfoDF %>% dplyr::full_join(data.frame(sample_pyclone_tree$nested_pyclone$ccf_ci_lower, stringsAsFactors = FALSE) %>% dplyr::mutate(clusterID = rownames(.)) %>% tidyr::pivot_longer(!clusterID, names_to = "Region", values_to = "CCF_CI_low"), by = c("clusterID", "Region"))
 110 |       clusterInfoDF <- clusterInfoDF %>% dplyr::full_join(data.frame(sample_pyclone_tree$nested_pyclone$ccf_ci_upper, stringsAsFactors = FALSE) %>% dplyr::mutate(clusterID = rownames(.)) %>% tidyr::pivot_longer(!clusterID, names_to = "Region", values_to = "CCF_CI_high"), by = c("clusterID", "Region"))
 111 |       clusterInfoDF <- clusterInfoDF %>% dplyr::full_join(data.frame(sample_pyclone_tree$clonality_out$clonality_table_corrected, stringsAsFactors = FALSE) %>% dplyr::mutate(clusterID = rownames(.)) %>% tidyr::pivot_longer(!clusterID, names_to = "Region", values_to = "clonality"), by = c("clusterID", "Region"))
 112 |       clusterInfoDF <- clusterInfoDF %>% dplyr::full_join(data.frame(sample_pyclone_tree$clone_proportion_out$clone_proportion_table, stringsAsFactors = FALSE) %>% dplyr::mutate(clusterID = rownames(.)) %>% tidyr::pivot_longer(!clusterID, names_to = "Region", values_to = "clone_proportions_default"), by = c("clusterID", "Region"))
 113 | 
 114 |       clusterInfoDF <- clusterInfoDF %>% dplyr::rename(SAMPLE = Region)
 115 |       write.table(clusterInfoDF, file = clusterInfoFile, row.names = FALSE, quote = FALSE, sep = "\t")
 116 | 
 117 |       ### writing clone proportion information
 118 |       cloneproportionInfoFile <- paste0(sample_pyclone_tree$parameters$generalSave, "cloneProportionsMinErrorTrees.txt")
 119 | 
 120 |       cp_min_sce_trees <- sample_pyclone_tree$clone_proportion_out$clone_proportions_min_sce_trees
 121 |       cloneproportionInfoList <- lapply(seq(cp_min_sce_trees), function(i){
 122 |         tree_id <- names(cp_min_sce_trees)[i]
 123 |         cp_table <- data.frame(cp_min_sce_trees[[i]], stringsAsFactors = FALSE)
 124 |         cp_table$clusterID <- rownames(cp_table)
 125 |         cp_table$treeID <- tree_id
 126 |         return(cp_table)
 127 |       })
 128 |       cloneproportionInfoDF <- do.call(rbind, cloneproportionInfoList)
 129 |       write.table(cloneproportionInfoDF, file = cloneproportionInfoFile, row.names = FALSE, quote = FALSE, sep = "\t")
 130 | 
 131 |       ### writing subclonal expansion score data
 132 |       subcloneExpansionInfoFile <- paste0(sample_pyclone_tree$parameters$generalSave, "subclonalExpansionScoreMinErrorTrees.txt")
 133 | 
 134 |       ses_min_sce_trees <- sample_pyclone_tree$subclonal_expansion_score_out$subclonal_exp_score_min_sce_trees
 135 |       subcloneExpansionInfoList <- lapply(seq(ses_min_sce_trees), function(i){
 136 |         tree_id <- names(ses_min_sce_trees)[i]
 137 |         ses_table <- data.frame(ses_min_sce_trees[[i]], stringsAsFactors = FALSE)
 138 |         ses_table$treeID <- tree_id
 139 |         return(ses_table)
 140 |       })
 141 |       subcloneExpansionInfoDF <- do.call(rbind, subcloneExpansionInfoList)
 142 |       write.table(subcloneExpansionInfoDF, file = subcloneExpansionInfoFile, row.names = FALSE, quote = FALSE, sep = "\t")
 143 | 
 144 |       ### writing output muttable - similar to input
 145 |       input_tsv <- input_tsv %>% dplyr::rename(originalCLUSTER = CLUSTER)
 146 |       if (is.null(nrow(sample_pyclone_tree$merged_clusters))) {
 147 |         input_tsv <- input_tsv %>% dplyr::mutate(treeCLUSTER = originalCLUSTER)
 148 |       } else {
 149 |         input_tsv <- input_tsv %>% dplyr::mutate(treeCLUSTER = originalCLUSTER)
 150 |         for (i in 1:nrow(sample_pyclone_tree$merged_clusters)) {
 151 |           input_tsv$treeCLUSTER <- gsub(sample_pyclone_tree$merged_clusters[i, 1], sample_pyclone_tree$merged_clusters[i, 3], input_tsv$treeCLUSTER)
 152 |         }
 153 |       }
 154 |       write.table(input_tsv, file = paste0(sample_pyclone_tree$parameters$generalSave, "treeTable.tsv"), row.names = FALSE, quote = FALSE, sep = "\t")
 155 | 
 156 |       ### writing alternative trees summary metrics
 157 |       altTreeInfoFile <- paste0(sample_pyclone_tree$parameters$generalSave, "alternativeTreeMetrics.txt")
 158 | 
 159 |       altTreeInfoDF <- data.frame(treeID = seq(sample_pyclone_tree$graph_pyclone$alt_trees), stringsAsFactors = FALSE)
 160 | 
 161 |       altTreeInfoDF$sum_condition_error <- sapply(altTreeInfoDF$treeID, function(i) sample_pyclone_tree$graph_pyclone$alt_trees_sum_condition_error[i])
 162 |       altTreeInfoDF$SCE_ranking <- match(altTreeInfoDF$sum_condition_error, sort(unique(altTreeInfoDF$sum_condition_error)))
 163 |       altTreeInfoDF$lowest_SCE <- ifelse(altTreeInfoDF$sum_condition_error == min(altTreeInfoDF$sum_condition_error), 'Lowest SCE tree', 'Alternative tree')
 164 | 
 165 |       altTreeInfoDF$edge_probability_score <- sapply(altTreeInfoDF$treeID, function(i) sample_pyclone_tree$graph_pyclone$alt_trees_edge_probability[i])
 166 |       altTreeInfoDF$edge_probability_ranking <- match(altTreeInfoDF$edge_probability_score, rev(sort(unique(altTreeInfoDF$edge_probability_score))))
 167 |       altTreeInfoDF$highest_edge_probability <- ifelse(altTreeInfoDF$edge_probability_score == max(altTreeInfoDF$edge_probability_score), 'Highest edge probability tree', 'Alternative tree')
 168 |       write.table(altTreeInfoDF, file = altTreeInfoFile, row.names = FALSE, quote = FALSE, sep = "\t")
 169 | 
 170 |     }
 171 | }
 172 | 
 173 | 
 174 | #' Input data preprocessing function
 175 | #'
 176 | #' This function takes the input tsv and formats the data to be compatible with
 177 | #' the main CONIPHER tree building function. NOTE: it is assumed that
 178 | #' clustering has been carried out prior to running tree building.
 179 | #' @param input_table An dataframe of the input mutation table in the correct
 180 | #' format. For more information on the input table format, please see our
 181 | #' tree building protocol.
 182 | #' @param prefix A tumour case and sample prefix, e.g. 'CRUK'.
 183 | #' @param out_dir A file path to the desired output directory
 184 | #' @export treebuilding_preprocess
 185 | 
 186 | treebuilding_preprocess <- function(input_table, prefix, out_dir) {
 187 |   cat('\n Preprocessing input data \n')
 188 |   # check if the correct columns are included
 189 |   required_cols <- c("CASE_ID", "SAMPLE", "CHR", "POS", "REF", "ALT", "CLUSTER", "CCF_PHYLO", "CCF_OBS", "MUT_COPY", "COPY_NUMBER_A", "COPY_NUMBER_B")
 190 |   if (FALSE%in% (required_cols %in% colnames(input_table)))
 191 |   {
 192 |     print('\nThe following columns are required in input_tsv:\n')
 193 |     cat(required_cols)
 194 |     stop()
 195 |   }
 196 | 
 197 |   # add mutation id column
 198 |   input_table$mutation_id <- paste(input_table$CASE_ID,
 199 |                                    input_table$CHR,
 200 |                                    input_table$POS,
 201 |                                    input_table$REF,
 202 |                                    input_table$ALT,
 203 |                                    sep=":")
 204 | 
 205 |   nr_unique_muts           <- length(unique(input_table$mutation_id))
 206 |   nr_regions               <- length(unique(input_table$SAMPLE))
 207 |   regions                  <- unique(input_table$SAMPLE)
 208 | 
 209 |   # Raise an error if prefix is not specified, or incorrectly specified
 210 |   if (is.null(prefix)){
 211 |     stop("No prefix specified. Please indicate a prefix for the current tumour case.")
 212 |   } else if (!grepl(prefix, input_table$CASE_ID[1])){
 213 |     stop("Incorrect prefix specified. Please input the correct prefix for the current tumour case.")
 214 |   }
 215 | 
 216 |   # make sure all columns are the correct class
 217 |   input_table$POS <- as.numeric(input_table$POS)
 218 |   input_table$CCF_PHYLO <- as.numeric(input_table$CCF_PHYLO)
 219 |   input_table$CCF_OBS <- as.numeric(input_table$CCF_OBS)
 220 |   input_table$MUT_COPY <- as.numeric(input_table$MUT_COPY)
 221 |   input_table$COPY_NUMBER_A <- as.numeric(input_table$COPY_NUMBER_A)
 222 |   input_table$COPY_NUMBER_B <- as.numeric(input_table$COPY_NUMBER_B)
 223 |   input_table$REF_COUNT <- as.numeric(input_table$REF_COUNT)
 224 |   input_table$VAR_COUNT <- as.numeric(input_table$VAR_COUNT)
 225 |   input_table$DEPTH <- as.numeric(input_table$DEPTH)
 226 |   input_table$ACF <- as.numeric(input_table$ACF)
 227 |   input_table$PLOIDY <- as.numeric(input_table$PLOIDY)
 228 | 
 229 | 
 230 |   # make sure all mutations have a cluster assigned
 231 |   tmp <- input_table[!is.na(input_table$CLUSTER),]
 232 |   removed_mutations <- c()
 233 |   for (mutation_id in unique(input_table$mutation_id))
 234 |   {
 235 |     if(length(unique(tmp[tmp$mutation_id%in%mutation_id,]$CLUSTER))==0)
 236 |     {
 237 |       # add warning
 238 |       removed_mutations <- c(removed_mutations,mutation_id)
 239 |       cat('\nwarning:')
 240 |       cat('', paste(mutation_id),'does not have a CLUSTER assigned, will remove')
 241 |       next;
 242 |     }
 243 | 
 244 |     input_table[input_table$mutation_id%in%mutation_id,]$CLUSTER <- unique(tmp[tmp$mutation_id%in%mutation_id,]$CLUSTER)
 245 |   }
 246 | 
 247 |   if(length(removed_mutations)>=1)
 248 |   {
 249 |     cat(paste('\nwarning: ',length(removed_mutations), ' mutations removed due to lack of cluster assignment',sep=""))
 250 |   }
 251 | 
 252 |   input_table <- input_table[!is.na(input_table$CLUSTER),,drop=FALSE]
 253 | 
 254 |   # check again:
 255 |   nr_unique_muts           <- length(unique(input_table$mutation_id))
 256 |   nr_regions               <- length(unique(input_table$SAMPLE))
 257 |   regions                  <- unique(input_table$SAMPLE)
 258 | 
 259 | 
 260 |   # Next convert the input_table into a sample_input_list
 261 |   input_list <- list()
 262 |   names_input_list <- c("pyclone",
 263 |                         "pyclone_absolute",
 264 |                         "sampleID",
 265 |                         "prefix",
 266 |                         "generalSave",
 267 |                         "merged_clusters")
 268 | 
 269 |   #create the pyclone table
 270 |   input_format           <- data.frame(matrix(data = NA,
 271 |                                               nrow = nr_unique_muts,
 272 |                                               ncol = 11*nr_regions+2),
 273 |                                        stringsAsFactors = FALSE)
 274 |   colnames(input_format)  <- c(paste(regions, "_cov", sep = "")
 275 |                                ,paste(regions, "_var_count", sep = "")
 276 |                                ,paste(regions, "_VAF", sep = "")
 277 |                                ,paste(regions,"_PhyloCCF",sep="")
 278 |                                ,paste(regions,"_PycloneCCF",sep="")
 279 |                                ,paste(regions,"_Pyclone_0.05",sep="")
 280 |                                ,paste(regions,"_Pyclone_0.95",sep="")
 281 |                                ,paste(regions,"_cpn.copies",sep="")
 282 |                                ,paste(regions,"_mut.cpn.num",sep="")
 283 |                                ,paste(regions,"_nAraw",sep="")
 284 |                                ,paste(regions,"_nBraw",sep="")
 285 |                                ,"PycloneCluster"
 286 |                                ,"CleanCluster")
 287 |   rownames(input_format) <- unique(input_table$mutation_id)
 288 |   # next populate the table
 289 |   for (mutation_id in rownames(input_format))
 290 |   {
 291 |     spec_mut_table <- input_table[input_table$mutation_id%in%mutation_id,,drop=FALSE]
 292 |     for (i in 1:nrow(spec_mut_table))
 293 |     {
 294 |       region_spec_mut <- spec_mut_table[i,,drop=FALSE]
 295 |       input_format[mutation_id,paste(region_spec_mut$SAMPLE,'_cov',sep="")] <- region_spec_mut$DEPTH
 296 |       input_format[mutation_id,paste(region_spec_mut$SAMPLE,'_var_count',sep="")] <- region_spec_mut$VAR_COUNT
 297 |       input_format[mutation_id,paste(region_spec_mut$SAMPLE,'_VAF',sep="")] <- region_spec_mut$VAR_COUNT / region_spec_mut$DEPTH
 298 | 
 299 |       input_format[mutation_id,paste(region_spec_mut$SAMPLE,'_PhyloCCF',sep="")] <- region_spec_mut$CCF_PHYLO
 300 |       input_format[mutation_id,paste(region_spec_mut$SAMPLE,'_PycloneCCF',sep="")] <- region_spec_mut$CCF_PHYLO
 301 |       input_format[mutation_id,paste(region_spec_mut$SAMPLE,'_Pyclone_0.05',sep="")] <- region_spec_mut$CCF_PHYLO
 302 |       input_format[mutation_id,paste(region_spec_mut$SAMPLE,'_Pyclone_0.95',sep="")] <- region_spec_mut$CCF_PHYLO
 303 |       input_format[mutation_id,paste(region_spec_mut$SAMPLE,'_cpn.copies',sep="")] <- 1
 304 | 
 305 |       input_format[mutation_id,paste(region_spec_mut$SAMPLE,'_mut.cpn.num',sep="")] <- region_spec_mut$MUT_COPY
 306 |       input_format[mutation_id,paste(region_spec_mut$SAMPLE,'_nAraw',sep="")] <- region_spec_mut$COPY_NUMBER_A
 307 |       input_format[mutation_id,paste(region_spec_mut$SAMPLE,'_nBraw',sep="")] <- region_spec_mut$COPY_NUMBER_B
 308 |       input_format[mutation_id,"PycloneCluster"] <- region_spec_mut$CLUSTER
 309 |       input_format[mutation_id,"CleanCluster"]  <- 1
 310 |     }
 311 |   }
 312 | 
 313 |   # Do the same thing for pyclone_absolute (non-subclonal-copy-number-corrected version)
 314 |   #create the pyclone table
 315 |   input_format_absolute           <- data.frame(matrix(data = NA,
 316 |                                                        nrow = nr_unique_muts,
 317 |                                                        ncol = 11*nr_regions+2),
 318 |                                                 stringsAsFactors = FALSE)
 319 |   colnames(input_format_absolute)  <- c(paste(regions, "_cov", sep = "")
 320 |                                         ,paste(regions, "_var_count", sep = "")
 321 |                                         ,paste(regions, "_VAF", sep = "")
 322 |                                         ,paste(regions,"_PhyloCCF",sep="")
 323 |                                         ,paste(regions,"_PycloneCCF",sep="")
 324 |                                         ,paste(regions,"_Pyclone_0.05",sep="")
 325 |                                         ,paste(regions,"_Pyclone_0.95",sep="")
 326 |                                         ,paste(regions,"_cpn.copies",sep="")
 327 |                                         ,paste(regions,"_mut.cpn.num",sep="")
 328 |                                         ,paste(regions,"_nAraw",sep="")
 329 |                                         ,paste(regions,"_nBraw",sep="")
 330 |                                         ,"PycloneCluster"
 331 |                                         ,"CleanCluster")
 332 |   rownames(input_format_absolute) <- unique(input_table$mutation_id)
 333 |   # next populate the table
 334 |   for (mutation_id in rownames(input_format_absolute))
 335 |   {
 336 |     spec_mut_table <- input_table[input_table$mutation_id%in%mutation_id,,drop=FALSE]
 337 | 
 338 |     for (i in 1:nrow(spec_mut_table))
 339 |     {
 340 |       region_spec_mut <- spec_mut_table[i,,drop=FALSE]
 341 |       input_format[mutation_id,paste(region_spec_mut$SAMPLE,'_cov',sep="")] <- region_spec_mut$DEPTH
 342 |       input_format[mutation_id,paste(region_spec_mut$SAMPLE,'_var_count',sep="")] <- region_spec_mut$VAR_COUNT
 343 |       input_format[mutation_id,paste(region_spec_mut$SAMPLE,'_VAF',sep="")] <- region_spec_mut$VAR_COUNT / region_spec_mut$DEPTH
 344 | 
 345 |       input_format_absolute[mutation_id,paste(region_spec_mut$SAMPLE,'_PhyloCCF',sep="")] <- region_spec_mut$CCF_OBS
 346 |       input_format_absolute[mutation_id,paste(region_spec_mut$SAMPLE,'_PycloneCCF',sep="")] <- region_spec_mut$CCF_OBS
 347 |       input_format_absolute[mutation_id,paste(region_spec_mut$SAMPLE,'_Pyclone_0.05',sep="")] <- region_spec_mut$CCF_OBS
 348 |       input_format_absolute[mutation_id,paste(region_spec_mut$SAMPLE,'_Pyclone_0.95',sep="")] <- region_spec_mut$CCF_OBS
 349 |       input_format_absolute[mutation_id,paste(region_spec_mut$SAMPLE,'_cpn.copies',sep="")] <- 1
 350 | 
 351 |       input_format_absolute[mutation_id,paste(region_spec_mut$SAMPLE,'_mut.cpn.num',sep="")] <- region_spec_mut$MUT_COPY
 352 |       input_format_absolute[mutation_id,paste(region_spec_mut$SAMPLE,'_nAraw',sep="")] <- region_spec_mut$COPY_NUMBER_A
 353 |       input_format_absolute[mutation_id,paste(region_spec_mut$SAMPLE,'_nBraw',sep="")] <- region_spec_mut$COPY_NUMBER_B
 354 |       input_format_absolute[mutation_id,"PycloneCluster"] <- region_spec_mut$CLUSTER
 355 |       input_format_absolute[mutation_id,"CleanCluster"]  <- 1
 356 | 
 357 |     }
 358 | 
 359 |   }
 360 | 
 361 |   # Now create list object for input to treebuilding
 362 |   input_list$pyclone          <- input_format
 363 |   input_list$pyclone_absolute <- input_format_absolute
 364 |   input_list$sampleID         <- input_table$CASE_ID[1]
 365 |   input_list$prefix           <- prefix
 366 |   input_list$generalSave      <- out_dir
 367 |   input_list$merged_clusters   <- NA
 368 | 
 369 |   if(!file.exists(input_list$generalSave))
 370 |   {
 371 |     dir.create(input_list$generalSave,showWarnings = TRUE, recursive = TRUE, mode = "0775")
 372 |   }
 373 | 
 374 |   return(input_list)
 375 | }
 376 | 
 377 | 
 378 | #' TRACERx tree building function
 379 | #'
 380 | #' This function is the main CONIPHER wrapper function to run phylogenetic
 381 | #' tree building from mutation clustering output. NOTE: it is assumed that
 382 | #' clustering has been carried out prior to running tree building.
 383 | #' @param sample_input_list An R list object. This object contains information
 384 | #' about each mutation in each tumour region sampled, including which cluster
 385 | #' each mutation was assigned to in mutation clustering.
 386 | #' This object can be generated by running the data preprocessing function
 387 | #' @param ccf_buffer PhyloCCF buffer allowance for testing tree level issue
 388 | #' (default=10)
 389 | #' @param pval_cutoff P-value cut off for testing cluster nesting (default=0.01)
 390 | #' @param use_boot Should bootstrapping be used to compute confidence interval?
 391 | #' (default=TRUE)
 392 | #' @param merge_clusters Should similar clusters be merged if possible?
 393 | #' (default=TRUE)
 394 | #' @param correct_cpn_clusters Should clusters driven by copy number errors be
 395 | #' removed? (default=TRUE)
 396 | #' @param adjust_noisy_clusters Should noisy clusters be adjusted? (default=TRUE)
 397 | #' @param adjust_noisy_clusters_prop What is the minimum proportion of mutations
 398 | #' required to be present in a region to avoid cluster adjustment? (default=0.05)
 399 | #' @param min_ccf What is the minimum CCF threshold to consider a mutation as
 400 | #' present? (default=0.01)
 401 | #' @param min_cluster_size What is the minimum number of mutations required in a
 402 | #' cluster to be included in analysis? (default=5)
 403 | #' @param run.multi.trees Should alternative tumour phylogenies be explored?
 404 | #' (default=TRUE)
 405 | #' @param n_clusters_to_move When running multiple trees specify the maximum
 406 | #' number of clusters to attempt moving. (default=5)
 407 | #' @returns sample_pyclone_tree, an R list object containing output information
 408 | #' from CONIPHER tree building
 409 | #' @export treebuilding_run
 410 | 
 411 | treebuilding_run <- function(sample_input_list
 412 |                                   , ccf_buffer = 10
 413 |                                   , pval_cutoff = 0.01
 414 |                                   , use_boot = TRUE
 415 |                                   , merge_clusters = TRUE
 416 |                                   , correct_cpn_clusters = TRUE
 417 |                                   , adjust_noisy_clusters = FALSE
 418 |                                   , adjust_noisy_clusters_prop = 0.05
 419 |                                   , min_ccf = 0.01
 420 |                                   , min_cluster_size = 5
 421 |                                   , run.multi.trees = TRUE
 422 |                                   , n_clusters_to_move = 5
 423 | ) {
 424 |   suppressPackageStartupMessages(require(igraph))
 425 |   suppressPackageStartupMessages(require(mapplots))
 426 | 
 427 |   cat('\n\nStarting Tree Building')
 428 |   #first keep track of parameters used for this
 429 |   input_parameter_list <- list()
 430 |   input_parameter_list$sampleID                      <- sampleID               <- sample_input_list$sampleID
 431 |   input_parameter_list$prefix                        <- prefix                 <- sample_input_list$prefix
 432 |   input_parameter_list$generalSave                   <- generalSave            <- sample_input_list$generalSave
 433 |   input_parameter_list$ccf_buffer                    <- ccf_buffer
 434 |   input_parameter_list$pval_cutoff                   <- pval_cutoff
 435 |   input_parameter_list$use_boot                      <- use_boot
 436 |   input_parameter_list$merge_clusters                <- merge_clusters
 437 |   input_parameter_list$correct_cpn_clusters          <- correct_cpn_clusters
 438 |   input_parameter_list$adjust_noisy_clusters         <- adjust_noisy_clusters
 439 |   input_parameter_list$adjust_noisy_clusters_prop    <- adjust_noisy_clusters_prop
 440 |   input_parameter_list$min_ccf                       <- min_ccf
 441 |   input_parameter_list$min_cluster_size              <- min_cluster_size
 442 | 
 443 |   cat('\nFollowing parameters used for tree building:\n')
 444 |   print(do.call(rbind,input_parameter_list))
 445 | 
 446 |   # prepare the output
 447 |   output_list  <- list()
 448 |   output_list$ccf_table_pyclone                      <- sample_input_list$pyclone
 449 |   output_list$ccf_table_absolute                     <- sample_input_list$pyclone_absolute
 450 |   output_list$ccf_table_pyclone_clean                <- sample_input_list$pyclone
 451 |   output_list$ccf_table_absolute_clean               <- sample_input_list$pyclone_absolute
 452 | 
 453 |   output_list$merged_clusters                        <- sample_input_list$merged_clusters
 454 | 
 455 |   output_list$noisy_clusters_adjusted                <- NA
 456 |   output_list$cpn_removed_clusters                   <- NA
 457 |   output_list$tree_removed_clusters                  <- NA
 458 | 
 459 |   test_pyclone          <- sample_input_list$pyclone
 460 |   test_pyclone_absolute <- sample_input_list$pyclone_absolute
 461 | 
 462 |   if(adjust_noisy_clusters)
 463 |   {
 464 |     cat('\nAdjusting noisy clusters\n')
 465 |     pyclone_adj  <- clean.noisy.clusters(pyclone = test_pyclone,max.absent.prop = adjust_noisy_clusters_prop)
 466 |     test_pyclone <- pyclone_adj$corrected_pyclone
 467 | 
 468 |     output_list$ccf_table_pyclone_clean <- test_pyclone
 469 | 
 470 |     if(!is.na(pyclone_adj$corrected_cluster)[1])
 471 |     {
 472 |       tmp <- pyclone_adj$corrected_cluster
 473 |       colnames(tmp) <- c('region','cluster')
 474 |       output_list$noisy_clusters_adjusted <- tmp
 475 |       cat('\nThe following clusters were adjusted:\n')
 476 |       print(tmp)
 477 |     }
 478 | 
 479 |     pyclone_adj_absolute  <- clean.noisy.clusters(test_pyclone_absolute,max.absent.prop = adjust_noisy_clusters_prop)
 480 |     test_pyclone_absolute <- pyclone_adj_absolute$corrected_pyclone
 481 | 
 482 |     output_list$ccf_table_absolute_clean <- test_pyclone_absolute
 483 |   }
 484 | 
 485 |   # make sure you only use clean clusters
 486 |   test_pyclone          <- test_pyclone[test_pyclone[, "CleanCluster"] %in% 1, -ncol(test_pyclone)]
 487 |   test_pyclone_absolute <- test_pyclone_absolute[test_pyclone_absolute[, "CleanCluster"] %in% 1, -ncol(test_pyclone_absolute)]
 488 | 
 489 |   if (nrow(test_pyclone) < min_cluster_size) { stop('too few mutations to run tree building') }
 490 |   if (sort(table(test_pyclone[, 'PycloneCluster']), decreasing = T)[1] < min_cluster_size) { stop('too few mutations to run tree building') }
 491 | 
 492 |   clusters_with_min_cluster_sizeations <- table(test_pyclone[,'PycloneCluster'])[table(test_pyclone[,'PycloneCluster'])>=min_cluster_size]
 493 | 
 494 | 
 495 |   cat('\n\n\nDetermining nesting of clusters\n')
 496 |   nested_pyclone <- determine.cluster.nesting(pyclone = test_pyclone
 497 |                                               , prefix = prefix
 498 |                                               , min_cluster_size = max(c(2, min_cluster_size))
 499 |                                               , pval_cutoff = pval_cutoff
 500 |                                               , use_boot =use_boot
 501 |                                               , min_ccf =  min_ccf
 502 |   )
 503 | 
 504 |   # NM additional step, taking clonality into account when nesting [05/04/2022]
 505 |   directedGraph_input_full <- matrix(0, 0, 2)
 506 |   colsums <- colSums(nested_pyclone$nestedclust)
 507 |   rowsums <- rowSums(nested_pyclone$nestedclust)
 508 |   trunk_cluster <- names(colsums[which(colsums == max(colsums))])
 509 |   if (length(trunk_cluster) > 1) {
 510 |     trunk_cluster <- names(sort(rowMeans(nested_pyclone$ccf_cluster_table[trunk_cluster,, drop = F]), decreasing = T))[1]
 511 |   }
 512 | 
 513 |   clonality_table <- clonality.function(pyclone = test_pyclone
 514 |                                         ,trunk =trunk_cluster
 515 |                                         ,prefix = prefix
 516 |                                         , min_cluster_size = max(c(2, min_cluster_size))
 517 |                                         ,pval_cutoff = pval_cutoff
 518 |                                         ,use_boot =use_boot )
 519 | 
 520 | 
 521 |   nested_pyclone  <- correct.clonality.nesting(nestedlist = nested_pyclone
 522 |                                                , pyclone = test_pyclone
 523 |                                                , clonality_table = clonality_table
 524 |                                                , pval_cutoff = pval_cutoff
 525 |                                                , min_cluster_size = min_cluster_size
 526 |                                                , min_ccf = min_ccf
 527 |                                                , prefix = prefix
 528 |   )
 529 |   # finish additional step NM [05/04/2022]
 530 |   cat('\nThe following nesting identified:\n')
 531 |   print(nested_pyclone$nestedclust[,order(colSums(nested_pyclone$nestedclust),decreasing=T)])
 532 |   cat('\n')
 533 | 
 534 |   nested_pyclone_absolute <- determine.cluster.nesting(pyclone = test_pyclone_absolute
 535 |                                                        , prefix = prefix
 536 |                                                        , min_cluster_size = max(c(2, min_cluster_size))
 537 |                                                        , pval_cutoff = pval_cutoff
 538 |                                                        , use_boot =use_boot
 539 |                                                        , min_ccf =  min_ccf
 540 |   )
 541 | 
 542 | 
 543 |   # we only merge clusters if we're not using absolute.
 544 |   if(merge_clusters %in% TRUE &
 545 |      nrow(nested_pyclone$nestedclust) > 1 &
 546 |      nrow(nested_pyclone_absolute$nestedclust) > 1)
 547 |   {
 548 |     cat('\nChecking for cluster merging\n')
 549 |     out <- merge.clusters.full(test_pyclone = test_pyclone
 550 |                                ,test_pyclone_absolute = test_pyclone_absolute
 551 |                                ,nested_pyclone = nested_pyclone
 552 |                                ,nested_pyclone_absolute = nested_pyclone_absolute
 553 |                                ,prefix = prefix
 554 |                                ,min_ccf = min_ccf
 555 |                                ,p_value_cut = pval_cutoff
 556 |                                ,min_cluster_size = min_cluster_size
 557 |                                ,use_boot = use_boot
 558 |     )
 559 | 
 560 |     nested_pyclone   <- out$nested_pyclone
 561 |     test_pyclone     <- out$test_pyclone
 562 | 
 563 |     if(!is.na(out$mergedclusters[1]) & !is.na(output_list$merged_clusters))
 564 |     {
 565 |       output_list$merged_clusters <- rbind(out$mergedclusters,output_list$merged_clusters)
 566 |       cat('\nThe following clusters were merged:\n')
 567 |       print(out$mergedclusters)
 568 |       cat('\n')
 569 |     }
 570 |     else if (!is.na(out$mergedclusters[1])) {
 571 |       output_list$merged_clusters <- out$mergedclusters
 572 |       cat('\nThe following clusters were merged:\n')
 573 |       print(out$mergedclusters)
 574 |       cat('\n')
 575 |     }
 576 |     output_list$ccf_table_pyclone_clean <- test_pyclone
 577 |   }
 578 | 
 579 | 
 580 |   # remove the genomically clustered clones which may be driven by undetected subclonal copy number
 581 |   if(correct_cpn_clusters)
 582 |   {
 583 |     cat('\nChecking for chromosome clustered clusters')
 584 |     # determine clonal/trunk cluster using same method as for tree building
 585 |     directedGraph_input_full <- matrix(0, 0, 2)
 586 |     colsums <- colSums(nested_pyclone$nestedclust)
 587 |     rowsums <- rowSums(nested_pyclone$nestedclust)
 588 |     trunk_cluster <- names(colsums[which(colsums == max(colsums))])
 589 |     if (length(trunk_cluster) > 1) {
 590 |       trunk_cluster <- names(sort(rowMeans(nested_pyclone$ccf_cluster_table[trunk_cluster,, drop = F]), decreasing = T))[1]
 591 |     }
 592 |     #remove the genomically clustered clones which may be driven by undetected subclonal copy number
 593 |     new_test_pyclone <- remove_clustered_clones(test_pyclone,
 594 |                                                 clonal_cluster = trunk_cluster,
 595 |                                                 p_value_cut = 0.01,
 596 |                                                 clustering_estimate_cut = 2 )
 597 | 
 598 |     if(identical(new_test_pyclone,test_pyclone))
 599 |     {
 600 |       cat('\nNo clusters removed\n')
 601 |     }
 602 |     if(!identical(new_test_pyclone,test_pyclone))
 603 |     {
 604 |       #
 605 |       cat('\nThe following clusters removed due to genomic clustering:\n')
 606 |       # sort out if copy number cluster removed.
 607 |       cpn_removed_clusters <- names(nested_pyclone$cluster_qc[,'ClusterName'])[!nested_pyclone$cluster_qc[,'ClusterName']%in%unique(new_test_pyclone[,'PycloneCluster'])]
 608 |       nested_pyclone$cluster_qc[nested_pyclone$cluster_qc[,'ClusterName']%in%cpn_removed_clusters,'CopyNumRemove'] <- 1
 609 |       output_list$cpn_removed_clusters <- cpn_removed_clusters
 610 | 
 611 | 
 612 | 
 613 |       cat(cpn_removed_clusters)
 614 |       cat('\n')
 615 |     }
 616 | 
 617 |     test_pyclone    <- new_test_pyclone
 618 |     output_list$ccf_table_pyclone_clean <- test_pyclone
 619 |   }
 620 | 
 621 |   cat('\nBuilding trees...')
 622 |   # check whether this means all the clusters are removed.
 623 |   graph_pyclone  <- grow.trees( nestedlist = nested_pyclone
 624 |                                 , pyclone = test_pyclone
 625 |                                 , min_cluster_size = min_cluster_size
 626 |                                 , force_trunk = TRUE
 627 |                                 , ccf_buffer = ccf_buffer
 628 |   )
 629 | 
 630 |   output_list$tree_removed_clusters <-  graph_pyclone$Clusters_with_issues
 631 | 
 632 |   cat('\n------------------\n')
 633 |   cat('\nTree identified\n')
 634 | 
 635 |   clonality_table <- clonality.function(pyclone = test_pyclone
 636 |                                         ,trunk = graph_pyclone$trunk
 637 |                                         ,prefix = prefix
 638 |                                         , min_cluster_size = max(c(2, min_cluster_size))
 639 |                                         ,pval_cutoff = pval_cutoff
 640 |                                         ,use_boot = use_boot )
 641 | 
 642 |   clonality_out   <- correct.clonality.table(clonality_table = clonality_table
 643 |                                              , graph_pyclone = graph_pyclone
 644 |                                              , trunk_cluster = graph_pyclone$trunk) #TODO still may need correcting for one region cases #EC 20210509
 645 | 
 646 |   ### AH edit set CCF in ccf cluster table as well as upper and lower CIs to 0 if cluster is defined as absent
 647 |   for (region in colnames(clonality_table)) {
 648 |     tmp.absentClust <- rownames(clonality_table)[clonality_table[,region] == "absent"]
 649 |     if (any(nested_pyclone$ccf_cluster_table[tmp.absentClust, region] != 0)) {
 650 |       print("Absent clusters with meanCCF > 0. Resetting to 0 in ccf cluster table")
 651 |       nested_pyclone$ccf_cluster_table[tmp.absentClust, region] <- 0
 652 |       nested_pyclone$ccf_ci_lower[tmp.absentClust, region] <- 0
 653 |       nested_pyclone$ccf_ci_upper[tmp.absentClust, region] <- 0
 654 |     }
 655 |   }
 656 |   ### AH edit done
 657 | 
 658 |   if (run.multi.trees) {
 659 |     cat('\nExploring presence of multiple alternate trees')
 660 |     multi.trees     <- grow.multi.trees(nestedlist = nested_pyclone
 661 |                                         ,pyclone = test_pyclone
 662 |                                         ,graph_pyclone = graph_pyclone
 663 |                                         ,ccf_buffer = ccf_buffer
 664 |                                         ,n_clusters_to_move = n_clusters_to_move
 665 |     )
 666 | 
 667 |   } else {
 668 |     multi.trees <- NULL
 669 |   }
 670 | 
 671 |   graph_pyclone$alt_trees          <- multi.trees$good.trees
 672 | 
 673 | 
 674 |   if(length(multi.trees)==0)
 675 |   {
 676 |     graph_pyclone$consensus_branches <- paste(graph_pyclone$default_tree[,1],graph_pyclone$default_tree[,2],sep=":")
 677 |     graph_pyclone$nested_clust       <- nested_pyclone[[1]]
 678 | 
 679 |     # list all clone - clone relationships which are common to all alternative trees
 680 |     # This captures some tree info for clones where the exact tree position is uncertain
 681 |     graph_pyclone$consensus_relationships <- extract_consensus_relationships( list(graph_pyclone$default_tree ) )
 682 |     graph_pyclone$alt_trees <- list(graph_pyclone$default_tree)
 683 |   }
 684 | 
 685 |   if(length(multi.trees)!=0)
 686 |   {
 687 |     # check whether any repeats in alt_trees [this can happen due to level issue]
 688 |     tree_vector <- c()
 689 |     for (i in 1:length(graph_pyclone$alt_trees))
 690 |     {
 691 |       tree_vector <- c(tree_vector,PasteVector(sort(paste(graph_pyclone$alt_trees[[i]][,1],graph_pyclone$alt_trees[[i]][,2],sep=":")),sep=","))
 692 | 
 693 |     }
 694 | 
 695 |     alt_trees    <- list()
 696 |     trees_to_use <-  c(1:length(graph_pyclone$alt_trees))[!duplicated(tree_vector)]
 697 |     for (i in 1:length(trees_to_use))
 698 |     {
 699 |       alt_trees[[i]] <- graph_pyclone$alt_trees[[trees_to_use[i]]]
 700 |     }
 701 | 
 702 |     graph_pyclone$alt_trees <- alt_trees
 703 | 
 704 |     graph_pyclone$consensus_branches <- multi.trees$consensus.branches
 705 |     graph_pyclone$nested_clust       <- multi.trees$consensus.nestedclust
 706 | 
 707 |     # list all clone - clone relationships which are common to all alternative trees
 708 |     # This captures some tree info for clones where the exact tree position is uncertain
 709 |     graph_pyclone$consensus_relationships <- extract_consensus_relationships( alt_trees )
 710 | 
 711 |   }
 712 | 
 713 | 
 714 |   ### Compute alternative tree metrics:
 715 |   # 1) Compute sum condition error SCE for each alternative tree + find trees with lowest SCE:
 716 | 
 717 |   cat('\n\nComputing sum condition error for each alternative tree')
 718 |   graph_pyclone$alt_trees_sum_condition_error <- compute_sum_condition_error(tree_list = graph_pyclone$alt_trees, ccf_cluster_table = nested_pyclone$ccf_cluster_table, trunk = trunk_cluster)
 719 |   graph_pyclone$min_sce_trees <- names(which(graph_pyclone$alt_trees_sum_condition_error == min(graph_pyclone$alt_trees_sum_condition_error)))
 720 |   cat('\nTrees with minimum sum condition error: ', graph_pyclone$min_sce_trees, '\n')
 721 | 
 722 |   # 2) Compute edge probability for each alternative tree + find trees with highest edge probability:
 723 | 
 724 |   cat('\n\nComputing edge probability score for each alternative tree\n')
 725 |   graph_pyclone$alt_trees_edge_probability <- compute_tree_edge_probability(tree_list = graph_pyclone$alt_trees, edgelength = graph_pyclone$edgelength, trunk = trunk_cluster)
 726 |   graph_pyclone$max_edge_probability_trees <- names(which(graph_pyclone$alt_trees_edge_probability == max(graph_pyclone$alt_trees_edge_probability)))
 727 |   cat('\nTrees with maximum edge probability: ', graph_pyclone$max_edge_probability_trees, '\n')
 728 | 
 729 | 
 730 |   ### Compute clone proportions output:
 731 |   # 1) Compute subclone proportions from default tree:
 732 |   cat('\n\nComputing clone proportions from default tree\n')
 733 |   clone_proportion_table <- compute_subclone_proportions(tree_list = graph_pyclone$alt_trees,
 734 |                                                          ccf_cluster_table = nested_pyclone$ccf_cluster_table,
 735 |                                                          clonality_table = clonality_out$clonality_table_corrected,
 736 |                                                          trunk = trunk_cluster,
 737 |                                                          force_clonal_100 = TRUE,
 738 |                                                          tree_id = 1)
 739 | 
 740 |   # 2) Compute subclone proportions from lowest error tree:
 741 |   cat('\n\nComputing clone proportions from tree with lowest sum condition error\n')
 742 |   clone_proportions_min_sce_trees <- lapply(graph_pyclone$min_sce_trees, function(i){
 743 |     compute_subclone_proportions(tree_list = graph_pyclone$alt_trees,
 744 |                                  ccf_cluster_table = nested_pyclone$ccf_cluster_table,
 745 |                                  clonality_table = clonality_out$clonality_table_corrected,
 746 |                                  trunk = trunk_cluster,
 747 |                                  force_clonal_100 = TRUE,
 748 |                                  tree_id = as.numeric(i))
 749 |   })
 750 |   names(clone_proportions_min_sce_trees) <- graph_pyclone$min_sce_trees
 751 |   clone_proportion_out <- list(clone_proportion_table = clone_proportion_table, clone_proportions_min_sce_trees = clone_proportions_min_sce_trees)
 752 | 
 753 | 
 754 |   ### Compute subclonal expansion score:
 755 |   # 1) Compute subclonal expansion score from default tree:
 756 |   cat('\n\nComputing subclonal expansion score from default tree\n')
 757 |   subclonal_exp_score <- compute_subclonal_expansion_score(tree_list = graph_pyclone$alt_trees,
 758 |                                                            tree_id = 1,
 759 |                                                            ccf_table_pyclone_clean = output_list$ccf_table_pyclone_clean)
 760 | 
 761 |   # 2) Compute subclonal expansion score from lowest error tree:
 762 |   cat('\n\nComputing subclonal expansion score from tree with lowest sum condition error\n')
 763 |   subclonal_exp_score_min_sce_trees <- lapply(graph_pyclone$min_sce_trees, function(i){
 764 |     compute_subclonal_expansion_score(tree_list = graph_pyclone$alt_trees,
 765 |                                       tree_id = as.numeric(i),
 766 |                                       ccf_table_pyclone_clean = output_list$ccf_table_pyclone_clean)
 767 | 
 768 |   })
 769 |   names(subclonal_exp_score_min_sce_trees) <- graph_pyclone$min_sce_trees
 770 |   subclonal_exp_score_out <- list(subclonal_exp_score = subclonal_exp_score, subclonal_exp_score_min_sce_trees = subclonal_exp_score_min_sce_trees)
 771 | 
 772 | 
 773 |   ### Finally, save all tree output
 774 |   # Save sample ID
 775 |   graph_pyclone$sampleID  <- sampleID
 776 |   graph_pyclone$long_sampleID  <- trx_rename.fn(sampleID, trialID = prefix)
 777 | 
 778 |   # Saving all output to list
 779 |   output_list$graph_pyclone                           <- graph_pyclone
 780 |   output_list$parameters                              <- input_parameter_list
 781 |   output_list$nested_pyclone                          <- nested_pyclone
 782 |   output_list$clonality_table                         <- clonality_table
 783 |   output_list$clonality_out                           <- clonality_out
 784 |   output_list$clone_proportion_out                    <- clone_proportion_out
 785 |   output_list$subclonal_expansion_score_out           <- subclonal_exp_score_out
 786 | 
 787 |   #let's save the output_list
 788 |   output_rds <- file.path(generalSave, paste0(sampleID, ".tree.RDS"))
 789 |   saveRDS(output_list, file = output_rds)
 790 | 
 791 |   return(output_list)
 792 | }
 793 | 
 794 | 
 795 | #' TRACERx tree plotting function
 796 | #'
 797 | #' This function is the CONIPHER function to plot the inferred phylogenetic tree.
 798 | #' @param sample_pyclone_tree A list containing all information about the
 799 | #' tree inferred using function tracerx.tree.building()
 800 | #' @importFrom grDevices "colorRampPalette" "dev.off" "pdf"
 801 | #' @importFrom graphics "abline" "axis" "barplot" "layout" "legend"
 802 | #' "par" "plot.new" "segments" "text" "title"
 803 | #' @importFrom igraph "get.edgelist"
 804 | #' @export treebuilding_plot
 805 | 
 806 | treebuilding_plot <- function(sample_pyclone_tree) {
 807 |   cat('\n Plotting inferred phylogenetic tree \n')
 808 |   require(mapplots)
 809 |   sampleID  <-  sample_pyclone_tree$parameters$sampleID
 810 |   prefix <- sample_pyclone_tree$parameters$prefix
 811 |   generalSave <- sample_pyclone_tree$parameters$generalSave
 812 |   ccf_buffer <- sample_pyclone_tree$parameters$ccf_buffer
 813 |   pval_cutoff <- sample_pyclone_tree$parameters$pval_cutoff
 814 |   use_boot <- sample_pyclone_tree$parameters$use_boot
 815 |   merge_clusters <- sample_pyclone_tree$parameters$merge_clusters
 816 |   correct_cpn_clusters <- sample_pyclone_tree$parameters$correct_cpn_clusters
 817 |   adjust_noisy_clusters <- sample_pyclone_tree$parameters$adjust_noisy_clusters
 818 |   adjust_noisy_clusters_prop <- sample_pyclone_tree$parameters$adjust_noisy_clusters_prop
 819 |   min_ccf <- sample_pyclone_tree$parameters$min_ccf
 820 |   min_cluster_size           <- sample_pyclone_tree$parameters$min_cluster_size
 821 | 
 822 |   nested_pyclone <- sample_pyclone_tree$nested_pyclone
 823 |   pyclone_tree   <- sample_pyclone_tree$graph_pyclone
 824 |   clonality_table <- sample_pyclone_tree$clonality_out$clonality_table_corrected
 825 |   clonality_out <- sample_pyclone_tree$clonality_out
 826 |   test_pyclone  <- sample_pyclone_tree$ccf_table_pyclone_clean
 827 |   cpn_removed_clusters <- sample_pyclone_tree$cpn_removed_clusters
 828 |   if(length(cpn_removed_clusters)==0)
 829 |   {
 830 |     cpn_removed_clusters <- NA
 831 |   }
 832 | 
 833 |   merged_clusters <- sample_pyclone_tree$merged_clusters
 834 | 
 835 |   ### Plot trees -- AUTOMATIC
 836 |   date <- gsub('-', '', substr(Sys.time(), 1, 10))
 837 | 
 838 |   pdfname <- file.path(generalSave, 'pytree_and_bar.pdf')
 839 | 
 840 |   height.mult.factor <- ceiling(nrow(nested_pyclone$ccf_cluster_table)/25)
 841 |   width.mult.factor  <- ceiling(nrow(nested_pyclone$ccf_cluster_table)/25)
 842 | 
 843 | 
 844 |   pdf(pdfname, width=22*width.mult.factor, height=12*height.mult.factor)
 845 |   {
 846 |     par(mar=c(0,0,0,0))
 847 |     layout(cbind(1:(nrow(nested_pyclone$ccf_cluster_table)+2),rep(nrow(nested_pyclone$ccf_cluster_table)+3,nrow(nested_pyclone$ccf_cluster_table)+2),rep(nrow(nested_pyclone$ccf_cluster_table)+3,nrow(nested_pyclone$ccf_cluster_table)+2)))
 848 |     require(beeswarm)
 849 | 
 850 |     tmp <- nested_pyclone$ccf_cluster_table
 851 |     main <- paste(substr(colnames(tmp)[1], 1, 8), '\ Phylo CCF values', sep = '')
 852 |     colnames(tmp) <- gsub(paste0(substr(colnames(tmp)[1], 1, 8), "_"), "", colnames(tmp))
 853 |     suppressPackageStartupMessages(require(gplots))
 854 |     plot.new()
 855 |     par(mar=c(2,2,2,2))
 856 |     title(main, cex = 2)
 857 | 
 858 |     colours.to.use <- color.tree(1:nrow(nested_pyclone$ccf_cluster_table))
 859 | 
 860 |     par(mar=c(0.1,5,0.1,2),lend=1)
 861 | 
 862 |     for (j in 1:nrow(nested_pyclone$ccf_cluster_table))
 863 |     {
 864 | 
 865 |       if(j==1)
 866 |       {
 867 |         border.col <- ifelse(clonality_table[j,]=='clonal','black','grey')
 868 |         bp <- barplot(nested_pyclone$ccf_cluster_table[j,],las=1,col=colours.to.use[j],border=border.col,names="",ylab=paste("Cl",rownames(nested_pyclone$ccf_cluster_table)[j],sep=" "),ylim=c(0,115),yaxt='n',cex.axis=1.25)
 869 | 
 870 |       }
 871 |       if(j!=1)
 872 |       {
 873 |         border.col <- ifelse(clonality_table[j,]=='clonal','black','grey')
 874 |         bp <- barplot(nested_pyclone$ccf_cluster_table[j,],las=1,col=colours.to.use[j],border=border.col,names="",ylab=paste("Cl",rownames(nested_pyclone$ccf_cluster_table)[j],sep=" "),ylim=c(0,115),yaxt='n',cex.axis=1.25)
 875 | 
 876 |       }
 877 |       axis(side = 2,at = c(0,50,100),labels=c(c(0,50,100)),las=1)
 878 |       if(j ==nrow(nested_pyclone$ccf_cluster_table))
 879 |       {
 880 |         axis(side=1,at=bp,labels=gsub(paste0(substr(colnames(nested_pyclone$ccf_cluster_table)[1], 1, 8), "_"), "",colnames(nested_pyclone$ccf_cluster_table))
 881 |              ,tick=FALSE
 882 |              ,cex.axis=1.25)
 883 | 
 884 |       }
 885 |       abline(h=0)
 886 |       abline(h=100,lty='dashed')
 887 |       abline(h=50,lty='dashed')
 888 |       for (bar in 1:length(bp))
 889 |       {
 890 | 
 891 |         beeswarm(test_pyclone[test_pyclone[,'PycloneCluster']%in%rownames(nested_pyclone$ccf_cluster_table)[j],grep('PhyloCCF',colnames(test_pyclone))[bar]]*100
 892 |                  ,at=bp[bar]
 893 |                  ,add=TRUE
 894 |                  ,corralWidth = 0.5
 895 |                  ,method='swarm'
 896 |                  ,corral='wrap'
 897 |                  ,pch=21
 898 |                  ,col=colours.to.use[j]
 899 |                  ,bg='grey')
 900 |         segments(x0 = bp[bar],x1 = bp[bar],y0 = nested_pyclone$ccf_ci_lower[j,bar],y1 = nested_pyclone$ccf_ci_upper[j,bar],lwd=5)
 901 |         text(x=bp[bar],y=25,labels=nested_pyclone$ccf_cluster_table[j,bar],cex =1.5)
 902 |       }
 903 |     }
 904 | 
 905 |     plot.new()
 906 |     par(mar=c(2.1, 2.1, 4.1, 38), xpd=TRUE)
 907 | 
 908 |     g <- graph.data.frame(pyclone_tree$default_tree,directed = FALSE)
 909 |     indx <- V(g)$name
 910 |     vcol <- setNames(color.tree(pyclone_tree$edgelength), names(pyclone_tree$edgelength))[indx]
 911 | 
 912 |     l <- layout_as_tree(g, root = pyclone_tree$trunk)
 913 | 
 914 |     pie.size <- ncol(sample_pyclone_tree$nested_pyclone$ccf_cluster_table)
 915 |     node.shape <- setNames(rep('pie', length(vcol)), names(vcol))
 916 |     pie.slices <- lapply(1:length(vcol), function(x) rep(1, pie.size))
 917 |     empty.col = '#bdbdbd'#'white'
 918 | 
 919 |     node_size_factor <- log2(max(pyclone_tree$edgelength)) / 30
 920 |     node.size <- log2(pyclone_tree$edgelength) / node_size_factor
 921 |     node.size <- node.size[names(node.shape)]
 922 | 
 923 |     pie.colors <- sample_pyclone_tree$nested_pyclone$ccf_cluster_table[match(names(vcol), rownames(sample_pyclone_tree$nested_pyclone$ccf_cluster_table)),, drop = F]
 924 |     pie.colors <- ifelse(pie.colors>=90,99,pie.colors)
 925 |     pie.colors <- ifelse(pie.colors<10&pie.colors>=1,10,pie.colors)
 926 |     pie.colors <- lapply(1:nrow(pie.colors), function(x) {
 927 |       if(!all(is.na(pie.colors[x,]))){
 928 |         tmp     <- pie.colors[x,]
 929 |         tmp2    <- tmp
 930 |         colfunc <- colorRampPalette(c("white", vcol[rownames(pie.colors)[x]]))
 931 |         speccolours <- colfunc(100)
 932 |         tmp[tmp>0]  <- speccolours[tmp]
 933 |         tmp[tmp2 == 0] <- empty.col
 934 |         tmp
 935 |       }
 936 |     })
 937 | 
 938 | 
 939 |     g_dir <- graph.data.frame(pyclone_tree$default_tree,directed = TRUE)
 940 |     edges <- get.edgelist(g_dir)
 941 |     ecol <- setNames(rep('#bdbdbd', nrow(edges)),edges[,2])# baseline, set edge color to black
 942 |     ewidth <- rep(1,length(ecol))
 943 | 
 944 |     #label consensus edges in other colour
 945 |     ecol[paste(edges[,1],edges[,2],sep=":")%in%pyclone_tree$consensus_branches] <- '#000000'
 946 |     ewidth[paste(edges[,1],edges[,2],sep=":")%in%pyclone_tree$consensus_branches] <-150
 947 | 
 948 |     plot(g
 949 |          , layout=l
 950 |          , main = sampleID
 951 |          , vertex.color = vcol[indx]
 952 |          , vertex.frame.color=vcol[indx]
 953 |          , vertex.shape = node.shape
 954 |          , vertex.lwd=5
 955 |          , vertex.pie.lwd=3
 956 |          , vertex.pie = pie.slices
 957 |          , vertex.pie.color = lapply(pie.colors,rev)
 958 |          , vertex.size = node.size
 959 |          , edge.color=ecol
 960 |          , edge.size=ewidth
 961 |          , vertex.label.cex=2
 962 |          , vertex.label.pos=2
 963 |          , vertex.label.dist=0
 964 |          , vertex.label.family='Helvetica'
 965 |          , vertex.label.font=2
 966 |          , vertex.label.color = 'black')
 967 | 
 968 | 
 969 |     legend.pie(1,1,labels=gsub(paste0(substr(colnames(tmp)[1], 1, 8), "_"), "", colnames(tmp)), radius=0.2, bty="n", col='#bdbdbd',
 970 |                cex=1.25, label.dist=0.8
 971 |                ,border='white')
 972 | 
 973 |     snv_clusters <- sort(pyclone_tree$edgelength[indx], decreasing = T)
 974 | 
 975 |     snv_clusters_removed <- pyclone_tree$edgelength
 976 |     snv_clusters_removed <- sort(snv_clusters_removed[!names(snv_clusters_removed) %in% indx], decreasing = T)
 977 |     if(!is.na(cpn_removed_clusters[1]))
 978 |     {
 979 |       snv_clusters_removed <- c(snv_clusters_removed,table(sample_pyclone_tree$ccf_table_pyclone[,'PycloneCluster'])[cpn_removed_clusters])
 980 |     }
 981 | 
 982 |     tmp <- legend('topright', inset = c(-0.3, 0), legend = paste(names(snv_clusters), ' (', snv_clusters,' SNVs)', sep = ''), col = vcol[names(snv_clusters)], pch = 19, title = 'Clusters included:', bty = 'n')  ## inset option controls how far from x and y margins
 983 |     if (length(snv_clusters_removed) > 0) {
 984 |       if(!is.na(cpn_removed_clusters[1]))
 985 |       {
 986 |         to_plot <- table(sample_pyclone_tree$ccf_table_pyclone[,'PycloneCluster'])[cpn_removed_clusters]
 987 |         legend(x=tmp$rect$left,y = 0,inset = c(-0.3, 0),legend = paste(names(to_plot), ' (', to_plot, ' SNVs)', sep = ''), col = vcol[names(to_plot)], pch = 19, title = 'Copy# clusters removed:', bty = 'n')
 988 | 
 989 |       }
 990 |       legend('bottomright', inset = c(-0.3, 0), legend = paste(names(snv_clusters_removed), ' (', snv_clusters_removed, ' SNVs)', sep = ''), col = vcol[names(snv_clusters_removed)], pch = 19, title = 'Clusters removed:', bty = 'n')
 991 |     }
 992 | 
 993 | 
 994 | 
 995 | 
 996 |   }
 997 |   dev.off()
 998 | 
 999 |   #next, plot all the possible trees
1000 |   trees.to.plot <- pyclone_tree$alt_trees
1001 |   if(length(trees.to.plot)==0)
1002 |   {
1003 |     #nothing to plot here.
1004 |   }
1005 |   if(length(trees.to.plot)!=0)
1006 |   {
1007 | 
1008 |     date <- gsub('-', '', substr(Sys.time(), 1, 10))
1009 | 
1010 |     pdfname <- file.path(generalSave, 'pytree_multipletrees.pdf')
1011 | 
1012 |     mult.factor <- ceiling(length(trees.to.plot)/50)
1013 | 
1014 |     pdf(pdfname, width=12*mult.factor, height=12*mult.factor)
1015 |     {
1016 |       nr.trees              <- length(trees.to.plot)
1017 |       columnnum             <- 1
1018 |       rownum                <- nr.trees/columnnum
1019 | 
1020 |       if(nr.trees<=50)
1021 |       {
1022 |         nr.to.use <- nr.trees
1023 |       }
1024 | 
1025 |       if(nr.trees>50)
1026 |       {
1027 |         nr.to.use <- signif(nr.trees+5,2)
1028 |       }
1029 | 
1030 |       for(i in 1: nr.to.use) {
1031 |         if((nr.to.use %% i) == 0) {
1032 |           if((i+(nr.to.use/i))<(columnnum+rownum))
1033 |           {
1034 |             columnnum <- i
1035 |             rownum    <- nr.to.use/columnnum
1036 |           }
1037 |         }
1038 |       }
1039 | 
1040 |       if(columnnum==1)
1041 |       {
1042 |         columnnum <- ceiling(columnnum*2)
1043 |         rownum    <- ceiling(rownum/2)
1044 |       }
1045 | 
1046 |       par(mfrow=c(rownum,columnnum),xpd=TRUE,mar=c(1, 1,1, 1))
1047 |       for (i in 1:nr.trees)
1048 |       {
1049 |         auto_tree    <- trees.to.plot[[i]]
1050 |         g <- graph.data.frame(auto_tree,directed = FALSE)
1051 |         indx <- V(g)$name
1052 |         vcol <- setNames(color.tree(pyclone_tree$edgelength), names(pyclone_tree$edgelength))[indx]
1053 | 
1054 |         l <- layout_as_tree(g, root = pyclone_tree$trunk)
1055 | 
1056 |         pie.size <- ncol(nested_pyclone$ccf_cluster_table)
1057 |         node.shape <- setNames(rep('pie', length(vcol)), names(vcol))
1058 |         pie.slices <- lapply(1:length(vcol), function(x) rep(1, pie.size))
1059 |         empty.col = 'gray85'
1060 | 
1061 |         node_size_factor <- log2(max(pyclone_tree$edgelength)) / 30
1062 |         node.size <- log2(pyclone_tree$edgelength) / node_size_factor
1063 |         node.size <- node.size[names(node.shape)]
1064 | 
1065 |         pie.colors <- nested_pyclone$ccf_cluster_table[match(names(vcol), rownames(nested_pyclone$ccf_cluster_table)),, drop = F]
1066 |         pie.colors <- lapply(1:nrow(pie.colors), function(x) {
1067 |           if(!all(is.na(pie.colors[x,]))){
1068 |             tmp <- pie.colors[x,]
1069 |             tmp[tmp > 0] <- vcol[rownames(pie.colors)[x]]
1070 |             tmp[tmp == 0] <- empty.col
1071 |             tmp
1072 |           }
1073 |         })
1074 | 
1075 |         g_dir <- graph.data.frame(auto_tree,directed = TRUE)
1076 |         edges <- get.edgelist(g_dir)
1077 |         ecol <- setNames(rep('#bdbdbd', nrow(edges)),edges[,2])# baseline, set edge color to black
1078 |         ewidth <- rep(1,length(ecol))
1079 | 
1080 |         #label consensus edges in other colour
1081 |         ecol[paste(edges[,1],edges[,2],sep=":")%in%pyclone_tree$consensus_branches] <- '#000000'
1082 |         ewidth[paste(edges[,1],edges[,2],sep=":")%in%pyclone_tree$consensus_branches] <- 2
1083 | 
1084 |         plot(g, main = sampleID
1085 |              , layout = l
1086 |              , vertex.color = vcol[indx]
1087 |              , vertex.shape = node.shape
1088 |              , vertex.pie = pie.slices
1089 |              , vertex.pie.color = pie.colors
1090 |              , vertex.pie.lty = 0
1091 |              , vertex.size = node.size
1092 |              , edge.width =ewidth
1093 |              , edge.color=ecol
1094 |              , arrow.size =0
1095 |              ,arrow.width=0
1096 |              ,arrow.mode=0
1097 |         )
1098 | 
1099 |       }
1100 | 
1101 |     }
1102 |     dev.off()
1103 |   }
1104 | }
1105 | 
1106 | 
1107 | 
1108 | 
1109 | 


--------------------------------------------------------------------------------
/R/main_clustering_functions.R:
--------------------------------------------------------------------------------
   1 | #' Full clustering run function
   2 | #'
   3 | #' This function takes all the input options and runs the three main steps:
   4 | #' preprocess, clustering run and postprocess
   5 | #' @param opt a list of options
   6 | #' @returns NULL
   7 | #' @export conipher_clustering
   8 | 
   9 | conipher_clustering <- function(case_id,
  10 |                                 out_dir,
  11 |                                 input_tsv_loc,
  12 |                                 input_seg_tsv_loc = NULL,
  13 |                                 subclonal_copy_correction = TRUE,
  14 |                                 only_truncal_subclonal_copy_correction = TRUE,
  15 |                                 pyclone_yaml_loc = NULL,
  16 |                                 min_cluster_size = 5,
  17 |                                 multiple_test_correction = TRUE,
  18 |                                 clean_clusters = TRUE,
  19 |                                 clonal_cutOff = 0.9,
  20 |                                 propClonal_threshold = 0.25,
  21 |                                 fix_absentCCFs = TRUE,
  22 |                                 driver_filter = "1A,1,2A",
  23 |                                 burn_in = 1000,
  24 |                                 seed = 1024,
  25 |                                 nProcs = 1,
  26 |                                 ...) {
  27 |     patient              <- case_id
  28 |     new.dir              <- paste0(out_dir, "/")
  29 |     driver_cat           <- unlist(strsplit(driver_filter, split = ","))
  30 |     if (is.null(pyclone_yaml_loc)) {
  31 |         template.config.yaml <- system.file("extdata", "template.config.yaml", package = "CONIPHER", mustWork = TRUE)
  32 |     } else {
  33 |         if (file.exists(pyclone_yaml_loc)) {
  34 |             template.config.yaml <- pyclone_yaml_loc
  35 |         } else {
  36 |             stop("PyClone template yaml file does not exist. \nPlease specify full path to file or set parameter to NULL to use default.\n")
  37 |         }
  38 |     }
  39 | 
  40 |     cat("\nCONIPHER clustering analysis of the following tumour case:\n")
  41 |     print(patient)
  42 |     cat("\n")
  43 | 
  44 |     if (!file.exists(new.dir)) {
  45 |         if (!dir.create(new.dir, recursive = TRUE)) {
  46 |             stop("Unable to create root directory.\n")
  47 |         }
  48 |     }
  49 | 
  50 |     if(!file.exists(input_tsv_loc)) {
  51 |         stop("Unable to find input_tsv.\n")
  52 |     }
  53 |     input_tsv     <- read.delim(input_tsv_loc, sep = "\t", stringsAsFactors = FALSE, header = TRUE, fill = TRUE, quote = "")
  54 |     if (nrow(input_tsv) == 0) {
  55 |         stop('No mutations passed filtering, stopping PyClone phylo clustering')
  56 |     }
  57 | 
  58 |     ### fix issue with sample names including '-'
  59 |     input_tsv$SAMPLE  <- gsub("-", "\\.", input_tsv$SAMPLE)
  60 | 
  61 |     input_list     <- clustering_preprocess(input_tsv, new.dir = new.dir, subclonal_copy_correction = subclonal_copy_correction, multiple_test_correction = multiple_test_correction, only_truncal_subclonal_copy_correction = only_truncal_subclonal_copy_correction, fix_absentCCFs = fix_absentCCFs)
  62 |     sample.results <- clustering_run(input_list, nProcs = nProcs, new.dir = new.dir, burn_in = burn_in, pyclone_seed = seed, template.config.yaml = template.config.yaml)
  63 |     clustering_postprocess(input_list, sample.results, new.dir = new.dir, input_tsv = input_tsv, input_seg_tsv_loc = input_seg_tsv_loc, min_cluster_size = min_cluster_size, driver_cat = driver_cat, clean_clusters = clean_clusters, min_ccf_present = 0.1, clonal_cutOff = clonal_cutOff, propClonal_threshold = propClonal_threshold)
  64 | }
  65 | 
  66 | 
  67 | #' Input data preprocessing function
  68 | #'
  69 | #' This function takes the input tsv and formats the data to be compatible with
  70 | #' the main CONIPHER clustering function.
  71 | #' @param input_table An dataframe of the input mutation table in the correct
  72 | #' format. For more information on the input table format, please see our
  73 | #' tree building protocol.
  74 | #' @param new.dir A character specifying the directory where the pyclone
  75 | #' output should be saved.
  76 | #' @param subclonal_copy_correction A logical value that specifies whether subclonal
  77 | #' copy number correction should be performed. 
  78 | #' Default is set to TRUE
  79 | #' @param multiple_test_correction A logical value that specifies whether multiple 
  80 | #' testing correction should be applied for the copy number correcting mutations.
  81 | #' Default is set to TRUE
  82 | #' @param only_truncal_subclonal_copy_correction A logical value that specifies 
  83 | #' whether only truncal subclonal copy number correction should be used.
  84 | #' Default is set to TRUE
  85 | #' @param fix_absentCCFs A logical value that specifies whether CCF 
  86 | #' of absent mutations should be set to zero.
  87 | #' Default is set to TRUE
  88 | #' @returns list including patient, regions.to.use, mut.table, seg.mat.copy
  89 | #' seg.mat.phylo, phylo.region.list, simpleClusterList
  90 | #' @importFrom dplyr "%>%"
  91 | #' @export clustering_preprocess
  92 | 
  93 | clustering_preprocess <- function(input_table, new.dir, subclonal_copy_correction = TRUE, multiple_test_correction = TRUE, only_truncal_subclonal_copy_correction = TRUE, fix_absentCCFs = TRUE) {
  94 |     gender  <- "male"
  95 |     patient <- unique(input_table$CASE_ID)
  96 |     regions.to.use <- unique(input_table$SAMPLE)
  97 |     input_table[, "key"] <- paste(paste0("chr", input_table[, "CHR"]), 
  98 |                                   input_table[, "POS"], 
  99 |                                   input_table[, "REF"], 
 100 |                                   input_table[, "ALT"], 
 101 |                                   sep = ":")
 102 | 
 103 |     if (sum(grepl("MUT_TYPE", colnames(input_table))) == 1) {
 104 |         mut.table <- data.frame(key = input_table[, "key"],
 105 |                                 chr = input_table[, "CHR"],
 106 |                                 start = input_table[, "POS"],
 107 |                                 stop = input_table[, "POS"],
 108 |                                 ref = input_table[, "REF"],
 109 |                                 var = input_table[, "ALT"],
 110 |                                 is_SNV = TRUE,
 111 |                                 Use.For.Plots = (input_table[, "MUT_TYPE"] == "SNV"),
 112 |                                 Use.For.Plots.Indel = (input_table[, "MUT_TYPE"] == "INDEL"),
 113 |                                 stringsAsFactors = FALSE)
 114 | 
 115 |         mut.table <- mut.table %>% 
 116 |             dplyr::full_join(input_table %>% 
 117 |                 dplyr::select(key, SAMPLE, REF_COUNT, VAR_COUNT, DEPTH) %>% 
 118 |                 dplyr::rename(cov = DEPTH, ref_count = REF_COUNT, var_count = VAR_COUNT) %>% 
 119 |                 dplyr::mutate(VAF = var_count / cov * 100) %>%
 120 |                 tidyr::pivot_wider(names_from = SAMPLE, values_from = c(cov, ref_count, var_count, VAF), names_glue = "{SAMPLE}.{.value}"), 
 121 |             by = "key") %>%
 122 |             dplyr::rowwise() %>%
 123 |             dplyr::mutate(is_blacklist = FALSE,
 124 |                           max.var_count = max(dplyr::c_across(ends_with(".var_count"))),
 125 |                           max.VAF = max(dplyr::c_across(ends_with(".VAF"))))
 126 | 
 127 |         mut.table <- mut.table %>%
 128 |             dplyr::mutate(Gene.refGene = "") %>%
 129 |             dplyr::mutate(driverCategory = NA)
 130 | 
 131 |         mut.table <- data.frame(mut.table, stringsAsFactors = FALSE)
 132 |     } else {
 133 |         mut.table <- data.frame(key = input_table[, "key"],
 134 |                                 chr = input_table[, "CHR"],
 135 |                                 start = input_table[, "POS"],
 136 |                                 stop = input_table[, "POS"],
 137 |                                 ref = input_table[, "REF"],
 138 |                                 var = input_table[, "ALT"],
 139 |                                 is_SNV = TRUE,
 140 |                                 stringsAsFactors = FALSE)
 141 | 
 142 |         mut.table <- mut.table %>% 
 143 |             dplyr::full_join(input_table %>% 
 144 |                 dplyr::select(key, SAMPLE, REF_COUNT, VAR_COUNT, DEPTH) %>%
 145 |                 dplyr::rename(cov = DEPTH, ref_count = REF_COUNT, var_count = VAR_COUNT) %>%
 146 |                 dplyr::mutate(VAF = var_count / cov * 100) %>%
 147 |                 tidyr::pivot_wider(names_from = SAMPLE, values_from = c(cov, ref_count, var_count, VAF), names_glue = "{SAMPLE}.{.value}"), 
 148 |             by = "key") %>%
 149 |             dplyr::rowwise() %>%
 150 |             dplyr::mutate(is_blacklist = FALSE,
 151 |                           max.var_count = max(dplyr::c_across(ends_with(".var_count"))),
 152 |                           max.VAF = max(dplyr::c_across(ends_with(".VAF"))))
 153 | 
 154 |         mut.table <- mut.table %>%
 155 |             dplyr::mutate(Gene.refGene = "") %>%
 156 |             dplyr::mutate(Use.For.Plots = TRUE, Use.For.Plots.Indel = FALSE) %>%
 157 |             dplyr::mutate(driverCategory = NA)
 158 | 
 159 |         mut.table <- data.frame(mut.table, stringsAsFactors = FALSE)
 160 |     }
 161 | 
 162 |     seg.mat.copy <- data.frame(SampleID = input_table[,"SAMPLE"],
 163 |                                chr = input_table[, "CHR"],
 164 |                                startpos = input_table[, "POS"],
 165 |                                endpos = input_table[, "POS"],
 166 |                                n.het = 1,
 167 |                                cnTotal = round(input_table[, "COPY_NUMBER_A"] + input_table[, "COPY_NUMBER_B"]),
 168 |                                nMajor = round(input_table[, "COPY_NUMBER_A"]),
 169 |                                nMinor = round(input_table[, "COPY_NUMBER_B"]),
 170 |                                Ploidy = input_table[, "PLOIDY"],
 171 |                                ACF = input_table[, "ACF"],
 172 |                                COPY_NUMBER_A = input_table[, "COPY_NUMBER_A"],
 173 |                                COPY_NUMBER_B = input_table[, "COPY_NUMBER_B"], 
 174 |                                stringsAsFactors = FALSE)
 175 | 
 176 |     mut.table    <- mut.table[mut.table$chr %in% 1:22,, drop = FALSE]
 177 |     seg.mat.copy <- seg.mat.copy[seg.mat.copy$chr %in% 1:22,, drop = FALSE]
 178 | 
 179 |     mut.table$mutation_id  <- paste(patient, mut.table$chr, mut.table$start, mut.table$ref, sep = ":")
 180 |     mut.table              <- mut.table[order(mut.table$max.VAF, decreasing = TRUE),]
 181 |     mut.table              <- mut.table[!duplicated(mut.table$mutation_id),, drop = FALSE]
 182 |     rownames(mut.table)    <- mut.table$mutation_id
 183 | 
 184 |     max.vaf        <- c()
 185 |     max.var.count  <- c()
 186 | 
 187 |     for (region in regions.to.use) {
 188 |         if (paste(region, ".VAF", sep = "") %in% colnames(mut.table)) {
 189 |             max.vaf       <- cbind(max.vaf, mut.table[, paste(region, ".VAF", sep = "")])
 190 |             max.var.count <- cbind(max.var.count, mut.table[, paste(region, ".var_count", sep = "")])
 191 |         }
 192 |     }
 193 | 
 194 |     mut.table$max.VAF        <- apply(max.vaf, 1, max)
 195 |     mut.table$max.var_count  <- apply(max.var.count, 1, max)
 196 | 
 197 |     mut.table <- mut.table[!is.na(mut.table$max.VAF),, drop = FALSE]
 198 | 
 199 |     mut.table <- mut.table[mut.table$Use.For.Plots | mut.table$Use.For.Plots.Indel, ]
 200 |     # mut.table <- mut.table[!((mut.table$Use.For.Plots & mut.table$max.var_count < 10) | is.na(mut.table$max.var_count)), ]
 201 | 
 202 | 
 203 | 
 204 |     seg.mat.phylo      <- create.subclonal.copy.number(seg.mat.copy = seg.mat.copy,min.subclonal = 0.01)
 205 | 
 206 |     if (subclonal_copy_correction %in% "FALSE") {
 207 |         cat('\nRunning without subclonal copy number mode')
 208 |         seg.mat.phylo$COPY_NUMBER_A <- seg.mat.phylo$nMajor
 209 |         seg.mat.phylo$COPY_NUMBER_B <- seg.mat.phylo$nMinor
 210 |         seg.mat.phylo$fracA <- 1
 211 |         seg.mat.phylo$fracB <- 0
 212 |         seg.mat.phylo$fracC <- NA
 213 |         seg.mat.phylo$fracD <- NA
 214 |         seg.mat.phylo$nMaj_A <- seg.mat.phylo$nMajor
 215 |         seg.mat.phylo$nMin_A <- seg.mat.phylo$nMinor
 216 |         seg.mat.phylo$nMaj_B <- seg.mat.phylo$nMajor
 217 |         seg.mat.phylo$nMin_B <- seg.mat.phylo$nMinor
 218 |         seg.mat.phylo$nMaj_C <- NA
 219 |         seg.mat.phylo$nMin_C <- NA
 220 |         seg.mat.phylo$nMaj_D <- NA
 221 |         seg.mat.phylo$nMin_D <- NA
 222 | 
 223 |         seg.mat.copy$COPY_NUMBER_A <- seg.mat.phylo$nMajor
 224 |         seg.mat.copy$COPY_NUMBER_B <- seg.mat.phylo$nMinor
 225 |     }
 226 | 
 227 |     patient.list       <- list()
 228 |     phylo.region.list  <- list()
 229 |     cellularity        <- rep(NA, length(regions.to.use))
 230 |     names(cellularity) <- regions.to.use
 231 | 
 232 |     # determine the indelCorrectionFactor
 233 |     if (length(regions.to.use) > 1) {
 234 |         indelCorrectionFactor <- determineIndelCorrectionFactor(patient = patient, mut.table = mut.table, regions.to.use = regions.to.use, seg.mat.phylo = seg.mat.phylo, seg.mat.copy = seg.mat.copy)
 235 |         indelMuts <- rownames(mut.table[mut.table$Use.For.Plots.Indel %in% TRUE,, drop = FALSE])
 236 |     }
 237 | 
 238 |     for (region in regions.to.use) {
 239 |         region.mut.table <- mut.table
 240 |         region.seg.copy  <- seg.mat.copy[seg.mat.copy$SampleID %in% region,, drop = FALSE]
 241 |         region.seg.phylo <- seg.mat.phylo[seg.mat.phylo$SampleID %in% region,, drop = FALSE]
 242 |         pyclone.table    <- data.frame(t(sapply(1:nrow(region.mut.table),identify.subclonal.mut.copy.number.ascat,region.mut.table,region.seg.phylo,region,patient)), stringsAsFactors = FALSE)
 243 |         pyclone.table    <- pyclone.table[!is.na(pyclone.table$minor_cn),]
 244 |         pyclone.table    <- pyclone.table[!is.na(pyclone.table$ref_counts),]
 245 |         pyclone.table    <- pyclone.table[!duplicated(pyclone.table$mutation_id),]
 246 | 
 247 |         sample.purity    <- region.seg.copy$ACF[1]
 248 | 
 249 |         pyclone.table    <- pyclone.table[(as.numeric(pyclone.table$ref_counts) + as.numeric(pyclone.table$var_counts)) >= 1,, drop = FALSE]
 250 |         region.earlyLate <- earlyORlateGender(region = region, complete.mutation.table = pyclone.table, purity = sample.purity, gender = gender)
 251 |         if (multiple_test_correction %in% FALSE) {
 252 |             region.phyloCCF <- calculate_phylo_ccf(region = region, complete.mutation.table = pyclone.table, purity = sample.purity, order.by.pos = TRUE, gender = gender)
 253 |         }
 254 | 
 255 |         if (multiple_test_correction %in% TRUE) {
 256 |             region.phyloCCF <- calculate_phylo_ccf_withBH(region = region, complete.mutation.table = pyclone.table, purity = sample.purity, order.by.pos = TRUE, gender = gender)
 257 |         }
 258 |       
 259 |         if (length(regions.to.use) > 1) {
 260 |             tmp <- intersect(rownames(region.phyloCCF), indelMuts)
 261 |             # let's look at indels specifically
 262 |             region.phyloCCF[tmp, 'phyloCCF'] <- region.phyloCCF[tmp, 'phyloCCF'] * as.numeric(indelCorrectionFactor[region])
 263 |             region.phyloCCF[tmp, 'mutCopyNum'] <- region.phyloCCF[tmp, 'mutCopyNum'] * as.numeric(indelCorrectionFactor[region])
 264 |         }
 265 | 
 266 |         phylo.region.list[[region]] <- region.phyloCCF
 267 |         cellularity[region]         <- sample.purity
 268 |     }
 269 | 
 270 |     # perform additional copy number correction where needed
 271 |     muts_to_consider <- unlist(phylo.region.list[[1]]$mutation_id)
 272 |     for (mut in muts_to_consider) {
 273 |         mut_table <- c()
 274 |         for (region in names(phylo.region.list)) {
 275 |             mut_table <- rbind(mut_table, phylo.region.list[[region]][phylo.region.list[[region]]$mutation_id %in% mut,, drop = FALSE])
 276 |         }
 277 |         
 278 |         if (max(mut_table$phyloCCF) > 1.5) {
 279 |             #all mutations are greater than 1.5 --> re-centre on 1 or       #one or more (but not all) mutation is greater than 1.5 (but all are clonal) --> re-centre on 1
 280 |             if (length(which(mut_table$phyloCCF > 1.5)) == nrow(mut_table) | length(which(mut_table$phyloCCF.0.05 >= 1)) == nrow(mut_table)) {
 281 |                 small_mut_table <- mut_table[mut_table$phyloCCF.0.05 > 1,, drop = FALSE]
 282 |                 if (nrow(small_mut_table) > 0) {
 283 |                     for (i in 1:nrow(small_mut_table)) {
 284 |                         small_row <- small_mut_table[i,, drop = FALSE]
 285 |                         region <- unlist(small_row$region)
 286 |                         region.copy <- seg.mat.phylo[seg.mat.phylo$SampleID %in% region,]
 287 | 
 288 |                         phyloCCF      <- small_row$absolute.ccf
 289 |                         phyloCCF_0.05 <- min(small_row$absolute.ccf.0.05, small_row$absolute.ccf.0.05 - abs(small_row$phyloCCF - small_row$phyloCCF.0.05))
 290 |                         phyloCCF_0.95 <- max(small_row$absolute.ccf.0.95, small_row$absolute.ccf.0.95 + abs(small_row$phyloCCF - small_row$phyloCCF.0.05))
 291 | 
 292 |                         phylo.region.list[[region]][phylo.region.list[[region]]$mutation_id %in% small_row$mutation_id,]$phyloCCF.0.05 <- phyloCCF_0.05
 293 |                         phylo.region.list[[region]][phylo.region.list[[region]]$mutation_id %in% small_row$mutation_id,]$phyloCCF.0.95 <- phyloCCF_0.95
 294 |                         phylo.region.list[[region]][phylo.region.list[[region]]$mutation_id %in% small_row$mutation_id,]$phyloCCF <- phyloCCF 
 295 |                     }
 296 |                 } 
 297 |             }
 298 |           
 299 |             #at least one mutation is subclonal --> properly adjust no.chrs.bearing.mut
 300 |             if (length(which(mut_table$phyloCCF.0.95 * mut_table$no.chrs.bearing.mut < 1)) >= 1) {
 301 |                 small_mut_table <- mut_table[mut_table$phyloCCF.0.05 > 1,, drop = FALSE]
 302 |                 if (nrow(small_mut_table) > 0) {
 303 |                     for (i in 1:nrow(small_mut_table)) {
 304 |                         small_row <- small_mut_table[i,, drop = FALSE]
 305 |                         region <- unlist(small_row$region)
 306 |                         region.copy <- seg.mat.phylo[seg.mat.phylo$SampleID %in% region,]
 307 | 
 308 |                         if (small_row$phyloCCF != small_row$mutCopyNum) {
 309 |                             phyloCCF      <- small_row$phyloCCF / small_row$no.chrs.bearing.mut
 310 |                             phyloCCF_0.05 <- small_row$phyloCCF.0.05 / small_row$no.chrs.bearing.mut
 311 |                             phyloCCF_0.95 <- small_row$phyloCCF.0.95 / small_row$no.chrs.bearing.mut
 312 |                         } else {
 313 |                             phyloCCF      <- small_row$phyloCCF
 314 |                             phyloCCF_0.05 <- small_row$phyloCCF.0.05
 315 |                             phyloCCF_0.95 <- small_row$phyloCCF.0.95
 316 |                         }
 317 |                         phylo.region.list[[region]][phylo.region.list[[region]]$mutation_id %in% small_row$mutation_id,]$phyloCCF.0.05 <- phyloCCF_0.05
 318 |                         phylo.region.list[[region]][phylo.region.list[[region]]$mutation_id %in% small_row$mutation_id,]$phyloCCF.0.95 <- phyloCCF_0.95
 319 |                         phylo.region.list[[region]][phylo.region.list[[region]]$mutation_id %in% small_row$mutation_id,]$phyloCCF <- phyloCCF
 320 |                     } 
 321 |                 }   
 322 |             } 
 323 |         }
 324 |         
 325 |         if (TRUE %in% c(mut_table$phyloCCF > mut_table$mutCopyNum)) {
 326 |             # are all the mutations now truncal?
 327 |             if (length(which(mut_table$phyloCCF.0.95 < 1)) == 0) next
 328 |             if (only_truncal_subclonal_copy_correction %in% TRUE) {
 329 |                 if (length(which(mut_table$phyloCCF.0.95 < 1)) >= 1) {
 330 |                     # so we've performed copy number correction, but that didn't make the mutation clonal, so let's revert back to non-copy number corrected
 331 |                     muts_to_revert <- mut_table[mut_table$no.chrs.bearing.mut < 1,, drop = FALSE]
 332 | 
 333 |                     for (region in unlist(muts_to_revert$region)) {
 334 |                         mut_to_revert <- muts_to_revert[unlist(muts_to_revert$region) %in% region,, drop = FALSE]
 335 |                         region.copy <- seg.mat.phylo[seg.mat.phylo$SampleID %in% region,]
 336 |                         expVAF   <- min(1 - 1e-6, c((region.copy$ACF[1]*1) / (2*(1-region.copy$ACF[1]) + region.copy$ACF[1]*(as.numeric(mut_to_revert$major_raw)+as.numeric(mut_to_revert$minor_raw)))))
 337 |                         VAF_ci   <- prop.test(x = as.numeric(mut_to_revert$var_counts),n = as.numeric(mut_to_revert$ref_counts)+as.numeric(mut_to_revert$var_counts),p = expVAF)
 338 |                         phyloCCF      <- (VAF_ci$estimate *1/region.copy$ACF[1])*((region.copy$ACF[1]*(as.numeric(mut_to_revert$major_raw)+as.numeric(mut_to_revert$minor_raw)))+2*(1-region.copy$ACF[1]))
 339 |                         phyloCCF_0.05 <- (VAF_ci$conf.int[1] *1/region.copy$ACF[1])*((region.copy$ACF[1]*(as.numeric(mut_to_revert$major_raw)+as.numeric(mut_to_revert$minor_raw)))+2*(1-region.copy$ACF[1]))
 340 |                         phyloCCF_0.95 <- (VAF_ci$conf.int[2] *1/region.copy$ACF[1])*((region.copy$ACF[1]*(as.numeric(mut_to_revert$major_raw)+as.numeric(mut_to_revert$minor_raw)))+2*(1-region.copy$ACF[1]))
 341 | 
 342 |                         phylo.region.list[[region]][phylo.region.list[[region]]$mutation_id %in% mut_to_revert$mutation_id,]$no.chrs.bearing.mut <- 1
 343 |                         phylo.region.list[[region]][phylo.region.list[[region]]$mutation_id %in% mut_to_revert$mutation_id,]$phyloCCF.0.05 <- phyloCCF_0.05
 344 |                         phylo.region.list[[region]][phylo.region.list[[region]]$mutation_id %in% mut_to_revert$mutation_id,]$phyloCCF.0.95 <- phyloCCF_0.95
 345 |                         phylo.region.list[[region]][phylo.region.list[[region]]$mutation_id %in% mut_to_revert$mutation_id,]$phyloCCF <- phyloCCF
 346 |                         phylo.region.list[[region]][phylo.region.list[[region]]$mutation_id %in% mut_to_revert$mutation_id,]$expected.VAF <- expVAF
 347 |                     }
 348 |                 }
 349 |             }
 350 |             if (only_truncal_subclonal_copy_correction %in% FALSE) next
 351 |         }
 352 |     }
 353 | 
 354 |     input_list <- list(patient = patient, new.dir = new.dir, mut.table = mut.table, seg.mat.phylo = seg.mat.phylo, phylo.region.list = phylo.region.list)
 355 |     simpleClusterList <- findSimpleClusters(input_list)
 356 |     ### fail safe to remove clusters of mutations not sufficiently present in any region
 357 |     simpleClusterList <- simpleClusterList[!sapply(strsplit(sapply(simpleClusterList, function(x) x$clusterBinary), split = ":"), function(y) all(y == "0"))]
 358 |     ### changing CCFs to 0 if cluster is absent
 359 |     if (fix_absentCCFs) {
 360 |         clusterPresence.df <- Reduce(rbind, lapply(names(simpleClusterList), function(x) data.frame(clusterID = x, regions = simpleClusterList[[x]]$RegionsInCluster, stringsAsFactors = FALSE)))
 361 |         for (region in regions.to.use) {
 362 |             tmp.clusterPresence <- clusterPresence.df %>% dplyr::filter(regions %in% region) %>% dplyr::pull(clusterID)
 363 |             tmp.clusterMutations <- as.character(unlist(lapply(simpleClusterList[as.character(tmp.clusterPresence)], function(x) x$MutationsWithCluster)))
 364 |             phylo.region.list[[region]][!as.character(unlist(phylo.region.list[[region]]$mutation_id)) %in% tmp.clusterMutations, "phyloCCF"] <- 0
 365 |             phylo.region.list[[region]][!as.character(unlist(phylo.region.list[[region]]$mutation_id)) %in% tmp.clusterMutations, "phyloCCF.0.05"] <- 0
 366 |             phylo.region.list[[region]][!as.character(unlist(phylo.region.list[[region]]$mutation_id)) %in% tmp.clusterMutations, "phyloCCF.0.95"] <- 0
 367 |         }
 368 |     }
 369 | 
 370 |     return(list(patient = patient, regions.to.use = regions.to.use, mut.table = mut.table, seg.mat.copy = seg.mat.copy, seg.mat.phylo = seg.mat.phylo, phylo.region.list = phylo.region.list, simpleClusterList = simpleClusterList))
 371 | }
 372 | 
 373 | 
 374 | #' Main clustering function
 375 | #'
 376 | #' This function takes the input list created in the preprocessing along with
 377 | #' the number of cores and output directory to run the main clustering.
 378 | #' @param input_list A list created by the clustering preprocess function
 379 | #' including patient id, regions to use, phylo region list and others.
 380 | #' @param nProcs A value referring to how many parallel processes 
 381 | #' of pyclone should be run.
 382 | #' @param new.dir A character specifying the directory where the pyclone
 383 | #' output should be saved.
 384 | #' @param burn_in Burn-in for DP clustering.
 385 | #' @param pyclone_seed Seed for PyClone run.
 386 | #' @param template.config.yaml Location of the template yaml file used to run PyClone.
 387 | #' @returns sample.results which is the location of the pyclone output table.
 388 | #' @importFrom parallel mclapply
 389 | #' @export clustering_run
 390 | 
 391 | clustering_run <- function(input_list, nProcs, new.dir, burn_in, pyclone_seed, template.config.yaml) {
 392 |     patient           <- input_list$patient
 393 |     simpleClusterList <- input_list$simpleClusterList
 394 | 
 395 |     no_cores <- nProcs
 396 |     print(paste0("Number of cores that are available: ", no_cores))
 397 | 
 398 |     ### always run pyclone
 399 |     tmp <- parallel::mclapply(simpleClusterList, function(x) {
 400 |         if (length(x$MutationsWithCluster) < 5) {
 401 |             CreateOutputNoPyCloneRun(clusterName = x$clusterID, patientID = patient, SmallClusters = simpleClusterList, patientDirToUse = new.dir)
 402 |         } else {
 403 |             RunPyCloneWithSimpleClusters(clusterName = x$clusterID, patientID = patient, SmallClusters = simpleClusterList, patientDirToUse = new.dir, yamlConfigLoc = template.config.yaml, pyclone.burnin = burn_in, pyclone.seed = pyclone_seed, run.pyclone = TRUE, pyclone.module = "PyClone/0.12.3-foss-2016b-Python-2.7.12-tkinter")
 404 |         }
 405 |     }, mc.cores = no_cores)
 406 |     rm(list = c("no_cores", "tmp"))
 407 | 
 408 |     allClusters <- paste0(list.files(new.dir, pattern = paste0(patient, "_cluster"), full.names = TRUE), "/", patient, ".results.tsv")
 409 |     pyclone.results.list <- lapply(names(simpleClusterList), function(clusterID) {
 410 |         cluster.results.file <- grep(paste0("cluster", clusterID, "/"), allClusters, value = TRUE)
 411 |         pyclone.results <- read.table(cluster.results.file, sep = "\t", header = TRUE, stringsAsFactors = FALSE)
 412 |         colnames(pyclone.results)  <- gsub('mutation_id', 'X', colnames(pyclone.results))
 413 |         pyclone.results$cluster_id <- as.numeric(clusterID) * 100 + as.numeric(pyclone.results$cluster_id)
 414 |         if (length(grep("^X$", colnames(pyclone.results))) == 0) {
 415 |             pyclone.results$X <- rownames(pyclone.results)
 416 |         }
 417 |         return(pyclone.results[, c("X", "cluster_id")])
 418 |     })
 419 |     pyclone.results <- Reduce(rbind, pyclone.results.list)
 420 |     sample.results <- paste(new.dir,"/",patient,'.results.tsv',sep="")
 421 |     write.table(pyclone.results, sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE, file = sample.results)
 422 |     return(sample.results)
 423 | }
 424 | 
 425 | 
 426 | #' Postprocessing of clustering function
 427 | #'
 428 | #' This function takes the input tsv and formats the data to be compatible with
 429 | #' the main CONIPHER clustering function.
 430 | #' @param input_list A list created by the clustering preprocess function
 431 | #' including patient id, regions to use, phylo region list and others.
 432 | #' @param sample.results which is the location of the pyclone output table.
 433 | #' @param new.dir A character specifying the directory where the pyclone
 434 | #' output should be saved.
 435 | #' @param input_tsv the input mutation tsv.
 436 | #' @param input_seg_tsv_loc path to a copy number segment tsv file that is used for
 437 | #' across genome copy number plotting.
 438 | #' Default NULL
 439 | #' @param min_cluster_size Minimum number of mutations needed for a cluster to be considered.
 440 | #' Default 5
 441 | #' @param driver_cat Which categories to use as driver mutations
 442 | #' Default "1"
 443 | #' @param clean_clusters should clusters be cleaned and merged?
 444 | #' Default TRUE
 445 | #' @param min_ccf_present minimum CCF to consider a mutation as present. 
 446 | #' Default 0.1
 447 | #' @param clonal_cutOff lower threshold CCF to consider mutations as clonal.
 448 | #' Default 0.9
 449 | #' @param propClonal_threshold Proportion of mutations in cluster which needs to be
 450 | #' considered clonal to merge.
 451 | #' Default 0.25
 452 | #' @returns NULL
 453 | #' @importFrom dplyr "%>%"
 454 | #' @export clustering_postprocess
 455 | 
 456 | clustering_postprocess <- function(input_list, sample.results, new.dir, input_tsv, input_seg_tsv_loc = NULL, min_cluster_size = 5, driver_cat = "1", clean_clusters = TRUE, min_ccf_present = 0.1, clonal_cutOff = 0.9, propClonal_threshold = 0.25) {
 457 |     phylo.region.list <- input_list$phylo.region.list
 458 |     mut.table         <- input_list$mut.table
 459 |     seg.mat.phylo     <- input_list$seg.mat.phylo
 460 |     seg.mat.copy      <- input_list$seg.mat.copy
 461 |     regions.to.use    <- input_list$regions.to.use
 462 |     simpleClusterList <- input_list$simpleClusterList
 463 |     patient           <- input_list$patient
 464 | 
 465 |     ITH1clust <- names(which(sapply(simpleClusterList, function(x) length(x$RegionsInCluster)) == length(phylo.region.list)))
 466 |     ITH1muts  <- simpleClusterList[[as.character(ITH1clust)]]$MutationsWithCluster
 467 | 
 468 |     pyclone.results <- read.table(sample.results, sep = "\t", header = TRUE, stringsAsFactors = FALSE)
 469 | 
 470 |     # let's make sure the same mutations are being used. 
 471 |     colnames(pyclone.results)  <- gsub('mutation_id', 'X', colnames(pyclone.results))
 472 |     rownames(pyclone.results)  <- pyclone.results$X
 473 |     pyclone.results            <- pyclone.results[rownames(pyclone.results) %in% mut.table$mutation_id,, drop = FALSE]
 474 | 
 475 |     most.likely.cluster        <- pyclone.results$cluster_id
 476 |     names(most.likely.cluster) <- pyclone.results$X
 477 |     muts.to.remove <- c()
 478 |     # let's have a look at whether any of the clusters are explained by copy number events
 479 | 
 480 | 
 481 |     mut.pvals            <- c()
 482 |     cluster.prop.aber    <- c()
 483 |     # require(coin)
 484 | 
 485 |     for (cluster in unique(most.likely.cluster)) {
 486 |       
 487 |         tmp <- copy.driven.clusterNEW(cluster, seg.mat.copy = seg.mat.copy, most.likely.cluster = most.likely.cluster, region.earlyLate.list = phylo.region.list, min.prop.cens = 0.1, loss.thresh = 0.25, diff.thresh = 0.55)
 488 |       
 489 |         if(TRUE %in% is.na(tmp[, 1])) stop
 490 | 
 491 |         cluster.prop.aber <- c(cluster.prop.aber, length(which(tmp[, 1] < 0.05 & tmp[, 2] <= 0.5)) / nrow(tmp))
 492 |         mut.pvals <- rbind(mut.pvals,tmp)
 493 |     }
 494 | 
 495 |     names(cluster.prop.aber) <- unique(most.likely.cluster)
 496 |     # Let's also check what the minimum copy number is for each mutation that is classified as lost
 497 | 
 498 |     out <- c()
 499 |     for (mutation_id in rownames(mut.pvals)) {
 500 |         out <- rbind(out, getMinCPN(mutation_id, phylo.region.list))
 501 |     }
 502 | 
 503 |     mut.pvals <- cbind(mut.pvals, out[, 2])
 504 |     sig.pvals <- mut.pvals[which(as.numeric(mut.pvals[, 1]) < 0.05 & as.numeric(mut.pvals[, 2]) <= 0.75 & as.numeric(mut.pvals[, 3]) == 0), 1]
 505 |     sig.table <- mut.pvals[which(as.numeric(mut.pvals[, 1]) < 0.05 & as.numeric(mut.pvals[, 2]) <= 0.75 & as.numeric(mut.pvals[, 3]) == 0),, drop = FALSE]
 506 | 
 507 |     print(length(sig.pvals))
 508 |     # let's see what happens if we set any cluster with over X% sig pvals as the entire cluster missing
 509 |     clusters.to.remove <- names(which(as.numeric(cluster.prop.aber) >= 0.85))
 510 |     muts.to.remove     <- names(most.likely.cluster)[which(most.likely.cluster %in% clusters.to.remove)]
 511 |     muts.to.remove     <- unique(c(names(sig.pvals),muts.to.remove))
 512 |     muts.to.remove     <- muts.to.remove[!is.na(muts.to.remove)]
 513 |     names(muts.to.remove) <- muts.to.remove
 514 | 
 515 |     # let's be clever about this and give the clusters we're removing the highest names
 516 |     cluster.size.remove <- names(most.likely.cluster[most.likely.cluster %in% names(which(table(most.likely.cluster) < min_cluster_size))])
 517 |     small.clusters      <- most.likely.cluster[cluster.size.remove]
 518 | 
 519 |     # are any of the small clusters private
 520 |     for (small.cluster in unique(small.clusters)) {
 521 |         # create a presence absence heatmap
 522 |         binary.heatmap  <- pyclone.results[, 2:(ncol(pyclone.results) - 1), drop = FALSE]
 523 |         binary.heatmap  <- ifelse(binary.heatmap < min_ccf_present, 0, 1)
 524 |         rownames(binary.heatmap) <- pyclone.results$X
 525 |         row.names       <- pyclone.results[pyclone.results$cluster_id %in% small.cluster, 'X']
 526 | 
 527 |         if (median(rowSums(binary.heatmap[row.names,, drop = FALSE])) == 1) {
 528 |             small.clusters <- small.clusters[!small.clusters %in% small.cluster]
 529 |         }
 530 |     }
 531 | 
 532 |     cluster.size.remove          <- cluster.size.remove[cluster.size.remove %in% names(small.clusters)]
 533 | 
 534 | 
 535 |     old.most.likely.cluster      <- most.likely.cluster
 536 | 
 537 |     most.likely.cluster <- most.likely.cluster[!names(most.likely.cluster) %in% unique(c(muts.to.remove,cluster.size.remove))]
 538 | 
 539 |     if (length(most.likely.cluster) == 0) {
 540 |         stop("ClusterSize\nYou don't have any clean clusters greater than min.cluster.size")
 541 |     }
 542 | 
 543 |     # Let's rename the most likely cluster
 544 |     tmp <- most.likely.cluster
 545 |     most.likely.cluster <- as.numeric(factor(most.likely.cluster))
 546 |     names(most.likely.cluster) <- names(tmp)
 547 |     if (length(most.likely.cluster) > 1) {
 548 |         tmp                 <- as.character(1:length(unique(most.likely.cluster)))
 549 |         names(tmp)          <- names(table(most.likely.cluster)[order(table(most.likely.cluster), decreasing = TRUE)])
 550 |         tmp2                <- most.likely.cluster
 551 |         most.likely.cluster <- tmp[as.character(match(most.likely.cluster, 1:length(most.likely.cluster)))]
 552 |         names(most.likely.cluster) <- names(tmp2)
 553 |         solid.cluster.end <- max(as.numeric(most.likely.cluster))
 554 |         new.cluster.start <- max(as.numeric(most.likely.cluster)) + 1
 555 |         removed.muts      <- old.most.likely.cluster[which(!names(old.most.likely.cluster) %in% names(most.likely.cluster))]
 556 |         removed.clusters  <- c()    
 557 |         for (cluster in names(table(removed.muts)[order(table(removed.muts), decreasing = TRUE)])) {
 558 |             new.cluster <- removed.muts[removed.muts %in% cluster]
 559 |             new.cluster[new.cluster %in% cluster] <- new.cluster.start
 560 |             removed.clusters <- c(removed.clusters,new.cluster)
 561 |             new.cluster.start <- new.cluster.start + 1
 562 |         }
 563 | 
 564 |         # let's add back the ones to remove, but make these higher clusters (i.e. these will have higher number)
 565 |         # this will make their removal seem easier later hopefully
 566 | 
 567 | 
 568 |         most.likely.cluster <- c(most.likely.cluster,removed.clusters)
 569 |         most.likely.cluster <- most.likely.cluster[order(as.numeric(most.likely.cluster))]
 570 |         names.muts          <- names(most.likely.cluster)
 571 |         most.likely.cluster <- as.numeric(most.likely.cluster)
 572 |         names(most.likely.cluster) <- names.muts
 573 |     }
 574 | 
 575 |     if (length(most.likely.cluster) == 0) {
 576 |         solid.cluster.end <- 0
 577 |         new.cluster.start <- 1
 578 | 
 579 |         removed.clusters  <- c()    
 580 |         for (cluster in names(table(removed.muts)[order(table(removed.muts), decreasing = TRUE)])) {
 581 |             new.cluster <- removed.muts[removed.muts %in% cluster]
 582 |             new.cluster[new.cluster %in% cluster] <- new.cluster.start
 583 |             removed.clusters <- c(removed.clusters,new.cluster)
 584 |             new.cluster.start <- new.cluster.start + 1
 585 |         }
 586 |         most.likely.cluster <- removed.clusters
 587 |     }
 588 | 
 589 | 
 590 | 
 591 |     #Let's add the muts to remove back into the table
 592 |     v.pvals                                                          <- rep(NA, nrow(mut.table))
 593 |     names(v.pvals)                                                   <- mut.table$mutation_id
 594 |     v.pvals[intersect(names(v.pvals), rownames(mut.pvals))]          <- mut.pvals[intersect(names(v.pvals), rownames(mut.pvals)), 1]
 595 |     v.remove                                                         <- names(v.pvals) %in% muts.to.remove
 596 |     names(v.remove)                                                  <- names(v.pvals)
 597 |     v.cluster                                                        <- rep(NA, nrow(mut.table))     
 598 |     names(v.cluster)                                                 <- names(v.pvals)
 599 |     v.cluster[names(most.likely.cluster)]                            <- most.likely.cluster
 600 |     v.size.remove                                                    <- names(v.pvals) %in% cluster.size.remove
 601 |     names(v.size.remove)                                             <- names(v.pvals)
 602 |     v.minCPN                                                         <- rep(NA, nrow(mut.table))
 603 |     names(v.minCPN)                                                  <- names(v.pvals)
 604 |     v.minCPN[intersect(names(v.minCPN), rownames(mut.pvals))]        <- mut.pvals[intersect(names(v.minCPN), rownames(mut.pvals)), 3]
 605 |     v.regionLoss                                                     <- rep(NA, nrow(mut.table))
 606 |     names(v.regionLoss)                                              <- names(v.pvals)
 607 |     v.regionLoss[intersect(names(v.regionLoss), rownames(mut.pvals))]<- mut.pvals[intersect(names(v.regionLoss), rownames(mut.pvals)), 4]
 608 |      
 609 | 
 610 | 
 611 | 
 612 |     mut.table$cpn.remove.pval <- v.pvals
 613 |     mut.table$cpn.remove      <- v.remove
 614 |     mut.table$cluster         <- v.cluster
 615 |     mut.table$cluster.remove  <- v.size.remove
 616 |     mut.table$minCPN          <- v.minCPN
 617 |     mut.table$regionLoss      <- v.regionLoss
 618 | 
 619 |     mut.table.save.name <- paste(new.dir, patient, '.all.SNV.cpn.xls', sep = "")
 620 |     write.table(mut.table, file = mut.table.save.name, sep = "\t", col.names = NA)
 621 | 
 622 |     # let's plot these mutations ####
 623 |     if (length(muts.to.remove) > 1) {
 624 |         pdf(paste(new.dir, patient, ".removedCPN.muts.pdf", sep = ""), width = 8, height = 8)
 625 |         clusters.to.plot <- most.likely.cluster[mut.table[mut.table$cpn.remove %in% TRUE, 'mutation_id']]
 626 | 
 627 |         {
 628 |             # let's only plot a cluster if it has removed musted    
 629 |             print(clusters.to.plot)  
 630 |             
 631 |             lyout <- c()
 632 |             for (i in seq(1, length(regions.to.use) * 2, by = 2)) {
 633 |                 lyout <- rbind(lyout, rbind(c(rep(i, 9), i + 1)))
 634 |             }
 635 |             
 636 |             layout(lyout)
 637 |         
 638 |             for (region in regions.to.use) {
 639 |                
 640 |                 region.earlyLate  <- phylo.region.list[[region]]                                             
 641 |                 region.earlyLate  <- region.earlyLate[!is.na(region.earlyLate$phyloCCF),]
 642 |                 region.earlyLate  <- region.earlyLate[region.earlyLate$mutation_id %in% muts.to.remove,, drop = FALSE]
 643 | 
 644 |                 # Using seg file if exists for plotting
 645 |                 region.seg.copy  <- seg.mat.copy[seg.mat.copy$SampleID %in% region,, drop = FALSE]
 646 | 
 647 |                 if (!is.null(input_seg_tsv_loc)) {
 648 |                     print("Using specified seg file for plotting")
 649 |                     region.seg.copy     <- read.delim2(input_seg_tsv_loc, stringsAsFactors = FALSE)
 650 |                     region.seg.copy$SAMPLE <- gsub("-", "\\.", region.seg.copy$SAMPLE)
 651 |                     region.seg.copy$COPY_NUMBER_A <- as.numeric(region.seg.copy$COPY_NUMBER_A)
 652 |                     region.seg.copy$COPY_NUMBER_B <- as.numeric(region.seg.copy$COPY_NUMBER_B)
 653 |                     region.seg.copy$CHR           <- as.numeric(region.seg.copy$CHR)
 654 |                     region.seg.copy$STARTPOS      <- as.numeric(region.seg.copy$STARTPOS)
 655 |                     
 656 |                     #Sort by start position within chromosome
 657 |                     region.seg.copy <- region.seg.copy[order(region.seg.copy$CHR,
 658 |                                                              region.seg.copy$STARTPOS), ]
 659 |                     # If providing seg file, ensure the sample names match the sample names in input tsv
 660 |                     if (!any(unique(region.seg.copy$SAMPLE) %in% unique(seg.mat.copy[,1]))) {
 661 |                         stop('Sample IDs do not match between input_tsv and input_seg_tsv')
 662 |                     }
 663 |                     region.seg.copy <- region.seg.copy %>% dplyr::rename(SampleID = SAMPLE)
 664 |                 } else {
 665 |                     print("Using tsv data for plotting")
 666 |                     region.seg.copy <- seg.mat.copy
 667 |                 }
 668 | 
 669 |                 region.seg.copy <- region.seg.copy %>% dplyr::filter(SampleID %in% region)
 670 |                 # ensure raw copy number columns are numeric:
 671 |                 region.seg.copy$COPY_NUMBER_A <- as.numeric(region.seg.copy$COPY_NUMBER_A)
 672 |                 region.seg.copy$COPY_NUMBER_B <- as.numeric(region.seg.copy$COPY_NUMBER_B)
 673 | 
 674 |                 # Rename columns
 675 |                 sub.mat.copy               <- region.seg.copy
 676 | 
 677 |                 colnames(sub.mat.copy)[2]  <- 'Chromosome'
 678 |                 colnames(sub.mat.copy)[3]  <- 'StartPosition'
 679 |                 colnames(sub.mat.copy)[4]  <- 'EndPosition'
 680 | 
 681 |                 #pdf(early.late.pdf)
 682 |                 par(mar = c(0.5, 5, 0.5, 0.2))
 683 |                 par(lend = 1)
 684 | 
 685 |                 plot.simpleClusters.raw(seg.mat.patient = sub.mat.copy, most.likely.cluster = clusters.to.plot, TCGA.earlyLate = region.earlyLate, sub.clonal = 1)
 686 | 
 687 |                 mtexti(region, side = 2, off = 0.5)
 688 | 
 689 |                 ds <- density(ifelse(as.numeric(region.earlyLate$mutCopyNum) > 5, 5, as.numeric(region.earlyLate$mutCopyNum)))
 690 |                 ds1 <- ds
 691 |                 ds1$x <- ds$y
 692 |                 ds1$y <- ds$x
 693 |                 par(mar = c(0.5, 0, 0.5, 4))
 694 |                 A <- hist(ifelse(as.numeric(region.earlyLate$mutCopyNum) > 5, 5, as.numeric(region.earlyLate$mutCopyNum)), breaks = seq(-0.25, 6, by = 0.1), plot = FALSE)
 695 |                 plot(NULL, type = "n", xlim = c(0, max(A$density)), ylim = c(-0.25, 6), bty = 'n', xaxs = 'i', xaxt = 'n', yaxt = 'n', yaxs = 'i', xlab = "", main = "", ylab = "")
 696 |                 rect(0, A$breaks[1:(length(A$breaks) - 1)], A$density, A$breaks[2:length(A$breaks)], border = TRUE, col = "#CC6666")
 697 |                 lines(ds1)
 698 |             }
 699 |         }
 700 |       
 701 |         for (cluster in unique(clusters.to.plot)) {
 702 |             # let's only plot a cluster if it has removed musted    
 703 | 
 704 |             print(cluster)  
 705 | 
 706 |             lyout <- c()
 707 |             for (i in seq(1, length(regions.to.use) * 2, by = 2)) {
 708 |               lyout <- rbind(lyout, rbind(c(rep(i, 9), i + 1)))
 709 |             }
 710 | 
 711 |             layout(lyout)
 712 | 
 713 |             for (region in regions.to.use) {
 714 |                 region.earlyLate  <- phylo.region.list[[region]]                                             
 715 |                 region.earlyLate  <- region.earlyLate[!is.na(region.earlyLate$phyloCCF),]
 716 |                 region.earlyLate  <- region.earlyLate[region.earlyLate$mutation_id %in% muts.to.remove,, drop = FALSE]
 717 | 
 718 |                 # Using seg file if exists for plotting
 719 |                 region.seg.copy  <- seg.mat.copy[seg.mat.copy$SampleID %in% region,, drop = FALSE]
 720 | 
 721 |                 if (!is.null(input_seg_tsv_loc)) {
 722 |                     print("Using specified seg file for plotting")
 723 |                     region.seg.copy     <- read.delim2(input_seg_tsv_loc, stringsAsFactors = FALSE)
 724 |                     region.seg.copy$SAMPLE <- gsub("-", "\\.", region.seg.copy$SAMPLE)
 725 |                     region.seg.copy$COPY_NUMBER_A <- as.numeric(region.seg.copy$COPY_NUMBER_A)
 726 |                     region.seg.copy$COPY_NUMBER_B <- as.numeric(region.seg.copy$COPY_NUMBER_B)
 727 |                     region.seg.copy$CHR           <- as.numeric(region.seg.copy$CHR)
 728 |                     region.seg.copy$STARTPOS      <- as.numeric(region.seg.copy$STARTPOS)
 729 |                     
 730 |                     #Sort by start position within chromosome
 731 |                     region.seg.copy <- region.seg.copy[order(region.seg.copy$CHR,
 732 |                                                              region.seg.copy$STARTPOS), ]
 733 |                     # If providing seg file, ensure the sample names match the sample names in input tsv
 734 |                     if (!any(unique(region.seg.copy$SAMPLE) %in% unique(seg.mat.copy[, 1]))) {
 735 |                         stop('Sample IDs do not match between input_tsv and input_seg_tsv')
 736 |                     }
 737 |                     region.seg.copy <- region.seg.copy %>% dplyr::rename(SampleID = SAMPLE)
 738 |                 } else {
 739 |                     print("Using tsv data for plotting")
 740 |                     region.seg.copy <- seg.mat.copy
 741 |                 }
 742 | 
 743 |                 region.seg.copy <- region.seg.copy %>% dplyr::filter(SampleID %in% region)
 744 |                 # ensure raw copy number columns are numeric:
 745 |                 region.seg.copy$COPY_NUMBER_A <- as.numeric(region.seg.copy$COPY_NUMBER_A)
 746 |                 region.seg.copy$COPY_NUMBER_B <- as.numeric(region.seg.copy$COPY_NUMBER_B)
 747 | 
 748 |                 # Rename columns:
 749 |                 sub.mat.copy               <- region.seg.copy
 750 |                 colnames(sub.mat.copy)[2]  <- 'Chromosome'
 751 |                 colnames(sub.mat.copy)[3]  <- 'StartPosition'
 752 |                 colnames(sub.mat.copy)[4]  <- 'EndPosition'
 753 | 
 754 | 
 755 |                 #pdf(early.late.pdf)
 756 |                 par(mar = c(0.5, 5, 0.5, 0.2))
 757 |                 par(lend = 1)
 758 | 
 759 |                 plot.simpleClusters.raw(seg.mat.patient = sub.mat.copy, most.likely.cluster = most.likely.cluster, cluster = cluster, TCGA.earlyLate = region.earlyLate, sub.clonal = 1)
 760 | 
 761 |                 mtexti(region, side = 2, off = 0.5)
 762 | 
 763 |                 ds <- density(ifelse(as.numeric(region.earlyLate$mutCopyNum) > 5, 5, as.numeric(region.earlyLate$mutCopyNum)))
 764 |                 ds1 <- ds
 765 |                 ds1$x <- ds$y
 766 |                 ds1$y <- ds$x
 767 |                 par(mar = c(0.5, 0, 0.5, 4))
 768 |                 A <- hist(ifelse(as.numeric(region.earlyLate$mutCopyNum) > 5, 5, as.numeric(region.earlyLate$mutCopyNum)), breaks = seq(-0.25, 6, by = 0.1), plot = FALSE)
 769 |                 plot(NULL, type = "n", xlim = c(0, max(A$density)), ylim = c(-0.25, 6), bty = 'n', xaxs = 'i', xaxt = 'n', yaxt = 'n', yaxs = 'i', xlab = "", main = "", ylab = "")
 770 |                 rect(0, A$breaks[1:(length(A$breaks) - 1)], A$density, A$breaks[2:length(A$breaks)], border = TRUE, col = "#CC6666")
 771 | 
 772 |                 lines(ds1)
 773 |             }
 774 |             dev.off()
 775 |         }
 776 | 
 777 |       
 778 | 
 779 |         # let's also write these to a table
 780 |         mut.table.remove <- mut.table[mut.table$mutation_id %in% muts.to.remove,, drop = FALSE]
 781 |         write.table(mut.table.remove, file = paste(new.dir, patient, ".removed.muts.txt", sep = ""), sep = "\t", quote = FALSE, col.names = NA)
 782 |         if (TRUE %in% c(mut.table.remove$driverCategory %in% driver_cat)) {
 783 |             cat("You're removing driver muts!!")
 784 |             removed.drivers <- mut.table.remove[mut.table.remove$driverCategory %in% driver_cat,, drop = FALSE]
 785 |             write.table(removed.drivers, file = paste(new.dir, patient, ".removed.drivers.txt", sep = ""), quote = FALSE, sep = "\t", col.names = NA)
 786 |             cat('\n')
 787 |             cat(removed.drivers$Gene.refGene)
 788 |         }
 789 | 
 790 |     }
 791 | 
 792 |     no.optima = length(unique(most.likely.cluster))
 793 |     max.cols = 12
 794 |     # require(RColorBrewer)
 795 |     cols           = paste(RColorBrewer::brewer.pal(min(max.cols,no.optima),name = "Paired"), sep = "")
 796 |     cols           = rep(cols, ceiling(no.optima / max.cols))[1:no.optima]
 797 |     cols.opac      = paste(cols, '99', sep = "")
 798 | 
 799 |     clean.most.likely.clusters <- most.likely.cluster[most.likely.cluster %in% c(1:solid.cluster.end)]
 800 | 
 801 | 
 802 |     # let's get confidence intervals for each of the mutations 
 803 |     # (and also plot the confidence intervals of the driver mutations)
 804 | 
 805 |     # # first, let's import the trace files
 806 |     region.trace              <- list()
 807 |     region.preClustPosterior  <- list()
 808 |     region.postClustPosterior <- list()
 809 | 
 810 |     for (region in regions.to.use) {
 811 |         clusters.to.use <- sapply(simpleClusterList, function(x) region %in% x$RegionsInCluster)
 812 |         clusters.to.use <- names(clusters.to.use)[clusters.to.use]
 813 | 
 814 |         for (clust in clusters.to.use) {
 815 |             pyclone.tsv <- read.table(paste0(new.dir, patient, "_cluster", clust, "/", patient, '.results.tsv'), stringsAsFactors = FALSE, header = TRUE)
 816 | 
 817 |             mutation_ids <- unlist(phylo.region.list[[region]]$mutation_id)
 818 |             if (length(grep("mutation_id", colnames(pyclone.tsv))) > 0) {
 819 |               tmp          <- intersect(mutation_ids, pyclone.tsv$mutation_id)
 820 |             } else {
 821 |               tmp          <- intersect(mutation_ids, rownames(pyclone.tsv))
 822 |             }
 823 |             phylo.region.list[[region]]$phyloCCF_PyClone.cluster[mutation_ids %in% tmp] <- most.likely.cluster[tmp]
 824 |         }
 825 |     }
 826 |         
 827 | 
 828 |     # Finally, let's put this into a megatable, and write this to an appropriate place
 829 |     save(phylo.region.list, file = paste(new.dir, patient, '.PhyloRegionList.RData', sep = ""))
 830 |     save.image(file = paste(new.dir, patient, ".PyClone.RData", sep = ""))
 831 | 
 832 | 
 833 |     print("Creating human readable output")
 834 |     ### creating human readable output
 835 |     print("Running non-original output")
 836 |     tmp.phylo.region.list <- lapply(phylo.region.list, function(x) {
 837 |         tmp <- data.frame(x, stringsAsFactors = FALSE)
 838 |         rownames(tmp) <- NULL
 839 |         tmp <- tmp %>% 
 840 |             dplyr::select(mutation_id, region, Reference_Base, Alternate_Base, ref_counts, var_counts, phyloCCF, phyloCCF.0.05, phyloCCF.0.95, absolute.ccf, mutCopyNum, major_cn, minor_cn) %>%
 841 |             dplyr::mutate(mutation_id = unlist(mutation_id), ref_counts = unlist(ref_counts), var_counts = unlist(var_counts), minor_cn = unlist(minor_cn), major_cn = unlist(major_cn), region = unlist(region), Reference_Base = unlist(Reference_Base), Alternate_Base = unlist(Alternate_Base)) %>%
 842 |             dplyr::rename(SAMPLE = region, REF = Reference_Base, ALT = Alternate_Base, REF_COUNT = ref_counts, VAR_COUNT = var_counts, CCF_PHYLO = phyloCCF, CCF_OBS = absolute.ccf, MUT_COPY = mutCopyNum, COPY_NUMBER_A = major_cn, COPY_NUMBER_B = minor_cn)
 843 |         return(tmp)
 844 |     })
 845 |     output_tsv <- dplyr::bind_rows(tmp.phylo.region.list) 
 846 | 
 847 |     output_tsv <- output_tsv %>% 
 848 |         dplyr::mutate(CLUSTER = most.likely.cluster[output_tsv$mutation_id],
 849 |                       CLEAN = ifelse(output_tsv$mutation_id %in% names(clean.most.likely.clusters), TRUE, FALSE))
 850 | 
 851 |     output_tsv <- output_tsv %>% 
 852 |         dplyr::mutate(CHR = as.numeric(sapply(strsplit(unlist(output_tsv$mutation_id), split = ":"), function(x) x[2])),
 853 |                       POS = as.numeric(sapply(strsplit(unlist(output_tsv$mutation_id), split = ":"), function(x) x[3])),
 854 |                       key = paste(paste0("chr", CHR), POS, REF, ALT, sep = ":")) %>%
 855 |         dplyr::left_join(input_tsv %>% dplyr::select(CASE_ID, SAMPLE, CHR, POS, REF, ALT, DEPTH, ACF, PLOIDY) %>% mutate(CHR = as.numeric(CHR), POS = as.numeric(POS)), by = c("CHR", "POS", "REF", "ALT", "SAMPLE")) %>%
 856 |         dplyr::select(CASE_ID, SAMPLE, CHR, POS, REF, ALT, REF_COUNT, VAR_COUNT, DEPTH, CLUSTER, CCF_PHYLO, CCF_OBS, MUT_COPY, COPY_NUMBER_A, COPY_NUMBER_B, ACF, PLOIDY, CLEAN, phyloCCF.0.05, phyloCCF.0.95)
 857 | 
 858 |     write.table(output_tsv %>% dplyr::select(-phyloCCF.0.05, -phyloCCF.0.95), file = paste0(new.dir, patient, ".SCoutput.FULL.tsv"), row.names = FALSE, quote = FALSE, sep = "\t")
 859 | 
 860 |     output_tsv_clean <- output_tsv %>% dplyr::filter(CLEAN) %>% dplyr::select(-CLEAN, -phyloCCF.0.05, -phyloCCF.0.95)
 861 |     output_tsv_dirty <- output_tsv %>% dplyr::filter(!CLEAN) %>% dplyr::select(-CLEAN, -phyloCCF.0.05, -phyloCCF.0.95)
 862 | 
 863 |     write.table(output_tsv_clean, file = paste0(new.dir, patient, ".SCoutput.CLEAN.tsv"), row.names = FALSE, quote = FALSE, sep = "\t")
 864 |     write.table(output_tsv_dirty, file = paste0(new.dir, patient, ".SCoutput.DIRTY.tsv"), row.names = FALSE, quote = FALSE, sep = "\t")
 865 | 
 866 | 
 867 |     #finally, save a version of the table that is cleaned
 868 | 
 869 |     if(clean_clusters %in% TRUE) {
 870 |         print("Cleaning clusters")
 871 |         output_tsv <- correct.clusters.from.table(output_tsv) #kg: merging clusters if sc cn correction created a new cluster
 872 |         dirty.clusters.remove <- output_tsv %>% 
 873 |             dplyr::group_by(CLUSTER) %>% 
 874 |             dplyr::filter(all(!CLEAN)) %>% 
 875 |             dplyr::pull(CLUSTER) %>% 
 876 |             unique()
 877 |         output_tsv <- output_tsv %>% 
 878 |             dplyr::mutate(CLEAN = ifelse(!CLUSTER %in% dirty.clusters.remove, TRUE, FALSE))
 879 | 
 880 |         ### merge clusters present in all regions at given thresholds
 881 |         print("Final merging of ubiqquitous clusters")
 882 |         ### select ubiquitous mutations in clean clusters
 883 |         issue_mutations <- output_tsv %>% 
 884 |             dplyr::mutate(mutation_id = paste(CASE_ID, CHR, POS, REF, sep = ":")) %>% 
 885 |             dplyr::filter(CLEAN, mutation_id %in% ITH1muts)
 886 |         if (nrow(issue_mutations) == 0) {
 887 |             print("No additional clusters corrected")
 888 |         } else {
 889 |             clusters_to_consider <- unique(issue_mutations$CLUSTER)
 890 |             ### calculate proportion of mutations that are greater or equal to the clonal threshold in each region
 891 |             clonalProportion.df <- issue_mutations %>% 
 892 |                 dplyr::group_by(CLUSTER, SAMPLE) %>%
 893 |                 dplyr::mutate(nMuts = dplyr::n(),
 894 |                               propClonal = sum(phyloCCF.0.95 >= clonal_cutOff) / nMuts) %>%
 895 |                 dplyr::select(CLUSTER, SAMPLE, propClonal) %>%
 896 |                 unique() %>% 
 897 |                 dplyr::ungroup()
 898 | 
 899 |             ### extract the lowest proportion and filter for clusters where this is greater than the threshold
 900 |             clusters_to_change <- as.character(clonalProportion.df %>%
 901 |                 dplyr::group_by(CLUSTER) %>% 
 902 |                 dplyr::mutate(minPropClonal = min(propClonal)) %>%
 903 |                 dplyr::select(CLUSTER, minPropClonal) %>%
 904 |                 unique() %>%
 905 |                 dplyr::filter(minPropClonal > propClonal_threshold) %>%
 906 |                 dplyr::pull(CLUSTER))
 907 | 
 908 |             ### if two or more clusters are above this threshold merge all clusters into the lowest cluster ID
 909 |             if (length(clusters_to_change) == 0) {
 910 |                 print("No clusters above specified thresholds")
 911 |             } else if (length(clusters_to_change) == 1) {
 912 |                 print("Only single cluster above thresholds identified. Nothing to merge")
 913 |             } else {
 914 |                 output_tsv <- output_tsv %>%
 915 |                     dplyr::mutate(CLUSTER = ifelse(CLUSTER %in% clusters_to_change, min(clusters_to_change), CLUSTER))
 916 |             }
 917 |         }
 918 | 
 919 |         write.table(output_tsv %>% dplyr::select(-phyloCCF.0.05, -phyloCCF.0.95), file = paste0(new.dir, patient, ".SCoutput.FULL.tsv"), row.names = FALSE, quote = FALSE, sep = "\t")
 920 |         output_tsv_clean <- output_tsv %>% dplyr::filter(CLEAN) %>% dplyr::select(-CLEAN, -phyloCCF.0.05, -phyloCCF.0.95)
 921 |         output_tsv_dirty <- output_tsv %>% dplyr::filter(!CLEAN) %>% dplyr::select(-CLEAN, -phyloCCF.0.05, -phyloCCF.0.95)
 922 |         write.table(output_tsv_clean, file = paste0(new.dir, patient, ".SCoutput.CLEAN.tsv"), row.names = FALSE, quote = FALSE, sep = "\t")
 923 |         write.table(output_tsv_dirty, file = paste0(new.dir, patient, ".SCoutput.DIRTY.tsv"), row.names = FALSE, quote = FALSE, sep = "\t")
 924 | 
 925 |     }
 926 | 
 927 |     if(length(regions.to.use) > 1) {
 928 |         ### plot heatmap
 929 |         ### plot the clusters
 930 |       
 931 |       
 932 |       pdf(paste(new.dir, "/", patient, "_pyclone_cluster_assignment_ccf_dirty",".pdf",sep=""),height=4,width=4)
 933 |       plot.pyclone.clusters(patient = patient
 934 |                             ,regionList=phylo.region.list
 935 |                             ,mut.table=mut.table
 936 |                             ,regions.to.use = regions.to.use
 937 |                             ,mostLikelyClusters = most.likely.cluster
 938 |                             ,driverCat = driver_cat
 939 |                             ,ccf='absolute'
 940 |       )
 941 |       dev.off()
 942 |       
 943 |       pdf(paste(new.dir,"/",patient,"_pyclone_cluster_assignment_phylo_dirty",".pdf",sep=""),height=4,width=4)
 944 |       plot.pyclone.clusters(patient=patient
 945 |                             ,regionList=phylo.region.list
 946 |                             ,mut.table=mut.table
 947 |                             ,regions.to.use = regions.to.use
 948 |                             ,mostLikelyClusters = most.likely.cluster
 949 |                             ,driverCat = driver_cat
 950 |                             ,ccf='phylo'
 951 |       )
 952 |       dev.off()
 953 |       
 954 |       pdf(paste(new.dir,"/",patient,"_pyclone_cluster_assignment_mutCpn_dirty",".pdf",sep=""),height=8,width=8)
 955 |       plot.pycloneMutCpn.clusters(patient=patient
 956 |                             ,regionList=phylo.region.list
 957 |                             ,mut.table=mut.table
 958 |                             ,regions.to.use = regions.to.use
 959 |                             ,mostLikelyClusters = most.likely.cluster
 960 |                             ,driverCat = driver_cat
 961 |                             ,ccf='phylo'
 962 |       )
 963 |       dev.off()
 964 |       
 965 |       
 966 |       pdf(paste(new.dir,"/",patient,"_pyclone_cluster_assignment_mutCpn_clean",".pdf",sep=""),height=8,width=8)
 967 |       plot.pycloneMutCpn.clusters(patient=patient
 968 |                             ,regionList=phylo.region.list
 969 |                             ,mut.table=mut.table
 970 |                             ,regions.to.use = regions.to.use
 971 |                             ,mostLikelyClusters = clean.most.likely.clusters
 972 |                             ,driverCat = driver_cat
 973 |                             ,ccf='phylo'
 974 |       )
 975 |       dev.off()
 976 | 
 977 |       pdf(paste(new.dir,"/",patient,"_pyclone_cluster_assignment_phylo_clean",".pdf",sep=""),height=4,width=4)
 978 |       plot.pyclone.clusters(patient=patient
 979 |                             ,regionList=phylo.region.list
 980 |                             ,mut.table=mut.table
 981 |                             ,regions.to.use = regions.to.use
 982 |                             ,mostLikelyClusters = clean.most.likely.clusters
 983 |                             ,driverCat = driver_cat
 984 |                             ,ccf='phylo'
 985 |       )
 986 |       dev.off()
 987 |       
 988 |       pdf(paste(new.dir,"/",patient,"_pyclone_cluster_assignment_ccf_clean",".pdf",sep=""),height=4,width=4)
 989 |       plot.pyclone.clusters(patient=patient
 990 |                             ,regionList=phylo.region.list
 991 |                             ,mut.table=mut.table
 992 |                             ,regions.to.use = regions.to.use
 993 |                             ,mostLikelyClusters = clean.most.likely.clusters
 994 |                             ,driverCat = driver_cat
 995 |                             ,ccf='absolute'
 996 |       )
 997 |       dev.off()
 998 |     }
 999 | 
1000 |     # let's plot the copy number clusters
1001 |     # Using seg file if exists for plotting
1002 | 
1003 |     if (!is.null(input_seg_tsv_loc)) {
1004 |         print("Using specified seg file for plotting")
1005 |         seg.mat.copy.plot     <- read.delim2(input_seg_tsv_loc, stringsAsFactors = F)
1006 |         seg.mat.copy.plot$SAMPLE <- gsub("-", "\\.", seg.mat.copy.plot$SAMPLE)
1007 |         seg.mat.copy.plot$COPY_NUMBER_A <- as.numeric(seg.mat.copy.plot$COPY_NUMBER_A)
1008 |         seg.mat.copy.plot$COPY_NUMBER_B <- as.numeric(seg.mat.copy.plot$COPY_NUMBER_B)
1009 |         seg.mat.copy.plot$CHR           <- as.numeric(seg.mat.copy.plot$CHR)
1010 |         seg.mat.copy.plot$STARTPOS      <- as.numeric(seg.mat.copy.plot$STARTPOS)
1011 |         
1012 |         #Sort by start position within chromosome
1013 |         seg.mat.copy.plot <- seg.mat.copy.plot[order(seg.mat.copy.plot$CHR,
1014 |                                                      seg.mat.copy.plot$STARTPOS), ]
1015 |         # If providing seg file, ensure the sample names match the sample names in input tsv
1016 |         if (!any(unique(seg.mat.copy.plot$SAMPLE) %in% unique(seg.mat.copy[,1]))) {
1017 |             stop('Sample IDs do not match between input_tsv and input_seg_tsv')
1018 |         }
1019 |     } else {
1020 |         print("Using tsv data for plotting")
1021 |         seg.mat.copy.plot <- seg.mat.copy
1022 |     }
1023 | 
1024 |     # ensure COPY_NUMBER_A and COPY_NUMBER_B are numeric
1025 |     seg.mat.copy.plot$COPY_NUMBER_A <- as.numeric(seg.mat.copy.plot$COPY_NUMBER_A)
1026 |     seg.mat.copy.plot$COPY_NUMBER_B <- as.numeric(seg.mat.copy.plot$COPY_NUMBER_B)
1027 | 
1028 |     pdf(paste(new.dir,"/",patient,"_pyclone_cluster_assignment_copynumber_dirty",".pdf",sep=""), height = ifelse(length(regions.to.use) == 1, 5, length(regions.to.use)))
1029 |     plot.region.mutCopyNum(phylo.region.list = phylo.region.list,seg.mat.copy = seg.mat.copy.plot,mostLikelyClusters = most.likely.cluster,plot.separate.clusters = TRUE)
1030 |     dev.off()
1031 | 
1032 |     pdf(paste(new.dir,"/",patient,"_pyclone_cluster_assignment_copynumber_clean",".pdf",sep=""), height = ifelse(length(regions.to.use) == 1, 5, length(regions.to.use)))
1033 |     plot.region.mutCopyNum(phylo.region.list = phylo.region.list,seg.mat.copy = seg.mat.copy.plot,mostLikelyClusters = clean.most.likely.clusters,plot.separate.clusters = TRUE)
1034 |     dev.off()
1035 | }
1036 | 
1037 | 
1038 | 


--------------------------------------------------------------------------------