├── .gitignore ├── LICENSE ├── inst ├── .DS_Store ├── vignette-supp │ ├── unnamed-chunk-13-1.png │ ├── unnamed-chunk-14-1.png │ ├── unnamed-chunk-15-1.png │ ├── unnamed-chunk-16-1.png │ └── unnamed-chunk-17-1.png ├── CITATION └── extdata │ ├── small_seqs.fa │ └── seqs.fa.treefile ├── NAMESPACE ├── .Rbuildignore ├── rhierbaps.Rproj ├── man ├── preproc_alignment.Rd ├── calc_log_ml.Rd ├── calc_change_in_ml.Rd ├── save_lml_logs.Rd ├── load_fasta.Rd ├── plot_sub_cluster.Rd ├── join_units_2.Rd ├── split_clusters_3.Rd ├── model_search_parallel.Rd ├── reallocate_units_4.Rd ├── move_units_1.Rd └── hierBAPS.Rd ├── NEWS.md ├── cran-comments.md ├── DESCRIPTION ├── LICENSE.md ├── R ├── save_lml_logs.R ├── split_clusters_3.R ├── preproc_alignment.R ├── calc_log_ml.R ├── load_fasta.R ├── log_stirling2.R ├── plot_sub_cluster.R ├── reallocate_units_4.R ├── move_units_1.R ├── join_units_2.R ├── calc_change_in_ml.R ├── model_search_parallel.R └── hierBAPS.R ├── README.Rmd ├── .github └── workflows │ └── R-CMD-check.yaml ├── vignettes ├── bibliography.bib └── introduction.Rmd └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | inst/doc 2 | .Rproj.user 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2018 2 | COPYRIGHT HOLDER: Gerry Tonkin-Hill 3 | -------------------------------------------------------------------------------- /inst/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gtonkinhill/rhierbaps/HEAD/inst/.DS_Store -------------------------------------------------------------------------------- /inst/vignette-supp/unnamed-chunk-13-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gtonkinhill/rhierbaps/HEAD/inst/vignette-supp/unnamed-chunk-13-1.png -------------------------------------------------------------------------------- /inst/vignette-supp/unnamed-chunk-14-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gtonkinhill/rhierbaps/HEAD/inst/vignette-supp/unnamed-chunk-14-1.png -------------------------------------------------------------------------------- /inst/vignette-supp/unnamed-chunk-15-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gtonkinhill/rhierbaps/HEAD/inst/vignette-supp/unnamed-chunk-15-1.png -------------------------------------------------------------------------------- /inst/vignette-supp/unnamed-chunk-16-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gtonkinhill/rhierbaps/HEAD/inst/vignette-supp/unnamed-chunk-16-1.png -------------------------------------------------------------------------------- /inst/vignette-supp/unnamed-chunk-17-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gtonkinhill/rhierbaps/HEAD/inst/vignette-supp/unnamed-chunk-17-1.png -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(hierBAPS) 4 | export(load_fasta) 5 | export(plot_sub_cluster) 6 | export(save_lml_logs) 7 | import(patchwork) 8 | -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^CRAN-RELEASE$ 2 | ^.*\.Rproj$ 3 | ^\.Rproj\.user$ 4 | cran-comments.md 5 | .travis.yml 6 | README.Rmd 7 | .gitignore 8 | ^LICENSE\.md$ 9 | revdep 10 | revdep/* 11 | ^\.github$ 12 | -------------------------------------------------------------------------------- /rhierbaps.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | BuildType: Package 16 | PackageUseDevtools: Yes 17 | PackageInstallArgs: --no-multiarch --with-keep.source 18 | PackageCheckArgs: --as-cran 19 | PackageRoxygenize: rd,collate,namespace 20 | -------------------------------------------------------------------------------- /man/preproc_alignment.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/preproc_alignment.R 3 | \name{preproc_alignment} 4 | \alias{preproc_alignment} 5 | \title{preproc_alignment} 6 | \usage{ 7 | preproc_alignment(snp.matrix) 8 | } 9 | \arguments{ 10 | \item{snp.matrix}{A matrix containing SNP data. Rows indicate isolates and columns loci.} 11 | } 12 | \value{ 13 | an snp.object 14 | } 15 | \description{ 16 | Preprocessed the snp matrix for hierBAPS. 17 | } 18 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # rhierbaps 1.1.2 2 | Fixed issue with stirling2 dependency causing the algorithm to stop prematurely for very large alignments. 3 | 4 | # rhierbaps 1.1.1 5 | Removed call to ggtree in introduction vignette as waiting on patch for ggtree given the new version of tibble. 6 | 7 | # rhierbaps 1.1.0 8 | * minor bug fix that caused an error when a cluster contained only a single SNP 9 | * added functionality to load in ape DNAbin objects 10 | * added functionality to calculate individual assignment probabilities 11 | 12 | # Initial release 1.0.0devtools::release() -------------------------------------------------------------------------------- /man/calc_log_ml.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/calc_log_ml.R 3 | \name{calc_log_ml} 4 | \alias{calc_log_ml} 5 | \title{calc_log_ml} 6 | \usage{ 7 | calc_log_ml(snp.object, partition) 8 | } 9 | \arguments{ 10 | \item{snp.object}{A snp.object containing the processed SNP data.} 11 | 12 | \item{partition}{An integer vector indicating a partition of the isolates.} 13 | } 14 | \value{ 15 | The log marginal likelihood of the given partition. 16 | } 17 | \description{ 18 | Calculate the log marginal likelihood assuming a Multinomial-Dirichlet distribution 19 | } 20 | -------------------------------------------------------------------------------- /man/calc_change_in_ml.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/calc_change_in_ml.R 3 | \name{calc_change_in_ml} 4 | \alias{calc_change_in_ml} 5 | \title{calc_change_in_ml} 6 | \usage{ 7 | calc_change_in_ml(snp.object, partition, indexes) 8 | } 9 | \arguments{ 10 | \item{snp.object}{A snp.object containing the processed SNP data.} 11 | 12 | \item{partition}{An integer vector indicating a partition of the isolates.} 13 | 14 | \item{indexes}{Indexes of the isolates to be moved (must come from one cluster.)} 15 | } 16 | \value{ 17 | the best cluster to move indexes to. 18 | } 19 | \description{ 20 | Calculate the change in the log marginal likelihood after moving index to each possible cluster 21 | } 22 | -------------------------------------------------------------------------------- /man/save_lml_logs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/save_lml_logs.R 3 | \name{save_lml_logs} 4 | \alias{save_lml_logs} 5 | \title{save_lml_logs} 6 | \usage{ 7 | save_lml_logs(hb.object, file) 8 | } 9 | \arguments{ 10 | \item{hb.object}{The resulting object from runnign hierBAPS} 11 | 12 | \item{file}{The file you would like to save the log output to.} 13 | } 14 | \description{ 15 | Saves the log marginal likelihoods to a text file. 16 | } 17 | \examples{ 18 | snp.matrix <- load_fasta(system.file("extdata", "small_seqs.fa", package = "rhierbaps")) 19 | hb.result <- hierBAPS(snp.matrix, max.depth=2, n.pops=20) 20 | save_lml_logs(hb.result, file.path(tempdir(), "output_file.txt")) 21 | 22 | } 23 | -------------------------------------------------------------------------------- /cran-comments.md: -------------------------------------------------------------------------------- 1 | ## Resubmission 2 | This is a an updated version. In this version I have: 3 | 4 | * Fixed an error cause by changes to the ggtree plotting package 5 | 6 | ## Resubmission 7 | This is a an updated version. In this version I have: 8 | 9 | * Fixed a bug that caused the algorithm to stop prematurely on large alignments 10 | 11 | ## Resubmission 12 | This is a an updated version. In this version I have: 13 | 14 | * Fixed a minor bug that caused an error when a cluster contained only a single SNP 15 | * Added functionality to accept an ape DNAbin object 16 | 17 | ## Test environments 18 | * local OS X install, R 3.6.1 19 | * ubuntu 16.04 (on travis-ci), R 3.6.1 20 | * win-builder (devel and release) 21 | 22 | ## R CMD check results 23 | There were no ERRORs, WARNINGs or NOTEs. -------------------------------------------------------------------------------- /man/load_fasta.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/load_fasta.R 3 | \name{load_fasta} 4 | \alias{load_fasta} 5 | \title{load_fasta} 6 | \usage{ 7 | load_fasta(msa, keep.singletons = FALSE) 8 | } 9 | \arguments{ 10 | \item{msa}{Either the location of a fasta file or ape DNAbin object containing the multiple sequence alignment data to be clustered} 11 | 12 | \item{keep.singletons}{A logical indicating whether to consider singleton mutations in calculating the clusters} 13 | } 14 | \value{ 15 | A character matrix with filtered SNP data 16 | } 17 | \description{ 18 | Loads a fasta file into matrix format ready for 19 | running the hierBAPS algorithm. 20 | } 21 | \examples{ 22 | msa <- system.file("extdata", "seqs.fa", package = "rhierbaps") 23 | snp.matrix <- load_fasta(msa) 24 | } 25 | -------------------------------------------------------------------------------- /inst/CITATION: -------------------------------------------------------------------------------- 1 | citHeader("Currently, the best way to cite rhierbaps in publications is to use:") 2 | 3 | citEntry(entry = "Article", 4 | title = "Hierarchical and spatially explicit clustering of {DNA} sequences 5 | with {BAPS} software", 6 | author = personList(as.person("Lu Cheng"), 7 | as.person("Thomas R Connor"), 8 | as.person("Jukka Siren"), 9 | as.person("David M. Aanensen"), 10 | as.person("Jukka Corander")), 11 | journal = "Mol. Biol. Evol.", 12 | year = "2013", 13 | volume = "30", 14 | number = "5", 15 | pages = "1224--1228", 16 | url = "http://dx.doi.org/10.1093/molbev/mst028", 17 | 18 | textVersion = 19 | paste("Cheng, Lu, Thomas R. Connor, Jukka Sirén, David M. Aanensen, and Jukka Corander. 2013.", 20 | "Hierarchical and Spatially Explicit Clustering of DNA Sequences with BAPS Software.", 21 | "Molecular Biology and Evolution 30 (5): 1224–28.") 22 | ) 23 | 24 | 25 | -------------------------------------------------------------------------------- /man/plot_sub_cluster.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plot_sub_cluster.R 3 | \name{plot_sub_cluster} 4 | \alias{plot_sub_cluster} 5 | \title{plot_sub_cluster} 6 | \usage{ 7 | plot_sub_cluster(hb.object, tree, level, sub.cluster) 8 | } 9 | \arguments{ 10 | \item{hb.object}{The resulting object from running hierBAPS} 11 | 12 | \item{tree}{A phylo tree object to plot} 13 | 14 | \item{level}{The level of the subcluster to be considered.} 15 | 16 | \item{sub.cluster}{An integer representing the subcluster to be considered.} 17 | } 18 | \description{ 19 | Creates a zoom plot using ggtree focusing on a cluster. 20 | } 21 | \examples{ 22 | \donttest{ 23 | snp.matrix <- load_fasta(system.file("extdata", "seqs.fa", package = "rhierbaps")) 24 | newick.file.name <- system.file("extdata", "seqs.fa.treefile", package = "rhierbaps") 25 | tree <- phytools::read.newick(newick.file.name) 26 | hb.result <- hierBAPS(snp.matrix, max.depth=2, n.pops=20) 27 | plot_sub_cluster(hb.result, tree, level = 1, sub.cluster = 9) 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /man/join_units_2.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/join_units_2.R 3 | \name{join_units_2} 4 | \alias{join_units_2} 5 | \title{join_units_2} 6 | \usage{ 7 | join_units_2( 8 | snp.object, 9 | partition, 10 | threshold = 1e-05, 11 | n.cores = 1, 12 | comb.chache = NULL 13 | ) 14 | } 15 | \arguments{ 16 | \item{snp.object}{A snp.object containing the processed SNP data.} 17 | 18 | \item{partition}{An integer vector indicating an initial partition of the isolates.} 19 | 20 | \item{threshold}{The increase in marginal log likelihood required to accept a move.} 21 | 22 | \item{n.cores}{The number of cores to use.} 23 | 24 | \item{comb.chache}{a matrix recording previous marginal llks of combining clusters} 25 | } 26 | \value{ 27 | The best partition after combining two clusters as well as 28 | a boolean value indicating whether a move increased the marginal likelihood. 29 | } 30 | \description{ 31 | Peform an iteration of the second move in the algorithm. That is combine two clusters 32 | to improve the marginal likelihood. 33 | } 34 | -------------------------------------------------------------------------------- /man/split_clusters_3.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/split_clusters_3.R 3 | \name{split_clusters_3} 4 | \alias{split_clusters_3} 5 | \title{split_clusters_3} 6 | \usage{ 7 | split_clusters_3( 8 | snp.object, 9 | partition, 10 | threshold = 1e-05, 11 | min.clust.size = 20, 12 | n.cores = 1 13 | ) 14 | } 15 | \arguments{ 16 | \item{snp.object}{A snp.object containing the processed SNP data.} 17 | 18 | \item{partition}{An integer vector indicating an initial partition of the isolates.} 19 | 20 | \item{threshold}{The increase in marginal log likelihood required to accept a move.} 21 | 22 | \item{min.clust.size}{Clusters smaller than min.clust.size will not be split.} 23 | 24 | \item{n.cores}{The number of cores to use.} 25 | } 26 | \value{ 27 | The best partition after splitting a cluster and re-allocating as well as 28 | a boolean value indicating whether a move increased the marginal likelihood. 29 | } 30 | \description{ 31 | Peform an iteration of the third move in the algorithm. That is split cluster in two 32 | and re-allocate one sub-cluster. 33 | } 34 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: rhierbaps 2 | Type: Package 3 | Title: Clustering Genetic Sequence Data Using the HierBAPS Algorithm 4 | Version: 1.1.3 5 | Authors@R: person("Gerry", "Tonkin-Hill", email = "g.tonkinhill@gmail.com", role = c("cre", "aut")) 6 | Description: Implements the hierarchical Bayesian analysis of populations structure (hierBAPS) 7 | algorithm of Cheng et al. (2013) for clustering DNA sequences 8 | from multiple sequence alignments in FASTA format. 9 | The implementation includes improved defaults and plotting capabilities 10 | and unlike the original 'MATLAB' version removes singleton SNPs by default. 11 | License: MIT + file LICENSE 12 | Encoding: UTF-8 13 | LazyData: true 14 | Imports: 15 | ape, 16 | purrr, 17 | utils, 18 | ggplot2, 19 | matrixStats, 20 | patchwork 21 | RoxygenNote: 7.1.0 22 | Suggests: knitr, 23 | rmarkdown, 24 | markdown, 25 | ggtree, 26 | phytools, 27 | testthat, 28 | formatR 29 | VignetteBuilder: knitr 30 | URL: https://github.com/gtonkinhill/rhierbaps 31 | BugReports: https://github.com/gtonkinhill/rhierbaps/issues 32 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | Copyright (c) 2018 Gerry Tonkin-Hill 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /R/save_lml_logs.R: -------------------------------------------------------------------------------- 1 | #' save_lml_logs 2 | #' 3 | #' Saves the log marginal likelihoods to a text file. 4 | #' 5 | #' 6 | #' @param hb.object The resulting object from runnign hierBAPS 7 | #' @param file The file you would like to save the log output to. 8 | #' 9 | #' @examples 10 | #' snp.matrix <- load_fasta(system.file("extdata", "small_seqs.fa", package = "rhierbaps")) 11 | #' hb.result <- hierBAPS(snp.matrix, max.depth=2, n.pops=20) 12 | #' save_lml_logs(hb.result, file.path(tempdir(), "output_file.txt")) 13 | #' 14 | #' @export 15 | save_lml_logs <- function(hb.object, file){ 16 | 17 | if (!is.character(file)) stop("Invalid file name. Must be a valid string.") 18 | if ((!is.list(hb.object) || !is.data.frame(hb.object$partition.df) 19 | ) || !is.list(hb.object$lml.list)) stop("Invalid hb.object!") 20 | 21 | sink(file) 22 | for(i in 1:length(hb.object$lml.list)){ 23 | cat(names(hb.object$lml.list)[[i]], "\n") 24 | cat("cluster names:\t", paste(names(hb.object$lml.list[[i]]), collapse = "\t"), "\n") 25 | cat("cluster log marginal likelihood:\t", paste(hb.object$lml.list[[i]], collapse = "\t"), "\n") 26 | } 27 | sink() 28 | } 29 | -------------------------------------------------------------------------------- /man/model_search_parallel.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/model_search_parallel.R 3 | \name{model_search_parallel} 4 | \alias{model_search_parallel} 5 | \title{model_search_parallel} 6 | \usage{ 7 | model_search_parallel( 8 | snp.object, 9 | partition, 10 | round.types, 11 | quiet = FALSE, 12 | n.extra.rounds = 0, 13 | n.cores = 1 14 | ) 15 | } 16 | \arguments{ 17 | \item{snp.object}{A snp.object containing the processed SNP data.} 18 | 19 | \item{partition}{An integer vector indicating an initial starting partition.} 20 | 21 | \item{round.types}{A vector indicating which series of moves to make.} 22 | 23 | \item{quiet}{Whether to suppress progress information (default=FALSE).} 24 | 25 | \item{n.extra.rounds}{The number of additional rounds to perform after the default hierBAPS 26 | settings (default=0). If set to Inf it will run until a local optimum is reached 27 | (this might take a long time).} 28 | 29 | \item{n.cores}{The number of cores to use.} 30 | } 31 | \value{ 32 | an optimised partition and marginal llk 33 | } 34 | \description{ 35 | Clusters DNA alignment using independent loci model 36 | } 37 | -------------------------------------------------------------------------------- /man/reallocate_units_4.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/reallocate_units_4.R 3 | \name{reallocate_units_4} 4 | \alias{reallocate_units_4} 5 | \title{reallocate_units_4} 6 | \usage{ 7 | reallocate_units_4( 8 | snp.object, 9 | partition, 10 | threshold = 1e-05, 11 | min.clust.size = 20, 12 | split = FALSE, 13 | n.cores = 1 14 | ) 15 | } 16 | \arguments{ 17 | \item{snp.object}{A snp.object containing the processed SNP data.} 18 | 19 | \item{partition}{An integer vector indicating an initial partition of the isolates.} 20 | 21 | \item{threshold}{The increase in marginal log likelihood required to accept a move.} 22 | 23 | \item{min.clust.size}{Clusters smaller than min.clust.size will not be split.} 24 | 25 | \item{split}{Whether to split only into two clusters (for move type 3).} 26 | 27 | \item{n.cores}{The number of cores to use.} 28 | } 29 | \value{ 30 | The best partition after splitting a cluster and re-allocating as well as 31 | a boolean value indicating whether a move increased the marginal likelihood. 32 | } 33 | \description{ 34 | Peform an iteration of the fourth move in the algorithm. That is split cluster into n 35 | subclusters and re-allocate one sub-cluster. 36 | } 37 | -------------------------------------------------------------------------------- /man/move_units_1.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/move_units_1.R 3 | \name{move_units_1} 4 | \alias{move_units_1} 5 | \title{move_units_1} 6 | \usage{ 7 | move_units_1( 8 | snp.object, 9 | partition, 10 | threshold = 1e-05, 11 | frac.clust.searched = 0.3, 12 | min.clust.size = 20, 13 | n.cores = 1 14 | ) 15 | } 16 | \arguments{ 17 | \item{snp.object}{A snp.object containing the processed SNP data.} 18 | 19 | \item{partition}{An integer vector indicating an initial partition of the isolates.} 20 | 21 | \item{threshold}{The increase in marginal log likelihood required to accept a move.} 22 | 23 | \item{frac.clust.searched}{The percentage of a large cluster that will be moved.} 24 | 25 | \item{min.clust.size}{All isolates in clusters less than or equal to min.clus.size will be searched.} 26 | 27 | \item{n.cores}{The number of cores to use.} 28 | } 29 | \value{ 30 | The best partition after moving units from one cluster to another as well as 31 | a boolean value indicating whether a move increased the marginal likelihood. 32 | } 33 | \description{ 34 | Peform an iteration of the first move in the algorithm. That is move units from one cluster to 35 | another to improve the marginal likelihood 36 | } 37 | -------------------------------------------------------------------------------- /R/split_clusters_3.R: -------------------------------------------------------------------------------- 1 | #' split_clusters_3 2 | #' 3 | #' Peform an iteration of the third move in the algorithm. That is split cluster in two 4 | #' and re-allocate one sub-cluster. 5 | #' 6 | #' 7 | #' @param snp.object A snp.object containing the processed SNP data. 8 | #' @param partition An integer vector indicating an initial partition of the isolates. 9 | #' @param threshold The increase in marginal log likelihood required to accept a move. 10 | #' @param min.clust.size Clusters smaller than min.clust.size will not be split. 11 | #' @param n.cores The number of cores to use. 12 | #' 13 | #' @return The best partition after splitting a cluster and re-allocating as well as 14 | #' a boolean value indicating whether a move increased the marginal likelihood. 15 | #' 16 | split_clusters_3 <- function(snp.object, partition, threshold=1e-5, 17 | min.clust.size=20, n.cores=1){ 18 | #At the moment this can't create new clusters. This is the same as in the original hierBAPS 19 | #but it might be worth allowing the creation of new clusters. TODO:Ask Jukka about it. 20 | return(reallocate_units_4(snp.object, partition, threshold=1e-5, 21 | min.clust.size=min.clust.size, split=TRUE, 22 | n.cores=n.cores)) 23 | 24 | 25 | } 26 | -------------------------------------------------------------------------------- /R/preproc_alignment.R: -------------------------------------------------------------------------------- 1 | #' preproc_alignment 2 | #' 3 | #' Preprocessed the snp matrix for hierBAPS. 4 | #' 5 | #' 6 | #' @param snp.matrix A matrix containing SNP data. Rows indicate isolates and columns loci. 7 | #' 8 | #' @return an snp.object 9 | #' 10 | preproc_alignment <- function(snp.matrix){ 11 | if(!is.matrix(snp.matrix)) stop("snp.matrix is not a valid matrix") 12 | 13 | n.seq <- nrow(snp.matrix) 14 | 15 | # prior: 1/number of distinct nucleotides observed at that column 16 | prior <- apply(snp.matrix, 2, function(x) table(factor(x, levels = c("a","c","g","t")), 17 | exclude = c("-", NA))) 18 | 19 | #TODO: should we be getting rid of unique SNPs here? 20 | #now ignore conserved columns 21 | keep <- colSums(prior>0)>1 22 | 23 | if(sum(keep)==0){ 24 | #all columns are conserved 25 | return(NA) 26 | } 27 | 28 | snp.matrix <- snp.matrix[, keep, drop=FALSE] 29 | prior <- prior[, keep, drop=FALSE] 30 | 31 | #finally generate a matrix for the prior nt values 32 | prior <- 1*(prior>0) 33 | prior <- t(t(prior)/colSums(prior)) 34 | 35 | #Calculate hamming distance 36 | orig.dist <- as.matrix(ape::dist.dna(ape::as.DNAbin(snp.matrix), 37 | model = "N", pairwise.deletion = TRUE)) 38 | 39 | return(list( 40 | n.seq = n.seq, 41 | dist = orig.dist, 42 | seq.inds = 1:n.seq, 43 | prior = prior, 44 | data = snp.matrix 45 | )) 46 | } 47 | -------------------------------------------------------------------------------- /R/calc_log_ml.R: -------------------------------------------------------------------------------- 1 | #' calc_log_ml 2 | #' 3 | #' Calculate the log marginal likelihood assuming a Multinomial-Dirichlet distribution 4 | #' 5 | #' 6 | #' @param snp.object A snp.object containing the processed SNP data. 7 | #' @param partition An integer vector indicating a partition of the isolates. 8 | #' 9 | #' @return The log marginal likelihood of the given partition. 10 | #' 11 | calc_log_ml <- function(snp.object, partition){ 12 | #some checks 13 | if (ncol(snp.object$prior)!=ncol(snp.object$data)) stop("ncol mismatch bwtn prior and data!") 14 | if (length(partition)!=nrow(snp.object$data)) stop("mismatch bwtn partition and data!") 15 | 16 | #get allele counts for each cluster in partition 17 | mA <- t(rowsum(1*(snp.object$data=="a"), partition)) 18 | mC <- t(rowsum(1*(snp.object$data=="c"), partition)) 19 | mG <- t(rowsum(1*(snp.object$data=="g"), partition)) 20 | mT <- t(rowsum(1*(snp.object$data=="t"), partition)) 21 | 22 | prior <- snp.object$prior 23 | prior[prior==0] <- 1 #deal with zeros and resulting NAs 24 | 25 | #calculate log marginal likelihood 26 | term1 <- -lgamma(1 + mA+mC+mG+mT) 27 | term2 <- lgamma(prior["a", ] + mA) - lgamma(prior["a", ]) 28 | term2 <- term2 + lgamma(prior["c", ] + mC) - lgamma(prior["c", ]) 29 | term2 <- term2 + lgamma(prior["g", ] + mG) - lgamma(prior["g", ]) 30 | term2 <- term2 + lgamma(prior["t", ] + mT) - lgamma(prior["t", ]) 31 | 32 | #take sum over all loci and clusters 33 | ml <- sum(term1 + term2) 34 | 35 | #uniform prior on K: 36 | #prior prob of each partition is equal so must divide by the number of possible 37 | #partitions. 38 | stirling2 <- log_stirling2(nrow(snp.object$data), length(unique(partition))) 39 | stopifnot(is.finite(stirling2)) 40 | stopifnot(!is.na(stirling2)) 41 | 42 | ml <- ml - stirling2 43 | 44 | return(ml) 45 | } 46 | -------------------------------------------------------------------------------- /R/load_fasta.R: -------------------------------------------------------------------------------- 1 | #' load_fasta 2 | #' 3 | #' Loads a fasta file into matrix format ready for 4 | #' running the hierBAPS algorithm. 5 | #' 6 | #' @param msa Either the location of a fasta file or ape DNAbin object containing the multiple sequence alignment data to be clustered 7 | #' @param keep.singletons A logical indicating whether to consider singleton mutations in calculating the clusters 8 | #' 9 | #' @return A character matrix with filtered SNP data 10 | #' 11 | #' @examples 12 | #' msa <- system.file("extdata", "seqs.fa", package = "rhierbaps") 13 | #' snp.matrix <- load_fasta(msa) 14 | #' @export 15 | load_fasta <- function(msa, keep.singletons=FALSE) { 16 | 17 | #Check inputs 18 | if(class(msa)=="character"){ 19 | if (!file.exists(msa)) stop("Invalid msa or the file does not exist!") 20 | seqs <- ape::read.FASTA(msa) 21 | } else if(class(msa)=="matrix"){ 22 | seqs <- ape::as.DNAbin(msa) 23 | } else if(class(msa)=="DNAbin"){ 24 | seqs <- msa 25 | } else{ 26 | stop("incorrect input for msa!") 27 | } 28 | if (!is.logical(keep.singletons)) stop("Invalid keep.singletons! Must be on of TRUE/FALSE.") 29 | 30 | #Load sequences using ape. This does a lot of the checking for us. 31 | seq_names <- labels(seqs) 32 | seqs <- as.character(as.matrix(seqs)) 33 | rownames(seqs) <- seq_names 34 | seqs[is.na(seqs)] <- "-" 35 | 36 | if (nrow(seqs)<3) stop("Less than 3 sequences!") 37 | warning("Characters not in acgtnACGTN- will be treated as missing (-)...") 38 | 39 | #Remove conserved columns 40 | conserved <- colSums(t(t(seqs)==seqs[1,]))==nrow(seqs) 41 | seqs <- seqs[, !conserved] 42 | 43 | if(!keep.singletons){ 44 | #remove singletons as they are uninformative in the algorithm 45 | is_singleton <- apply(seqs, 2, function(x){ 46 | tab <- table(x) 47 | return(x %in% names(tab)[tab==1]) 48 | }) 49 | seqs[is_singleton] <- "-" 50 | } 51 | 52 | #Convert gaps and unknowns to same symbol 53 | seqs[seqs=="n"] <- "-" 54 | 55 | return(seqs) 56 | } 57 | -------------------------------------------------------------------------------- /man/hierBAPS.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/hierBAPS.R 3 | \name{hierBAPS} 4 | \alias{hierBAPS} 5 | \title{hierBAPS} 6 | \usage{ 7 | hierBAPS( 8 | snp.matrix, 9 | max.depth = 2, 10 | n.pops = floor(nrow(snp.matrix)/5), 11 | quiet = FALSE, 12 | n.extra.rounds = 0, 13 | assignment.probs = FALSE, 14 | n.cores = 1 15 | ) 16 | } 17 | \arguments{ 18 | \item{snp.matrix}{Character matrix of aligned sequences produced by \link{load_fasta}.} 19 | 20 | \item{max.depth}{Maximum depth of hierarchical search (default = 2).} 21 | 22 | \item{n.pops}{Maximum number of populations in the data (default = number of isolates/5)} 23 | 24 | \item{quiet}{Whether to suppress progress information (default=FALSE).} 25 | 26 | \item{n.extra.rounds}{The number of additional rounds to perform after the default hierBAPS 27 | settings (default=0). If set to Inf it will run until a local optimum is reached 28 | (this might take a long time).} 29 | 30 | \item{assignment.probs}{whether or not to calculate the assignment probabilities to each cluster (default=FALSE)} 31 | 32 | \item{n.cores}{The number of cores to use.} 33 | } 34 | \value{ 35 | A list containing a dataframe indicating an assignment of each sequence 36 | to hierarchical clusters as well as the log marginal likelihoods for each level. 37 | } 38 | \description{ 39 | Runs the hierBAPS algorithm of Cheng et al. 2013 40 | } 41 | \examples{ 42 | snp.matrix <- load_fasta(system.file("extdata", "small_seqs.fa", package = "rhierbaps")) 43 | hb <- hierBAPS(snp.matrix, max.depth=2, n.pops=20, quiet=FALSE) 44 | 45 | \donttest{ 46 | snp.matrix <- load_fasta(system.file("extdata", "seqs.fa", package = "rhierbaps")) 47 | system.time({hb <- hierBAPS(snp.matrix, max.depth=2, n.pops=20, quiet=FALSE)}) 48 | } 49 | 50 | } 51 | \references{ 52 | Cheng, Lu, Thomas R. Connor, Jukka Sirén, David M. Aanensen, and Jukka Corander. 2013. “Hierarchical and Spatially Explicit Clustering of DNA Sequences with BAPS Software.” Molecular Biology and Evolution 30 (5): 1224–28. 53 | } 54 | \author{ 55 | Gerry Tonkin-Hill 56 | } 57 | -------------------------------------------------------------------------------- /R/log_stirling2.R: -------------------------------------------------------------------------------- 1 | #' log_stirling2 2 | #' 3 | #' @param n number of objects 4 | #' @param k number of partitions 5 | #' 6 | #' @return log of the Stirling number of the second kind 7 | #' 8 | #' 9 | log_stirling2 <- function(n, k){ 10 | if(!is.numeric(n)) stop("n is not numeric!") 11 | if(!is.numeric(k)) stop("k is not numeric!") 12 | if(k>n) stop("k must be less than n!") 13 | 14 | v <- n/k 15 | G <- lambertW(-v*exp(-v)) 16 | 17 | lS2 <- log(sqrt((v-1)/(v*(1-G)))) + 18 | (n-k)*(log(v-1)-log(v-G)) + 19 | n*log(k)-k*log(n) + 20 | k*(1-G) + 21 | lchoose(n, k) 22 | 23 | return(lS2) 24 | } 25 | 26 | # This function was written by Ben Bolker and taken from https://stat.ethz.ch/pipermail/r-help/2003-November/042793.html 27 | lambertW = function(z,b=0,maxiter=10,eps=.Machine$double.eps, 28 | min.imag=1e-9) { 29 | if (any(round(Re(b)) != b)) 30 | stop("branch number for W must be an integer") 31 | if (!is.complex(z) && any(z<0)) z=as.complex(z) 32 | ## series expansion about -1/e 33 | ## 34 | ## p = (1 - 2*abs(b)).*sqrt(2*e*z + 2); 35 | ## w = (11/72)*p; 36 | ## w = (w - 1/3).*p; 37 | ## w = (w + 1).*p - 1 38 | ## 39 | ## first-order version suffices: 40 | ## 41 | w = (1 - 2*abs(b))*sqrt(2*exp(1)*z + 2) - 1 42 | ## asymptotic expansion at 0 and Inf 43 | ## 44 | v = log(z + as.numeric(z==0 & b==0)) + 2*pi*b*1i; 45 | v = v - log(v + as.numeric(v==0)) 46 | ## choose strategy for initial guess 47 | ## 48 | c = abs(z + exp(-1)); 49 | c = (c > 1.45 - 1.1*abs(b)); 50 | c = c | (b*Im(z) > 0) | (!Im(z) & (b == 1)) 51 | w = (1 - c)*w + c*v 52 | ## Halley iteration 53 | ## 54 | for (n in 1:maxiter) { 55 | p = exp(w) 56 | t = w*p - z 57 | f = (w != -1) 58 | t = f*t/(p*(w + f) - 0.5*(w + 2.0)*t/(w + f)) 59 | w = w - t 60 | if (abs(Re(t)) < (2.48*eps)*(1.0 + abs(Re(w))) 61 | && abs(Im(t)) < (2.48*eps)*(1.0 + abs(Im(w)))) 62 | break 63 | } 64 | if (n==maxiter) warning(paste("iteration limit (",maxiter, 65 | ") reached, result of W may be inaccurate",sep="")) 66 | if (all(Im(w) 8 | 9 | ```{r, echo = FALSE} 10 | knitr::opts_chunk$set( 11 | collapse = TRUE, 12 | comment = "#>", 13 | fig.path = "inst/vignette-supp/", 14 | echo=TRUE, 15 | warning=FALSE, 16 | message=FALSE, 17 | tidy=TRUE 18 | ) 19 | ``` 20 | 21 | 22 | [![R-CMD-check](https://github.com/gtonkinhill/rhierbaps/workflows/R-CMD-check/badge.svg)](https://github.com/gtonkinhill/rhierbaps/actions) 23 | 24 | 25 | 26 | # rhierbaps 27 | 28 | We have recently developed a faster verion of the BAPs clustering method. It can be found [here](https://github.com/gtonkinhill/fastbaps). 29 | 30 | ## Installation 31 | `rhierbaps` is available on CRAN. 32 | ```{r, eval=FALSE} 33 | install.packages("rhierbaps") 34 | ``` 35 | 36 | The development version is available on github. It can be installed with `devtools` 37 | ```{r, eval = FALSE} 38 | install.packages("devtools") 39 | 40 | devtools::install_github("gtonkinhill/rhierbaps") 41 | ``` 42 | 43 | If you would like to also build the vignette with your installation run: 44 | 45 | ```{r, eval=FALSE} 46 | devtools::install_github("gtonkinhill/rhierbaps", build_vignettes = TRUE) 47 | ``` 48 | 49 | ## Quick Start 50 | Run hierBAPS. 51 | 52 | ```{r, fig.width =8, fig.height=6, fig.align='center'} 53 | # install.packages("rhierbaps") 54 | library(rhierbaps) 55 | 56 | fasta.file.name <- system.file("extdata", "seqs.fa", package = "rhierbaps") 57 | snp.matrix <- load_fasta(fasta.file.name) 58 | hb.results <- hierBAPS(snp.matrix, max.depth=2, n.pops=20, quiet = TRUE) 59 | head(hb.results$partition.df) 60 | ``` 61 | 62 | ```{r, echo = FALSE} 63 | intro_rmd <- 'vignettes/introduction.Rmd' 64 | 65 | raw_rmd <- readLines(intro_rmd) 66 | 67 | # remove yaml 68 | yaml_lines <- grep("---", raw_rmd) 69 | 70 | # remove appendix (session info) 71 | appendix <- grep("Session", raw_rmd) 72 | 73 | compressed_rmd <- raw_rmd[c(-seq(yaml_lines[1], yaml_lines[2], by = 1), 74 | -seq(appendix, length(raw_rmd)))] 75 | writeLines(compressed_rmd, "child.Rmd") 76 | ``` 77 | 78 | ```{r, child = 'child.Rmd'} 79 | ``` 80 | 81 | ```{r cleanup, echo=FALSE, include=FALSE} 82 | if (file.exists("child.Rmd")) { 83 | file.remove("child.Rmd") 84 | } 85 | ``` 86 | -------------------------------------------------------------------------------- /inst/extdata/small_seqs.fa: -------------------------------------------------------------------------------- 1 | >1 2 | AGCTGTAATG 3 | >2 4 | AGCTGTAATG 5 | >3 6 | AGCTGTAATG 7 | >4 8 | AGTCGTAATG 9 | >5 10 | AATCGTAATG 11 | >6 12 | AGTCGTAATG 13 | >7 14 | AATCGTAATG 15 | >8 16 | AATCGTAATG 17 | >9 18 | AGTCGTAATG 19 | >10 20 | AGTCGTAATG 21 | >11 22 | AGTCGTAATG 23 | >12 24 | AGTCGTAATG 25 | >13 26 | AATCGTAATG 27 | >14 28 | AGTCGTAATG 29 | >15 30 | AATCGTAATG 31 | >16 32 | AGTCGTAATG 33 | >17 34 | AGTCGTAATG 35 | >18 36 | AGTCGTAATG 37 | >19 38 | AGTCGTAATG 39 | >20 40 | AGTCGTAATG 41 | >21 42 | AGTCATAATG 43 | >22 44 | AGTCGTAATG 45 | >23 46 | AGTCGTAATG 47 | >24 48 | AGTCGTAATG 49 | >25 50 | AATCGTAATG 51 | >26 52 | AGTCGTAATG 53 | >27 54 | AGTCGTAATG 55 | >28 56 | AGCCGTAATG 57 | >29 58 | AGTCGTAATG 59 | >30 60 | AGTCATAATG 61 | >31 62 | AGCCGTAATG 63 | >32 64 | AATCGTAATG 65 | >33 66 | AGTCGTAATG 67 | >34 68 | AGTCGTAATG 69 | >35 70 | AGTCATAATG 71 | >36 72 | AGTCATAATG 73 | >37 74 | AGTCATAATG 75 | >38 76 | AATCGTAATG 77 | >39 78 | AGTCGTAATG 79 | >40 80 | AGTCGTAATG 81 | >41 82 | AGTCATAATG 83 | >42 84 | AGTCATAATG 85 | >43 86 | AGTCGTAATG 87 | >44 88 | AGTCGTAATG 89 | >45 90 | AGCCGTAATG 91 | >46 92 | AGTCGTAATG 93 | >47 94 | AGTCGTAATA 95 | >48 96 | AGTCGTAATG 97 | >49 98 | AGTCGTAATG 99 | >50 100 | AGTCGTAATG 101 | >51 102 | AGTCGTAATG 103 | >52 104 | AGTCGTAATG 105 | >53 106 | AGTCGTAATG 107 | >54 108 | AGTCGTAATG 109 | >55 110 | AGTCGTAATG 111 | >56 112 | AGTCGTAATG 113 | >57 114 | AGTCGTAATG 115 | >58 116 | AGTCGTAATG 117 | >59 118 | AGTCGTAATG 119 | >60 120 | AGTCGTAATG 121 | >61 122 | AGTCGTAATG 123 | >62 124 | AGTCGTAATG 125 | >63 126 | AGTCGTAATG 127 | >64 128 | AGTCGTAATG 129 | >65 130 | AGTCGTAATG 131 | >66 132 | AATCGTAATG 133 | >67 134 | AGTCGTAATG 135 | >68 136 | AGTCGTAATG 137 | >69 138 | AGTCGTAATG 139 | >70 140 | AGTCGTAATG 141 | >71 142 | AGTCGTAATG 143 | >72 144 | AGTCGTAATG 145 | >73 146 | AGTCGTAATG 147 | >74 148 | AGTCGTAATG 149 | >75 150 | AGTCGTAATG 151 | >76 152 | AGTCGTAATG 153 | >77 154 | AGTCGTAATG 155 | >78 156 | AATCGTAATG 157 | >79 158 | AATCGTAATG 159 | >80 160 | AGTCGTAATG 161 | >81 162 | AGTCGTAATG 163 | >82 164 | AGTCATAATG 165 | >83 166 | AATCAAAGTG 167 | >84 168 | AGTCGTAATG 169 | >85 170 | AATCGTAATG 171 | >86 172 | AGTCGTAATG 173 | >87 174 | AATCAAAA-G 175 | >88 176 | AGTCATAATG 177 | >89 178 | AGTCGTAATG 179 | >90 180 | AATCGTAATG 181 | >91 182 | AGCCGTAATG 183 | >92 184 | AATCGTAATG 185 | >93 186 | AGTCGTAATG 187 | >94 188 | AGTCGTAATG 189 | >95 190 | AGTCGTAATG 191 | >96 192 | AGTCGTAATG 193 | >97 194 | AGTCGTAATG 195 | >98 196 | AATCGTAATG 197 | >99 198 | AGTCGTAATG 199 | >100 200 | AGTCGTAATG 201 | -------------------------------------------------------------------------------- /R/plot_sub_cluster.R: -------------------------------------------------------------------------------- 1 | #' plot_sub_cluster 2 | #' 3 | #' Creates a zoom plot using ggtree focusing on a cluster. 4 | #' 5 | #' @import patchwork 6 | #' 7 | #' 8 | #' @param hb.object The resulting object from running hierBAPS 9 | #' @param tree A phylo tree object to plot 10 | #' @param level The level of the subcluster to be considered. 11 | #' @param sub.cluster An integer representing the subcluster to be considered. 12 | #' 13 | #' @examples 14 | #' \donttest{ 15 | #' snp.matrix <- load_fasta(system.file("extdata", "seqs.fa", package = "rhierbaps")) 16 | #' newick.file.name <- system.file("extdata", "seqs.fa.treefile", package = "rhierbaps") 17 | #' tree <- phytools::read.newick(newick.file.name) 18 | #' hb.result <- hierBAPS(snp.matrix, max.depth=2, n.pops=20) 19 | #' plot_sub_cluster(hb.result, tree, level = 1, sub.cluster = 9) 20 | #' } 21 | #' @export 22 | plot_sub_cluster <- function(hb.object, tree, level, sub.cluster){ 23 | #Checks 24 | if ((!is.list(hb.object) || !is.data.frame(hb.object$partition.df) 25 | ) || !is.list(hb.object$lml.list)) stop("Invalid hb.object!") 26 | if (!(class(tree)=="phylo")) stop("Invalid tree object!") 27 | if ((!is.numeric(level)) || (level<1)) stop("Invalid level! Must be a positive integer.") 28 | if ((!is.numeric(sub.cluster)) || (sub.cluster<1)) stop("Invalid sub.cluster! Must be a positive integer.") 29 | 30 | level <- level+1 31 | if(!("ggtree" %in% 32 | rownames(utils::installed.packages()))) stop("This function requires ggtree to be installed") 33 | 34 | cluster.isolate <- hb.object$partition.df$Isolate[hb.object$partition.df[,level]==sub.cluster] 35 | 36 | # Plot the full tree with the clade highlighted 37 | hb.object$partition.df$is_in_cluster <- hb.object$partition.df$Isolate %in% cluster.isolate 38 | full_tree <- ggtree::`%<+%`(ggtree::ggtree(tree), hb.object$partition.df) + 39 | ggtree::geom_tippoint(ggplot2::aes_string(color="is_in_cluster"), size=0.5) + 40 | ggplot2::theme(legend.position = "none") + 41 | ggplot2::scale_colour_manual(values=c("#000000", "#e31a1c")) 42 | 43 | # Subset the tree 44 | temp_column_id <- paste(c("factor(`level ", level, "`)"), collapse = "") 45 | sub_tree <- ape::drop.tip(tree, tree$tip.label[!tree$tip.label %in% cluster.isolate]) 46 | sub_tree <- ggtree::`%<+%`(ggtree::ggtree(sub_tree), hb.object$partition.df) + 47 | ggtree::geom_tippoint(ggplot2::aes_string(color=temp_column_id)) + 48 | ggplot2::labs(color=temp_column_id) + 49 | ggplot2::theme(legend.position="right") + 50 | ggplot2::scale_color_discrete(name=paste("level ", level, collapse = "")) 51 | 52 | return(full_tree+sub_tree+patchwork::plot_layout(nrow = 1)) 53 | } 54 | -------------------------------------------------------------------------------- /R/reallocate_units_4.R: -------------------------------------------------------------------------------- 1 | #' reallocate_units_4 2 | #' 3 | #' Peform an iteration of the fourth move in the algorithm. That is split cluster into n 4 | #' subclusters and re-allocate one sub-cluster. 5 | #' 6 | #' 7 | #' @param snp.object A snp.object containing the processed SNP data. 8 | #' @param partition An integer vector indicating an initial partition of the isolates. 9 | #' @param threshold The increase in marginal log likelihood required to accept a move. 10 | #' @param min.clust.size Clusters smaller than min.clust.size will not be split. 11 | #' @param split Whether to split only into two clusters (for move type 3). 12 | #' @param n.cores The number of cores to use. 13 | #' 14 | #' @return The best partition after splitting a cluster and re-allocating as well as 15 | #' a boolean value indicating whether a move increased the marginal likelihood. 16 | #' 17 | reallocate_units_4 <- function(snp.object, partition, threshold=1e-5, 18 | min.clust.size=20, split=FALSE, 19 | n.cores=1){ 20 | 21 | #some checks 22 | if (ncol(snp.object$prior)!=ncol(snp.object$data)) stop("ncol mismatch bwtn prior and data!") 23 | if (length(partition)!=nrow(snp.object$data)) stop("mismatch bwtn partition and data!") 24 | if (!(class(partition)=="integer")) stop("step 4 -> partition is not an integer vector!") 25 | 26 | clusters <- unique(partition) 27 | big_clusters <- clusters[purrr::map_int(clusters, ~ sum(partition==.x)>min.clust.size)==1] 28 | 29 | is.improved <- FALSE 30 | max_ml <- calc_log_ml(snp.object, partition) 31 | 32 | for(c in big_clusters){ 33 | index <- c(1:length(partition))[partition==c] 34 | d <- snp.object$dist[partition==c, partition==c] 35 | h <- stats::hclust(stats::as.dist(d), method = "complete") 36 | if(split){ 37 | npops <- 2 38 | } else { 39 | npops <- min(20, floor(nrow(d)/5)) 40 | } 41 | sub.partition <- stats::cutree(h, k=npops) 42 | sub.clusters <- unique(sub.partition) 43 | max_ml <- calc_log_ml(snp.object, partition) 44 | 45 | temp_mls <- parallel::mclapply(sub.clusters, function(sub.c){ 46 | best.clust <- calc_change_in_ml(snp.object, partition, index[sub.partition==sub.c]) 47 | temp.partition <- partition 48 | temp.partition[index[sub.partition==sub.c]] <- as.integer(best.clust) 49 | return(list(lml=calc_log_ml(snp.object, temp.partition), 50 | best.clust=best.clust)) 51 | }, mc.cores=n.cores) 52 | 53 | arg.max <- which.max(purrr::map_dbl(temp_mls, ~ .x$lml)) 54 | 55 | if(temp_mls[[arg.max]]$lml > (max_ml+threshold)){ 56 | partition[index[sub.partition==sub.clusters[[arg.max]]]] <- as.integer(temp_mls[[arg.max]]$best.clust) 57 | max_ml <- temp_mls[[arg.max]]$lml 58 | is.improved <- TRUE 59 | } 60 | } 61 | 62 | return(list(partition=partition, is.improved=is.improved, lml=max_ml)) 63 | 64 | } 65 | -------------------------------------------------------------------------------- /R/move_units_1.R: -------------------------------------------------------------------------------- 1 | #' move_units_1 2 | #' 3 | #' Peform an iteration of the first move in the algorithm. That is move units from one cluster to 4 | #' another to improve the marginal likelihood 5 | #' 6 | #' 7 | #' @param snp.object A snp.object containing the processed SNP data. 8 | #' @param partition An integer vector indicating an initial partition of the isolates. 9 | #' @param threshold The increase in marginal log likelihood required to accept a move. 10 | #' @param frac.clust.searched The percentage of a large cluster that will be moved. 11 | #' @param min.clust.size All isolates in clusters less than or equal to min.clus.size will be searched. 12 | #' @param n.cores The number of cores to use. 13 | #' 14 | #' @return The best partition after moving units from one cluster to another as well as 15 | #' a boolean value indicating whether a move increased the marginal likelihood. 16 | #' 17 | move_units_1 <- function(snp.object, partition, threshold=1e-5, 18 | frac.clust.searched=0.3, 19 | min.clust.size=20, 20 | n.cores=1){ 21 | 22 | #some checks 23 | if (ncol(snp.object$prior)!=ncol(snp.object$data)) stop("ncol mismatch bwtn prior and data!") 24 | if (length(partition)!=nrow(snp.object$data)) stop("mismatch bwtn partition and data!") 25 | if (!(class(partition)=="integer")) stop("in move 1 -> partition is not an integer vector!") 26 | 27 | nsamples <- nrow(snp.object$data) 28 | clusters <- unique(partition) 29 | cluster_size <- purrr::map_int(clusters, ~ sum(partition==.x)) 30 | names(cluster_size) <- clusters 31 | max_ml <- calc_log_ml(snp.object, partition) 32 | max_partition <- partition 33 | 34 | #identify the most divereged isolates of each cluster as candidates for moving. 35 | #as in hierBAPS we take the individuals that are most distant from each other 36 | indexes <- purrr::as_vector(purrr::imap(clusters[cluster_size>1], function(c, ind){ 37 | if(cluster_size[ind] > min.clust.size){ 38 | #big cluster so only move the most distant isolates 39 | index <- c(1:length(partition))[partition==c] 40 | d <- snp.object$dist[partition==c, partition==c] 41 | diag(d) <- NA 42 | mean.dist <- rowMeans(d, na.rm = TRUE) 43 | return(index[mean.dist > stats::quantile(mean.dist, probs=(1-frac.clust.searched))]) 44 | } else { 45 | #small cluster so consider all isolates 46 | return(c(1:length(partition))[partition==c]) 47 | } 48 | })) 49 | 50 | is.improved <- FALSE 51 | for (i in indexes){ 52 | best.cluster <- calc_change_in_ml(snp.object, max_partition, i) 53 | temp_partition <- max_partition 54 | temp_partition[i] <- as.integer(best.cluster) 55 | temp_lml <- calc_log_ml(snp.object, temp_partition) 56 | #If improvment above threshold make the swap 57 | if(temp_lml > (max_ml+threshold)){ 58 | max_ml <- temp_lml 59 | max_partition <- temp_partition 60 | is.improved <- TRUE 61 | } 62 | } 63 | 64 | return(list(partition=max_partition, is.improved=is.improved, lml=max_ml)) 65 | } 66 | -------------------------------------------------------------------------------- /.github/workflows/R-CMD-check.yaml: -------------------------------------------------------------------------------- 1 | # For help debugging build failures open an issue on the RStudio community with the 'github-actions' tag. 2 | # https://community.rstudio.com/new-topic?category=Package%20development&tags=github-actions 3 | on: 4 | push: 5 | branches: 6 | - main 7 | - master 8 | pull_request: 9 | branches: 10 | - main 11 | - master 12 | schedule: 13 | - cron: '1 1 1 * *' 14 | 15 | name: R-CMD-check 16 | 17 | jobs: 18 | R-CMD-check: 19 | runs-on: ${{ matrix.config.os }} 20 | 21 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 22 | 23 | strategy: 24 | fail-fast: false 25 | matrix: 26 | config: 27 | - {os: windows-latest, r: 'release'} 28 | - {os: macOS-latest, r: 'release'} 29 | - {os: ubuntu-20.04, r: 'release', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"} 30 | 31 | env: 32 | R_REMOTES_NO_ERRORS_FROM_WARNINGS: true 33 | RSPM: ${{ matrix.config.rspm }} 34 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 35 | 36 | steps: 37 | - uses: actions/checkout@v2 38 | 39 | - uses: r-lib/actions/setup-r@v1 40 | with: 41 | r-version: ${{ matrix.config.r }} 42 | 43 | - uses: r-lib/actions/setup-pandoc@v1 44 | 45 | - name: Query dependencies 46 | run: | 47 | install.packages('remotes') 48 | install.packages('BiocManager') 49 | saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2) 50 | writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version") 51 | shell: Rscript {0} 52 | 53 | - name: Cache R packages 54 | if: runner.os != 'Windows' 55 | uses: actions/cache@v2 56 | with: 57 | path: ${{ env.R_LIBS_USER }} 58 | key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }} 59 | restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1- 60 | 61 | - name: Install system dependencies 62 | if: runner.os == 'Linux' 63 | run: | 64 | while read -r cmd 65 | do 66 | eval sudo $cmd 67 | done < <(Rscript -e 'writeLines(remotes::system_requirements("ubuntu", "20.04"))') 68 | 69 | - name: Install dependencies 70 | run: | 71 | remotes::install_deps(dependencies = TRUE) 72 | remotes::install_cran("rcmdcheck") 73 | BiocManager::install("ggtree") 74 | shell: Rscript {0} 75 | 76 | - name: Check 77 | env: 78 | _R_CHECK_CRAN_INCOMING_REMOTE_: false 79 | run: | 80 | options(crayon.enabled = TRUE) 81 | rcmdcheck::rcmdcheck(args = c("--no-manual", "--as-cran"), error_on = "warning", check_dir = "check") 82 | shell: Rscript {0} 83 | 84 | - name: Upload check results 85 | if: failure() 86 | uses: actions/upload-artifact@main 87 | with: 88 | name: ${{ runner.os }}-r${{ matrix.config.r }}-results 89 | path: check 90 | -------------------------------------------------------------------------------- /R/join_units_2.R: -------------------------------------------------------------------------------- 1 | #' join_units_2 2 | #' 3 | #' Peform an iteration of the second move in the algorithm. That is combine two clusters 4 | #' to improve the marginal likelihood. 5 | #' 6 | #' 7 | #' @param snp.object A snp.object containing the processed SNP data. 8 | #' @param partition An integer vector indicating an initial partition of the isolates. 9 | #' @param threshold The increase in marginal log likelihood required to accept a move. 10 | #' @param n.cores The number of cores to use. 11 | #' @param comb.chache a matrix recording previous marginal llks of combining clusters 12 | #' 13 | #' @return The best partition after combining two clusters as well as 14 | #' a boolean value indicating whether a move increased the marginal likelihood. 15 | #' 16 | join_units_2 <- function(snp.object, partition, threshold=1e-5, n.cores=1, comb.chache=NULL){ 17 | 18 | #some checks 19 | if (ncol(snp.object$prior)!=ncol(snp.object$data)) stop("ncol mismatch bwtn prior and data!") 20 | if (length(partition)!=nrow(snp.object$data)) stop("mismatch bwtn partition and data!") 21 | 22 | max_ml <- calc_log_ml(snp.object, partition) 23 | is.improved <- FALSE 24 | clusters <- unique(partition) 25 | 26 | stopifnot(length(clusters) > 1) 27 | 28 | 29 | combinations <- utils::combn(clusters, 2, simplify = TRUE) 30 | 31 | if(is.null(comb.chache)){ 32 | comb.chache <- matrix(NA, nrow=max(clusters), ncol=max(clusters)) 33 | } 34 | 35 | temp.combinations <- combinations[, is.na(comb.chache[t(combinations)]), drop=FALSE] 36 | 37 | temp_mls <- parallel::mcmapply(function(p1, p2){ 38 | temp_partition <- partition 39 | temp_partition[temp_partition==p2] <- p1 40 | return(calc_log_ml(snp.object, temp_partition)) 41 | }, temp.combinations[1,], temp.combinations[2,], mc.cores = n.cores) 42 | 43 | comb.chache[t(temp.combinations)] <- temp_mls 44 | 45 | arg.max <- which(comb.chache == max(comb.chache, na.rm = TRUE), arr.ind = TRUE)[1,] 46 | 47 | if(comb.chache[arg.max[[1]], arg.max[[2]]] > (max_ml+threshold)){ 48 | partition[partition==arg.max[[2]]] <- arg.max[[1]] 49 | is.improved <- TRUE 50 | if (length(clusters)<=2){ 51 | diff <- (comb.chache[arg.max[[1]], arg.max[[2]]] - 52 | max_ml + 53 | 2*log_stirling2(nrow(snp.object$data), length(clusters)-1) - 54 | log_stirling2(nrow(snp.object$data), length(clusters)) 55 | ) 56 | } else { 57 | diff <- (comb.chache[arg.max[[1]], arg.max[[2]]] - 58 | max_ml + 59 | 2*log_stirling2(nrow(snp.object$data), length(clusters)-1) - 60 | log_stirling2(nrow(snp.object$data), length(clusters)) - 61 | log_stirling2(nrow(snp.object$data), length(clusters)-2) 62 | ) 63 | } 64 | 65 | max_ml <- comb.chache[arg.max[[1]], arg.max[[2]]] 66 | comb.chache <- comb.chache + diff 67 | comb.chache[arg.max[[1]], ] <- NA 68 | comb.chache[arg.max[[2]], ] <- NA 69 | comb.chache[, arg.max[[1]]] <- NA 70 | comb.chache[, arg.max[[2]]] <- NA 71 | } 72 | 73 | return(list(partition=partition, is.improved=is.improved, lml=max_ml, comb.chache=comb.chache)) 74 | } 75 | -------------------------------------------------------------------------------- /R/calc_change_in_ml.R: -------------------------------------------------------------------------------- 1 | #' calc_change_in_ml 2 | #' 3 | #' Calculate the change in the log marginal likelihood after moving index to each possible cluster 4 | #' 5 | #' 6 | #' @param snp.object A snp.object containing the processed SNP data. 7 | #' @param partition An integer vector indicating a partition of the isolates. 8 | #' @param indexes Indexes of the isolates to be moved (must come from one cluster.) 9 | #' 10 | #' @return the best cluster to move indexes to. 11 | #' 12 | calc_change_in_ml <- function(snp.object, partition, indexes){ 13 | #some checks 14 | if (ncol(snp.object$prior)!=ncol(snp.object$data)) stop("ncol mismatch bwtn prior and data!") 15 | if (length(partition)!=nrow(snp.object$data)) stop("mismatch bwtn partition and data!") 16 | if (!(all(indexes %in% 1:length(partition)))) stop("indexes outside of partiton range!") 17 | if (!(class(partition)=="integer")) stop("partition is not an integer vector!") 18 | 19 | original_cluster <- unique(partition[indexes]) 20 | if(length(original_cluster)!=1) stop("there was not a unique cluster in the index set!") 21 | 22 | #create temporary partition with indexes in a seperate cluster 23 | temp_partition <- partition 24 | temp_cluster <- max(partition)+1 25 | temp_partition[indexes] <- temp_cluster 26 | 27 | #get allele counts for each cluster in partition 28 | mA <- t(rowsum(1*(snp.object$data=="a"), temp_partition)) 29 | mC <- t(rowsum(1*(snp.object$data=="c"), temp_partition)) 30 | mG <- t(rowsum(1*(snp.object$data=="g"), temp_partition)) 31 | mT <- t(rowsum(1*(snp.object$data=="t"), temp_partition)) 32 | 33 | #now add counts from the temp cluster to every other cluster and take away from its own. 34 | mA <- mA[,colnames(mA)!=temp_cluster, drop=FALSE] + mA[, colnames(mA)==temp_cluster] 35 | mC <- mC[,colnames(mC)!=temp_cluster, drop=FALSE] + mC[, colnames(mC)==temp_cluster] 36 | mG <- mG[,colnames(mG)!=temp_cluster, drop=FALSE] + mG[, colnames(mG)==temp_cluster] 37 | mT <- mT[,colnames(mT)!=temp_cluster, drop=FALSE] + mT[, colnames(mT)==temp_cluster] 38 | 39 | prior <- snp.object$prior 40 | prior[prior==0] <- 1 #deal with zeros and resulting NAs 41 | 42 | #calculate log marginal likelihood 43 | term1 <- -lgamma(1 + mA+mC+mG+mT) 44 | term2 <- lgamma(prior["a", ] + mA) - lgamma(prior["a", ]) 45 | term2 <- term2 + lgamma(prior["c", ] + mC) - lgamma(prior["c", ]) 46 | term2 <- term2 + lgamma(prior["g", ] + mG) - lgamma(prior["g", ]) 47 | term2 <- term2 + lgamma(prior["t", ] + mT) - lgamma(prior["t", ]) 48 | 49 | new_columnwise_lml <- colSums(term1 + term2) 50 | names(new_columnwise_lml) <- colnames(mA) 51 | 52 | #now calculate original columnwise llm before the move 53 | #get allele counts for each cluster in partition 54 | mA <- t(rowsum(1*(snp.object$data=="a"), partition)) 55 | mC <- t(rowsum(1*(snp.object$data=="c"), partition)) 56 | mG <- t(rowsum(1*(snp.object$data=="g"), partition)) 57 | mT <- t(rowsum(1*(snp.object$data=="t"), partition)) 58 | term1 <- -lgamma(1 + mA+mC+mG+mT) 59 | term2 <- lgamma(prior["a", ] + mA) - lgamma(prior["a", ]) 60 | term2 <- term2 + lgamma(prior["c", ] + mC) - lgamma(prior["c", ]) 61 | term2 <- term2 + lgamma(prior["g", ] + mG) - lgamma(prior["g", ]) 62 | term2 <- term2 + lgamma(prior["t", ] + mT) - lgamma(prior["t", ]) 63 | orignal_columnwise_lml <- colSums(term1 + term2) 64 | names(orignal_columnwise_lml) <- colnames(mA) 65 | new_columnwise_lml <- new_columnwise_lml[names(orignal_columnwise_lml)] 66 | 67 | diff_columnwise_lml <- orignal_columnwise_lml - new_columnwise_lml 68 | diff_columnwise_lml[original_cluster] <- Inf 69 | 70 | best_column <- names(orignal_columnwise_lml)[which.min(diff_columnwise_lml)] 71 | 72 | return(best_column) 73 | } 74 | -------------------------------------------------------------------------------- /R/model_search_parallel.R: -------------------------------------------------------------------------------- 1 | #' model_search_parallel 2 | #' 3 | #' Clusters DNA alignment using independent loci model 4 | #' 5 | #' 6 | #' @param snp.object A snp.object containing the processed SNP data. 7 | #' @param partition An integer vector indicating an initial starting partition. 8 | #' @param round.types A vector indicating which series of moves to make. 9 | #' @param quiet Whether to suppress progress information (default=FALSE). 10 | #' @param n.extra.rounds The number of additional rounds to perform after the default hierBAPS 11 | #' settings (default=0). If set to Inf it will run until a local optimum is reached 12 | #' (this might take a long time). 13 | #' @param n.cores The number of cores to use. 14 | #' 15 | #' @return an optimised partition and marginal llk 16 | #' 17 | model_search_parallel <- function(snp.object, partition, round.types, 18 | quiet=FALSE, n.extra.rounds=0, n.cores=1){ 19 | if(!all(round.types %in% c(1,2,3,4))) stop("Invalid round type!") 20 | 21 | was.updated <- rep(TRUE, 4) 22 | move.count <- 0 23 | max.ml <- calc_log_ml(snp.object, partition) 24 | comb.chache <- NULL 25 | if(!quiet){ 26 | cat('\r', paste(c( 27 | "Round: ", move.count, "/", length(round.types), " Type: ", "none", " Log marginal likelihood: ", max.ml 28 | ), collapse = "")) 29 | } 30 | 31 | while(move.count < (length(round.types)+n.extra.rounds)){ 32 | 33 | move.count <- move.count + 1 34 | if(move.count>length(round.types)){ 35 | r <- sample(1:4, 1, replace = TRUE) 36 | } else { 37 | r <- round.types[[move.count]] 38 | } 39 | 40 | 41 | n.clusters <- length(unique(partition)) 42 | if (length(unique(partition))<=1){ 43 | next 44 | } 45 | if((r==1) && was.updated[[1]]){ 46 | update <- move_units_1(snp.object, partition, 47 | n.cores=n.cores) 48 | if(!update$is.improved){ 49 | was.updated[[1]] <- FALSE 50 | } else{ 51 | comb.chache <- NULL 52 | partition <- update$partition 53 | max.ml <- update$lml 54 | was.updated <- rep(TRUE, 4) 55 | } 56 | } else if(r==2 && was.updated[[2]]){ 57 | update <- join_units_2(snp.object, partition, 58 | n.cores=n.cores, comb.chache=comb.chache) 59 | comb.chache <- update$comb.chache 60 | if(!update$is.improved){ 61 | was.updated[[2]] <- FALSE 62 | } else{ 63 | partition <- update$partition 64 | max.ml <- update$lml 65 | was.updated <- rep(TRUE, 4) 66 | } 67 | } else if(r==3 && was.updated[[3]]){ 68 | update <- split_clusters_3(snp.object, partition, 69 | n.cores=n.cores) 70 | if(!update$is.improved){ 71 | was.updated[[3]] <- FALSE 72 | } else{ 73 | comb.chache <- NULL 74 | partition <- update$partition 75 | max.ml <- update$lml 76 | was.updated <- rep(TRUE, 4) 77 | } 78 | } else if(r==4 && was.updated[[4]]){ 79 | update <- reallocate_units_4(snp.object, partition, 80 | n.cores=n.cores) 81 | if(!update$is.improved){ 82 | was.updated[[4]] <- FALSE 83 | } else{ 84 | comb.chache <- NULL 85 | partition <- update$partition 86 | max.ml <- update$lml 87 | was.updated <- rep(TRUE, 4) 88 | } 89 | } 90 | #Print current status 91 | if(!quiet){ 92 | cat('\r', paste(c( 93 | "Round: ", move.count, "/", length(round.types), " Type: ", r, " Log marginal likelihood: ", max.ml 94 | ), collapse = "")) 95 | utils::flush.console() 96 | } 97 | 98 | #Check for local convergence 99 | if (sum(was.updated)==0){ 100 | if (!quiet) print("Converged locally!") 101 | break 102 | } 103 | } 104 | return(list(partition=partition, lml=calc_log_ml(snp.object, partition))) 105 | } 106 | -------------------------------------------------------------------------------- /vignettes/bibliography.bib: -------------------------------------------------------------------------------- 1 | @ARTICLE{Cheng2013-mp, 2 | title = "Hierarchical and spatially explicit clustering of {DNA} 3 | sequences with {BAPS} software", 4 | author = "Cheng, Lu and Connor, Thomas R and Sir{\'e}n, Jukka and 5 | Aanensen, David M and Corander, Jukka", 6 | affiliation = "Department of Mathematics and Statistics, University of 7 | Helsinki, Helsinki, Finland.", 8 | journal = "Mol. Biol. Evol.", 9 | volume = 30, 10 | number = 5, 11 | pages = "1224--1228", 12 | month = may, 13 | year = 2013, 14 | url = "http://dx.doi.org/10.1093/molbev/mst028", 15 | language = "en", 16 | issn = "0737-4038, 1537-1719", 17 | pmid = "23408797", 18 | doi = "10.1093/molbev/mst028", 19 | pmc = "PMC3670731" 20 | } 21 | 22 | @ARTICLE{Paradis2004-ck, 23 | title = "{APE}: Analyses of Phylogenetics and Evolution in {R} language", 24 | author = "Paradis, Emmanuel and Claude, Julien and Strimmer, Korbinian", 25 | affiliation = "Laboratoire de Pal{\'e}ontologie, Pal{\'e}obiologie and 26 | Phylog{\'e}nie, Institut des Sciences de l'Evolution, 27 | Universit{\'e} Montpellier II, F-34095 Montpellier c{\'e}dex 28 | 05, France. paradis@isem.univ-montp2.fr", 29 | journal = "Bioinformatics", 30 | volume = 20, 31 | number = 2, 32 | pages = "289--290", 33 | month = jan, 34 | year = 2004, 35 | url = "http://www.ncbi.nlm.nih.gov/pubmed/14734327", 36 | issn = "1367-4803", 37 | pmid = "14734327", 38 | doi = "10.1093/bioinformatics/btg412" 39 | } 40 | 41 | 42 | @ARTICLE{Yu2017-bf, 43 | title = "ggtree: an r package for visualization and annotation of 44 | phylogenetic trees with their covariates and other associated data", 45 | author = "Yu, Guangchuang and Smith, David K and Zhu, Huachen and Guan, Yi 46 | and Lam, Tommy Tsan-Yuk", 47 | journal = "Methods Ecol. Evol.", 48 | volume = 8, 49 | number = 1, 50 | pages = "28--36", 51 | month = jan, 52 | year = 2017, 53 | url = "http://dx.doi.org/10.1111/2041-210X.12628", 54 | issn = "2041-210X", 55 | doi = "10.1111/2041-210X.12628" 56 | } 57 | 58 | 59 | @ARTICLE{Revell2012-ik, 60 | title = "phytools: an {R} package for phylogenetic comparative biology 61 | (and other things)", 62 | author = "Revell, Liam J", 63 | journal = "Methods Ecol. Evol.", 64 | publisher = "Blackwell Publishing Ltd", 65 | volume = 3, 66 | number = 2, 67 | pages = "217--223", 68 | month = apr, 69 | year = 2012, 70 | url = "http://dx.doi.org/10.1111/j.2041-210X.2011.00169.x", 71 | issn = "2041-210X", 72 | doi = "10.1111/j.2041-210X.2011.00169.x" 73 | } 74 | 75 | 76 | @ARTICLE{Kalyaanamoorthy2017-go, 77 | title = "{ModelFinder}: fast model selection for accurate phylogenetic 78 | estimates", 79 | author = "Kalyaanamoorthy, Subha and Minh, Bui Quang and Wong, Thomas K 80 | F and von Haeseler, Arndt and Jermiin, Lars S", 81 | affiliation = "Land \&Water, CSIRO, Canberra, Australian Capital Territory, 82 | Australia. Faculty of Pharmacy \&Pharmaceutical Sciences, 83 | University of Alberta, Edmonton, Alberta, Canada. Center for 84 | Integrative Bioinformatics Vienna, Max F. Perutz Laboratories, 85 | University of Vienna \&Medical University of Vienna, Vienna, 86 | Austria. Research School of Biology, Australian National 87 | University, Canberra, Australian Capital Territory, Australia. 88 | Bioinformatics and Computational Biology, Faculty of 89 | Computational Science, University of Vienna, Vienna, Austria.", 90 | journal = "Nat. Methods", 91 | volume = 14, 92 | number = 6, 93 | pages = "587--589", 94 | month = jun, 95 | year = 2017, 96 | url = "http://dx.doi.org/10.1038/nmeth.4285", 97 | language = "en", 98 | issn = "1548-7091, 1548-7105", 99 | pmid = "28481363", 100 | doi = "10.1038/nmeth.4285", 101 | pmc = "PMC5453245" 102 | } 103 | 104 | -------------------------------------------------------------------------------- /vignettes/introduction.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Introduction to rhierbaps" 3 | author: "Gerry Tonkin-Hill" 4 | date: "`r Sys.Date()`" 5 | bibliography: bibliography.bib 6 | output: 7 | html_document: 8 | fig_width: 12 9 | fig_height: 8 10 | vignette: > 11 | %\VignetteIndexEntry{Introduction to rhierbaps} 12 | %\VignetteEncoding{UTF-8} 13 | %\VignetteEngine{knitr::rmarkdown} 14 | editor_options: 15 | chunk_output_type: console 16 | --- 17 | 18 | ```{r global_options, include=FALSE} 19 | knitr::opts_chunk$set(fig.width=12, fig.height=8, 20 | echo=TRUE, warning=FALSE, message=FALSE, 21 | tidy=TRUE) 22 | ``` 23 | 24 | The hierBAPS algorithm was introduced in [@Cheng2013-mp] and provides a method for hierarchically clustering DNA sequence data to reveal nested population structure. Previously the algorithm was available as a compiled MATLAB binary. We provide a convenient R implementation and include a number of useful additional options including the ability to use multiple cores, save the log marginal likelihood scores and run the algorithm until local convergence. Furthermore, we provide a wrapper to a ggtree plotting function allowing for easy exploration of sub-clusters. 25 | 26 | *** 27 | 28 | **Things to keep in mind before running hierBAPS** 29 | 30 | 1. hierBAPS uses a uniform prior for K. 31 | 2. The prior for a site depend on the available snps, i.e. if a site only has 'AC', then the prior for 'ACGT' is (1/2, 1/2, 0, 0) 32 | 3. The initial sequence partition is generated using hierarchical clustering with complete linkage based on a Hamming distance matrix. 33 | 4. The initial number of populations should be set much higher than the expected number of populations. 34 | 5. More search rounds of the algorithm can be added using the `n.extra.rounds` parameter. 35 | 6. To get reproducible results the seed in R must be set. 36 | 37 | ## Libraries 38 | 39 | ```{r} 40 | library(rhierbaps) 41 | library(ggtree) 42 | library(phytools) 43 | library(ape) 44 | 45 | set.seed(1234) 46 | ``` 47 | 48 | ## Loading data 49 | 50 | We first need to load a multiple sequence alignment in fasta format. We can then generate the required SNP matrix. 51 | 52 | ```{r} 53 | fasta.file.name <- system.file("extdata", "seqs.fa", package = "rhierbaps") 54 | snp.matrix <- load_fasta(fasta.file.name) 55 | ``` 56 | 57 | If you wish to include singleton SNPs (those that appear in only one isolate) then set `keep.singletons=FALSE`. However, this is currently advised against as these SNPs lead to a higher number of parameters in the model and do not provide information about shared ancestry. 58 | 59 | It is also possible to load an ape DNAbin object. Here me make use of the woodmouse dataset in ape. 60 | 61 | ```{r} 62 | data(woodmouse) 63 | woodmouse.snp.matrix <- load_fasta(woodmouse) 64 | ``` 65 | 66 | ## Running hierBAPS 67 | 68 | We now need to decide how many levels of clustering we are interested in and the number of initial clusters to start from. It is a good idea to choose `n.pops` to be significantly larger than the number of clusters you expect. 69 | 70 | To run hierBAPS with $2$ levels and $20$ initial clusters we run 71 | 72 | ```{r} 73 | hb.results <- hierBAPS(snp.matrix, max.depth=2, n.pops=20, quiet = TRUE) 74 | head(hb.results$partition.df) 75 | ``` 76 | 77 | This produces a list which includes a data frame indicating the resulting partition of the isolates at the difference levels. The isolate names in this data frame are taken from the fasta headers and thus for plotting it is important that these match the isolate names in any tree used later. This function also outputs the log marginal likelihoods at the different levels of clustering. 78 | 79 | hierBAPS can also be run until the algorithm converges to a local optimum as 80 | 81 | ```{r, eval=FALSE} 82 | hb.results <- hierBAPS(snp.matrix, max.depth=2, n.pops=20, n.extra.rounds = Inf, 83 | quiet = TRUE) 84 | ``` 85 | 86 | We can also check how long hierBAPS takes to run on the test dataset of 515 samples and 744 SNPs. 87 | 88 | ```{r} 89 | system.time(hierBAPS(snp.matrix, max.depth=2, n.pops=20, quiet = TRUE)) 90 | ``` 91 | 92 | ## Plotting results 93 | 94 | To plot the results it is useful to consider a tree of the same isolates. We clustered the example isolates using Iqtree [@Kalyaanamoorthy2017-go]. The ggtree [@Yu2017-bf] package then allows us to plot the results. 95 | 96 | First we need to load the newick file. 97 | 98 | ```{r} 99 | newick.file.name <- system.file("extdata", "seqs.fa.treefile", package = "rhierbaps") 100 | iqtree <- phytools::read.newick(newick.file.name) 101 | ``` 102 | 103 | A simple coloured tree allows us to see the top level cluster assignment from hierBAPS. 104 | 105 | ```{r} 106 | gg <- ggtree(iqtree, layout="circular") 107 | gg <- gg %<+% hb.results$partition.df 108 | gg <- gg + geom_tippoint(aes(color=factor(`level 1`))) 109 | gg 110 | ``` 111 | 112 | As there are many more clusters at the second level using colours to distinguish them can get confusing. Instead we can label the tips with their corresponding clusters. 113 | 114 | ```{r} 115 | gg <- ggtree(iqtree, layout="circular", branch.length = "none") 116 | gg <- gg %<+% hb.results$partition.df 117 | gg <- gg + geom_tippoint(aes(color=factor(`level 1`))) 118 | gg <- gg + theme(legend.position="right") 119 | gg <- gg + geom_tiplab(aes(label = `level 2`), size = 1, offset = 1) 120 | gg 121 | ``` 122 | 123 | We can also zoom in on a particular top level cluster to get a better idea of how it is partitioned at the lower level. As an example we zoom in on sub cluster 9 at level 1. 124 | 125 | ```{r} 126 | plot_sub_cluster(hb.results, iqtree, level = 1, sub.cluster = 9) 127 | ``` 128 | 129 | Finally, we can inspect the log marginal likelihoods given for each level. 130 | 131 | ```{r} 132 | hb.results$lml.list 133 | ``` 134 | 135 | ## Caculating assignment probabilities 136 | 137 | We can also calculate the individual probabilities of assignment to each cluster. Here we make use of the woodmouse dataset loaded earlier. 138 | 139 | ```{r} 140 | hb.results.woodmouse <- hierBAPS(woodmouse.snp.matrix, max.depth=2, n.extra.rounds = Inf, 141 | quiet = TRUE, assignment.probs = TRUE) 142 | head(hb.results.woodmouse$cluster.assignment.prob[[1]]) 143 | ``` 144 | 145 | ## Saving results 146 | 147 | For runs that take a long time it is a good idea to save the output. We can save the partition file as 148 | 149 | ```{r, eval=FALSE} 150 | write.csv(hb.results$partition.df, file=file.path(tempdir(), "hierbaps_partition.csv"), 151 | col.names = TRUE, row.names = FALSE) 152 | 153 | save_lml_logs(hb.results, file.path(tempdir(), "hierbaps_logML.txt")) 154 | ``` 155 | 156 | ## Citing rhierbaps 157 | 158 | If you use rhierbaps in a research publication please cite both 159 | 160 | Tonkin-Hill, Gerry, John A. Lees, Stephen D. Bentley, Simon D. W. Frost, and Jukka Corander. 2018. “RhierBAPS: An R Implementation of the Population Clustering Algorithm hierBAPS.” Wellcome Open Research 3 (July): 93. 161 | 162 | Cheng, Lu, Thomas R. Connor, Jukka Sirén, David M. Aanensen, and Jukka Corander. 2013. “Hierarchical and Spatially Explicit Clustering of DNA Sequences with BAPS Software.” Molecular Biology and Evolution 30 (5): 1224–28. 163 | 164 | ## References 165 | 166 | --- 167 | nocite: '@*' 168 | ... 169 | 170 | ## Session Information 171 | 172 | ```{r} 173 | sessionInfo() 174 | ``` 175 | 176 | 177 | -------------------------------------------------------------------------------- /R/hierBAPS.R: -------------------------------------------------------------------------------- 1 | #' hierBAPS 2 | #' 3 | #' Runs the hierBAPS algorithm of Cheng et al. 2013 4 | #' 5 | #' 6 | #' @param snp.matrix Character matrix of aligned sequences produced by \link{load_fasta}. 7 | #' @param max.depth Maximum depth of hierarchical search (default = 2). 8 | #' @param n.pops Maximum number of populations in the data (default = number of isolates/5) 9 | #' @param quiet Whether to suppress progress information (default=FALSE). 10 | #' @param n.extra.rounds The number of additional rounds to perform after the default hierBAPS 11 | #' settings (default=0). If set to Inf it will run until a local optimum is reached 12 | #' (this might take a long time). 13 | #' @param assignment.probs whether or not to calculate the assignment probabilities to each cluster (default=FALSE) 14 | #' @param n.cores The number of cores to use. 15 | #' 16 | #' @return A list containing a dataframe indicating an assignment of each sequence 17 | #' to hierarchical clusters as well as the log marginal likelihoods for each level. 18 | #' 19 | #' @examples 20 | #' snp.matrix <- load_fasta(system.file("extdata", "small_seqs.fa", package = "rhierbaps")) 21 | #' hb <- hierBAPS(snp.matrix, max.depth=2, n.pops=20, quiet=FALSE) 22 | #' 23 | #' \donttest{ 24 | #' snp.matrix <- load_fasta(system.file("extdata", "seqs.fa", package = "rhierbaps")) 25 | #' system.time({hb <- hierBAPS(snp.matrix, max.depth=2, n.pops=20, quiet=FALSE)}) 26 | #' } 27 | #' 28 | #'@author Gerry Tonkin-Hill 29 | #'@references Cheng, Lu, Thomas R. Connor, Jukka Sirén, David M. Aanensen, and Jukka Corander. 2013. “Hierarchical and Spatially Explicit Clustering of DNA Sequences with BAPS Software.” Molecular Biology and Evolution 30 (5): 1224–28. 30 | #' 31 | #' @export 32 | hierBAPS <- function(snp.matrix, max.depth=2, n.pops=floor(nrow(snp.matrix)/5), 33 | quiet=FALSE, n.extra.rounds=0, assignment.probs=FALSE, n.cores=1){ 34 | 35 | #Check inputs 36 | if (!is.matrix(snp.matrix)) stop("snp.matrix is not a matrix!") 37 | if ((!is.numeric(max.depth)) || (max.depth<1)) stop("Invalid max.depth! Must be a positive integer.") 38 | if ((!is.numeric(n.pops)) || (n.pops<1)) stop("Invalid n.pops! Must be a positive integer.") 39 | if (!is.logical(quiet)) stop("Invalid quiet! Must be one of TRUE/FALSE.") 40 | if ((!is.numeric(n.extra.rounds)) || (n.extra.rounds<0)) stop("Invalid n.extra.rounds! 41 | Must be a non-negative integer.") 42 | if ((!is.numeric(n.cores)) || (n.cores<1)) stop("Invalid n.cores! Must be a positive integer.") 43 | 44 | # search operators 45 | round.types <- c(2*rep(1, n.pops), 46 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 47 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 48 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 49 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 4, 3, 4, 50 | 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 51 | 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 52 | 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 53 | 4, 3, 4, 3, 4, 3, 4, 3, 4, 1, 1, 1, 1, 1, 2, 3, 4, 1, 2, 3, 4, 54 | 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 55 | 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 56 | 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 57 | 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 58 | 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 59 | 2, 3, 4, 1, 2, 3, 4) 60 | 61 | #Don't split clusters with the less than 4 memebers 62 | MIN.CLUSTER.SIZE <- 4 63 | 64 | #iterate over levels 65 | all.partition.matrix <- matrix(0, nrow=nrow(snp.matrix), ncol=max.depth) 66 | lml.list <- list() 67 | 68 | for (cur.depth in seq(0, max.depth-1)){ 69 | 70 | if (cur.depth==0){ 71 | snp.object <- preproc_alignment(snp.matrix) 72 | if(is.na(snp.object[[1]])){ 73 | stop("All sites are conserved!") 74 | } 75 | cur.part <- rep(1, snp.object$n.seq) 76 | avail.cluster.ids <- 1 77 | snp.object$heds <- rownames(snp.matrix) 78 | } else{ 79 | cur.part <- all.partition.matrix[snp.object$seq.inds, cur.depth] 80 | avail.cluster.ids <- unique(cur.part) 81 | } 82 | 83 | if(!quiet) print(paste(c("---- Current depth: ", cur.depth, " ----"), collapse = "")) 84 | 85 | if(length(cur.part)==0){ 86 | if(!quiet) print(paste(c("all sequences are clustered. Quit! There are ", cur.depth, 87 | " layers in total."), collapse = "")) 88 | break 89 | } 90 | 91 | local.label.offset <- 1 92 | 93 | for (i in 1:length(avail.cluster.ids)){ 94 | cluid <- avail.cluster.ids[i] 95 | if(!quiet) print(paste(c("Current depth: ", cur.depth, " Cluster ID: ", cluid), collapse = "")) 96 | 97 | if (sum(cur.part==cluid) < MIN.CLUSTER.SIZE){ 98 | #we dont split any further. 99 | all.partition.matrix[cur.part==cluid, cur.depth+1] <- local.label.offset 100 | local.label.offset <- local.label.offset+1 101 | if(length(lml.list)<(cur.depth+1)){ 102 | lml.list[[cur.depth+1]] <- NA 103 | } else { 104 | lml.list[[cur.depth+1]] <- c(lml.list[[cur.depth+1]], NA) 105 | } 106 | next 107 | } 108 | 109 | if (cur.depth==0){ 110 | tmp.snp.object <- snp.object 111 | } else { 112 | tmp.snp.object <- preproc_alignment(snp.object$data[cur.part==cluid, ]) 113 | if(is.na(tmp.snp.object[[1]])){ 114 | #all sites are conserved at this partition so dont split further 115 | all.partition.matrix[cur.part==cluid, cur.depth+1] <- local.label.offset 116 | local.label.offset <- local.label.offset+1 117 | if(length(lml.list)<(cur.depth+1)){ 118 | lml.list[[cur.depth+1]] <- NA 119 | } else { 120 | lml.list[[cur.depth+1]] <- c(lml.list[[cur.depth+1]], NA) 121 | } 122 | next 123 | } 124 | } 125 | 126 | 127 | 128 | tmp.z.hclust <- stats::hclust(stats::as.dist(tmp.snp.object$dist), method = 'complete') 129 | 130 | if (tmp.snp.object$n.seq > (3*n.pops)){ 131 | tmp.init.part <- stats::cutree(tmp.z.hclust, k = n.pops) 132 | } else { 133 | tmp.num = min(floor(tmp.snp.object$n.seq/2), n.pops) 134 | tmp.init.part <- stats::cutree(tmp.z.hclust, k = tmp.num) 135 | } 136 | 137 | temp_partition = model_search_parallel(tmp.snp.object, tmp.init.part, round.types, 138 | quiet, n.extra.rounds, n.cores) 139 | 140 | if (!quiet){ 141 | print(paste(c("Best partition: Nclusters ", length(unique(temp_partition$partition)), 142 | " Log(ml*prior) ", temp_partition$lml), collapse = "")) 143 | } 144 | 145 | if(length(lml.list)<(cur.depth+1)){ 146 | lml.list[[cur.depth+1]] <- temp_partition$lml 147 | } else { 148 | lml.list[[cur.depth+1]] <- c(lml.list[[cur.depth+1]], temp_partition$lml) 149 | } 150 | 151 | 152 | for (clust in unique(temp_partition$partition)){ 153 | cluster_index <- rep(FALSE, length(cur.part)) 154 | cluster_index[cur.part==cluid][clust==temp_partition$partition] <- TRUE 155 | all.partition.matrix[cluster_index, cur.depth+1] <- local.label.offset 156 | local.label.offset <- local.label.offset+1 157 | } 158 | } 159 | 160 | names(lml.list[[cur.depth+1]]) <- avail.cluster.ids 161 | 162 | } 163 | names(lml.list) <- paste("Depth", seq(0, max.depth-1)) 164 | partition.df <- data.frame(isolates=snp.object$heds, all.partition.matrix) 165 | colnames(partition.df) <- c("Isolate", paste("level", 1:ncol(all.partition.matrix))) 166 | 167 | if(assignment.probs){ 168 | #calculate assignment probabilities 169 | colnames(snp.object$data) <- 1:ncol(snp.object$data) 170 | consensus <- apply(snp.object$data, 2, function(x) table(factor(x, levels = c("a","c","g","t")), 171 | exclude = c("-", NA))) 172 | consensus <- rownames(consensus)[apply(consensus,2,which.max)] 173 | cluster.ass.prob.list <- lapply(1:max.depth, function(l){ 174 | partitions <- split(1:nrow(snp.matrix), partition.df[,l+1]) 175 | cluster.ass.probs <- do.call(cbind, lapply(partitions, function(p){ 176 | cluster.allele.freqs <- apply(snp.object$data[p,,drop=FALSE], 2, function(x) table(factor(x, levels = c("a","c","g","t")), 177 | exclude = c("-", NA))) 178 | cluster.allele.freqs <- log((cluster.allele.freqs + snp.object$prior)/(length(p)+1)) 179 | apply(snp.object$data, 1, function(seq){ 180 | seq[seq=="-"] <- consensus[seq=="-"] 181 | return(sum(cluster.allele.freqs[cbind(seq, 1:ncol(cluster.allele.freqs))])) 182 | }) 183 | })) 184 | cluster.ass.probs <- cluster.ass.probs-apply(cluster.ass.probs, 1, matrixStats::logSumExp) 185 | cluster.ass.probs <- exp(cluster.ass.probs) 186 | cluster.ass.probs[cluster.ass.probs<1e-16] <- 0 187 | rownames(cluster.ass.probs) <- snp.object$heds 188 | colnames(cluster.ass.probs) <- paste("Cluster", 1:ncol(cluster.ass.probs)) 189 | return(cluster.ass.probs) 190 | }) 191 | 192 | return(list(partition.df=partition.df, 193 | cluster.assignment.prob=cluster.ass.prob.list, 194 | lml.list=lml.list)) 195 | } 196 | 197 | return(list(partition.df=partition.df, 198 | lml.list=lml.list)) 199 | } 200 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | [![R-CMD-check](https://github.com/gtonkinhill/rhierbaps/workflows/R-CMD-check/badge.svg)](https://github.com/gtonkinhill/rhierbaps/actions) 6 | 7 | 8 | # rhierbaps 9 | 10 | We have recently developed a faster verion of the BAPs clustering 11 | method. It can be found [here](https://github.com/gtonkinhill/fastbaps). 12 | 13 | ## Installation 14 | 15 | `rhierbaps` is available on CRAN. 16 | 17 | ``` r 18 | install.packages("rhierbaps") 19 | ``` 20 | 21 | The development version is available on github. It can be installed with 22 | `devtools` 23 | 24 | ``` r 25 | install.packages("devtools") 26 | 27 | devtools::install_github("gtonkinhill/rhierbaps") 28 | ``` 29 | 30 | If you would like to also build the vignette with your installation run: 31 | 32 | ``` r 33 | devtools::install_github("gtonkinhill/rhierbaps", build_vignettes = TRUE) 34 | ``` 35 | 36 | ## Quick Start 37 | 38 | Run hierBAPS. 39 | 40 | ``` r 41 | # install.packages('rhierbaps') 42 | library(rhierbaps) 43 | 44 | fasta.file.name <- system.file("extdata", "seqs.fa", package = "rhierbaps") 45 | snp.matrix <- load_fasta(fasta.file.name) 46 | hb.results <- hierBAPS(snp.matrix, max.depth = 2, n.pops = 20, quiet = TRUE) 47 | head(hb.results$partition.df) 48 | #> Isolate level 1 level 2 49 | #> 1 1 1 1 50 | #> 2 2 1 1 51 | #> 3 3 1 1 52 | #> 4 4 2 5 53 | #> 5 5 3 9 54 | #> 6 6 3 9 55 | ``` 56 | 57 | The hierBAPS algorithm was introduced in (Cheng et al. 2013) and 58 | provides a method for hierarchically clustering DNA sequence data to 59 | reveal nested population structure. Previously the algorithm was 60 | available as a compiled MATLAB binary. We provide a convenient R 61 | implementation and include a number of useful additional options 62 | including the ability to use multiple cores, save the log marginal 63 | likelihood scores and run the algorithm until local convergence. 64 | Furthermore, we provide a wrapper to a ggtree plotting function allowing 65 | for easy exploration of sub-clusters. 66 | 67 | ------------------------------------------------------------------------ 68 | 69 | **Things to keep in mind before running hierBAPS** 70 | 71 | 1. hierBAPS uses a uniform prior for K. 72 | 2. The prior for a site depend on the available snps, i.e. if a site 73 | only has ‘AC,’ then the prior for ‘ACGT’ is (1/2, 1/2, 0, 0) 74 | 3. The initial sequence partition is generated using hierarchical 75 | clustering with complete linkage based on a Hamming distance matrix. 76 | 4. The initial number of populations should be set much higher than the 77 | expected number of populations. 78 | 5. More search rounds of the algorithm can be added using the 79 | `n.extra.rounds` parameter. 80 | 6. To get reproducible results the seed in R must be set. 81 | 82 | ## Libraries 83 | 84 | ``` r 85 | library(rhierbaps) 86 | library(ggtree) 87 | library(phytools) 88 | library(ape) 89 | 90 | set.seed(1234) 91 | ``` 92 | 93 | ## Loading data 94 | 95 | We first need to load a multiple sequence alignment in fasta format. We 96 | can then generate the required SNP matrix. 97 | 98 | ``` r 99 | fasta.file.name <- system.file("extdata", "seqs.fa", package = "rhierbaps") 100 | snp.matrix <- load_fasta(fasta.file.name) 101 | ``` 102 | 103 | If you wish to include singleton SNPs (those that appear in only one 104 | isolate) then set `keep.singletons=FALSE`. However, this is currently 105 | advised against as these SNPs lead to a higher number of parameters in 106 | the model and do not provide information about shared ancestry. 107 | 108 | It is also possible to load an ape DNAbin object. Here me make use of 109 | the woodmouse dataset in ape. 110 | 111 | ``` r 112 | data(woodmouse) 113 | woodmouse.snp.matrix <- load_fasta(woodmouse) 114 | ``` 115 | 116 | ## Running hierBAPS 117 | 118 | We now need to decide how many levels of clustering we are interested in 119 | and the number of initial clusters to start from. It is a good idea to 120 | choose `n.pops` to be significantly larger than the number of clusters 121 | you expect. 122 | 123 | To run hierBAPS with 2 levels and 20 initial clusters we run 124 | 125 | ``` r 126 | hb.results <- hierBAPS(snp.matrix, max.depth = 2, n.pops = 20, quiet = TRUE) 127 | head(hb.results$partition.df) 128 | #> Isolate level 1 level 2 129 | #> 1 1 1 1 130 | #> 2 2 1 1 131 | #> 3 3 1 1 132 | #> 4 4 2 5 133 | #> 5 5 3 9 134 | #> 6 6 3 9 135 | ``` 136 | 137 | This produces a list which includes a data frame indicating the 138 | resulting partition of the isolates at the difference levels. The 139 | isolate names in this data frame are taken from the fasta headers and 140 | thus for plotting it is important that these match the isolate names in 141 | any tree used later. This function also outputs the log marginal 142 | likelihoods at the different levels of clustering. 143 | 144 | hierBAPS can also be run until the algorithm converges to a local 145 | optimum as 146 | 147 | ``` r 148 | hb.results <- hierBAPS(snp.matrix, max.depth = 2, n.pops = 20, n.extra.rounds = Inf, 149 | quiet = TRUE) 150 | ``` 151 | 152 | We can also check how long hierBAPS takes to run on the test dataset of 153 | 515 samples and 744 SNPs. 154 | 155 | ``` r 156 | system.time(hierBAPS(snp.matrix, max.depth = 2, n.pops = 20, quiet = TRUE)) 157 | #> user system elapsed 158 | #> 84.184 17.152 102.219 159 | ``` 160 | 161 | ## Plotting results 162 | 163 | To plot the results it is useful to consider a tree of the same 164 | isolates. We clustered the example isolates using Iqtree 165 | (Kalyaanamoorthy et al. 2017). The ggtree (Yu et al. 2017) package then 166 | allows us to plot the results. 167 | 168 | First we need to load the newick file. 169 | 170 | ``` r 171 | newick.file.name <- system.file("extdata", "seqs.fa.treefile", package = "rhierbaps") 172 | iqtree <- phytools::read.newick(newick.file.name) 173 | ``` 174 | 175 | A simple coloured tree allows us to see the top level cluster assignment 176 | from hierBAPS. 177 | 178 | ``` r 179 | gg <- ggtree(iqtree, layout = "circular") 180 | gg <- gg %<+% hb.results$partition.df 181 | gg <- gg + geom_tippoint(aes(color = factor(`level 1`))) 182 | gg 183 | ``` 184 | 185 | ![](inst/vignette-supp/unnamed-chunk-15-1.png) 186 | 187 | As there are many more clusters at the second level using colours to 188 | distinguish them can get confusing. Instead we can label the tips with 189 | their corresponding clusters. 190 | 191 | ``` r 192 | gg <- ggtree(iqtree, layout = "circular", branch.length = "none") 193 | gg <- gg %<+% hb.results$partition.df 194 | gg <- gg + geom_tippoint(aes(color = factor(`level 1`))) 195 | gg <- gg + theme(legend.position = "right") 196 | gg <- gg + geom_tiplab(aes(label = `level 2`), size = 1, offset = 1) 197 | gg 198 | ``` 199 | 200 | ![](inst/vignette-supp/unnamed-chunk-16-1.png) 201 | 202 | We can also zoom in on a particular top level cluster to get a better 203 | idea of how it is partitioned at the lower level. As an example we zoom 204 | in on sub cluster 9 at level 1. 205 | 206 | ``` r 207 | plot_sub_cluster(hb.results, iqtree, level = 1, sub.cluster = 9) 208 | ``` 209 | 210 | ![](inst/vignette-supp/unnamed-chunk-17-1.png) 211 | 212 | Finally, we can inspect the log marginal likelihoods given for each 213 | level. 214 | 215 | ``` r 216 | hb.results$lml.list 217 | #> $`Depth 0` 218 | #> 1 219 | #> -50858.92 220 | #> 221 | #> $`Depth 1` 222 | #> 1 2 3 4 5 6 7 223 | #> -2121.8599 -4012.3594 -4237.7639 -3095.1865 -1525.7356 -3180.7572 -4015.5020 224 | #> 8 9 10 11 12 13 225 | #> -2104.5277 -1736.0192 -780.0635 -810.7793 -688.5214 -163.3198 226 | ``` 227 | 228 | ## Caculating assignment probabilities 229 | 230 | We can also calculate the individual probabilities of assignment to each 231 | cluster. Here we make use of the woodmouse dataset loaded earlier. 232 | 233 | ``` r 234 | hb.results.woodmouse <- hierBAPS(woodmouse.snp.matrix, max.depth = 2, n.extra.rounds = Inf, 235 | quiet = TRUE, assignment.probs = TRUE) 236 | head(hb.results.woodmouse$cluster.assignment.prob[[1]]) 237 | #> Cluster 1 Cluster 2 Cluster 3 238 | #> No305 9.997868e-01 2.104112e-04 2.805482e-06 239 | #> No304 5.620947e-06 9.999944e-01 1.699254e-11 240 | #> No306 8.996214e-03 9.910038e-01 9.626735e-09 241 | #> No0906S 9.965743e-01 3.425673e-03 1.902359e-08 242 | #> No0908S 9.911304e-01 8.869359e-03 2.068655e-07 243 | #> No0909S 2.615477e-09 1.105831e-10 1.000000e+00 244 | ``` 245 | 246 | ## Saving results 247 | 248 | For runs that take a long time it is a good idea to save the output. We 249 | can save the partition file as 250 | 251 | ``` r 252 | write.csv(hb.results$partition.df, file = file.path(tempdir(), "hierbaps_partition.csv"), 253 | col.names = TRUE, row.names = FALSE) 254 | 255 | save_lml_logs(hb.results, file.path(tempdir(), "hierbaps_logML.txt")) 256 | ``` 257 | 258 | ## Citing rhierbaps 259 | 260 | If you use rhierbaps in a research publication please cite both 261 | 262 | Tonkin-Hill, Gerry, John A. Lees, Stephen D. Bentley, Simon D. W. Frost, 263 | and Jukka Corander. 2018. “RhierBAPS: An R Implementation of the 264 | Population Clustering Algorithm hierBAPS.” Wellcome Open Research 3 265 | (July): 93. 266 | 267 | Cheng, Lu, Thomas R. Connor, Jukka Sirén, David M. Aanensen, and Jukka 268 | Corander. 2013. “Hierarchical and Spatially Explicit Clustering of DNA 269 | Sequences with BAPS Software.” Molecular Biology and Evolution 30 (5): 270 | 1224–28. 271 | 272 | ## References 273 | 274 |
275 | 276 |
277 | 278 | Cheng, Lu, Thomas R Connor, Jukka Sirén, David M Aanensen, and Jukka 279 | Corander. 2013. “Hierarchical and Spatially Explicit Clustering of DNA 280 | Sequences with BAPS Software.” *Mol. Biol. Evol.* 30 (5): 1224–28. 281 | . 282 | 283 |
284 | 285 |
286 | 287 | Kalyaanamoorthy, Subha, Bui Quang Minh, Thomas K F Wong, Arndt von 288 | Haeseler, and Lars S Jermiin. 2017. “ModelFinder: Fast Model Selection 289 | for Accurate Phylogenetic Estimates.” *Nat. Methods* 14 (6): 587–89. 290 | . 291 | 292 |
293 | 294 |
295 | 296 | Paradis, Emmanuel, Julien Claude, and Korbinian Strimmer. 2004. “APE: 297 | Analyses of Phylogenetics and Evolution in R Language.” *Bioinformatics* 298 | 20 (2): 289–90. . 299 | 300 |
301 | 302 |
303 | 304 | Revell, Liam J. 2012. “Phytools: An R Package for Phylogenetic 305 | Comparative Biology (and Other Things).” *Methods Ecol. Evol.* 3 (2): 306 | 217–23. . 307 | 308 |
309 | 310 |
311 | 312 | Yu, Guangchuang, David K Smith, Huachen Zhu, Yi Guan, and Tommy Tsan-Yuk 313 | Lam. 2017. “Ggtree: An r Package for Visualization and Annotation of 314 | Phylogenetic Trees with Their Covariates and Other Associated Data.” 315 | *Methods Ecol. Evol.* 8 (1): 28–36. 316 | . 317 | 318 |
319 | 320 |
321 | -------------------------------------------------------------------------------- /inst/extdata/seqs.fa.treefile: -------------------------------------------------------------------------------- 1 | (1:0.0000003922,((2:0.0007035735,(((((((((((((((((((((((((((((((((((((4:0.0000003922,179:0.0003535671):0.000354,464:0.0003539488):0.000000,(100:0.0000003922,450:0.0003536100):0.000709):0.000000,142:0.0000003922):0.000354,(((((24:0.0003529467,496:0.0003550324):0.000362,184:0.0003548929):0.000710,220:0.0000003922):0.000000,155:0.0003547216):0.000710,((((73:0.0000003922,94:0.0003539096):0.000000,99:0.0032322991):0.000357,224:0.0003523452):0.000711,((177:0.0000003922,459:0.0003534055):0.000000,386:0.0065951662):0.000709):0.000361):0.000000):0.000000,(176:0.0000003922,395:0.0007089763):0.000354):0.000354,22:0.0007095167):0.000000,392:0.0010662049):0.000000,427:0.0007094620):0.000721,(((14:0.0025555280,255:0.0032846947):0.000679,(19:0.0007091635,((((((((29:0.0011246004,503:0.0032581496):0.002499,221:0.0010906324):0.000001,((((34:0.0000003922,(((64:0.0010707535,(174:0.0003525606,(((226:0.0003782272,422:0.0010400592):0.000708,244:0.0003770596):0.001397,(360:0.0014285156,361:0.0018050544):0.000354):0.000353):0.000717):0.000347,(381:0.0000003922,493:0.0007078393):0.001064):0.000000,239:0.0007072111):0.000708):0.000000,89:0.0007073133):0.003624,101:0.0003521745):0.000000,96:0.0000003922):0.001413):0.003272,95:0.0010707559):0.001427,512:0.0003624883):0.000353,372:0.0003561372):0.000709,50:0.0003554568):0.000706,511:0.0000003922):0.000709):0.000001):0.000352,(149:0.0007089470,(153:0.0003509940,363:0.0007129547):0.000356):0.000000):0.000000):0.002924,(93:0.0007081535,370:0.0014309426):0.001816):0.001434,17:0.0025121258):0.000000,(((((46:0.0007141307,(((53:0.0010688883,(143:0.0000003922,446:0.0003531549):0.000709):0.000354,472:0.0036145825):0.000000,379:0.0007086590):0.000353):0.000358,86:0.0007113670):0.001976,52:0.0031424499):0.000610,48:0.0006367453):0.000863,154:0.0009981955):0.000430):0.001961,(((((((((((((8:0.0000003922,((25:0.0003539114,((195:0.0000003922,359:0.0062335276):0.000354,(199:0.0003556274,(((297:0.0137869584,407:0.0000003922):0.000001,376:0.0032301154):0.003661,344:0.0021602983):0.000701):0.000709):0.000000):0.000000,398:0.0025191055):0.000000):0.000000,317:0.0003538645):0.000000,420:0.0069804024):0.000000,15:0.0007090655):0.000353,399:0.0010710014):0.001076,501:0.0003544501):0.000354,((98:0.0000003922,162:0.0003536528):0.000708,(157:0.0043728611,159:0.0007091857):0.000361):0.000000):0.000353,364:0.0007099404):0.000000,506:0.0017822738):0.000711,((13:0.0010658510,(((((((((((18:0.0000003922,((54:0.0003543339,((138:0.0000003922,241:0.0003540590):0.000354,458:0.0010677907):0.000000):0.000000,225:0.0007102141):0.000000):0.000350,((39:0.0007121253,40:0.0003550740):0.000358,(385:0.0007157663,465:0.0003503058):0.000355):0.000000):0.001465,(84:0.0003539348,166:0.0000003922):0.001838):0.000337,(139:0.0010686173,156:0.0007300277):0.001074):0.000340,373:0.0043836922):0.001087,((80:0.0101764131,148:0.0047715177):0.009532,(124:0.0007113996,253:0.0000003922):0.004854):0.008877):0.000000,(33:0.0003440857,(((59:0.0010753338,193:0.0007064301):0.001426,68:0.0010664840):0.000000,175:0.0007099116):0.001438):0.001444):0.000717,(66:0.0010687895,212:0.0007119085):0.000717):0.000714,137:0.0010695360):0.000354,(((44:0.0000003922,232:0.0014211805):0.000353,238:0.0010641979):0.000362,(466:0.0003588210,473:0.0000003922):0.000708):0.000000):0.000354,201:0.0017824052):0.000000):0.000354,((85:0.0000003922,(118:0.0000003922,(168:0.0003561767,263:0.0010664386):0.000710):0.000354):0.000354,119:0.0003547149):0.000355):0.000000):0.000352,251:0.0010742405):0.000711,136:0.0003578704):0.000871,51:0.0025998047):0.003820):0.002858,(20:0.0005672263,71:0.0015775113):0.001991):0.000833,((((((((((((((9:0.0000003922,189:0.0010658690):0.004700,(((((16:0.0003535928,197:0.0000003922):0.000000,265:0.0003536964):0.000354,23:0.0000003922):0.000000,346:0.0062176476):0.000000,203:0.0021493938):0.001068):0.000385,257:0.0000003922):0.000353,((56:0.0000003922,61:0.0003536736):0.000353,((230:0.0049678814,264:0.0054550950):0.005289,324:0.0000003922):0.001780):0.000000):0.000000,12:0.0000003922):0.008297,55:0.0038298064):0.001777,((49:0.0007206788,((65:0.0020411338,198:0.0023537088):0.004383,(192:0.0003577465,202:0.0003542944):0.001568):0.000639):0.001435,67:0.0007167978):0.000473):0.001164,69:0.0008702073):0.002225,70:0.0015668742):0.000001,(213:0.0030015320,240:0.0055881828):0.003858):0.005166,437:0.0044411917):0.001363,435:0.0011025525):0.000386,(((((((10:0.0000003922,256:0.0003556787):0.001800,(((43:0.0007185045,(((146:0.0000003922,210:0.0007131044):0.000354,161:0.0014348301):0.000358,243:0.0000003922):0.000355):0.000358,500:0.0007137950):0.000728,150:0.0014491195):0.000338):0.000727,((((242:0.0003523982,397:0.0003566649):0.000710,507:0.0003533463):0.000000,502:0.0000003922):0.000721,485:0.0017953811):0.000708):0.000000,(191:0.0000003922,(204:0.0003553595,236:0.0010771425):0.000000):0.000712):0.000000,58:0.0003551845):0.000355,63:0.0014359595):0.000001,(117:0.0003156831,206:0.0040539968):0.002564):0.000667):0.000001,(((97:0.0000003922,229:0.0007106387):0.000000,479:0.0062127532):0.002474,(468:0.0003545272,509:0.0000003922):0.004116):0.004147):0.003698):0.001259,170:0.0046551960):0.005336,(194:0.0054161588,400:0.0031284103):0.010861):0.003903,390:0.0137043954):0.003887,439:0.0048070035):0.008872,(((28:0.0031275198,188:0.0019492323):0.006273,79:0.0030622297):0.003104,440:0.0097959906):0.004713):0.008037,((((((((((72:0.0003555136,(111:0.0000003922,((((218:0.0003555378,223:0.0000003922):0.000000,(470:0.0007133693,504:0.0000003922):0.000356):0.000000,467:0.0003556020):0.000000,388:0.0003557119):0.000356):0.000000):0.000000,(((((((158:0.0000003922,208:0.0003551764):0.000355,209:0.0014317715):0.000357,227:0.0000003922):0.001070,(((183:0.0039827828,471:0.0000003922):0.000000,495:0.0003562274):0.000272,478:0.0033386852):0.002975):0.000000,(233:0.0014377683,(487:0.0010761675,510:0.0003574562):0.000713):0.000355):0.000355,(((228:0.0010716653,281:0.0003544590):0.000357,(231:0.0007127089,505:0.0000003922):0.000000):0.000355,258:0.0014310076):0.000000):0.000000,(278:0.0003628785,342:0.0029004655):0.000350):0.000355):0.000000,268:0.0003554775):0.000000,249:0.0014340364):0.003167,((307:0.0047266113,345:0.0000003922):0.001431,319:0.0000003922):0.001171):0.003931,490:0.0080616410):0.001937,(((291:0.0007138777,306:0.0000003922):0.001073,301:0.0000003922):0.002298,((299:0.0000003922,347:0.0007160197):0.000848,(335:0.0089801523,338:0.0000003922):0.000588):0.003841):0.004057):0.004520,293:0.0010258793):0.001438,330:0.0055949249):0.004097,315:0.0000003922):0.025384):0.004622,30:0.0148853615):0.006402,(((((((((((21:0.0040979179,403:0.0032749688):0.005810,322:0.0048392161):0.007156,(((((((((35:0.0014337559,(((((((41:0.0000003922,425:0.0003563539):0.000715,(327:0.0014335668,421:0.0003562179):0.000000):0.001441,(410:0.0000003922,431:0.0003562976):0.000727):0.001797,(341:0.0000003922,456:0.0010707248):0.000000):0.000356,(((305:0.0000003922,426:0.0003560967):0.000355,406:0.0003579949):0.000717,(325:0.0000003922,(414:0.0000003922,415:0.0003559011):0.000356):0.000000):0.000000):0.000000,(302:0.0000003922,311:0.0003561402):0.000715):0.000321,457:0.0095624942):0.001835):0.000000,187:0.0000003922):0.000000,116:0.0014374313):0.000723,88:0.0010786580):0.000351,222:0.0018179920):0.001094,321:0.0007255894):0.001477,((((((((114:0.0009669077,((269:0.0000003922,(272:0.0000003922,(328:0.0000003922,433:0.0003549640):0.000356):0.000355):0.001685,(313:0.0032189765,424:0.0037976678):0.002832):0.001320):0.000459,(316:0.0003540297,334:0.0003560015):0.000716):0.000000,(215:0.0010727560,404:0.0007154284):0.001441):0.000354,298:0.0007121893):0.000000,436:0.0007086915):0.000715,(190:0.0000003922,((196:0.0000003922,(((401:0.0025222145,402:0.0000003922):0.000765,447:0.0010218913):0.005188,412:0.0121389872):0.000000):0.000000,455:0.0010682576):0.000354):0.000000):0.002149,((405:0.0014354208,((408:0.0007116608,413:0.0000003922):0.000355,411:0.0000003922):0.000000):0.000355,432:0.0000003922):0.001070):0.000000,(235:0.0005408902,248:0.0080571026):0.003448):0.000307):0.001423,(((254:0.0000003922,326:0.0014302488):0.001150,423:0.0035685320):0.001595,318:0.0088782434):0.001988):0.000792,350:0.0020978916):0.001152):0.000000,286:0.0003488210):0.000720,448:0.0000003922):0.000406,(296:0.0000003922,(393:0.0003557367,416:0.0000003922):0.001438):0.002117):0.005612,36:0.0088682001):0.005346,((((((((((((37:0.0016960315,((((115:0.0000007616,454:0.0036722083):0.004644,442:0.0077154247):0.002401,(200:0.0000008194,276:0.0021652525):0.000407):0.001999,329:0.0024396608):0.002516):0.005905,449:0.0018097567):0.008172,337:0.0000003922):0.001240,((((((277:0.0003562376,280:0.0000003922):0.004443,(300:0.0007056579,(303:0.0003833841,(304:0.0003817471,358:0.0003309443):0.000716):0.003645):0.002252):0.001090,(284:0.0003547891,357:0.0000003922):0.000316):0.003313,279:0.0000003922):0.000000,287:0.0003553810):0.001536,283:0.0036843772):0.000653):0.001688,271:0.0029171109):0.010508,((((323:0.0007117425,354:0.0003551338):0.000000,355:0.0014351687):0.000000,336:0.0000003922):0.000355,356:0.0000003922):0.000362):0.000349,(273:0.0000003922,333:0.0003542776):0.001068):0.000358,282:0.0000003922):0.000711,285:0.0003573814):0.005890,(42:0.0012569486,320:0.0125937335):0.006447):0.002720,288:0.0083601291):0.001659,(((270:0.0014301558,(((((((289:0.0039816990,428:0.0000003922):0.001443,417:0.0014340222):0.001809,(418:0.0000003922,429:0.0003535193):0.000353):0.000298,((((332:0.0000003922,349:0.0007124447):0.000356,(352:0.0003559396,453:0.0003564005):0.000355):0.000718,348:0.0003575134):0.000348,353:0.0029133473):0.001136):0.007450,292:0.0000003922):0.000710,(((314:0.0000003922,409:0.0007107416):0.000350,(339:0.0000003922,340:0.0059051010):0.007000):0.000000,(419:0.0007057688,434:0.0003588989):0.000355):0.000359):0.000000,331:0.0003543693):0.000710):0.000357,274:0.0000003922):0.000355,430:0.0003546822):0.004194):0.010859):0.018064,((((((83:0.0020910351,(260:0.0003602016,261:0.0000003922):0.002691):0.001368,87:0.0179077213):0.000976,259:0.0006714356):0.015538,(488:0.0004118642,497:0.0003078671):0.007439):0.004085,438:0.0087469973):0.027489,445:0.0034077469):0.098877):0.008066,(((((((480:0.0000003922,(483:0.0000003922,(484:0.0003570504,491:0.0007140612):0.000356):0.000356):0.000000,499:0.0014361772):0.000356,481:0.0007134974):0.000000,492:0.0000003922):0.000000,489:0.0007138816):0.000884,515:0.0204426203):0.018870,((482:0.0003553582,(494:0.0000003922,508:0.0010706387):0.000000):0.001464,486:0.0044704565):0.023417):0.013776):0.008326,((74:0.0102453981,(151:0.0071566798,476:0.0036438395):0.004565):0.006955,514:0.0173481691):0.009175):0.016346):0.000952,(((((((82:0.0095358550,126:0.0001695106):0.000693,309:0.0031149149):0.002205,(((((131:0.0135107407,343:0.0068122580):0.001366,(217:0.0003108555,245:0.0007543639):0.006445):0.001462,(((160:0.0007518170,451:0.0048446200):0.003949,252:0.0033675686):0.003263,262:0.0017525040):0.001117):0.001303,295:0.0029159083):0.000766,((((178:0.0013179481,216:0.0004774830):0.000727,(290:0.0094782992,(312:0.0000003922,375:0.0003542309):0.000714):0.002799):0.002693,275:0.0021738254):0.001072,250:0.0010855613):0.001407):0.001572):0.003133,((((186:0.0057115408,(310:0.0000003922,351:0.0007146345):0.006215):0.007276,387:0.0028268292):0.005968,308:0.0079786128):0.006882,207:0.0059950918):0.003323):0.000361,(125:0.0053683351,(152:0.0003541843,294:0.0000003922):0.007442):0.002287):0.004504,((((128:0.0085515995,133:0.0044046083):0.004326,513:0.0120825469):0.001298,377:0.0053292902):0.002420,469:0.0107939202):0.002079):0.003756,((102:0.0051532810,378:0.0029255086):0.002567,247:0.0125389116):0.005180):0.010196):0.007513,((((((((5:0.0014192083,((7:0.0025262258,32:0.0003474027):0.001427,90:0.0010624589):0.000000):0.001064,120:0.0003558736):0.000352,172:0.0007105200):0.000355,92:0.0000003922):0.001423,127:0.0007204773):0.003853,452:0.0111906573):0.001938,(180:0.0078895223,211:0.0057259314):0.000955):0.001383,6:0.0053836891):0.002219):0.006962,234:0.0021829859):0.001180,(169:0.0032923008,(219:0.0007096769,394:0.0007095456):0.000001):0.001817):0.006909,(((38:0.0000003922,103:0.0003527159):0.001622,475:0.0034399883):0.000866,((((78:0.0010664316,132:0.0014386796):0.000706,((104:0.0003527199,463:0.0000003922):0.000354,477:0.0017892280):0.000348):0.001062,((105:0.0014202094,391:0.0007096957):0.001067,(362:0.0010681810,396:0.0007093988):0.000347):0.000362):0.000001,((123:0.0007054523,441:0.0106917594):0.000000,369:0.0003519569):0.001066):0.002525):0.003121):0.004496,383:0.0019474631):0.006980,(((((26:0.0000003922,165:0.0003529737):0.003635,31:0.0010716812):0.003639,(((47:0.0003519547,374:0.0000006985):0.003385,(171:0.0039466215,246:0.0000003922):0.002902):0.002531,371:0.0010645969):0.001123):0.001054,(81:0.0003835303,((((144:0.0021391886,164:0.0000003922):0.000353,380:0.0000003922):0.004587,(443:0.0022167469,444:0.0033157915):0.011942):0.002764,266:0.0017489979):0.001447):0.007394):0.001707,(((((45:0.0000003922,185:0.0003515018):0.000352,((((121:0.0007064108,122:0.0000003922):0.000706,181:0.0000003922):0.000706,182:0.0000003922):0.001007,384:0.0065038652):0.002578):0.000000,368:0.0007056582):0.000000,205:0.0000003922):0.000000,(((91:0.0000003922,140:0.0003514798):0.002086,141:0.0004072188):0.000357,462:0.0002919864):0.000412):0.002045):0.003459):0.006149,(110:0.0028092992,((112:0.0021656979,460:0.0006862366):0.000628,214:0.0083347195):0.002370):0.004954):0.002283,((((((((11:0.0003523216,(109:0.0000003922,129:0.0014165574):0.000705):0.000353,(173:0.0043093443,365:0.0000003922):0.000000):0.000000,((27:0.0002245806,(167:0.0000003922,(367:0.0003537019,382:0.0003536319):0.000000):0.000485):0.006064,147:0.0134288708):0.004476):0.000000,(75:0.0000003922,163:0.0014158343):0.000705):0.001064,(62:0.0000003922,108:0.0014155285):0.000000):0.000937,(((((106:0.0028377863,(113:0.0011069698,461:0.0017558736):0.000714):0.000754,366:0.0024953469):0.001082,498:0.0007107242):0.000340,237:0.0007163468):0.000908,267:0.0012401247):0.000650):0.001022,((57:0.0001841001,76:0.0005209020):0.000365,60:0.0034186489):0.002611):0.000000,((77:0.0035140507,474:0.0008099525):0.006578,389:0.0077028762):0.003763):0.008049):0.001040,145:0.0007543394):0.001070,130:0.0000003922):0.000000,107:0.0017683320):0.000703,135:0.0003509915):0.000000):0.000000,134:0.0003510809):0.000000,3:0.0003510272); 2 | --------------------------------------------------------------------------------