├── .gitignore
├── LICENSE
├── inst
    ├── .DS_Store
    ├── vignette-supp
    │   ├── unnamed-chunk-13-1.png
    │   ├── unnamed-chunk-14-1.png
    │   ├── unnamed-chunk-15-1.png
    │   ├── unnamed-chunk-16-1.png
    │   └── unnamed-chunk-17-1.png
    ├── CITATION
    └── extdata
    │   ├── small_seqs.fa
    │   └── seqs.fa.treefile
├── NAMESPACE
├── .Rbuildignore
├── rhierbaps.Rproj
├── man
    ├── preproc_alignment.Rd
    ├── calc_log_ml.Rd
    ├── calc_change_in_ml.Rd
    ├── save_lml_logs.Rd
    ├── load_fasta.Rd
    ├── plot_sub_cluster.Rd
    ├── join_units_2.Rd
    ├── split_clusters_3.Rd
    ├── model_search_parallel.Rd
    ├── reallocate_units_4.Rd
    ├── move_units_1.Rd
    └── hierBAPS.Rd
├── NEWS.md
├── cran-comments.md
├── DESCRIPTION
├── LICENSE.md
├── R
    ├── save_lml_logs.R
    ├── split_clusters_3.R
    ├── preproc_alignment.R
    ├── calc_log_ml.R
    ├── load_fasta.R
    ├── log_stirling2.R
    ├── plot_sub_cluster.R
    ├── reallocate_units_4.R
    ├── move_units_1.R
    ├── join_units_2.R
    ├── calc_change_in_ml.R
    ├── model_search_parallel.R
    └── hierBAPS.R
├── README.Rmd
├── .github
    └── workflows
    │   └── R-CMD-check.yaml
├── vignettes
    ├── bibliography.bib
    └── introduction.Rmd
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | inst/doc
2 | .Rproj.user
3 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2018
2 | COPYRIGHT HOLDER: Gerry Tonkin-Hill
3 | 


--------------------------------------------------------------------------------
/inst/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gtonkinhill/rhierbaps/HEAD/inst/.DS_Store


--------------------------------------------------------------------------------
/inst/vignette-supp/unnamed-chunk-13-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gtonkinhill/rhierbaps/HEAD/inst/vignette-supp/unnamed-chunk-13-1.png


--------------------------------------------------------------------------------
/inst/vignette-supp/unnamed-chunk-14-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gtonkinhill/rhierbaps/HEAD/inst/vignette-supp/unnamed-chunk-14-1.png


--------------------------------------------------------------------------------
/inst/vignette-supp/unnamed-chunk-15-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gtonkinhill/rhierbaps/HEAD/inst/vignette-supp/unnamed-chunk-15-1.png


--------------------------------------------------------------------------------
/inst/vignette-supp/unnamed-chunk-16-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gtonkinhill/rhierbaps/HEAD/inst/vignette-supp/unnamed-chunk-16-1.png


--------------------------------------------------------------------------------
/inst/vignette-supp/unnamed-chunk-17-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gtonkinhill/rhierbaps/HEAD/inst/vignette-supp/unnamed-chunk-17-1.png


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
1 | # Generated by roxygen2: do not edit by hand
2 | 
3 | export(hierBAPS)
4 | export(load_fasta)
5 | export(plot_sub_cluster)
6 | export(save_lml_logs)
7 | import(patchwork)
8 | 


--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^CRAN-RELEASE$
 2 | ^.*\.Rproj$
 3 | ^\.Rproj\.user$
 4 | cran-comments.md
 5 | .travis.yml
 6 | README.Rmd
 7 | .gitignore
 8 | ^LICENSE\.md$
 9 | revdep
10 | revdep/*
11 | ^\.github$
12 | 


--------------------------------------------------------------------------------
/rhierbaps.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | BuildType: Package
16 | PackageUseDevtools: Yes
17 | PackageInstallArgs: --no-multiarch --with-keep.source
18 | PackageCheckArgs: --as-cran
19 | PackageRoxygenize: rd,collate,namespace
20 | 


--------------------------------------------------------------------------------
/man/preproc_alignment.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/preproc_alignment.R
 3 | \name{preproc_alignment}
 4 | \alias{preproc_alignment}
 5 | \title{preproc_alignment}
 6 | \usage{
 7 | preproc_alignment(snp.matrix)
 8 | }
 9 | \arguments{
10 | \item{snp.matrix}{A matrix containing SNP data. Rows indicate isolates and columns loci.}
11 | }
12 | \value{
13 | an snp.object
14 | }
15 | \description{
16 | Preprocessed the snp matrix for hierBAPS.
17 | }
18 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
 1 | # rhierbaps 1.1.2
 2 | Fixed issue with stirling2 dependency causing the algorithm to stop prematurely for very large alignments.
 3 | 
 4 | # rhierbaps 1.1.1
 5 | Removed call to ggtree in introduction vignette as waiting on patch for ggtree given the new version of tibble.
 6 | 
 7 | # rhierbaps 1.1.0
 8 | * minor bug fix that caused an error when a cluster contained only a single SNP
 9 | * added functionality to load in ape DNAbin objects
10 | * added functionality to calculate individual assignment probabilities
11 | 
12 | # Initial release 1.0.0devtools::release()


--------------------------------------------------------------------------------
/man/calc_log_ml.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/calc_log_ml.R
 3 | \name{calc_log_ml}
 4 | \alias{calc_log_ml}
 5 | \title{calc_log_ml}
 6 | \usage{
 7 | calc_log_ml(snp.object, partition)
 8 | }
 9 | \arguments{
10 | \item{snp.object}{A snp.object containing the processed SNP data.}
11 | 
12 | \item{partition}{An integer vector indicating a partition of the isolates.}
13 | }
14 | \value{
15 | The log marginal likelihood of the given partition.
16 | }
17 | \description{
18 | Calculate the log marginal likelihood assuming a Multinomial-Dirichlet distribution
19 | }
20 | 


--------------------------------------------------------------------------------
/man/calc_change_in_ml.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/calc_change_in_ml.R
 3 | \name{calc_change_in_ml}
 4 | \alias{calc_change_in_ml}
 5 | \title{calc_change_in_ml}
 6 | \usage{
 7 | calc_change_in_ml(snp.object, partition, indexes)
 8 | }
 9 | \arguments{
10 | \item{snp.object}{A snp.object containing the processed SNP data.}
11 | 
12 | \item{partition}{An integer vector indicating a partition of the isolates.}
13 | 
14 | \item{indexes}{Indexes of the isolates to be moved (must come from one cluster.)}
15 | }
16 | \value{
17 | the best cluster to move indexes to.
18 | }
19 | \description{
20 | Calculate the change in the log marginal likelihood after moving index to each possible cluster
21 | }
22 | 


--------------------------------------------------------------------------------
/man/save_lml_logs.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/save_lml_logs.R
 3 | \name{save_lml_logs}
 4 | \alias{save_lml_logs}
 5 | \title{save_lml_logs}
 6 | \usage{
 7 | save_lml_logs(hb.object, file)
 8 | }
 9 | \arguments{
10 | \item{hb.object}{The resulting object from runnign hierBAPS}
11 | 
12 | \item{file}{The file you would like to save the log output to.}
13 | }
14 | \description{
15 | Saves the log marginal likelihoods to a text file.
16 | }
17 | \examples{
18 | snp.matrix <- load_fasta(system.file("extdata", "small_seqs.fa", package = "rhierbaps"))
19 | hb.result <- hierBAPS(snp.matrix, max.depth=2, n.pops=20)
20 | save_lml_logs(hb.result,  file.path(tempdir(), "output_file.txt"))
21 | 
22 | }
23 | 


--------------------------------------------------------------------------------
/cran-comments.md:
--------------------------------------------------------------------------------
 1 | ## Resubmission
 2 | This is a an updated version. In this version I have:
 3 | 
 4 | * Fixed an error cause by changes to the ggtree plotting package
 5 | 
 6 | ## Resubmission
 7 | This is a an updated version. In this version I have:
 8 | 
 9 | * Fixed a bug that caused the  algorithm to stop prematurely on large alignments
10 | 
11 | ## Resubmission
12 | This is a an updated version. In this version I have:
13 | 
14 | * Fixed a minor bug that caused an error when a cluster contained only a single SNP
15 | * Added functionality to accept an ape DNAbin object
16 | 
17 | ## Test environments
18 | * local OS X install, R 3.6.1
19 | * ubuntu 16.04 (on travis-ci), R 3.6.1
20 | * win-builder (devel and release) 
21 | 
22 | ## R CMD check results
23 | There were no ERRORs, WARNINGs or NOTEs.


--------------------------------------------------------------------------------
/man/load_fasta.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/load_fasta.R
 3 | \name{load_fasta}
 4 | \alias{load_fasta}
 5 | \title{load_fasta}
 6 | \usage{
 7 | load_fasta(msa, keep.singletons = FALSE)
 8 | }
 9 | \arguments{
10 | \item{msa}{Either the location of a fasta file or ape DNAbin object containing the multiple sequence alignment data to be clustered}
11 | 
12 | \item{keep.singletons}{A logical indicating whether to consider singleton mutations in calculating the clusters}
13 | }
14 | \value{
15 | A character matrix with filtered SNP data
16 | }
17 | \description{
18 | Loads a fasta file into matrix format ready for
19 | running the hierBAPS algorithm.
20 | }
21 | \examples{
22 | msa <- system.file("extdata", "seqs.fa", package = "rhierbaps")
23 | snp.matrix <- load_fasta(msa)
24 | }
25 | 


--------------------------------------------------------------------------------
/inst/CITATION:
--------------------------------------------------------------------------------
 1 | citHeader("Currently, the best way to cite rhierbaps in publications is to use:")
 2 | 
 3 | citEntry(entry = "Article",
 4 | title        = "Hierarchical and spatially explicit clustering of {DNA} sequences
 5 |               with {BAPS} software",
 6 | author       = personList(as.person("Lu Cheng"),
 7 |                  as.person("Thomas R Connor"),
 8 |                  as.person("Jukka Siren"),
 9 |                  as.person("David M. Aanensen"),
10 |                  as.person("Jukka Corander")),
11 | journal      = "Mol. Biol. Evol.",
12 | year         = "2013",
13 | volume       = "30",
14 | number       = "5",
15 | pages        = "1224--1228",
16 | url          = "http://dx.doi.org/10.1093/molbev/mst028",
17 | 
18 | textVersion  =
19 | paste("Cheng, Lu, Thomas R. Connor, Jukka Sirén, David M. Aanensen, and Jukka Corander. 2013.",
20 | "Hierarchical and Spatially Explicit Clustering of DNA Sequences with BAPS Software.",
21 | "Molecular Biology and Evolution 30 (5): 1224–28.")
22 | )
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/man/plot_sub_cluster.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plot_sub_cluster.R
 3 | \name{plot_sub_cluster}
 4 | \alias{plot_sub_cluster}
 5 | \title{plot_sub_cluster}
 6 | \usage{
 7 | plot_sub_cluster(hb.object, tree, level, sub.cluster)
 8 | }
 9 | \arguments{
10 | \item{hb.object}{The resulting object from running hierBAPS}
11 | 
12 | \item{tree}{A phylo tree object to plot}
13 | 
14 | \item{level}{The level of the subcluster to be considered.}
15 | 
16 | \item{sub.cluster}{An integer representing the subcluster to be considered.}
17 | }
18 | \description{
19 | Creates a zoom plot using ggtree focusing on a cluster.
20 | }
21 | \examples{
22 | \donttest{
23 | snp.matrix <- load_fasta(system.file("extdata", "seqs.fa", package = "rhierbaps"))
24 | newick.file.name <- system.file("extdata", "seqs.fa.treefile", package = "rhierbaps")
25 | tree <- phytools::read.newick(newick.file.name)
26 | hb.result <- hierBAPS(snp.matrix, max.depth=2, n.pops=20)
27 | plot_sub_cluster(hb.result, tree, level = 1, sub.cluster = 9)
28 | }
29 | }
30 | 


--------------------------------------------------------------------------------
/man/join_units_2.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/join_units_2.R
 3 | \name{join_units_2}
 4 | \alias{join_units_2}
 5 | \title{join_units_2}
 6 | \usage{
 7 | join_units_2(
 8 |   snp.object,
 9 |   partition,
10 |   threshold = 1e-05,
11 |   n.cores = 1,
12 |   comb.chache = NULL
13 | )
14 | }
15 | \arguments{
16 | \item{snp.object}{A snp.object containing the processed SNP data.}
17 | 
18 | \item{partition}{An integer vector indicating an initial partition of the isolates.}
19 | 
20 | \item{threshold}{The increase in marginal log likelihood required to accept a move.}
21 | 
22 | \item{n.cores}{The number of cores to use.}
23 | 
24 | \item{comb.chache}{a matrix recording previous marginal llks of combining clusters}
25 | }
26 | \value{
27 | The best partition after combining two clusters as well as
28 | a boolean value indicating whether a move increased the marginal likelihood.
29 | }
30 | \description{
31 | Peform an iteration of the second move in the algorithm. That is combine two clusters
32 | to improve the marginal likelihood.
33 | }
34 | 


--------------------------------------------------------------------------------
/man/split_clusters_3.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/split_clusters_3.R
 3 | \name{split_clusters_3}
 4 | \alias{split_clusters_3}
 5 | \title{split_clusters_3}
 6 | \usage{
 7 | split_clusters_3(
 8 |   snp.object,
 9 |   partition,
10 |   threshold = 1e-05,
11 |   min.clust.size = 20,
12 |   n.cores = 1
13 | )
14 | }
15 | \arguments{
16 | \item{snp.object}{A snp.object containing the processed SNP data.}
17 | 
18 | \item{partition}{An integer vector indicating an initial partition of the isolates.}
19 | 
20 | \item{threshold}{The increase in marginal log likelihood required to accept a move.}
21 | 
22 | \item{min.clust.size}{Clusters smaller than min.clust.size will not be split.}
23 | 
24 | \item{n.cores}{The number of cores to use.}
25 | }
26 | \value{
27 | The best partition after splitting a cluster and re-allocating as well as
28 | a boolean value indicating whether a move increased the marginal likelihood.
29 | }
30 | \description{
31 | Peform an iteration of the third move in the algorithm. That is split cluster in two
32 | and re-allocate one sub-cluster.
33 | }
34 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: rhierbaps
 2 | Type: Package
 3 | Title: Clustering Genetic Sequence Data Using the HierBAPS Algorithm
 4 | Version: 1.1.3
 5 | Authors@R: person("Gerry", "Tonkin-Hill", email = "g.tonkinhill@gmail.com", role = c("cre", "aut"))
 6 | Description: Implements the hierarchical Bayesian analysis of populations structure (hierBAPS) 
 7 |   algorithm of Cheng et al. (2013) <doi:10.1093/molbev/mst028> for clustering DNA sequences 
 8 |   from multiple sequence alignments in FASTA format. 
 9 |   The implementation includes improved defaults and plotting capabilities 
10 |   and unlike the original 'MATLAB' version removes singleton SNPs by default.
11 | License: MIT + file LICENSE
12 | Encoding: UTF-8
13 | LazyData: true
14 | Imports: 
15 |     ape,
16 |     purrr,
17 |     utils,
18 |     ggplot2,
19 |     matrixStats,
20 |     patchwork
21 | RoxygenNote: 7.1.0
22 | Suggests: knitr,
23 |     rmarkdown,
24 |     markdown,
25 |     ggtree,
26 |     phytools,
27 |     testthat,
28 |     formatR
29 | VignetteBuilder: knitr
30 | URL: https://github.com/gtonkinhill/rhierbaps
31 | BugReports: https://github.com/gtonkinhill/rhierbaps/issues
32 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | Copyright (c) 2018 Gerry Tonkin-Hill
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/R/save_lml_logs.R:
--------------------------------------------------------------------------------
 1 | #' save_lml_logs
 2 | #'
 3 | #' Saves the log marginal likelihoods to a text file.
 4 | #'
 5 | #'
 6 | #' @param hb.object The resulting object from runnign hierBAPS
 7 | #' @param file The file you would like to save the log output to.
 8 | #'
 9 | #' @examples
10 | #' snp.matrix <- load_fasta(system.file("extdata", "small_seqs.fa", package = "rhierbaps"))
11 | #' hb.result <- hierBAPS(snp.matrix, max.depth=2, n.pops=20)
12 | #' save_lml_logs(hb.result,  file.path(tempdir(), "output_file.txt"))
13 | #'
14 | #' @export
15 | save_lml_logs <- function(hb.object, file){
16 | 
17 |   if (!is.character(file)) stop("Invalid file name. Must be a valid string.")
18 |   if ((!is.list(hb.object) || !is.data.frame(hb.object$partition.df)
19 |        ) || !is.list(hb.object$lml.list)) stop("Invalid hb.object!")
20 | 
21 |   sink(file)
22 |   for(i in 1:length(hb.object$lml.list)){
23 |     cat(names(hb.object$lml.list)[[i]], "\n")
24 |     cat("cluster names:\t", paste(names(hb.object$lml.list[[i]]), collapse = "\t"), "\n")
25 |     cat("cluster log marginal likelihood:\t", paste(hb.object$lml.list[[i]], collapse = "\t"), "\n")
26 |   }
27 |   sink()
28 | }
29 | 


--------------------------------------------------------------------------------
/man/model_search_parallel.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/model_search_parallel.R
 3 | \name{model_search_parallel}
 4 | \alias{model_search_parallel}
 5 | \title{model_search_parallel}
 6 | \usage{
 7 | model_search_parallel(
 8 |   snp.object,
 9 |   partition,
10 |   round.types,
11 |   quiet = FALSE,
12 |   n.extra.rounds = 0,
13 |   n.cores = 1
14 | )
15 | }
16 | \arguments{
17 | \item{snp.object}{A snp.object containing the processed SNP data.}
18 | 
19 | \item{partition}{An integer vector indicating an initial starting partition.}
20 | 
21 | \item{round.types}{A vector indicating which series of moves to make.}
22 | 
23 | \item{quiet}{Whether to suppress progress information (default=FALSE).}
24 | 
25 | \item{n.extra.rounds}{The number of additional rounds to perform after the default hierBAPS
26 | settings (default=0). If set to Inf it will run until a local optimum is reached
27 | (this might take a long time).}
28 | 
29 | \item{n.cores}{The number of cores to use.}
30 | }
31 | \value{
32 | an optimised partition and marginal llk
33 | }
34 | \description{
35 | Clusters DNA alignment using independent loci model
36 | }
37 | 


--------------------------------------------------------------------------------
/man/reallocate_units_4.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/reallocate_units_4.R
 3 | \name{reallocate_units_4}
 4 | \alias{reallocate_units_4}
 5 | \title{reallocate_units_4}
 6 | \usage{
 7 | reallocate_units_4(
 8 |   snp.object,
 9 |   partition,
10 |   threshold = 1e-05,
11 |   min.clust.size = 20,
12 |   split = FALSE,
13 |   n.cores = 1
14 | )
15 | }
16 | \arguments{
17 | \item{snp.object}{A snp.object containing the processed SNP data.}
18 | 
19 | \item{partition}{An integer vector indicating an initial partition of the isolates.}
20 | 
21 | \item{threshold}{The increase in marginal log likelihood required to accept a move.}
22 | 
23 | \item{min.clust.size}{Clusters smaller than min.clust.size will not be split.}
24 | 
25 | \item{split}{Whether to split only into two clusters (for move type 3).}
26 | 
27 | \item{n.cores}{The number of cores to use.}
28 | }
29 | \value{
30 | The best partition after splitting a cluster and re-allocating as well as
31 | a boolean value indicating whether a move increased the marginal likelihood.
32 | }
33 | \description{
34 | Peform an iteration of the fourth move in the algorithm. That is split cluster into n
35 | subclusters and re-allocate one sub-cluster.
36 | }
37 | 


--------------------------------------------------------------------------------
/man/move_units_1.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/move_units_1.R
 3 | \name{move_units_1}
 4 | \alias{move_units_1}
 5 | \title{move_units_1}
 6 | \usage{
 7 | move_units_1(
 8 |   snp.object,
 9 |   partition,
10 |   threshold = 1e-05,
11 |   frac.clust.searched = 0.3,
12 |   min.clust.size = 20,
13 |   n.cores = 1
14 | )
15 | }
16 | \arguments{
17 | \item{snp.object}{A snp.object containing the processed SNP data.}
18 | 
19 | \item{partition}{An integer vector indicating an initial partition of the isolates.}
20 | 
21 | \item{threshold}{The increase in marginal log likelihood required to accept a move.}
22 | 
23 | \item{frac.clust.searched}{The percentage of a large cluster that will be moved.}
24 | 
25 | \item{min.clust.size}{All isolates in clusters less than or equal to min.clus.size will be searched.}
26 | 
27 | \item{n.cores}{The number of cores to use.}
28 | }
29 | \value{
30 | The best partition after moving units from one cluster to another as well as
31 | a boolean value indicating whether a move increased the marginal likelihood.
32 | }
33 | \description{
34 | Peform an iteration of the first move in the algorithm. That is move units from one cluster to
35 | another to improve the marginal likelihood
36 | }
37 | 


--------------------------------------------------------------------------------
/R/split_clusters_3.R:
--------------------------------------------------------------------------------
 1 | #' split_clusters_3
 2 | #'
 3 | #' Peform an iteration of the third move in the algorithm. That is split cluster in two
 4 | #' and re-allocate one sub-cluster.
 5 | #'
 6 | #'
 7 | #' @param snp.object A snp.object containing the processed SNP data.
 8 | #' @param partition An integer vector indicating an initial partition of the isolates.
 9 | #' @param threshold The increase in marginal log likelihood required to accept a move.
10 | #' @param min.clust.size Clusters smaller than min.clust.size will not be split.
11 | #' @param n.cores The number of cores to use.
12 | #'
13 | #' @return The best partition after splitting a cluster and re-allocating as well as
14 | #' a boolean value indicating whether a move increased the marginal likelihood.
15 | #'
16 | split_clusters_3 <- function(snp.object, partition, threshold=1e-5,
17 |                              min.clust.size=20, n.cores=1){
18 |   #At the moment this can't create new clusters. This is the same as in the original hierBAPS
19 |   #but it might be worth allowing the creation of new clusters. TODO:Ask Jukka about it.
20 |   return(reallocate_units_4(snp.object, partition, threshold=1e-5,
21 |                             min.clust.size=min.clust.size, split=TRUE,
22 |                             n.cores=n.cores))
23 | 
24 | 
25 | }
26 | 


--------------------------------------------------------------------------------
/R/preproc_alignment.R:
--------------------------------------------------------------------------------
 1 | #' preproc_alignment
 2 | #'
 3 | #' Preprocessed the snp matrix for hierBAPS.
 4 | #'
 5 | #'
 6 | #' @param snp.matrix A matrix containing SNP data. Rows indicate isolates and columns loci.
 7 | #'
 8 | #' @return an snp.object
 9 | #'
10 | preproc_alignment <- function(snp.matrix){
11 |   if(!is.matrix(snp.matrix)) stop("snp.matrix is not a valid matrix")
12 | 
13 |   n.seq <- nrow(snp.matrix)
14 | 
15 |   # prior: 1/number of distinct nucleotides observed at that column
16 |   prior <- apply(snp.matrix, 2, function(x) table(factor(x, levels = c("a","c","g","t")),
17 |                                                   exclude = c("-", NA)))
18 | 
19 |   #TODO: should we be getting rid of unique SNPs here?
20 |   #now ignore conserved columns
21 |   keep <- colSums(prior>0)>1
22 |   
23 |   if(sum(keep)==0){
24 |     #all columns are conserved
25 |     return(NA)
26 |   }
27 |   
28 |   snp.matrix <- snp.matrix[, keep, drop=FALSE]
29 |   prior <- prior[, keep, drop=FALSE]
30 | 
31 |   #finally generate a matrix for the prior nt values
32 |   prior <- 1*(prior>0)
33 |   prior <- t(t(prior)/colSums(prior))
34 | 
35 |   #Calculate hamming distance
36 |   orig.dist <- as.matrix(ape::dist.dna(ape::as.DNAbin(snp.matrix),
37 |                                        model = "N", pairwise.deletion = TRUE))
38 | 
39 |   return(list(
40 |     n.seq = n.seq,
41 |     dist = orig.dist,
42 |     seq.inds = 1:n.seq,
43 |     prior = prior,
44 |     data = snp.matrix
45 |   ))
46 | }
47 | 


--------------------------------------------------------------------------------
/R/calc_log_ml.R:
--------------------------------------------------------------------------------
 1 | #' calc_log_ml
 2 | #'
 3 | #' Calculate the log marginal likelihood assuming a Multinomial-Dirichlet distribution
 4 | #'
 5 | #'
 6 | #' @param snp.object A snp.object containing the processed SNP data.
 7 | #' @param partition An integer vector indicating a partition of the isolates.
 8 | #'
 9 | #' @return The log marginal likelihood of the given partition.
10 | #'
11 | calc_log_ml <- function(snp.object, partition){
12 |   #some checks
13 |   if (ncol(snp.object$prior)!=ncol(snp.object$data)) stop("ncol mismatch bwtn prior and data!")
14 |   if (length(partition)!=nrow(snp.object$data)) stop("mismatch bwtn partition and data!")
15 | 
16 |   #get allele counts for each cluster in partition
17 |   mA <- t(rowsum(1*(snp.object$data=="a"), partition))
18 |   mC <- t(rowsum(1*(snp.object$data=="c"), partition))
19 |   mG <- t(rowsum(1*(snp.object$data=="g"), partition))
20 |   mT <- t(rowsum(1*(snp.object$data=="t"), partition))
21 | 
22 |   prior <- snp.object$prior
23 |   prior[prior==0] <- 1 #deal with zeros and resulting NAs
24 | 
25 |   #calculate log marginal likelihood
26 |   term1 <- -lgamma(1 + mA+mC+mG+mT)
27 |   term2 <- lgamma(prior["a", ] + mA) - lgamma(prior["a", ])
28 |   term2 <- term2 + lgamma(prior["c", ] + mC) - lgamma(prior["c", ])
29 |   term2 <- term2 + lgamma(prior["g", ] + mG) - lgamma(prior["g", ])
30 |   term2 <- term2 + lgamma(prior["t", ] + mT) - lgamma(prior["t", ])
31 | 
32 |   #take sum over all loci and clusters
33 |   ml <- sum(term1 + term2)
34 | 
35 |   #uniform prior on K:
36 |   #prior prob of each partition is equal so must divide by the number of possible
37 |   #partitions.
38 |   stirling2 <- log_stirling2(nrow(snp.object$data), length(unique(partition)))
39 |   stopifnot(is.finite(stirling2))
40 |   stopifnot(!is.na(stirling2))
41 |   
42 |   ml <- ml - stirling2
43 | 
44 |   return(ml)
45 | }
46 | 


--------------------------------------------------------------------------------
/R/load_fasta.R:
--------------------------------------------------------------------------------
 1 | #' load_fasta
 2 | #'
 3 | #' Loads a fasta file into matrix format ready for
 4 | #' running the hierBAPS algorithm.
 5 | #'
 6 | #' @param msa Either the location of a fasta file or ape DNAbin object containing the multiple sequence alignment data to be clustered
 7 | #' @param keep.singletons A logical indicating whether to consider singleton mutations in calculating the clusters
 8 | #'
 9 | #' @return A character matrix with filtered SNP data
10 | #'
11 | #' @examples
12 | #' msa <- system.file("extdata", "seqs.fa", package = "rhierbaps")
13 | #' snp.matrix <- load_fasta(msa)
14 | #' @export
15 | load_fasta <- function(msa, keep.singletons=FALSE) {
16 | 
17 |   #Check inputs
18 |   if(class(msa)=="character"){
19 |     if (!file.exists(msa)) stop("Invalid msa or the file does not exist!")
20 |     seqs <- ape::read.FASTA(msa)
21 |   } else if(class(msa)=="matrix"){
22 |     seqs <- ape::as.DNAbin(msa)
23 |   } else if(class(msa)=="DNAbin"){
24 |     seqs <- msa
25 |   } else{
26 |     stop("incorrect input for msa!")
27 |   }
28 |   if (!is.logical(keep.singletons)) stop("Invalid keep.singletons! Must be on of TRUE/FALSE.")
29 | 
30 |   #Load sequences using ape. This does a lot of the checking for us.
31 |   seq_names <- labels(seqs)
32 |   seqs <- as.character(as.matrix(seqs))
33 |   rownames(seqs) <- seq_names
34 |   seqs[is.na(seqs)] <- "-"
35 | 
36 |   if (nrow(seqs)<3) stop("Less than 3 sequences!")
37 |   warning("Characters not in acgtnACGTN- will be treated as missing (-)...")
38 | 
39 |   #Remove conserved columns
40 |   conserved <- colSums(t(t(seqs)==seqs[1,]))==nrow(seqs)
41 |   seqs <- seqs[, !conserved]
42 | 
43 |   if(!keep.singletons){
44 |     #remove singletons as they are uninformative in the algorithm
45 |     is_singleton <- apply(seqs, 2, function(x){
46 |       tab <- table(x)
47 |       return(x %in% names(tab)[tab==1])
48 |     })
49 |     seqs[is_singleton] <- "-"
50 |   }
51 | 
52 |   #Convert gaps and unknowns to same symbol
53 |   seqs[seqs=="n"] <- "-"
54 | 
55 |   return(seqs)
56 | }
57 | 


--------------------------------------------------------------------------------
/man/hierBAPS.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/hierBAPS.R
 3 | \name{hierBAPS}
 4 | \alias{hierBAPS}
 5 | \title{hierBAPS}
 6 | \usage{
 7 | hierBAPS(
 8 |   snp.matrix,
 9 |   max.depth = 2,
10 |   n.pops = floor(nrow(snp.matrix)/5),
11 |   quiet = FALSE,
12 |   n.extra.rounds = 0,
13 |   assignment.probs = FALSE,
14 |   n.cores = 1
15 | )
16 | }
17 | \arguments{
18 | \item{snp.matrix}{Character matrix of aligned sequences produced by \link{load_fasta}.}
19 | 
20 | \item{max.depth}{Maximum depth of hierarchical search (default = 2).}
21 | 
22 | \item{n.pops}{Maximum number of populations in the data (default = number of isolates/5)}
23 | 
24 | \item{quiet}{Whether to suppress progress information (default=FALSE).}
25 | 
26 | \item{n.extra.rounds}{The number of additional rounds to perform after the default hierBAPS
27 | settings (default=0). If set to Inf it will run until a local optimum is reached
28 | (this might take a long time).}
29 | 
30 | \item{assignment.probs}{whether or not to calculate the assignment probabilities to each cluster (default=FALSE)}
31 | 
32 | \item{n.cores}{The number of cores to use.}
33 | }
34 | \value{
35 | A list containing a dataframe indicating an assignment of each sequence
36 | to hierarchical clusters as well as the log marginal likelihoods for each level.
37 | }
38 | \description{
39 | Runs the hierBAPS algorithm of Cheng et al. 2013
40 | }
41 | \examples{
42 | snp.matrix <- load_fasta(system.file("extdata", "small_seqs.fa", package = "rhierbaps"))
43 | hb <- hierBAPS(snp.matrix, max.depth=2, n.pops=20, quiet=FALSE)
44 | 
45 | \donttest{
46 | snp.matrix <- load_fasta(system.file("extdata", "seqs.fa", package = "rhierbaps"))
47 | system.time({hb <- hierBAPS(snp.matrix, max.depth=2, n.pops=20, quiet=FALSE)})
48 | }
49 | 
50 | }
51 | \references{
52 | Cheng, Lu, Thomas R. Connor, Jukka Sirén, David M. Aanensen, and Jukka Corander. 2013. “Hierarchical and Spatially Explicit Clustering of DNA Sequences with BAPS Software.” Molecular Biology and Evolution 30 (5): 1224–28.
53 | }
54 | \author{
55 | Gerry Tonkin-Hill
56 | }
57 | 


--------------------------------------------------------------------------------
/R/log_stirling2.R:
--------------------------------------------------------------------------------
 1 | #' log_stirling2
 2 | #'
 3 | #' @param n number of objects
 4 | #' @param k number of partitions
 5 | #'
 6 | #' @return log of the Stirling number of the second kind
 7 | #' 
 8 | #'
 9 | log_stirling2 <- function(n, k){
10 |   if(!is.numeric(n)) stop("n is not numeric!")
11 |   if(!is.numeric(k)) stop("k is not numeric!")
12 |   if(k>n) stop("k must be less than n!")
13 |   
14 |   v <- n/k
15 |   G <- lambertW(-v*exp(-v))
16 |   
17 |   lS2 <- log(sqrt((v-1)/(v*(1-G)))) +
18 |     (n-k)*(log(v-1)-log(v-G)) +
19 |     n*log(k)-k*log(n) +
20 |     k*(1-G) +
21 |     lchoose(n, k)
22 | 
23 |   return(lS2)
24 | }
25 | 
26 | # This function was written by Ben Bolker and taken from https://stat.ethz.ch/pipermail/r-help/2003-November/042793.html
27 | lambertW = function(z,b=0,maxiter=10,eps=.Machine$double.eps,
28 |                     min.imag=1e-9) {
29 |   if (any(round(Re(b)) != b))
30 |     stop("branch number for W must be an integer")
31 |   if (!is.complex(z) && any(z<0)) z=as.complex(z)
32 |   ## series expansion about -1/e
33 |   ##
34 |   ## p = (1 - 2*abs(b)).*sqrt(2*e*z + 2);
35 |   ## w = (11/72)*p;
36 |   ## w = (w - 1/3).*p;
37 |   ## w = (w + 1).*p - 1
38 |   ##
39 |   ## first-order version suffices:
40 |   ##
41 |   w = (1 - 2*abs(b))*sqrt(2*exp(1)*z + 2) - 1
42 |   ## asymptotic expansion at 0 and Inf
43 |   ##
44 |   v = log(z + as.numeric(z==0 & b==0)) + 2*pi*b*1i;
45 |   v = v - log(v + as.numeric(v==0))
46 |   ## choose strategy for initial guess
47 |   ##
48 |   c = abs(z + exp(-1));
49 |   c = (c > 1.45 - 1.1*abs(b));
50 |   c = c | (b*Im(z) > 0) | (!Im(z) & (b == 1))
51 |   w = (1 - c)*w + c*v
52 |   ## Halley iteration
53 |   ##
54 |   for (n in 1:maxiter) {
55 |     p = exp(w)
56 |     t = w*p - z
57 |     f = (w != -1)
58 |     t = f*t/(p*(w + f) - 0.5*(w + 2.0)*t/(w + f))
59 |     w = w - t
60 |     if (abs(Re(t)) < (2.48*eps)*(1.0 + abs(Re(w)))
61 |         && abs(Im(t)) < (2.48*eps)*(1.0 + abs(Im(w))))
62 |       break
63 |   }
64 |   if (n==maxiter) warning(paste("iteration limit (",maxiter,
65 |                                 ") reached, result of W may be inaccurate",sep=""))
66 |   if (all(Im(w)<min.imag)) w = as.numeric(w)
67 |   return(w)
68 | }
69 | 


--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | bibliography: vignettes/bibliography.bib
 3 | output: 
 4 |   github_document
 5 | ---
 6 | 
 7 | <!-- README.md is generated from README.Rmd. Please edit that file -->
 8 | 
 9 | ```{r, echo = FALSE}
10 | knitr::opts_chunk$set(
11 |   collapse = TRUE,
12 |   comment = "#>",
13 |   fig.path = "inst/vignette-supp/",
14 |   echo=TRUE, 
15 |   warning=FALSE, 
16 |   message=FALSE,
17 |   tidy=TRUE
18 | )
19 | ```
20 | 
21 | <!-- badges: start -->
22 |   [![R-CMD-check](https://github.com/gtonkinhill/rhierbaps/workflows/R-CMD-check/badge.svg)](https://github.com/gtonkinhill/rhierbaps/actions)
23 | <!-- badges: end -->
24 | 
25 | 
26 | # rhierbaps
27 | 
28 | We have recently developed a faster verion of the BAPs clustering method. It can be found [here](https://github.com/gtonkinhill/fastbaps). 
29 | 
30 | ## Installation
31 | `rhierbaps` is available on CRAN.
32 | ```{r, eval=FALSE}
33 | install.packages("rhierbaps")
34 | ```
35 | 
36 | The development version is available on github. It can be installed with `devtools`
37 | ```{r, eval = FALSE}
38 | install.packages("devtools")
39 | 
40 | devtools::install_github("gtonkinhill/rhierbaps")
41 | ```
42 | 
43 | If you would like to also build the vignette with your installation run:
44 | 
45 | ```{r, eval=FALSE}
46 | devtools::install_github("gtonkinhill/rhierbaps", build_vignettes = TRUE)
47 | ```
48 | 
49 | ## Quick Start
50 | Run hierBAPS.
51 | 
52 | ```{r, fig.width =8, fig.height=6, fig.align='center'}
53 | # install.packages("rhierbaps")
54 | library(rhierbaps)
55 | 
56 | fasta.file.name <- system.file("extdata", "seqs.fa", package = "rhierbaps")
57 | snp.matrix <- load_fasta(fasta.file.name)
58 | hb.results <- hierBAPS(snp.matrix, max.depth=2, n.pops=20, quiet = TRUE)
59 | head(hb.results$partition.df)
60 | ```
61 | 
62 | ```{r, echo = FALSE}
63 | intro_rmd <- 'vignettes/introduction.Rmd'
64 | 
65 | raw_rmd <- readLines(intro_rmd)
66 | 
67 | # remove yaml 
68 | yaml_lines <- grep("---", raw_rmd)
69 | 
70 | # remove appendix (session info)
71 | appendix <- grep("Session", raw_rmd)
72 | 
73 | compressed_rmd <- raw_rmd[c(-seq(yaml_lines[1], yaml_lines[2], by = 1), 
74 |                             -seq(appendix, length(raw_rmd)))]
75 | writeLines(compressed_rmd, "child.Rmd")
76 | ```
77 | 
78 | ```{r, child = 'child.Rmd'}
79 | ```
80 | 
81 | ```{r cleanup, echo=FALSE, include=FALSE}
82 | if (file.exists("child.Rmd")) {
83 |   file.remove("child.Rmd")
84 | }
85 | ```
86 | 


--------------------------------------------------------------------------------
/inst/extdata/small_seqs.fa:
--------------------------------------------------------------------------------
  1 | >1
  2 | AGCTGTAATG
  3 | >2
  4 | AGCTGTAATG
  5 | >3
  6 | AGCTGTAATG
  7 | >4
  8 | AGTCGTAATG
  9 | >5
 10 | AATCGTAATG
 11 | >6
 12 | AGTCGTAATG
 13 | >7
 14 | AATCGTAATG
 15 | >8
 16 | AATCGTAATG
 17 | >9
 18 | AGTCGTAATG
 19 | >10
 20 | AGTCGTAATG
 21 | >11
 22 | AGTCGTAATG
 23 | >12
 24 | AGTCGTAATG
 25 | >13
 26 | AATCGTAATG
 27 | >14
 28 | AGTCGTAATG
 29 | >15
 30 | AATCGTAATG
 31 | >16
 32 | AGTCGTAATG
 33 | >17
 34 | AGTCGTAATG
 35 | >18
 36 | AGTCGTAATG
 37 | >19
 38 | AGTCGTAATG
 39 | >20
 40 | AGTCGTAATG
 41 | >21
 42 | AGTCATAATG
 43 | >22
 44 | AGTCGTAATG
 45 | >23
 46 | AGTCGTAATG
 47 | >24
 48 | AGTCGTAATG
 49 | >25
 50 | AATCGTAATG
 51 | >26
 52 | AGTCGTAATG
 53 | >27
 54 | AGTCGTAATG
 55 | >28
 56 | AGCCGTAATG
 57 | >29
 58 | AGTCGTAATG
 59 | >30
 60 | AGTCATAATG
 61 | >31
 62 | AGCCGTAATG
 63 | >32
 64 | AATCGTAATG
 65 | >33
 66 | AGTCGTAATG
 67 | >34
 68 | AGTCGTAATG
 69 | >35
 70 | AGTCATAATG
 71 | >36
 72 | AGTCATAATG
 73 | >37
 74 | AGTCATAATG
 75 | >38
 76 | AATCGTAATG
 77 | >39
 78 | AGTCGTAATG
 79 | >40
 80 | AGTCGTAATG
 81 | >41
 82 | AGTCATAATG
 83 | >42
 84 | AGTCATAATG
 85 | >43
 86 | AGTCGTAATG
 87 | >44
 88 | AGTCGTAATG
 89 | >45
 90 | AGCCGTAATG
 91 | >46
 92 | AGTCGTAATG
 93 | >47
 94 | AGTCGTAATA
 95 | >48
 96 | AGTCGTAATG
 97 | >49
 98 | AGTCGTAATG
 99 | >50
100 | AGTCGTAATG
101 | >51
102 | AGTCGTAATG
103 | >52
104 | AGTCGTAATG
105 | >53
106 | AGTCGTAATG
107 | >54
108 | AGTCGTAATG
109 | >55
110 | AGTCGTAATG
111 | >56
112 | AGTCGTAATG
113 | >57
114 | AGTCGTAATG
115 | >58
116 | AGTCGTAATG
117 | >59
118 | AGTCGTAATG
119 | >60
120 | AGTCGTAATG
121 | >61
122 | AGTCGTAATG
123 | >62
124 | AGTCGTAATG
125 | >63
126 | AGTCGTAATG
127 | >64
128 | AGTCGTAATG
129 | >65
130 | AGTCGTAATG
131 | >66
132 | AATCGTAATG
133 | >67
134 | AGTCGTAATG
135 | >68
136 | AGTCGTAATG
137 | >69
138 | AGTCGTAATG
139 | >70
140 | AGTCGTAATG
141 | >71
142 | AGTCGTAATG
143 | >72
144 | AGTCGTAATG
145 | >73
146 | AGTCGTAATG
147 | >74
148 | AGTCGTAATG
149 | >75
150 | AGTCGTAATG
151 | >76
152 | AGTCGTAATG
153 | >77
154 | AGTCGTAATG
155 | >78
156 | AATCGTAATG
157 | >79
158 | AATCGTAATG
159 | >80
160 | AGTCGTAATG
161 | >81
162 | AGTCGTAATG
163 | >82
164 | AGTCATAATG
165 | >83
166 | AATCAAAGTG
167 | >84
168 | AGTCGTAATG
169 | >85
170 | AATCGTAATG
171 | >86
172 | AGTCGTAATG
173 | >87
174 | AATCAAAA-G
175 | >88
176 | AGTCATAATG
177 | >89
178 | AGTCGTAATG
179 | >90
180 | AATCGTAATG
181 | >91
182 | AGCCGTAATG
183 | >92
184 | AATCGTAATG
185 | >93
186 | AGTCGTAATG
187 | >94
188 | AGTCGTAATG
189 | >95
190 | AGTCGTAATG
191 | >96
192 | AGTCGTAATG
193 | >97
194 | AGTCGTAATG
195 | >98
196 | AATCGTAATG
197 | >99
198 | AGTCGTAATG
199 | >100
200 | AGTCGTAATG
201 | 


--------------------------------------------------------------------------------
/R/plot_sub_cluster.R:
--------------------------------------------------------------------------------
 1 | #' plot_sub_cluster
 2 | #'
 3 | #' Creates a zoom plot using ggtree focusing on a cluster.
 4 | #' 
 5 | #' @import patchwork
 6 | #'
 7 | #'
 8 | #' @param hb.object The resulting object from running hierBAPS
 9 | #' @param tree A phylo tree object to plot
10 | #' @param level The level of the subcluster to be considered.
11 | #' @param sub.cluster An integer representing the subcluster to be considered.
12 | #'
13 | #' @examples
14 | #' \donttest{
15 | #' snp.matrix <- load_fasta(system.file("extdata", "seqs.fa", package = "rhierbaps"))
16 | #' newick.file.name <- system.file("extdata", "seqs.fa.treefile", package = "rhierbaps")
17 | #' tree <- phytools::read.newick(newick.file.name)
18 | #' hb.result <- hierBAPS(snp.matrix, max.depth=2, n.pops=20)
19 | #' plot_sub_cluster(hb.result, tree, level = 1, sub.cluster = 9)
20 | #' }
21 | #' @export
22 | plot_sub_cluster <- function(hb.object, tree, level, sub.cluster){
23 |   #Checks
24 |   if ((!is.list(hb.object) || !is.data.frame(hb.object$partition.df)
25 |   ) || !is.list(hb.object$lml.list)) stop("Invalid hb.object!")
26 |   if (!(class(tree)=="phylo")) stop("Invalid tree object!")
27 |   if ((!is.numeric(level)) || (level<1)) stop("Invalid level! Must be a positive integer.")
28 |   if ((!is.numeric(sub.cluster)) || (sub.cluster<1)) stop("Invalid sub.cluster! Must be a positive integer.")
29 | 
30 |   level <- level+1
31 |   if(!("ggtree" %in%
32 |        rownames(utils::installed.packages()))) stop("This function requires ggtree to be installed")
33 | 
34 |   cluster.isolate <- hb.object$partition.df$Isolate[hb.object$partition.df[,level]==sub.cluster]
35 |   
36 |   # Plot the full tree with the clade highlighted
37 |   hb.object$partition.df$is_in_cluster <- hb.object$partition.df$Isolate %in% cluster.isolate
38 |   full_tree <- ggtree::`%<+%`(ggtree::ggtree(tree), hb.object$partition.df) +
39 |     ggtree::geom_tippoint(ggplot2::aes_string(color="is_in_cluster"), size=0.5) +
40 |     ggplot2::theme(legend.position = "none") + 
41 |     ggplot2::scale_colour_manual(values=c("#000000", "#e31a1c"))
42 |   
43 |   # Subset the tree
44 |   temp_column_id <- paste(c("factor(`level ", level, "`)"), collapse = "")
45 |   sub_tree <- ape::drop.tip(tree, tree$tip.label[!tree$tip.label %in% cluster.isolate])
46 |   sub_tree <- ggtree::`%<+%`(ggtree::ggtree(sub_tree), hb.object$partition.df) +
47 |     ggtree::geom_tippoint(ggplot2::aes_string(color=temp_column_id)) +
48 |     ggplot2::labs(color=temp_column_id) + 
49 |     ggplot2::theme(legend.position="right") +
50 |     ggplot2::scale_color_discrete(name=paste("level ", level, collapse = ""))
51 | 
52 |   return(full_tree+sub_tree+patchwork::plot_layout(nrow = 1))
53 | }
54 | 


--------------------------------------------------------------------------------
/R/reallocate_units_4.R:
--------------------------------------------------------------------------------
 1 | #' reallocate_units_4
 2 | #'
 3 | #' Peform an iteration of the fourth move in the algorithm. That is split cluster into n
 4 | #' subclusters and re-allocate one sub-cluster.
 5 | #' 
 6 | #'
 7 | #' @param snp.object A snp.object containing the processed SNP data.
 8 | #' @param partition An integer vector indicating an initial partition of the isolates.
 9 | #' @param threshold The increase in marginal log likelihood required to accept a move.
10 | #' @param min.clust.size Clusters smaller than min.clust.size will not be split.
11 | #' @param split Whether to split only into two clusters (for move type 3).
12 | #' @param n.cores The number of cores to use.
13 | #'
14 | #' @return The best partition after splitting a cluster and re-allocating as well as
15 | #' a boolean value indicating whether a move increased the marginal likelihood.
16 | #'
17 | reallocate_units_4 <- function(snp.object, partition, threshold=1e-5,
18 |                                min.clust.size=20, split=FALSE,
19 |                                n.cores=1){
20 | 
21 |   #some checks
22 |   if (ncol(snp.object$prior)!=ncol(snp.object$data)) stop("ncol mismatch bwtn prior and data!")
23 |   if (length(partition)!=nrow(snp.object$data)) stop("mismatch bwtn partition and data!")
24 |   if (!(class(partition)=="integer")) stop("step 4 -> partition is not an integer vector!")
25 | 
26 |   clusters <- unique(partition)
27 |   big_clusters <- clusters[purrr::map_int(clusters, ~ sum(partition==.x)>min.clust.size)==1]
28 | 
29 |   is.improved <- FALSE
30 |   max_ml <- calc_log_ml(snp.object, partition)
31 | 
32 |   for(c in big_clusters){
33 |     index <- c(1:length(partition))[partition==c]
34 |     d <- snp.object$dist[partition==c, partition==c]
35 |     h <- stats::hclust(stats::as.dist(d), method = "complete")
36 |     if(split){
37 |       npops <- 2
38 |     } else {
39 |       npops <- min(20, floor(nrow(d)/5))
40 |     }
41 |     sub.partition <- stats::cutree(h, k=npops)
42 |     sub.clusters <- unique(sub.partition)
43 |     max_ml <- calc_log_ml(snp.object, partition)
44 | 
45 |     temp_mls <- parallel::mclapply(sub.clusters, function(sub.c){
46 |       best.clust <- calc_change_in_ml(snp.object, partition, index[sub.partition==sub.c])
47 |       temp.partition <- partition
48 |       temp.partition[index[sub.partition==sub.c]] <- as.integer(best.clust)
49 |       return(list(lml=calc_log_ml(snp.object, temp.partition),
50 |                   best.clust=best.clust))
51 |     }, mc.cores=n.cores)
52 | 
53 |     arg.max <- which.max(purrr::map_dbl(temp_mls, ~ .x$lml))
54 | 
55 |     if(temp_mls[[arg.max]]$lml > (max_ml+threshold)){
56 |       partition[index[sub.partition==sub.clusters[[arg.max]]]] <- as.integer(temp_mls[[arg.max]]$best.clust)
57 |       max_ml <- temp_mls[[arg.max]]$lml
58 |       is.improved <- TRUE
59 |     }
60 |   }
61 | 
62 |   return(list(partition=partition, is.improved=is.improved, lml=max_ml))
63 | 
64 | }
65 | 


--------------------------------------------------------------------------------
/R/move_units_1.R:
--------------------------------------------------------------------------------
 1 | #' move_units_1
 2 | #'
 3 | #' Peform an iteration of the first move in the algorithm. That is move units from one cluster to
 4 | #' another to improve the marginal likelihood
 5 | #'
 6 | #'
 7 | #' @param snp.object A snp.object containing the processed SNP data.
 8 | #' @param partition An integer vector indicating an initial partition of the isolates.
 9 | #' @param threshold The increase in marginal log likelihood required to accept a move.
10 | #' @param frac.clust.searched The percentage of a large cluster that will be moved.
11 | #' @param min.clust.size All isolates in clusters less than or equal to min.clus.size will be searched.
12 | #' @param n.cores The number of cores to use.
13 | #'
14 | #' @return The best partition after moving units from one cluster to another as well as
15 | #' a boolean value indicating whether a move increased the marginal likelihood.
16 | #'
17 | move_units_1 <- function(snp.object, partition, threshold=1e-5,
18 |                          frac.clust.searched=0.3,
19 |                          min.clust.size=20,
20 |                          n.cores=1){
21 | 
22 |   #some checks
23 |   if (ncol(snp.object$prior)!=ncol(snp.object$data)) stop("ncol mismatch bwtn prior and data!")
24 |   if (length(partition)!=nrow(snp.object$data)) stop("mismatch bwtn partition and data!")
25 |   if (!(class(partition)=="integer")) stop("in move 1 -> partition is not an integer vector!")
26 | 
27 |   nsamples <- nrow(snp.object$data)
28 |   clusters <- unique(partition)
29 |   cluster_size <- purrr::map_int(clusters, ~ sum(partition==.x))
30 |   names(cluster_size) <- clusters
31 |   max_ml <- calc_log_ml(snp.object, partition)
32 |   max_partition <- partition
33 | 
34 |   #identify the most divereged isolates of each cluster as candidates for moving.
35 |   #as in hierBAPS we take the individuals that are most distant from each other
36 |   indexes <- purrr::as_vector(purrr::imap(clusters[cluster_size>1], function(c, ind){
37 |     if(cluster_size[ind] > min.clust.size){
38 |       #big cluster so only move the most distant isolates
39 |       index <- c(1:length(partition))[partition==c]
40 |       d <- snp.object$dist[partition==c, partition==c]
41 |       diag(d) <- NA
42 |       mean.dist <- rowMeans(d, na.rm = TRUE)
43 |       return(index[mean.dist > stats::quantile(mean.dist, probs=(1-frac.clust.searched))])
44 |     } else {
45 |       #small cluster so consider all isolates
46 |       return(c(1:length(partition))[partition==c])
47 |     }
48 |   }))
49 | 
50 |   is.improved <- FALSE
51 |   for (i in indexes){
52 |     best.cluster <- calc_change_in_ml(snp.object, max_partition, i)
53 |     temp_partition <- max_partition
54 |     temp_partition[i] <- as.integer(best.cluster)
55 |     temp_lml <- calc_log_ml(snp.object, temp_partition)
56 |     #If improvment above threshold make the swap
57 |     if(temp_lml > (max_ml+threshold)){
58 |       max_ml <- temp_lml
59 |       max_partition <- temp_partition
60 |       is.improved <- TRUE
61 |     }
62 |   }
63 | 
64 |   return(list(partition=max_partition, is.improved=is.improved, lml=max_ml))
65 | }
66 | 


--------------------------------------------------------------------------------
/.github/workflows/R-CMD-check.yaml:
--------------------------------------------------------------------------------
 1 | # For help debugging build failures open an issue on the RStudio community with the 'github-actions' tag.
 2 | # https://community.rstudio.com/new-topic?category=Package%20development&tags=github-actions
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |       - master
 8 |   pull_request:
 9 |     branches:
10 |       - main
11 |       - master
12 |   schedule:
13 |     - cron:  '1 1 1 * *'
14 | 
15 | name: R-CMD-check
16 | 
17 | jobs:
18 |   R-CMD-check:
19 |     runs-on: ${{ matrix.config.os }}
20 | 
21 |     name: ${{ matrix.config.os }} (${{ matrix.config.r }})
22 | 
23 |     strategy:
24 |       fail-fast: false
25 |       matrix:
26 |         config:
27 |           - {os: windows-latest, r: 'release'}
28 |           - {os: macOS-latest, r: 'release'}
29 |           - {os: ubuntu-20.04, r: 'release', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"}
30 | 
31 |     env:
32 |       R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
33 |       RSPM: ${{ matrix.config.rspm }}
34 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
35 | 
36 |     steps:
37 |       - uses: actions/checkout@v2
38 | 
39 |       - uses: r-lib/actions/setup-r@v1
40 |         with:
41 |           r-version: ${{ matrix.config.r }}
42 | 
43 |       - uses: r-lib/actions/setup-pandoc@v1
44 | 
45 |       - name: Query dependencies
46 |         run: |
47 |           install.packages('remotes')
48 |           install.packages('BiocManager')
49 |           saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2)
50 |           writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version")
51 |         shell: Rscript {0}
52 | 
53 |       - name: Cache R packages
54 |         if: runner.os != 'Windows'
55 |         uses: actions/cache@v2
56 |         with:
57 |           path: ${{ env.R_LIBS_USER }}
58 |           key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }}
59 |           restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-
60 | 
61 |       - name: Install system dependencies
62 |         if: runner.os == 'Linux'
63 |         run: |
64 |           while read -r cmd
65 |           do
66 |             eval sudo $cmd
67 |           done < <(Rscript -e 'writeLines(remotes::system_requirements("ubuntu", "20.04"))')
68 | 
69 |       - name: Install dependencies
70 |         run: |
71 |           remotes::install_deps(dependencies = TRUE)
72 |           remotes::install_cran("rcmdcheck")
73 |           BiocManager::install("ggtree")
74 |         shell: Rscript {0}
75 | 
76 |       - name: Check
77 |         env:
78 |           _R_CHECK_CRAN_INCOMING_REMOTE_: false
79 |         run: |
80 |           options(crayon.enabled = TRUE)
81 |           rcmdcheck::rcmdcheck(args = c("--no-manual", "--as-cran"), error_on = "warning", check_dir = "check")
82 |         shell: Rscript {0}
83 | 
84 |       - name: Upload check results
85 |         if: failure()
86 |         uses: actions/upload-artifact@main
87 |         with:
88 |           name: ${{ runner.os }}-r${{ matrix.config.r }}-results
89 |           path: check
90 | 


--------------------------------------------------------------------------------
/R/join_units_2.R:
--------------------------------------------------------------------------------
 1 | #' join_units_2
 2 | #'
 3 | #' Peform an iteration of the second move in the algorithm. That is combine two clusters
 4 | #' to improve the marginal likelihood.
 5 | #'
 6 | #'
 7 | #' @param snp.object A snp.object containing the processed SNP data.
 8 | #' @param partition An integer vector indicating an initial partition of the isolates.
 9 | #' @param threshold The increase in marginal log likelihood required to accept a move.
10 | #' @param n.cores The number of cores to use.
11 | #' @param comb.chache a matrix recording previous marginal llks of combining clusters
12 | #'
13 | #' @return The best partition after combining two clusters as well as
14 | #' a boolean value indicating whether a move increased the marginal likelihood.
15 | #'
16 | join_units_2 <- function(snp.object, partition, threshold=1e-5, n.cores=1, comb.chache=NULL){
17 | 
18 |   #some checks
19 |   if (ncol(snp.object$prior)!=ncol(snp.object$data)) stop("ncol mismatch bwtn prior and data!")
20 |   if (length(partition)!=nrow(snp.object$data)) stop("mismatch bwtn partition and data!")
21 | 
22 |   max_ml <- calc_log_ml(snp.object, partition)
23 |   is.improved <- FALSE
24 |   clusters <- unique(partition)
25 | 
26 |   stopifnot(length(clusters) > 1)
27 |   
28 |   
29 |   combinations <- utils::combn(clusters, 2, simplify = TRUE)
30 |   
31 |   if(is.null(comb.chache)){
32 |     comb.chache <- matrix(NA, nrow=max(clusters), ncol=max(clusters))
33 |   }
34 |   
35 |   temp.combinations <- combinations[, is.na(comb.chache[t(combinations)]), drop=FALSE]
36 |   
37 |   temp_mls <- parallel::mcmapply(function(p1, p2){
38 |     temp_partition <- partition
39 |     temp_partition[temp_partition==p2] <- p1
40 |     return(calc_log_ml(snp.object, temp_partition))
41 |   }, temp.combinations[1,], temp.combinations[2,], mc.cores = n.cores)
42 |   
43 |   comb.chache[t(temp.combinations)] <- temp_mls
44 | 
45 |   arg.max <- which(comb.chache == max(comb.chache, na.rm = TRUE), arr.ind = TRUE)[1,]
46 | 
47 |   if(comb.chache[arg.max[[1]], arg.max[[2]]] > (max_ml+threshold)){
48 |     partition[partition==arg.max[[2]]] <- arg.max[[1]]
49 |     is.improved <- TRUE
50 |     if (length(clusters)<=2){
51 |       diff <- (comb.chache[arg.max[[1]], arg.max[[2]]] - 
52 |                  max_ml + 
53 |                  2*log_stirling2(nrow(snp.object$data), length(clusters)-1) - 
54 |                  log_stirling2(nrow(snp.object$data), length(clusters))
55 |       )
56 |     } else {
57 |       diff <- (comb.chache[arg.max[[1]], arg.max[[2]]] - 
58 |                  max_ml + 
59 |                  2*log_stirling2(nrow(snp.object$data), length(clusters)-1) - 
60 |                  log_stirling2(nrow(snp.object$data), length(clusters)) -
61 |                  log_stirling2(nrow(snp.object$data), length(clusters)-2)
62 |       )
63 |     }
64 |     
65 |     max_ml <- comb.chache[arg.max[[1]], arg.max[[2]]]
66 |     comb.chache <- comb.chache + diff
67 |     comb.chache[arg.max[[1]], ] <- NA
68 |     comb.chache[arg.max[[2]], ] <- NA
69 |     comb.chache[, arg.max[[1]]] <- NA
70 |     comb.chache[, arg.max[[2]]] <- NA
71 |   }
72 | 
73 |   return(list(partition=partition, is.improved=is.improved, lml=max_ml, comb.chache=comb.chache))
74 | }
75 | 


--------------------------------------------------------------------------------
/R/calc_change_in_ml.R:
--------------------------------------------------------------------------------
 1 | #' calc_change_in_ml
 2 | #'
 3 | #' Calculate the change in the log marginal likelihood after moving index to each possible cluster
 4 | #'
 5 | #'
 6 | #' @param snp.object A snp.object containing the processed SNP data.
 7 | #' @param partition An integer vector indicating a partition of the isolates.
 8 | #' @param indexes Indexes of the isolates to be moved (must come from one cluster.)
 9 | #'
10 | #' @return the best cluster to move indexes to.
11 | #'
12 | calc_change_in_ml <- function(snp.object, partition, indexes){
13 |   #some checks
14 |   if (ncol(snp.object$prior)!=ncol(snp.object$data)) stop("ncol mismatch bwtn prior and data!")
15 |   if (length(partition)!=nrow(snp.object$data)) stop("mismatch bwtn partition and data!")
16 |   if (!(all(indexes %in% 1:length(partition)))) stop("indexes outside of partiton range!")
17 |   if (!(class(partition)=="integer")) stop("partition is not an integer vector!")
18 | 
19 |   original_cluster <- unique(partition[indexes])
20 |   if(length(original_cluster)!=1) stop("there was not a unique cluster in the index set!")
21 | 
22 |   #create temporary partition with indexes in a seperate cluster
23 |   temp_partition <- partition
24 |   temp_cluster <- max(partition)+1
25 |   temp_partition[indexes] <- temp_cluster
26 | 
27 |   #get allele counts for each cluster in partition
28 |   mA <- t(rowsum(1*(snp.object$data=="a"), temp_partition))
29 |   mC <- t(rowsum(1*(snp.object$data=="c"), temp_partition))
30 |   mG <- t(rowsum(1*(snp.object$data=="g"), temp_partition))
31 |   mT <- t(rowsum(1*(snp.object$data=="t"), temp_partition))
32 | 
33 |   #now add counts from the temp cluster to every other cluster and take away from its own.
34 |   mA <- mA[,colnames(mA)!=temp_cluster, drop=FALSE] + mA[, colnames(mA)==temp_cluster]
35 |   mC <- mC[,colnames(mC)!=temp_cluster, drop=FALSE] + mC[, colnames(mC)==temp_cluster]
36 |   mG <- mG[,colnames(mG)!=temp_cluster, drop=FALSE] + mG[, colnames(mG)==temp_cluster]
37 |   mT <- mT[,colnames(mT)!=temp_cluster, drop=FALSE] + mT[, colnames(mT)==temp_cluster]
38 | 
39 |   prior <- snp.object$prior
40 |   prior[prior==0] <- 1 #deal with zeros and resulting NAs
41 | 
42 |   #calculate log marginal likelihood
43 |   term1 <- -lgamma(1 + mA+mC+mG+mT)
44 |   term2 <- lgamma(prior["a", ] + mA) - lgamma(prior["a", ])
45 |   term2 <- term2 + lgamma(prior["c", ] + mC) - lgamma(prior["c", ])
46 |   term2 <- term2 + lgamma(prior["g", ] + mG) - lgamma(prior["g", ])
47 |   term2 <- term2 + lgamma(prior["t", ] + mT) - lgamma(prior["t", ])
48 | 
49 |   new_columnwise_lml <- colSums(term1 + term2)
50 |   names(new_columnwise_lml) <- colnames(mA)
51 | 
52 |   #now calculate original columnwise llm before the move
53 |   #get allele counts for each cluster in partition
54 |   mA <- t(rowsum(1*(snp.object$data=="a"), partition))
55 |   mC <- t(rowsum(1*(snp.object$data=="c"), partition))
56 |   mG <- t(rowsum(1*(snp.object$data=="g"), partition))
57 |   mT <- t(rowsum(1*(snp.object$data=="t"), partition))
58 |   term1 <- -lgamma(1 + mA+mC+mG+mT)
59 |   term2 <- lgamma(prior["a", ] + mA) - lgamma(prior["a", ])
60 |   term2 <- term2 + lgamma(prior["c", ] + mC) - lgamma(prior["c", ])
61 |   term2 <- term2 + lgamma(prior["g", ] + mG) - lgamma(prior["g", ])
62 |   term2 <- term2 + lgamma(prior["t", ] + mT) - lgamma(prior["t", ])
63 |   orignal_columnwise_lml <- colSums(term1 + term2)
64 |   names(orignal_columnwise_lml) <- colnames(mA)
65 |   new_columnwise_lml <- new_columnwise_lml[names(orignal_columnwise_lml)]
66 | 
67 |   diff_columnwise_lml <- orignal_columnwise_lml - new_columnwise_lml
68 |   diff_columnwise_lml[original_cluster] <- Inf
69 | 
70 |   best_column <- names(orignal_columnwise_lml)[which.min(diff_columnwise_lml)]
71 | 
72 |   return(best_column)
73 | }
74 | 


--------------------------------------------------------------------------------
/R/model_search_parallel.R:
--------------------------------------------------------------------------------
  1 | #' model_search_parallel
  2 | #'
  3 | #' Clusters DNA alignment using independent loci model
  4 | #'
  5 | #'
  6 | #' @param snp.object A snp.object containing the processed SNP data.
  7 | #' @param partition An integer vector indicating an initial starting partition.
  8 | #' @param round.types A vector indicating which series of moves to make.
  9 | #' @param quiet Whether to suppress progress information (default=FALSE).
 10 | #' @param n.extra.rounds The number of additional rounds to perform after the default hierBAPS
 11 | #' settings (default=0). If set to Inf it will run until a local optimum is reached
 12 | #' (this might take a long time).
 13 | #' @param n.cores The number of cores to use.
 14 | #'
 15 | #' @return an optimised partition and marginal llk
 16 | #'
 17 | model_search_parallel <- function(snp.object, partition, round.types,
 18 |                                   quiet=FALSE, n.extra.rounds=0, n.cores=1){
 19 |   if(!all(round.types %in% c(1,2,3,4))) stop("Invalid round type!")
 20 | 
 21 |   was.updated <- rep(TRUE, 4)
 22 |   move.count <- 0
 23 |   max.ml <- calc_log_ml(snp.object, partition)
 24 |   comb.chache <- NULL
 25 |   if(!quiet){
 26 |     cat('\r', paste(c(
 27 |       "Round: ", move.count, "/", length(round.types), " Type: ", "none", " Log marginal likelihood: ", max.ml
 28 |     ), collapse = ""))
 29 |   }
 30 | 
 31 |   while(move.count < (length(round.types)+n.extra.rounds)){
 32 | 
 33 |     move.count <- move.count + 1
 34 |     if(move.count>length(round.types)){
 35 |       r <- sample(1:4, 1, replace = TRUE)
 36 |     } else {
 37 |       r <- round.types[[move.count]]
 38 |     }
 39 | 
 40 | 
 41 |     n.clusters <- length(unique(partition))
 42 |     if (length(unique(partition))<=1){
 43 |       next
 44 |     }
 45 |     if((r==1) && was.updated[[1]]){
 46 |       update <- move_units_1(snp.object, partition,
 47 |                              n.cores=n.cores)
 48 |       if(!update$is.improved){
 49 |         was.updated[[1]] <- FALSE
 50 |       } else{
 51 |         comb.chache <- NULL
 52 |         partition <- update$partition
 53 |         max.ml <- update$lml
 54 |         was.updated <- rep(TRUE, 4)
 55 |       }
 56 |     } else if(r==2 && was.updated[[2]]){
 57 |       update <- join_units_2(snp.object, partition,
 58 |                              n.cores=n.cores, comb.chache=comb.chache)
 59 |       comb.chache <- update$comb.chache
 60 |       if(!update$is.improved){
 61 |         was.updated[[2]] <- FALSE
 62 |       } else{
 63 |         partition <- update$partition
 64 |         max.ml <- update$lml
 65 |         was.updated <- rep(TRUE, 4)
 66 |       }
 67 |     } else if(r==3 && was.updated[[3]]){
 68 |       update <- split_clusters_3(snp.object, partition,
 69 |                                  n.cores=n.cores)
 70 |       if(!update$is.improved){
 71 |         was.updated[[3]] <- FALSE
 72 |       } else{
 73 |         comb.chache <- NULL
 74 |         partition <- update$partition
 75 |         max.ml <- update$lml
 76 |         was.updated <- rep(TRUE, 4)
 77 |       }
 78 |     } else if(r==4 && was.updated[[4]]){
 79 |       update <- reallocate_units_4(snp.object, partition,
 80 |                                    n.cores=n.cores)
 81 |       if(!update$is.improved){
 82 |         was.updated[[4]] <- FALSE
 83 |       } else{
 84 |         comb.chache <- NULL
 85 |         partition <- update$partition
 86 |         max.ml <- update$lml
 87 |         was.updated <- rep(TRUE, 4)
 88 |       }
 89 |     }
 90 |     #Print current status
 91 |     if(!quiet){
 92 |       cat('\r', paste(c(
 93 |         "Round: ", move.count, "/", length(round.types), " Type: ", r, " Log marginal likelihood: ", max.ml
 94 |       ), collapse = ""))
 95 |       utils::flush.console()
 96 |     }
 97 | 
 98 |     #Check for local convergence
 99 |     if (sum(was.updated)==0){
100 |       if (!quiet) print("Converged locally!")
101 |       break
102 |     }
103 |   }
104 |   return(list(partition=partition, lml=calc_log_ml(snp.object, partition)))
105 | }
106 | 


--------------------------------------------------------------------------------
/vignettes/bibliography.bib:
--------------------------------------------------------------------------------
  1 | @ARTICLE{Cheng2013-mp,
  2 |   title       = "Hierarchical and spatially explicit clustering of {DNA}
  3 |                  sequences with {BAPS} software",
  4 |   author      = "Cheng, Lu and Connor, Thomas R and Sir{\'e}n, Jukka and
  5 |                  Aanensen, David M and Corander, Jukka",
  6 |   affiliation = "Department of Mathematics and Statistics, University of
  7 |                  Helsinki, Helsinki, Finland.",
  8 |   journal     = "Mol. Biol. Evol.",
  9 |   volume      =  30,
 10 |   number      =  5,
 11 |   pages       = "1224--1228",
 12 |   month       =  may,
 13 |   year        =  2013,
 14 |   url         = "http://dx.doi.org/10.1093/molbev/mst028",
 15 |   language    = "en",
 16 |   issn        = "0737-4038, 1537-1719",
 17 |   pmid        = "23408797",
 18 |   doi         = "10.1093/molbev/mst028",
 19 |   pmc         = "PMC3670731"
 20 | }
 21 | 
 22 | @ARTICLE{Paradis2004-ck,
 23 |   title       = "{APE}: Analyses of Phylogenetics and Evolution in {R} language",
 24 |   author      = "Paradis, Emmanuel and Claude, Julien and Strimmer, Korbinian",
 25 |   affiliation = "Laboratoire de Pal{\'e}ontologie, Pal{\'e}obiologie and
 26 |                  Phylog{\'e}nie, Institut des Sciences de l'Evolution,
 27 |                  Universit{\'e} Montpellier II, F-34095 Montpellier c{\'e}dex
 28 |                  05, France. paradis@isem.univ-montp2.fr",
 29 |   journal     = "Bioinformatics",
 30 |   volume      =  20,
 31 |   number      =  2,
 32 |   pages       = "289--290",
 33 |   month       =  jan,
 34 |   year        =  2004,
 35 |   url         = "http://www.ncbi.nlm.nih.gov/pubmed/14734327",
 36 |   issn        = "1367-4803",
 37 |   pmid        = "14734327",
 38 |   doi         = "10.1093/bioinformatics/btg412"
 39 | }
 40 | 
 41 | 
 42 | @ARTICLE{Yu2017-bf,
 43 |   title   = "ggtree: an r package for visualization and annotation of
 44 |              phylogenetic trees with their covariates and other associated data",
 45 |   author  = "Yu, Guangchuang and Smith, David K and Zhu, Huachen and Guan, Yi
 46 |              and Lam, Tommy Tsan-Yuk",
 47 |   journal = "Methods Ecol. Evol.",
 48 |   volume  =  8,
 49 |   number  =  1,
 50 |   pages   = "28--36",
 51 |   month   =  jan,
 52 |   year    =  2017,
 53 |   url     = "http://dx.doi.org/10.1111/2041-210X.12628",
 54 |   issn    = "2041-210X",
 55 |   doi     = "10.1111/2041-210X.12628"
 56 | }
 57 | 
 58 | 
 59 | @ARTICLE{Revell2012-ik,
 60 |   title     = "phytools: an {R} package for phylogenetic comparative biology
 61 |                (and other things)",
 62 |   author    = "Revell, Liam J",
 63 |   journal   = "Methods Ecol. Evol.",
 64 |   publisher = "Blackwell Publishing Ltd",
 65 |   volume    =  3,
 66 |   number    =  2,
 67 |   pages     = "217--223",
 68 |   month     =  apr,
 69 |   year      =  2012,
 70 |   url       = "http://dx.doi.org/10.1111/j.2041-210X.2011.00169.x",
 71 |   issn      = "2041-210X",
 72 |   doi       = "10.1111/j.2041-210X.2011.00169.x"
 73 | }
 74 | 
 75 | 
 76 | @ARTICLE{Kalyaanamoorthy2017-go,
 77 |   title       = "{ModelFinder}: fast model selection for accurate phylogenetic
 78 |                  estimates",
 79 |   author      = "Kalyaanamoorthy, Subha and Minh, Bui Quang and Wong, Thomas K
 80 |                  F and von Haeseler, Arndt and Jermiin, Lars S",
 81 |   affiliation = "Land \&Water, CSIRO, Canberra, Australian Capital Territory,
 82 |                  Australia. Faculty of Pharmacy \&Pharmaceutical Sciences,
 83 |                  University of Alberta, Edmonton, Alberta, Canada. Center for
 84 |                  Integrative Bioinformatics Vienna, Max F. Perutz Laboratories,
 85 |                  University of Vienna \&Medical University of Vienna, Vienna,
 86 |                  Austria. Research School of Biology, Australian National
 87 |                  University, Canberra, Australian Capital Territory, Australia.
 88 |                  Bioinformatics and Computational Biology, Faculty of
 89 |                  Computational Science, University of Vienna, Vienna, Austria.",
 90 |   journal     = "Nat. Methods",
 91 |   volume      =  14,
 92 |   number      =  6,
 93 |   pages       = "587--589",
 94 |   month       =  jun,
 95 |   year        =  2017,
 96 |   url         = "http://dx.doi.org/10.1038/nmeth.4285",
 97 |   language    = "en",
 98 |   issn        = "1548-7091, 1548-7105",
 99 |   pmid        = "28481363",
100 |   doi         = "10.1038/nmeth.4285",
101 |   pmc         = "PMC5453245"
102 | }
103 | 
104 | 


--------------------------------------------------------------------------------
/vignettes/introduction.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Introduction to rhierbaps"
  3 | author: "Gerry Tonkin-Hill"
  4 | date: "`r Sys.Date()`"
  5 | bibliography: bibliography.bib
  6 | output: 
  7 |   html_document:
  8 |     fig_width: 12
  9 |     fig_height: 8
 10 | vignette: >
 11 |   %\VignetteIndexEntry{Introduction to rhierbaps}
 12 |   %\VignetteEncoding{UTF-8}
 13 |   %\VignetteEngine{knitr::rmarkdown}
 14 | editor_options: 
 15 |   chunk_output_type: console
 16 | ---
 17 | 
 18 | ```{r global_options, include=FALSE}
 19 | knitr::opts_chunk$set(fig.width=12, fig.height=8,
 20 |                       echo=TRUE, warning=FALSE, message=FALSE,
 21 |                       tidy=TRUE)
 22 | ```
 23 | 
 24 | The hierBAPS algorithm was introduced in [@Cheng2013-mp] and provides a method for hierarchically clustering DNA sequence data to reveal nested population structure. Previously the algorithm was available as a compiled MATLAB binary. We provide a convenient R implementation and include a number of useful additional options including the ability to use multiple cores, save the log marginal likelihood scores and run the algorithm until local convergence. Furthermore, we provide a wrapper to a ggtree plotting function allowing for easy exploration of sub-clusters.
 25 | 
 26 | ***
 27 | 
 28 | **Things to keep in mind before running hierBAPS**
 29 | 
 30 | 1. hierBAPS uses a uniform prior for K.
 31 | 2. The prior for a site depend on the available snps, i.e. if a site only has 'AC', then the prior for 'ACGT' is (1/2, 1/2, 0, 0)
 32 | 3. The initial sequence partition is generated using hierarchical clustering with complete linkage based on a Hamming distance matrix.
 33 | 4. The initial number of populations should be set much higher than the expected number of populations.
 34 | 5. More search rounds of the algorithm can be added using the `n.extra.rounds` parameter.
 35 | 6. To get reproducible results the seed in R must be set.
 36 | 
 37 | ## Libraries
 38 | 
 39 | ```{r}
 40 | library(rhierbaps)
 41 | library(ggtree)
 42 | library(phytools)
 43 | library(ape)
 44 | 
 45 | set.seed(1234)
 46 | ```
 47 | 
 48 | ## Loading data
 49 | 
 50 | We first need to load a multiple sequence alignment in fasta format. We can then generate the required SNP matrix.
 51 | 
 52 | ```{r}
 53 | fasta.file.name <- system.file("extdata", "seqs.fa", package = "rhierbaps")
 54 | snp.matrix <- load_fasta(fasta.file.name)
 55 | ```
 56 | 
 57 | If you wish to include singleton SNPs (those that appear in only one isolate) then set `keep.singletons=FALSE`. However, this is currently advised against as these SNPs lead to a higher number of parameters in the model and do not provide information about shared ancestry.
 58 | 
 59 | It is also possible to load an ape DNAbin object. Here me make use of the woodmouse dataset in ape.
 60 | 
 61 | ```{r}
 62 | data(woodmouse)
 63 | woodmouse.snp.matrix <- load_fasta(woodmouse)
 64 | ```
 65 | 
 66 | ## Running hierBAPS
 67 | 
 68 | We now need to decide how many levels of clustering we are interested in and the number of initial clusters to start from. It is a good idea to choose `n.pops` to be significantly larger than the number of clusters you expect.
 69 | 
 70 | To run hierBAPS with $2$ levels and $20$ initial clusters we run
 71 | 
 72 | ```{r}
 73 | hb.results <- hierBAPS(snp.matrix, max.depth=2, n.pops=20, quiet = TRUE)
 74 | head(hb.results$partition.df)
 75 | ```
 76 | 
 77 | This produces a list which includes a data frame indicating the resulting partition of the isolates at the difference levels. The isolate names in this data frame are taken from the fasta headers and thus for plotting it is important that these match the isolate names in any tree used later. This function also outputs the log marginal likelihoods at the different levels of clustering.
 78 | 
 79 | hierBAPS can also be run until the algorithm converges to a local optimum as
 80 | 
 81 | ```{r, eval=FALSE}
 82 | hb.results <- hierBAPS(snp.matrix, max.depth=2, n.pops=20, n.extra.rounds = Inf,
 83 |                        quiet = TRUE)
 84 | ```
 85 | 
 86 | We can also check how long hierBAPS takes to run on the test dataset of 515 samples and 744 SNPs.
 87 | 
 88 | ```{r}
 89 | system.time(hierBAPS(snp.matrix, max.depth=2, n.pops=20, quiet = TRUE))
 90 | ```
 91 | 
 92 | ## Plotting results
 93 | 
 94 | To plot the results it is useful to consider a tree of the same isolates. We clustered the example isolates using Iqtree [@Kalyaanamoorthy2017-go]. The ggtree [@Yu2017-bf] package then allows us to plot the results.
 95 | 
 96 | First we need to load the newick file.
 97 | 
 98 | ```{r}
 99 | newick.file.name <- system.file("extdata", "seqs.fa.treefile", package = "rhierbaps")
100 | iqtree <- phytools::read.newick(newick.file.name)
101 | ```
102 | 
103 | A simple coloured tree allows us to see the top level cluster assignment from hierBAPS.
104 | 
105 | ```{r}
106 | gg <- ggtree(iqtree, layout="circular")
107 | gg <- gg %<+% hb.results$partition.df
108 | gg <- gg + geom_tippoint(aes(color=factor(`level 1`)))
109 | gg
110 | ```
111 | 
112 | As there are many more clusters at the second level using colours to distinguish them can get confusing. Instead we can label the tips with their corresponding clusters.
113 | 
114 | ```{r}
115 | gg <- ggtree(iqtree, layout="circular", branch.length = "none")
116 | gg <- gg %<+% hb.results$partition.df
117 | gg <- gg + geom_tippoint(aes(color=factor(`level 1`)))
118 | gg <- gg + theme(legend.position="right")
119 | gg <- gg + geom_tiplab(aes(label = `level 2`), size = 1, offset = 1)
120 | gg
121 | ```
122 | 
123 | We can also zoom in on a particular top level cluster to get a better idea of how it is partitioned at the lower level. As an example we zoom in on sub cluster 9 at level 1.
124 | 
125 | ```{r}
126 | plot_sub_cluster(hb.results, iqtree, level = 1, sub.cluster = 9)
127 | ```
128 | 
129 | Finally, we can inspect the log marginal likelihoods given for each level.
130 | 
131 | ```{r}
132 | hb.results$lml.list
133 | ```
134 | 
135 | ## Caculating assignment probabilities
136 | 
137 | We can also calculate the individual probabilities of assignment to each cluster. Here we make use of the woodmouse dataset loaded earlier.
138 | 
139 | ```{r}
140 | hb.results.woodmouse <- hierBAPS(woodmouse.snp.matrix, max.depth=2, n.extra.rounds = Inf,
141 |                                  quiet = TRUE, assignment.probs = TRUE)
142 | head(hb.results.woodmouse$cluster.assignment.prob[[1]])
143 | ```
144 | 
145 | ## Saving results
146 | 
147 | For runs that take a long time it is a good idea to save the output. We can save the partition file as
148 | 
149 | ```{r, eval=FALSE}
150 | write.csv(hb.results$partition.df, file=file.path(tempdir(), "hierbaps_partition.csv"), 
151 |           col.names = TRUE, row.names = FALSE)
152 | 
153 | save_lml_logs(hb.results,  file.path(tempdir(), "hierbaps_logML.txt"))
154 | ```
155 | 
156 | ## Citing rhierbaps
157 | 
158 | If you use rhierbaps in a research publication please cite both
159 | 
160 | Tonkin-Hill, Gerry, John A. Lees, Stephen D. Bentley, Simon D. W. Frost, and Jukka Corander. 2018. “RhierBAPS: An R Implementation of the Population Clustering Algorithm hierBAPS.” Wellcome Open Research 3 (July): 93.
161 | 
162 | Cheng, Lu, Thomas R. Connor, Jukka Sirén, David M. Aanensen, and Jukka Corander. 2013. “Hierarchical and Spatially Explicit Clustering of DNA Sequences with BAPS Software.” Molecular Biology and Evolution 30 (5): 1224–28.
163 | 
164 | ## References
165 | 
166 | ---
167 | nocite: '@*'
168 | ...
169 | 
170 | ## Session Information
171 | 
172 | ```{r}
173 | sessionInfo()
174 | ```
175 | 
176 | 
177 | 


--------------------------------------------------------------------------------
/R/hierBAPS.R:
--------------------------------------------------------------------------------
  1 | #' hierBAPS
  2 | #'
  3 | #' Runs the hierBAPS algorithm of Cheng et al. 2013
  4 | #'
  5 | #'
  6 | #' @param snp.matrix Character matrix of aligned sequences produced by \link{load_fasta}.
  7 | #' @param max.depth Maximum depth of hierarchical search (default = 2).
  8 | #' @param n.pops Maximum number of populations in the data (default = number of isolates/5)
  9 | #' @param quiet Whether to suppress progress information (default=FALSE).
 10 | #' @param n.extra.rounds The number of additional rounds to perform after the default hierBAPS
 11 | #' settings (default=0). If set to Inf it will run until a local optimum is reached
 12 | #' (this might take a long time).
 13 | #' @param assignment.probs whether or not to calculate the assignment probabilities to each cluster (default=FALSE)
 14 | #' @param n.cores The number of cores to use.
 15 | #'
 16 | #' @return A list containing a dataframe indicating an assignment of each sequence
 17 | #' to hierarchical clusters as well as the log marginal likelihoods for each level.
 18 | #'
 19 | #' @examples
 20 | #' snp.matrix <- load_fasta(system.file("extdata", "small_seqs.fa", package = "rhierbaps"))
 21 | #' hb <- hierBAPS(snp.matrix, max.depth=2, n.pops=20, quiet=FALSE)
 22 | #'
 23 | #' \donttest{
 24 | #' snp.matrix <- load_fasta(system.file("extdata", "seqs.fa", package = "rhierbaps"))
 25 | #' system.time({hb <- hierBAPS(snp.matrix, max.depth=2, n.pops=20, quiet=FALSE)})
 26 | #' }
 27 | #'
 28 | #'@author Gerry Tonkin-Hill
 29 | #'@references Cheng, Lu, Thomas R. Connor, Jukka Sirén, David M. Aanensen, and Jukka Corander. 2013. “Hierarchical and Spatially Explicit Clustering of DNA Sequences with BAPS Software.” Molecular Biology and Evolution 30 (5): 1224–28.
 30 | #'
 31 | #' @export
 32 | hierBAPS <- function(snp.matrix, max.depth=2, n.pops=floor(nrow(snp.matrix)/5),
 33 |                      quiet=FALSE, n.extra.rounds=0, assignment.probs=FALSE, n.cores=1){
 34 | 
 35 |   #Check inputs
 36 |   if (!is.matrix(snp.matrix)) stop("snp.matrix is not a matrix!")
 37 |   if ((!is.numeric(max.depth)) || (max.depth<1)) stop("Invalid max.depth! Must be a positive integer.")
 38 |   if ((!is.numeric(n.pops)) || (n.pops<1)) stop("Invalid n.pops! Must be a positive integer.")
 39 |   if (!is.logical(quiet)) stop("Invalid quiet! Must be one of TRUE/FALSE.")
 40 |   if ((!is.numeric(n.extra.rounds)) || (n.extra.rounds<0)) stop("Invalid n.extra.rounds!
 41 |                                                                 Must be a non-negative integer.")
 42 |   if ((!is.numeric(n.cores)) || (n.cores<1)) stop("Invalid n.cores! Must be a positive integer.")
 43 | 
 44 |   # search operators
 45 |   round.types <- c(2*rep(1, n.pops),
 46 |                  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 47 |                  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 48 |                  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 49 |                  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 4, 3, 4,
 50 |                  3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3,
 51 |                  4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4,
 52 |                  3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3,
 53 |                  4, 3, 4, 3, 4, 3, 4, 3, 4, 1, 1, 1, 1, 1, 2, 3, 4, 1, 2, 3, 4,
 54 |                  1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1,
 55 |                  2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2,
 56 |                  3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3,
 57 |                  4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
 58 |                  1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1,
 59 |                  2, 3, 4, 1, 2, 3, 4)
 60 |   
 61 |   #Don't split clusters with the less than 4 memebers
 62 |   MIN.CLUSTER.SIZE <- 4
 63 | 
 64 |   #iterate over levels
 65 |   all.partition.matrix <- matrix(0, nrow=nrow(snp.matrix), ncol=max.depth)
 66 |   lml.list <- list()
 67 | 
 68 |   for (cur.depth in seq(0, max.depth-1)){
 69 | 
 70 |     if (cur.depth==0){
 71 |       snp.object <- preproc_alignment(snp.matrix)
 72 |       if(is.na(snp.object[[1]])){
 73 |         stop("All sites are conserved!")
 74 |       }
 75 |       cur.part <- rep(1, snp.object$n.seq)
 76 |       avail.cluster.ids <- 1
 77 |       snp.object$heds <- rownames(snp.matrix)
 78 |     } else{
 79 |       cur.part <- all.partition.matrix[snp.object$seq.inds, cur.depth]
 80 |       avail.cluster.ids <- unique(cur.part)
 81 |     }
 82 | 
 83 |     if(!quiet) print(paste(c("---- Current depth: ", cur.depth, " ----"), collapse = ""))
 84 | 
 85 |     if(length(cur.part)==0){
 86 |       if(!quiet) print(paste(c("all sequences are clustered. Quit! There are ", cur.depth,
 87 |                   " layers in total."), collapse = ""))
 88 |       break
 89 |     }
 90 | 
 91 |     local.label.offset <- 1
 92 | 
 93 |     for (i in 1:length(avail.cluster.ids)){
 94 |       cluid <- avail.cluster.ids[i]
 95 |       if(!quiet) print(paste(c("Current depth: ", cur.depth, " Cluster ID: ", cluid), collapse = ""))
 96 | 
 97 |       if (sum(cur.part==cluid) < MIN.CLUSTER.SIZE){
 98 |         #we dont split any further.
 99 |         all.partition.matrix[cur.part==cluid, cur.depth+1] <- local.label.offset
100 |         local.label.offset <- local.label.offset+1
101 |         if(length(lml.list)<(cur.depth+1)){
102 |           lml.list[[cur.depth+1]] <- NA
103 |         } else {
104 |           lml.list[[cur.depth+1]] <- c(lml.list[[cur.depth+1]], NA)
105 |         }
106 |         next
107 |       }
108 | 
109 |       if (cur.depth==0){
110 |         tmp.snp.object <- snp.object
111 |       } else {
112 |         tmp.snp.object <- preproc_alignment(snp.object$data[cur.part==cluid, ])
113 |         if(is.na(tmp.snp.object[[1]])){
114 |           #all sites are conserved at this partition so dont split further
115 |           all.partition.matrix[cur.part==cluid, cur.depth+1] <- local.label.offset
116 |           local.label.offset <- local.label.offset+1
117 |           if(length(lml.list)<(cur.depth+1)){
118 |             lml.list[[cur.depth+1]] <- NA
119 |           } else {
120 |             lml.list[[cur.depth+1]] <- c(lml.list[[cur.depth+1]], NA)
121 |           }
122 |           next
123 |         }
124 |       }
125 |       
126 |       
127 | 
128 |       tmp.z.hclust <- stats::hclust(stats::as.dist(tmp.snp.object$dist), method = 'complete')
129 | 
130 |       if (tmp.snp.object$n.seq > (3*n.pops)){
131 |         tmp.init.part <- stats::cutree(tmp.z.hclust, k = n.pops)
132 |       } else {
133 |         tmp.num = min(floor(tmp.snp.object$n.seq/2), n.pops)
134 |         tmp.init.part <- stats::cutree(tmp.z.hclust, k = tmp.num)
135 |       }
136 | 
137 |       temp_partition = model_search_parallel(tmp.snp.object, tmp.init.part, round.types,
138 |                                              quiet, n.extra.rounds, n.cores)
139 | 
140 |       if (!quiet){
141 |         print(paste(c("Best partition: Nclusters ", length(unique(temp_partition$partition)),
142 |                       " Log(ml*prior) ", temp_partition$lml), collapse = ""))
143 |       }
144 | 
145 |       if(length(lml.list)<(cur.depth+1)){
146 |         lml.list[[cur.depth+1]] <- temp_partition$lml
147 |       } else {
148 |         lml.list[[cur.depth+1]] <- c(lml.list[[cur.depth+1]], temp_partition$lml)
149 |       }
150 | 
151 | 
152 |       for (clust in unique(temp_partition$partition)){
153 |         cluster_index <- rep(FALSE, length(cur.part))
154 |         cluster_index[cur.part==cluid][clust==temp_partition$partition] <- TRUE
155 |         all.partition.matrix[cluster_index, cur.depth+1] <- local.label.offset
156 |         local.label.offset <- local.label.offset+1
157 |       }
158 |     }
159 | 
160 |     names(lml.list[[cur.depth+1]]) <- avail.cluster.ids
161 | 
162 |   }
163 |   names(lml.list) <- paste("Depth", seq(0, max.depth-1))
164 |   partition.df <- data.frame(isolates=snp.object$heds, all.partition.matrix)
165 |   colnames(partition.df) <- c("Isolate", paste("level", 1:ncol(all.partition.matrix)))
166 |   
167 |   if(assignment.probs){
168 |     #calculate assignment probabilities
169 |     colnames(snp.object$data) <- 1:ncol(snp.object$data)
170 |     consensus <- apply(snp.object$data, 2, function(x) table(factor(x, levels = c("a","c","g","t")),
171 |                                                              exclude = c("-", NA)))
172 |     consensus <- rownames(consensus)[apply(consensus,2,which.max)]
173 |     cluster.ass.prob.list <- lapply(1:max.depth, function(l){
174 |       partitions <- split(1:nrow(snp.matrix), partition.df[,l+1])
175 |       cluster.ass.probs <- do.call(cbind, lapply(partitions, function(p){
176 |         cluster.allele.freqs <- apply(snp.object$data[p,,drop=FALSE], 2, function(x) table(factor(x, levels = c("a","c","g","t")),
177 |                                                                                            exclude = c("-", NA)))
178 |         cluster.allele.freqs <- log((cluster.allele.freqs + snp.object$prior)/(length(p)+1))
179 |         apply(snp.object$data, 1, function(seq){
180 |           seq[seq=="-"] <- consensus[seq=="-"]
181 |           return(sum(cluster.allele.freqs[cbind(seq, 1:ncol(cluster.allele.freqs))]))
182 |         })
183 |       }))
184 |       cluster.ass.probs <- cluster.ass.probs-apply(cluster.ass.probs, 1, matrixStats::logSumExp)
185 |       cluster.ass.probs <- exp(cluster.ass.probs)
186 |       cluster.ass.probs[cluster.ass.probs<1e-16] <- 0
187 |       rownames(cluster.ass.probs) <- snp.object$heds
188 |       colnames(cluster.ass.probs) <- paste("Cluster", 1:ncol(cluster.ass.probs))
189 |       return(cluster.ass.probs)
190 |     })
191 |     
192 |     return(list(partition.df=partition.df,
193 |                 cluster.assignment.prob=cluster.ass.prob.list,
194 |                 lml.list=lml.list))
195 |   }
196 |   
197 |   return(list(partition.df=partition.df,
198 |               lml.list=lml.list))
199 | }
200 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | <!-- README.md is generated from README.Rmd. Please edit that file -->
  3 | <!-- badges: start -->
  4 | 
  5 | [![R-CMD-check](https://github.com/gtonkinhill/rhierbaps/workflows/R-CMD-check/badge.svg)](https://github.com/gtonkinhill/rhierbaps/actions)
  6 | <!-- badges: end -->
  7 | 
  8 | # rhierbaps
  9 | 
 10 | We have recently developed a faster verion of the BAPs clustering
 11 | method. It can be found [here](https://github.com/gtonkinhill/fastbaps).
 12 | 
 13 | ## Installation
 14 | 
 15 | `rhierbaps` is available on CRAN.
 16 | 
 17 | ``` r
 18 | install.packages("rhierbaps")
 19 | ```
 20 | 
 21 | The development version is available on github. It can be installed with
 22 | `devtools`
 23 | 
 24 | ``` r
 25 | install.packages("devtools")
 26 | 
 27 | devtools::install_github("gtonkinhill/rhierbaps")
 28 | ```
 29 | 
 30 | If you would like to also build the vignette with your installation run:
 31 | 
 32 | ``` r
 33 | devtools::install_github("gtonkinhill/rhierbaps", build_vignettes = TRUE)
 34 | ```
 35 | 
 36 | ## Quick Start
 37 | 
 38 | Run hierBAPS.
 39 | 
 40 | ``` r
 41 | # install.packages('rhierbaps')
 42 | library(rhierbaps)
 43 | 
 44 | fasta.file.name <- system.file("extdata", "seqs.fa", package = "rhierbaps")
 45 | snp.matrix <- load_fasta(fasta.file.name)
 46 | hb.results <- hierBAPS(snp.matrix, max.depth = 2, n.pops = 20, quiet = TRUE)
 47 | head(hb.results$partition.df)
 48 | #>   Isolate level 1 level 2
 49 | #> 1       1       1       1
 50 | #> 2       2       1       1
 51 | #> 3       3       1       1
 52 | #> 4       4       2       5
 53 | #> 5       5       3       9
 54 | #> 6       6       3       9
 55 | ```
 56 | 
 57 | The hierBAPS algorithm was introduced in (Cheng et al. 2013) and
 58 | provides a method for hierarchically clustering DNA sequence data to
 59 | reveal nested population structure. Previously the algorithm was
 60 | available as a compiled MATLAB binary. We provide a convenient R
 61 | implementation and include a number of useful additional options
 62 | including the ability to use multiple cores, save the log marginal
 63 | likelihood scores and run the algorithm until local convergence.
 64 | Furthermore, we provide a wrapper to a ggtree plotting function allowing
 65 | for easy exploration of sub-clusters.
 66 | 
 67 | ------------------------------------------------------------------------
 68 | 
 69 | **Things to keep in mind before running hierBAPS**
 70 | 
 71 | 1.  hierBAPS uses a uniform prior for K.
 72 | 2.  The prior for a site depend on the available snps, i.e. if a site
 73 |     only has ‘AC,’ then the prior for ‘ACGT’ is (1/2, 1/2, 0, 0)
 74 | 3.  The initial sequence partition is generated using hierarchical
 75 |     clustering with complete linkage based on a Hamming distance matrix.
 76 | 4.  The initial number of populations should be set much higher than the
 77 |     expected number of populations.
 78 | 5.  More search rounds of the algorithm can be added using the
 79 |     `n.extra.rounds` parameter.
 80 | 6.  To get reproducible results the seed in R must be set.
 81 | 
 82 | ## Libraries
 83 | 
 84 | ``` r
 85 | library(rhierbaps)
 86 | library(ggtree)
 87 | library(phytools)
 88 | library(ape)
 89 | 
 90 | set.seed(1234)
 91 | ```
 92 | 
 93 | ## Loading data
 94 | 
 95 | We first need to load a multiple sequence alignment in fasta format. We
 96 | can then generate the required SNP matrix.
 97 | 
 98 | ``` r
 99 | fasta.file.name <- system.file("extdata", "seqs.fa", package = "rhierbaps")
100 | snp.matrix <- load_fasta(fasta.file.name)
101 | ```
102 | 
103 | If you wish to include singleton SNPs (those that appear in only one
104 | isolate) then set `keep.singletons=FALSE`. However, this is currently
105 | advised against as these SNPs lead to a higher number of parameters in
106 | the model and do not provide information about shared ancestry.
107 | 
108 | It is also possible to load an ape DNAbin object. Here me make use of
109 | the woodmouse dataset in ape.
110 | 
111 | ``` r
112 | data(woodmouse)
113 | woodmouse.snp.matrix <- load_fasta(woodmouse)
114 | ```
115 | 
116 | ## Running hierBAPS
117 | 
118 | We now need to decide how many levels of clustering we are interested in
119 | and the number of initial clusters to start from. It is a good idea to
120 | choose `n.pops` to be significantly larger than the number of clusters
121 | you expect.
122 | 
123 | To run hierBAPS with 2 levels and 20 initial clusters we run
124 | 
125 | ``` r
126 | hb.results <- hierBAPS(snp.matrix, max.depth = 2, n.pops = 20, quiet = TRUE)
127 | head(hb.results$partition.df)
128 | #>   Isolate level 1 level 2
129 | #> 1       1       1       1
130 | #> 2       2       1       1
131 | #> 3       3       1       1
132 | #> 4       4       2       5
133 | #> 5       5       3       9
134 | #> 6       6       3       9
135 | ```
136 | 
137 | This produces a list which includes a data frame indicating the
138 | resulting partition of the isolates at the difference levels. The
139 | isolate names in this data frame are taken from the fasta headers and
140 | thus for plotting it is important that these match the isolate names in
141 | any tree used later. This function also outputs the log marginal
142 | likelihoods at the different levels of clustering.
143 | 
144 | hierBAPS can also be run until the algorithm converges to a local
145 | optimum as
146 | 
147 | ``` r
148 | hb.results <- hierBAPS(snp.matrix, max.depth = 2, n.pops = 20, n.extra.rounds = Inf, 
149 |     quiet = TRUE)
150 | ```
151 | 
152 | We can also check how long hierBAPS takes to run on the test dataset of
153 | 515 samples and 744 SNPs.
154 | 
155 | ``` r
156 | system.time(hierBAPS(snp.matrix, max.depth = 2, n.pops = 20, quiet = TRUE))
157 | #>    user  system elapsed 
158 | #>  84.184  17.152 102.219
159 | ```
160 | 
161 | ## Plotting results
162 | 
163 | To plot the results it is useful to consider a tree of the same
164 | isolates. We clustered the example isolates using Iqtree
165 | (Kalyaanamoorthy et al. 2017). The ggtree (Yu et al. 2017) package then
166 | allows us to plot the results.
167 | 
168 | First we need to load the newick file.
169 | 
170 | ``` r
171 | newick.file.name <- system.file("extdata", "seqs.fa.treefile", package = "rhierbaps")
172 | iqtree <- phytools::read.newick(newick.file.name)
173 | ```
174 | 
175 | A simple coloured tree allows us to see the top level cluster assignment
176 | from hierBAPS.
177 | 
178 | ``` r
179 | gg <- ggtree(iqtree, layout = "circular")
180 | gg <- gg %<+% hb.results$partition.df
181 | gg <- gg + geom_tippoint(aes(color = factor(`level 1`)))
182 | gg
183 | ```
184 | 
185 | ![](inst/vignette-supp/unnamed-chunk-15-1.png)<!-- -->
186 | 
187 | As there are many more clusters at the second level using colours to
188 | distinguish them can get confusing. Instead we can label the tips with
189 | their corresponding clusters.
190 | 
191 | ``` r
192 | gg <- ggtree(iqtree, layout = "circular", branch.length = "none")
193 | gg <- gg %<+% hb.results$partition.df
194 | gg <- gg + geom_tippoint(aes(color = factor(`level 1`)))
195 | gg <- gg + theme(legend.position = "right")
196 | gg <- gg + geom_tiplab(aes(label = `level 2`), size = 1, offset = 1)
197 | gg
198 | ```
199 | 
200 | ![](inst/vignette-supp/unnamed-chunk-16-1.png)<!-- -->
201 | 
202 | We can also zoom in on a particular top level cluster to get a better
203 | idea of how it is partitioned at the lower level. As an example we zoom
204 | in on sub cluster 9 at level 1.
205 | 
206 | ``` r
207 | plot_sub_cluster(hb.results, iqtree, level = 1, sub.cluster = 9)
208 | ```
209 | 
210 | ![](inst/vignette-supp/unnamed-chunk-17-1.png)<!-- -->
211 | 
212 | Finally, we can inspect the log marginal likelihoods given for each
213 | level.
214 | 
215 | ``` r
216 | hb.results$lml.list
217 | #> $`Depth 0`
218 | #>         1 
219 | #> -50858.92 
220 | #> 
221 | #> $`Depth 1`
222 | #>          1          2          3          4          5          6          7 
223 | #> -2121.8599 -4012.3594 -4237.7639 -3095.1865 -1525.7356 -3180.7572 -4015.5020 
224 | #>          8          9         10         11         12         13 
225 | #> -2104.5277 -1736.0192  -780.0635  -810.7793  -688.5214  -163.3198
226 | ```
227 | 
228 | ## Caculating assignment probabilities
229 | 
230 | We can also calculate the individual probabilities of assignment to each
231 | cluster. Here we make use of the woodmouse dataset loaded earlier.
232 | 
233 | ``` r
234 | hb.results.woodmouse <- hierBAPS(woodmouse.snp.matrix, max.depth = 2, n.extra.rounds = Inf, 
235 |     quiet = TRUE, assignment.probs = TRUE)
236 | head(hb.results.woodmouse$cluster.assignment.prob[[1]])
237 | #>            Cluster 1    Cluster 2    Cluster 3
238 | #> No305   9.997868e-01 2.104112e-04 2.805482e-06
239 | #> No304   5.620947e-06 9.999944e-01 1.699254e-11
240 | #> No306   8.996214e-03 9.910038e-01 9.626735e-09
241 | #> No0906S 9.965743e-01 3.425673e-03 1.902359e-08
242 | #> No0908S 9.911304e-01 8.869359e-03 2.068655e-07
243 | #> No0909S 2.615477e-09 1.105831e-10 1.000000e+00
244 | ```
245 | 
246 | ## Saving results
247 | 
248 | For runs that take a long time it is a good idea to save the output. We
249 | can save the partition file as
250 | 
251 | ``` r
252 | write.csv(hb.results$partition.df, file = file.path(tempdir(), "hierbaps_partition.csv"), 
253 |     col.names = TRUE, row.names = FALSE)
254 | 
255 | save_lml_logs(hb.results, file.path(tempdir(), "hierbaps_logML.txt"))
256 | ```
257 | 
258 | ## Citing rhierbaps
259 | 
260 | If you use rhierbaps in a research publication please cite both
261 | 
262 | Tonkin-Hill, Gerry, John A. Lees, Stephen D. Bentley, Simon D. W. Frost,
263 | and Jukka Corander. 2018. “RhierBAPS: An R Implementation of the
264 | Population Clustering Algorithm hierBAPS.” Wellcome Open Research 3
265 | (July): 93.
266 | 
267 | Cheng, Lu, Thomas R. Connor, Jukka Sirén, David M. Aanensen, and Jukka
268 | Corander. 2013. “Hierarchical and Spatially Explicit Clustering of DNA
269 | Sequences with BAPS Software.” Molecular Biology and Evolution 30 (5):
270 | 1224–28.
271 | 
272 | ## References
273 | 
274 | <div id="refs" class="references csl-bib-body hanging-indent">
275 | 
276 | <div id="ref-Cheng2013-mp" class="csl-entry">
277 | 
278 | Cheng, Lu, Thomas R Connor, Jukka Sirén, David M Aanensen, and Jukka
279 | Corander. 2013. “Hierarchical and Spatially Explicit Clustering of DNA
280 | Sequences with BAPS Software.” *Mol. Biol. Evol.* 30 (5): 1224–28.
281 | <https://doi.org/10.1093/molbev/mst028>.
282 | 
283 | </div>
284 | 
285 | <div id="ref-Kalyaanamoorthy2017-go" class="csl-entry">
286 | 
287 | Kalyaanamoorthy, Subha, Bui Quang Minh, Thomas K F Wong, Arndt von
288 | Haeseler, and Lars S Jermiin. 2017. “ModelFinder: Fast Model Selection
289 | for Accurate Phylogenetic Estimates.” *Nat. Methods* 14 (6): 587–89.
290 | <https://doi.org/10.1038/nmeth.4285>.
291 | 
292 | </div>
293 | 
294 | <div id="ref-Paradis2004-ck" class="csl-entry">
295 | 
296 | Paradis, Emmanuel, Julien Claude, and Korbinian Strimmer. 2004. “APE:
297 | Analyses of Phylogenetics and Evolution in R Language.” *Bioinformatics*
298 | 20 (2): 289–90. <https://doi.org/10.1093/bioinformatics/btg412>.
299 | 
300 | </div>
301 | 
302 | <div id="ref-Revell2012-ik" class="csl-entry">
303 | 
304 | Revell, Liam J. 2012. “Phytools: An R Package for Phylogenetic
305 | Comparative Biology (and Other Things).” *Methods Ecol. Evol.* 3 (2):
306 | 217–23. <https://doi.org/10.1111/j.2041-210X.2011.00169.x>.
307 | 
308 | </div>
309 | 
310 | <div id="ref-Yu2017-bf" class="csl-entry">
311 | 
312 | Yu, Guangchuang, David K Smith, Huachen Zhu, Yi Guan, and Tommy Tsan-Yuk
313 | Lam. 2017. “Ggtree: An r Package for Visualization and Annotation of
314 | Phylogenetic Trees with Their Covariates and Other Associated Data.”
315 | *Methods Ecol. Evol.* 8 (1): 28–36.
316 | <https://doi.org/10.1111/2041-210X.12628>.
317 | 
318 | </div>
319 | 
320 | </div>
321 | 


--------------------------------------------------------------------------------
/inst/extdata/seqs.fa.treefile:
--------------------------------------------------------------------------------
1 | (1:0.0000003922,((2:0.0007035735,(((((((((((((((((((((((((((((((((((((4:0.0000003922,179:0.0003535671):0.000354,464:0.0003539488):0.000000,(100:0.0000003922,450:0.0003536100):0.000709):0.000000,142:0.0000003922):0.000354,(((((24:0.0003529467,496:0.0003550324):0.000362,184:0.0003548929):0.000710,220:0.0000003922):0.000000,155:0.0003547216):0.000710,((((73:0.0000003922,94:0.0003539096):0.000000,99:0.0032322991):0.000357,224:0.0003523452):0.000711,((177:0.0000003922,459:0.0003534055):0.000000,386:0.0065951662):0.000709):0.000361):0.000000):0.000000,(176:0.0000003922,395:0.0007089763):0.000354):0.000354,22:0.0007095167):0.000000,392:0.0010662049):0.000000,427:0.0007094620):0.000721,(((14:0.0025555280,255:0.0032846947):0.000679,(19:0.0007091635,((((((((29:0.0011246004,503:0.0032581496):0.002499,221:0.0010906324):0.000001,((((34:0.0000003922,(((64:0.0010707535,(174:0.0003525606,(((226:0.0003782272,422:0.0010400592):0.000708,244:0.0003770596):0.001397,(360:0.0014285156,361:0.0018050544):0.000354):0.000353):0.000717):0.000347,(381:0.0000003922,493:0.0007078393):0.001064):0.000000,239:0.0007072111):0.000708):0.000000,89:0.0007073133):0.003624,101:0.0003521745):0.000000,96:0.0000003922):0.001413):0.003272,95:0.0010707559):0.001427,512:0.0003624883):0.000353,372:0.0003561372):0.000709,50:0.0003554568):0.000706,511:0.0000003922):0.000709):0.000001):0.000352,(149:0.0007089470,(153:0.0003509940,363:0.0007129547):0.000356):0.000000):0.000000):0.002924,(93:0.0007081535,370:0.0014309426):0.001816):0.001434,17:0.0025121258):0.000000,(((((46:0.0007141307,(((53:0.0010688883,(143:0.0000003922,446:0.0003531549):0.000709):0.000354,472:0.0036145825):0.000000,379:0.0007086590):0.000353):0.000358,86:0.0007113670):0.001976,52:0.0031424499):0.000610,48:0.0006367453):0.000863,154:0.0009981955):0.000430):0.001961,(((((((((((((8:0.0000003922,((25:0.0003539114,((195:0.0000003922,359:0.0062335276):0.000354,(199:0.0003556274,(((297:0.0137869584,407:0.0000003922):0.000001,376:0.0032301154):0.003661,344:0.0021602983):0.000701):0.000709):0.000000):0.000000,398:0.0025191055):0.000000):0.000000,317:0.0003538645):0.000000,420:0.0069804024):0.000000,15:0.0007090655):0.000353,399:0.0010710014):0.001076,501:0.0003544501):0.000354,((98:0.0000003922,162:0.0003536528):0.000708,(157:0.0043728611,159:0.0007091857):0.000361):0.000000):0.000353,364:0.0007099404):0.000000,506:0.0017822738):0.000711,((13:0.0010658510,(((((((((((18:0.0000003922,((54:0.0003543339,((138:0.0000003922,241:0.0003540590):0.000354,458:0.0010677907):0.000000):0.000000,225:0.0007102141):0.000000):0.000350,((39:0.0007121253,40:0.0003550740):0.000358,(385:0.0007157663,465:0.0003503058):0.000355):0.000000):0.001465,(84:0.0003539348,166:0.0000003922):0.001838):0.000337,(139:0.0010686173,156:0.0007300277):0.001074):0.000340,373:0.0043836922):0.001087,((80:0.0101764131,148:0.0047715177):0.009532,(124:0.0007113996,253:0.0000003922):0.004854):0.008877):0.000000,(33:0.0003440857,(((59:0.0010753338,193:0.0007064301):0.001426,68:0.0010664840):0.000000,175:0.0007099116):0.001438):0.001444):0.000717,(66:0.0010687895,212:0.0007119085):0.000717):0.000714,137:0.0010695360):0.000354,(((44:0.0000003922,232:0.0014211805):0.000353,238:0.0010641979):0.000362,(466:0.0003588210,473:0.0000003922):0.000708):0.000000):0.000354,201:0.0017824052):0.000000):0.000354,((85:0.0000003922,(118:0.0000003922,(168:0.0003561767,263:0.0010664386):0.000710):0.000354):0.000354,119:0.0003547149):0.000355):0.000000):0.000352,251:0.0010742405):0.000711,136:0.0003578704):0.000871,51:0.0025998047):0.003820):0.002858,(20:0.0005672263,71:0.0015775113):0.001991):0.000833,((((((((((((((9:0.0000003922,189:0.0010658690):0.004700,(((((16:0.0003535928,197:0.0000003922):0.000000,265:0.0003536964):0.000354,23:0.0000003922):0.000000,346:0.0062176476):0.000000,203:0.0021493938):0.001068):0.000385,257:0.0000003922):0.000353,((56:0.0000003922,61:0.0003536736):0.000353,((230:0.0049678814,264:0.0054550950):0.005289,324:0.0000003922):0.001780):0.000000):0.000000,12:0.0000003922):0.008297,55:0.0038298064):0.001777,((49:0.0007206788,((65:0.0020411338,198:0.0023537088):0.004383,(192:0.0003577465,202:0.0003542944):0.001568):0.000639):0.001435,67:0.0007167978):0.000473):0.001164,69:0.0008702073):0.002225,70:0.0015668742):0.000001,(213:0.0030015320,240:0.0055881828):0.003858):0.005166,437:0.0044411917):0.001363,435:0.0011025525):0.000386,(((((((10:0.0000003922,256:0.0003556787):0.001800,(((43:0.0007185045,(((146:0.0000003922,210:0.0007131044):0.000354,161:0.0014348301):0.000358,243:0.0000003922):0.000355):0.000358,500:0.0007137950):0.000728,150:0.0014491195):0.000338):0.000727,((((242:0.0003523982,397:0.0003566649):0.000710,507:0.0003533463):0.000000,502:0.0000003922):0.000721,485:0.0017953811):0.000708):0.000000,(191:0.0000003922,(204:0.0003553595,236:0.0010771425):0.000000):0.000712):0.000000,58:0.0003551845):0.000355,63:0.0014359595):0.000001,(117:0.0003156831,206:0.0040539968):0.002564):0.000667):0.000001,(((97:0.0000003922,229:0.0007106387):0.000000,479:0.0062127532):0.002474,(468:0.0003545272,509:0.0000003922):0.004116):0.004147):0.003698):0.001259,170:0.0046551960):0.005336,(194:0.0054161588,400:0.0031284103):0.010861):0.003903,390:0.0137043954):0.003887,439:0.0048070035):0.008872,(((28:0.0031275198,188:0.0019492323):0.006273,79:0.0030622297):0.003104,440:0.0097959906):0.004713):0.008037,((((((((((72:0.0003555136,(111:0.0000003922,((((218:0.0003555378,223:0.0000003922):0.000000,(470:0.0007133693,504:0.0000003922):0.000356):0.000000,467:0.0003556020):0.000000,388:0.0003557119):0.000356):0.000000):0.000000,(((((((158:0.0000003922,208:0.0003551764):0.000355,209:0.0014317715):0.000357,227:0.0000003922):0.001070,(((183:0.0039827828,471:0.0000003922):0.000000,495:0.0003562274):0.000272,478:0.0033386852):0.002975):0.000000,(233:0.0014377683,(487:0.0010761675,510:0.0003574562):0.000713):0.000355):0.000355,(((228:0.0010716653,281:0.0003544590):0.000357,(231:0.0007127089,505:0.0000003922):0.000000):0.000355,258:0.0014310076):0.000000):0.000000,(278:0.0003628785,342:0.0029004655):0.000350):0.000355):0.000000,268:0.0003554775):0.000000,249:0.0014340364):0.003167,((307:0.0047266113,345:0.0000003922):0.001431,319:0.0000003922):0.001171):0.003931,490:0.0080616410):0.001937,(((291:0.0007138777,306:0.0000003922):0.001073,301:0.0000003922):0.002298,((299:0.0000003922,347:0.0007160197):0.000848,(335:0.0089801523,338:0.0000003922):0.000588):0.003841):0.004057):0.004520,293:0.0010258793):0.001438,330:0.0055949249):0.004097,315:0.0000003922):0.025384):0.004622,30:0.0148853615):0.006402,(((((((((((21:0.0040979179,403:0.0032749688):0.005810,322:0.0048392161):0.007156,(((((((((35:0.0014337559,(((((((41:0.0000003922,425:0.0003563539):0.000715,(327:0.0014335668,421:0.0003562179):0.000000):0.001441,(410:0.0000003922,431:0.0003562976):0.000727):0.001797,(341:0.0000003922,456:0.0010707248):0.000000):0.000356,(((305:0.0000003922,426:0.0003560967):0.000355,406:0.0003579949):0.000717,(325:0.0000003922,(414:0.0000003922,415:0.0003559011):0.000356):0.000000):0.000000):0.000000,(302:0.0000003922,311:0.0003561402):0.000715):0.000321,457:0.0095624942):0.001835):0.000000,187:0.0000003922):0.000000,116:0.0014374313):0.000723,88:0.0010786580):0.000351,222:0.0018179920):0.001094,321:0.0007255894):0.001477,((((((((114:0.0009669077,((269:0.0000003922,(272:0.0000003922,(328:0.0000003922,433:0.0003549640):0.000356):0.000355):0.001685,(313:0.0032189765,424:0.0037976678):0.002832):0.001320):0.000459,(316:0.0003540297,334:0.0003560015):0.000716):0.000000,(215:0.0010727560,404:0.0007154284):0.001441):0.000354,298:0.0007121893):0.000000,436:0.0007086915):0.000715,(190:0.0000003922,((196:0.0000003922,(((401:0.0025222145,402:0.0000003922):0.000765,447:0.0010218913):0.005188,412:0.0121389872):0.000000):0.000000,455:0.0010682576):0.000354):0.000000):0.002149,((405:0.0014354208,((408:0.0007116608,413:0.0000003922):0.000355,411:0.0000003922):0.000000):0.000355,432:0.0000003922):0.001070):0.000000,(235:0.0005408902,248:0.0080571026):0.003448):0.000307):0.001423,(((254:0.0000003922,326:0.0014302488):0.001150,423:0.0035685320):0.001595,318:0.0088782434):0.001988):0.000792,350:0.0020978916):0.001152):0.000000,286:0.0003488210):0.000720,448:0.0000003922):0.000406,(296:0.0000003922,(393:0.0003557367,416:0.0000003922):0.001438):0.002117):0.005612,36:0.0088682001):0.005346,((((((((((((37:0.0016960315,((((115:0.0000007616,454:0.0036722083):0.004644,442:0.0077154247):0.002401,(200:0.0000008194,276:0.0021652525):0.000407):0.001999,329:0.0024396608):0.002516):0.005905,449:0.0018097567):0.008172,337:0.0000003922):0.001240,((((((277:0.0003562376,280:0.0000003922):0.004443,(300:0.0007056579,(303:0.0003833841,(304:0.0003817471,358:0.0003309443):0.000716):0.003645):0.002252):0.001090,(284:0.0003547891,357:0.0000003922):0.000316):0.003313,279:0.0000003922):0.000000,287:0.0003553810):0.001536,283:0.0036843772):0.000653):0.001688,271:0.0029171109):0.010508,((((323:0.0007117425,354:0.0003551338):0.000000,355:0.0014351687):0.000000,336:0.0000003922):0.000355,356:0.0000003922):0.000362):0.000349,(273:0.0000003922,333:0.0003542776):0.001068):0.000358,282:0.0000003922):0.000711,285:0.0003573814):0.005890,(42:0.0012569486,320:0.0125937335):0.006447):0.002720,288:0.0083601291):0.001659,(((270:0.0014301558,(((((((289:0.0039816990,428:0.0000003922):0.001443,417:0.0014340222):0.001809,(418:0.0000003922,429:0.0003535193):0.000353):0.000298,((((332:0.0000003922,349:0.0007124447):0.000356,(352:0.0003559396,453:0.0003564005):0.000355):0.000718,348:0.0003575134):0.000348,353:0.0029133473):0.001136):0.007450,292:0.0000003922):0.000710,(((314:0.0000003922,409:0.0007107416):0.000350,(339:0.0000003922,340:0.0059051010):0.007000):0.000000,(419:0.0007057688,434:0.0003588989):0.000355):0.000359):0.000000,331:0.0003543693):0.000710):0.000357,274:0.0000003922):0.000355,430:0.0003546822):0.004194):0.010859):0.018064,((((((83:0.0020910351,(260:0.0003602016,261:0.0000003922):0.002691):0.001368,87:0.0179077213):0.000976,259:0.0006714356):0.015538,(488:0.0004118642,497:0.0003078671):0.007439):0.004085,438:0.0087469973):0.027489,445:0.0034077469):0.098877):0.008066,(((((((480:0.0000003922,(483:0.0000003922,(484:0.0003570504,491:0.0007140612):0.000356):0.000356):0.000000,499:0.0014361772):0.000356,481:0.0007134974):0.000000,492:0.0000003922):0.000000,489:0.0007138816):0.000884,515:0.0204426203):0.018870,((482:0.0003553582,(494:0.0000003922,508:0.0010706387):0.000000):0.001464,486:0.0044704565):0.023417):0.013776):0.008326,((74:0.0102453981,(151:0.0071566798,476:0.0036438395):0.004565):0.006955,514:0.0173481691):0.009175):0.016346):0.000952,(((((((82:0.0095358550,126:0.0001695106):0.000693,309:0.0031149149):0.002205,(((((131:0.0135107407,343:0.0068122580):0.001366,(217:0.0003108555,245:0.0007543639):0.006445):0.001462,(((160:0.0007518170,451:0.0048446200):0.003949,252:0.0033675686):0.003263,262:0.0017525040):0.001117):0.001303,295:0.0029159083):0.000766,((((178:0.0013179481,216:0.0004774830):0.000727,(290:0.0094782992,(312:0.0000003922,375:0.0003542309):0.000714):0.002799):0.002693,275:0.0021738254):0.001072,250:0.0010855613):0.001407):0.001572):0.003133,((((186:0.0057115408,(310:0.0000003922,351:0.0007146345):0.006215):0.007276,387:0.0028268292):0.005968,308:0.0079786128):0.006882,207:0.0059950918):0.003323):0.000361,(125:0.0053683351,(152:0.0003541843,294:0.0000003922):0.007442):0.002287):0.004504,((((128:0.0085515995,133:0.0044046083):0.004326,513:0.0120825469):0.001298,377:0.0053292902):0.002420,469:0.0107939202):0.002079):0.003756,((102:0.0051532810,378:0.0029255086):0.002567,247:0.0125389116):0.005180):0.010196):0.007513,((((((((5:0.0014192083,((7:0.0025262258,32:0.0003474027):0.001427,90:0.0010624589):0.000000):0.001064,120:0.0003558736):0.000352,172:0.0007105200):0.000355,92:0.0000003922):0.001423,127:0.0007204773):0.003853,452:0.0111906573):0.001938,(180:0.0078895223,211:0.0057259314):0.000955):0.001383,6:0.0053836891):0.002219):0.006962,234:0.0021829859):0.001180,(169:0.0032923008,(219:0.0007096769,394:0.0007095456):0.000001):0.001817):0.006909,(((38:0.0000003922,103:0.0003527159):0.001622,475:0.0034399883):0.000866,((((78:0.0010664316,132:0.0014386796):0.000706,((104:0.0003527199,463:0.0000003922):0.000354,477:0.0017892280):0.000348):0.001062,((105:0.0014202094,391:0.0007096957):0.001067,(362:0.0010681810,396:0.0007093988):0.000347):0.000362):0.000001,((123:0.0007054523,441:0.0106917594):0.000000,369:0.0003519569):0.001066):0.002525):0.003121):0.004496,383:0.0019474631):0.006980,(((((26:0.0000003922,165:0.0003529737):0.003635,31:0.0010716812):0.003639,(((47:0.0003519547,374:0.0000006985):0.003385,(171:0.0039466215,246:0.0000003922):0.002902):0.002531,371:0.0010645969):0.001123):0.001054,(81:0.0003835303,((((144:0.0021391886,164:0.0000003922):0.000353,380:0.0000003922):0.004587,(443:0.0022167469,444:0.0033157915):0.011942):0.002764,266:0.0017489979):0.001447):0.007394):0.001707,(((((45:0.0000003922,185:0.0003515018):0.000352,((((121:0.0007064108,122:0.0000003922):0.000706,181:0.0000003922):0.000706,182:0.0000003922):0.001007,384:0.0065038652):0.002578):0.000000,368:0.0007056582):0.000000,205:0.0000003922):0.000000,(((91:0.0000003922,140:0.0003514798):0.002086,141:0.0004072188):0.000357,462:0.0002919864):0.000412):0.002045):0.003459):0.006149,(110:0.0028092992,((112:0.0021656979,460:0.0006862366):0.000628,214:0.0083347195):0.002370):0.004954):0.002283,((((((((11:0.0003523216,(109:0.0000003922,129:0.0014165574):0.000705):0.000353,(173:0.0043093443,365:0.0000003922):0.000000):0.000000,((27:0.0002245806,(167:0.0000003922,(367:0.0003537019,382:0.0003536319):0.000000):0.000485):0.006064,147:0.0134288708):0.004476):0.000000,(75:0.0000003922,163:0.0014158343):0.000705):0.001064,(62:0.0000003922,108:0.0014155285):0.000000):0.000937,(((((106:0.0028377863,(113:0.0011069698,461:0.0017558736):0.000714):0.000754,366:0.0024953469):0.001082,498:0.0007107242):0.000340,237:0.0007163468):0.000908,267:0.0012401247):0.000650):0.001022,((57:0.0001841001,76:0.0005209020):0.000365,60:0.0034186489):0.002611):0.000000,((77:0.0035140507,474:0.0008099525):0.006578,389:0.0077028762):0.003763):0.008049):0.001040,145:0.0007543394):0.001070,130:0.0000003922):0.000000,107:0.0017683320):0.000703,135:0.0003509915):0.000000):0.000000,134:0.0003510809):0.000000,3:0.0003510272);
2 | 


--------------------------------------------------------------------------------