├── vignettes ├── .gitignore ├── cols.rds ├── emb.rds ├── r_face.rds ├── thresh.rds ├── r_nodes.png ├── r_shape.rds ├── rips_all.rds ├── shape_rt.rds ├── rips_cycle.rds ├── theta_nodes.png ├── rips_secondary.rds ├── cols_respiratory.rds └── cols_time_since_last_block.rds ├── src ├── Makevars.win ├── ANN.o ├── perf.o ├── bd_tree.o ├── brute.o ├── figtree.o ├── kd_dump.o ├── kd_tree.o ├── kd_util.o ├── bd_search.o ├── kd_search.o ├── kd_split.o ├── RcppExports.o ├── TDApplied.so ├── bd_pr_search.o ├── kd_pr_search.o ├── KCenterClustering.o ├── bd_fix_rad_search.o ├── kd_fix_rad_search.o ├── RcppExports.cpp ├── kd_fix_rad_search.h ├── kd_pr_search.h ├── kd_search.h ├── bd_search.cpp ├── bd_pr_search.cpp ├── kd_split.h ├── bd_fix_rad_search.cpp ├── bd_tree.h ├── brute.cpp ├── pr_queue.h ├── pr_queue_k.h ├── kd_util.h ├── perf.cpp ├── KCenterClustering.h └── ANNx.h ├── tests ├── testthat.R └── testthat │ ├── test-convert.R │ ├── test-enclosing.R │ ├── test-utilities.R │ ├── test-plot.R │ ├── test-MDS.R │ ├── test-kernel.R │ └── test-python.R ├── .gitignore ├── CRAN-SUBMISSION ├── R ├── zzz.R ├── TDApplied-package.R ├── RcppExports.R ├── convert.R ├── enclosing_rad.R └── kernel_calculations.R ├── .Rbuildignore ├── TDApplied.Rproj ├── man ├── check_ripser.Rd ├── import_ripser.Rd ├── check_PyH_setup.Rd ├── enclosing_radius.Rd ├── TDApplied-package.Rd ├── diagram_to_df.Rd ├── loss.Rd ├── diagram_kernel.Rd ├── predict_diagram_kkmeans.Rd ├── plot_diagram.Rd ├── gram_matrix.Rd ├── PyH.Rd ├── vr_graphs.Rd ├── predict_diagram_ksvm.Rd ├── predict_diagram_kpca.Rd ├── distance_matrix.Rd ├── diagram_kkmeans.Rd ├── diagram_distance.Rd ├── plot_vr_graph.Rd ├── analyze_representatives.Rd ├── independence_test.Rd ├── diagram_kpca.Rd ├── diagram_mds.Rd ├── universal_null.Rd ├── permutation_model_inference.Rd ├── bootstrap_persistence_thresholds.Rd ├── permutation_test.Rd └── diagram_ksvm.Rd ├── cran-comments.md ├── DESCRIPTION ├── NAMESPACE ├── exec └── parallel_with_approximation.R └── NEWS.md /vignettes/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | *.R 3 | -------------------------------------------------------------------------------- /src/Makevars.win: -------------------------------------------------------------------------------- 1 | PKG_CPPFLAGS += -DFIGTREE_DLL_EXPORTS -DDLL_EXPORTS -------------------------------------------------------------------------------- /src/ANN.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/src/ANN.o -------------------------------------------------------------------------------- /src/perf.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/src/perf.o -------------------------------------------------------------------------------- /src/bd_tree.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/src/bd_tree.o -------------------------------------------------------------------------------- /src/brute.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/src/brute.o -------------------------------------------------------------------------------- /src/figtree.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/src/figtree.o -------------------------------------------------------------------------------- /src/kd_dump.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/src/kd_dump.o -------------------------------------------------------------------------------- /src/kd_tree.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/src/kd_tree.o -------------------------------------------------------------------------------- /src/kd_util.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/src/kd_util.o -------------------------------------------------------------------------------- /src/bd_search.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/src/bd_search.o -------------------------------------------------------------------------------- /src/kd_search.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/src/kd_search.o -------------------------------------------------------------------------------- /src/kd_split.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/src/kd_split.o -------------------------------------------------------------------------------- /src/RcppExports.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/src/RcppExports.o -------------------------------------------------------------------------------- /src/TDApplied.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/src/TDApplied.so -------------------------------------------------------------------------------- /src/bd_pr_search.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/src/bd_pr_search.o -------------------------------------------------------------------------------- /src/kd_pr_search.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/src/kd_pr_search.o -------------------------------------------------------------------------------- /vignettes/cols.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/vignettes/cols.rds -------------------------------------------------------------------------------- /vignettes/emb.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/vignettes/emb.rds -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(TDApplied) 3 | 4 | test_check("TDApplied") 5 | -------------------------------------------------------------------------------- /vignettes/r_face.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/vignettes/r_face.rds -------------------------------------------------------------------------------- /vignettes/thresh.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/vignettes/thresh.rds -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | inst/doc 6 | /doc/ 7 | /Meta/ 8 | -------------------------------------------------------------------------------- /src/KCenterClustering.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/src/KCenterClustering.o -------------------------------------------------------------------------------- /src/bd_fix_rad_search.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/src/bd_fix_rad_search.o -------------------------------------------------------------------------------- /src/kd_fix_rad_search.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/src/kd_fix_rad_search.o -------------------------------------------------------------------------------- /vignettes/r_nodes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/vignettes/r_nodes.png -------------------------------------------------------------------------------- /vignettes/r_shape.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/vignettes/r_shape.rds -------------------------------------------------------------------------------- /vignettes/rips_all.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/vignettes/rips_all.rds -------------------------------------------------------------------------------- /vignettes/shape_rt.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/vignettes/shape_rt.rds -------------------------------------------------------------------------------- /vignettes/rips_cycle.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/vignettes/rips_cycle.rds -------------------------------------------------------------------------------- /vignettes/theta_nodes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/vignettes/theta_nodes.png -------------------------------------------------------------------------------- /vignettes/rips_secondary.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/vignettes/rips_secondary.rds -------------------------------------------------------------------------------- /CRAN-SUBMISSION: -------------------------------------------------------------------------------- 1 | Version: 3.0.4 2 | Date: 2024-10-27 22:04:37 UTC 3 | SHA: f72e303388b8467a96701d8746df308db4029c69 4 | -------------------------------------------------------------------------------- /vignettes/cols_respiratory.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/vignettes/cols_respiratory.rds -------------------------------------------------------------------------------- /vignettes/cols_time_since_last_block.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/vignettes/cols_time_since_last_block.rds -------------------------------------------------------------------------------- /R/zzz.R: -------------------------------------------------------------------------------- 1 | 2 | # unload C++ DLL for proper cleanup 3 | .onUnload <- function (libpath) { 4 | library.dynam.unload("TDApplied", libpath) 5 | } -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^README\.Rmd$ 4 | ^cran-comments\.md$ 5 | ^CRAN-SUBMISSION$ 6 | ^doc$ 7 | ^Meta$ 8 | ^LICENSE\.md$ 9 | -------------------------------------------------------------------------------- /R/TDApplied-package.R: -------------------------------------------------------------------------------- 1 | #' @useDynLib TDApplied, .registration = TRUE 2 | #' @docType package 3 | #' @keywords internal 4 | "_PACKAGE" 5 | 6 | ## usethis namespace: start 7 | ## usethis namespace: end 8 | NULL 9 | -------------------------------------------------------------------------------- /R/RcppExports.R: -------------------------------------------------------------------------------- 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | figtree <- function(X, h, Q, Y, epsilon, G) { 5 | .Call(`_TDApplied_figtree`, X, h, Q, Y, epsilon, G) 6 | } 7 | 8 | -------------------------------------------------------------------------------- /TDApplied.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | BuildType: Package 16 | PackageUseDevtools: Yes 17 | PackageInstallArgs: --no-multiarch --with-keep.source 18 | -------------------------------------------------------------------------------- /man/check_ripser.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/python_functions.R 3 | \name{check_ripser} 4 | \alias{check_ripser} 5 | \title{Verify an imported ripser module.} 6 | \usage{ 7 | check_ripser(ripser) 8 | } 9 | \arguments{ 10 | \item{ripser}{the ripser module object.} 11 | } 12 | \description{ 13 | Verify an imported ripser module. 14 | } 15 | \author{ 16 | Shael Brown - \email{shaelebrown@gmail.com} 17 | } 18 | -------------------------------------------------------------------------------- /man/import_ripser.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/python_functions.R 3 | \name{import_ripser} 4 | \alias{import_ripser} 5 | \title{Import the python module ripser.} 6 | \usage{ 7 | import_ripser() 8 | } 9 | \value{ 10 | the python ripser module. 11 | } 12 | \description{ 13 | The ripser module is needed for fast persistent cohomology calculations with the PyH function. 14 | } 15 | \details{ 16 | Same as "reticulate::import("ripser")", just with additional checks. 17 | } 18 | \examples{ 19 | \dontrun{ 20 | # import ripser 21 | ripser <- import_ripser() 22 | } 23 | } 24 | \author{ 25 | Shael Brown - \email{shaelebrown@gmail.com} 26 | } 27 | -------------------------------------------------------------------------------- /man/check_PyH_setup.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/python_functions.R 3 | \name{check_PyH_setup} 4 | \alias{check_PyH_setup} 5 | \title{Make sure that python has been configured correctly for persistent homology calculations.} 6 | \usage{ 7 | check_PyH_setup() 8 | } 9 | \description{ 10 | Ensures that the reticulate package has been installed, that python is available to be used 11 | by reticulate functions, and that the python module "ripser" has been installed. 12 | } 13 | \details{ 14 | An error message will be thrown if any of the above conditions are not met. 15 | } 16 | \author{ 17 | Shael Brown - \email{shaelebrown@gmail.com} 18 | } 19 | -------------------------------------------------------------------------------- /tests/testthat/test-convert.R: -------------------------------------------------------------------------------- 1 | 2 | # test_that("diagram_to_df can accept the right kinds of input",{ 3 | # 4 | # skip_if_not_installed("TDA") 5 | # skip_if_not_installed("TDAstats") 6 | # D <- TDA::circleUnif(n = 20,r = 1) 7 | # phom_TDA <- TDA::ripsDiag(X = D,maxdimension = 1,maxscale = 2) 8 | # phom_TDAstats <- TDAstats::calculate_homology(mat = D,threshold = 2) 9 | # simulated_PyH_phom <- list(diagram = diagram_to_df(phom_TDA),representatives = list()) 10 | # expect_s3_class(diagram_to_df(phom_TDA),"data.frame") 11 | # expect_s3_class(diagram_to_df(phom_TDAstats),"data.frame") 12 | # expect_s3_class(diagram_to_df(diagram_to_df(phom_TDA)),"data.frame") 13 | # expect_s3_class(diagram_to_df(simulated_PyH_phom),"data.frame") 14 | # 15 | # }) 16 | 17 | test_that("diagram_to_df can detect incorrect parameters properly",{ 18 | 19 | expect_error(diagram_to_df(2),"computation") 20 | 21 | }) -------------------------------------------------------------------------------- /cran-comments.md: -------------------------------------------------------------------------------- 1 | 2 | ## Test environments 3 | * local Mac OS X install, R 4.1.2 4 | * win-builder (devel and release) 5 | * rhub windows virtual machine 6 | * rhub macos virtual machine 7 | * rhub linux virtual machine 8 | * rhub ubuntu-release, valgrind, ubuntu-clang, clang19 and atlas containers 9 | 10 | ## R CMD check results 11 | 12 | 0 errors | 0 warnings | 1 note 13 | 14 | ## NOTES 15 | 16 | * the note on R CMD check is for large sub directory size (necessary for the extensive documentation needed for journal publication). 17 | * on rhub there are build errors for gcc14 (Fedora Linux R devel) and macos-arm64, seemingly because some of the package dependencies are not available on those platforms. 18 | * some of the examples run for over 5s, however these examples have been made as small and fast as possible without throwing errors. 19 | * there are domain-specific words and author names in ML_and_Inference.Rmd which were flagged by devtools::check_spelling() but to the author's knowledge they are all spelled correctly. 20 | 21 | 22 | -------------------------------------------------------------------------------- /man/enclosing_radius.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/enclosing_rad.R 3 | \name{enclosing_radius} 4 | \alias{enclosing_radius} 5 | \title{Compute the enclosing radius for a dataset.} 6 | \usage{ 7 | enclosing_radius(X, distance_mat = FALSE) 8 | } 9 | \arguments{ 10 | \item{X}{the input dataset, must either be a matrix or data frame.} 11 | 12 | \item{distance_mat}{whether or not `X` is a distance matrix, default FALSE.} 13 | } 14 | \value{ 15 | the numeric enclosing radius. 16 | } 17 | \description{ 18 | The enclosing radius is the minimum (Euclidean distance) radius beyond which no topological changes will occur. 19 | } 20 | \examples{ 21 | 22 | # create a persistence diagram from a 2D Gaussian 23 | df = data.frame(x = rnorm(n = 20,mean = 0,sd = 1),y = rnorm(n = 20,mean = 0,sd = 1)) 24 | 25 | # compute the enclosing radius from the point cloud 26 | enc_rad <- enclosing_radius(df, distance_mat = FALSE) 27 | 28 | # compute the distance matrix manually, stored as a matrix 29 | dist_df <- as.matrix(dist(df)) 30 | 31 | # compute the enclosing radius from the distance matrix 32 | enc_rad <- enclosing_radius(dist_df, distance_mat = TRUE) 33 | } 34 | \author{ 35 | Shael Brown - \email{shaelebrown@gmail.com} 36 | } 37 | -------------------------------------------------------------------------------- /man/TDApplied-package.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/TDApplied-package.R 3 | \docType{package} 4 | \name{TDApplied-package} 5 | \alias{TDApplied} 6 | \alias{TDApplied-package} 7 | \title{TDApplied: Machine Learning and Inference for Topological Data Analysis} 8 | \description{ 9 | Topological data analysis is a powerful tool for finding non-linear global structure in whole datasets. The main tool of topological data analysis is persistent homology, which computes a topological shape descriptor of a dataset called a persistence diagram. 'TDApplied' provides useful and efficient methods for analyzing groups of persistence diagrams with machine learning and statistical inference, and these functions can also interface with other data science packages to form flexible and integrated topological data analysis pipelines. 10 | } 11 | \seealso{ 12 | Useful links: 13 | \itemize{ 14 | \item \url{https://github.com/shaelebrown/TDApplied} 15 | \item Report bugs at \url{https://github.com/shaelebrown/TDApplied/issues} 16 | } 17 | 18 | } 19 | \author{ 20 | \strong{Maintainer}: Shael Brown \email{shaelebrown@gmail.com} 21 | 22 | Authors: 23 | \itemize{ 24 | \item Dr. Reza Farivar \email{reza.farivar@mcgill.ca} [funder] 25 | } 26 | 27 | } 28 | \keyword{internal} 29 | -------------------------------------------------------------------------------- /tests/testthat/test-enclosing.R: -------------------------------------------------------------------------------- 1 | 2 | test_that("enclosing_radius can detect incorrect inputs",{ 3 | 4 | expect_error(enclosing_radius(NULL, NULL), "distance_mat") 5 | expect_error(enclosing_radius(NULL, c(T,F)), "single") 6 | expect_error(enclosing_radius(NULL, NA), "NA") 7 | expect_error(enclosing_radius(NULL, T), "X") 8 | expect_error(enclosing_radius(data.frame(),T),"X") 9 | expect_error(enclosing_radius(data.frame(x = 1),T),"X") 10 | expect_error(enclosing_radius(data.frame(x = c(1,2)),T),"X") 11 | expect_error(enclosing_radius(X = NULL,T),"X") 12 | expect_error(enclosing_radius(X = data.frame(x = c(1,NA)),T),"missing") 13 | expect_error(enclosing_radius(data.frame(x = c(1),y = c(2)),T),"two") 14 | expect_error(enclosing_radius(data.frame(x = c(1,2,3),y = c(2,1,2)),T),"square") 15 | 16 | }) 17 | 18 | test_that("enclosing_radius is computing properly",{ 19 | 20 | X <- data.frame(x = c(1:10),y = c(1:10)) 21 | dist_X <- as.matrix(dist(X)) 22 | expect_equal(enclosing_radius(X, F), dist_X[1,6]) 23 | expect_equal(enclosing_radius(dist_X, T), dist_X[1,6]) 24 | 25 | theta <- runif(n = 100,min = 0,max = 2*pi) 26 | x <- cos(theta) 27 | y <- sin(theta) 28 | df <- data.frame(x = x,y = y) 29 | dist_df <- as.matrix(dist(df)) 30 | expect_equal(enclosing_radius(df, F),enclosing_radius(dist_df, T)) 31 | 32 | }) 33 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: TDApplied 2 | Type: Package 3 | Title: Machine Learning and Inference for Topological Data Analysis 4 | Version: 3.0.4 5 | Authors@R: c(person("Shael", "Brown", email = "shaelebrown@gmail.com", role = c("aut","cre")), 6 | person("Dr. Reza", "Farivar", email = "reza.farivar@mcgill.ca", role = c("aut","fnd"))) 7 | Author: Shael Brown [aut, cre], 8 | Dr. Reza Farivar [aut, fnd] 9 | Maintainer: Shael Brown 10 | Description: Topological data analysis is a powerful tool for finding non-linear global structure 11 | in whole datasets. The main tool of topological data analysis is persistent homology, which computes 12 | a topological shape descriptor of a dataset called a persistence diagram. 'TDApplied' provides 13 | useful and efficient methods for analyzing groups of persistence diagrams with machine learning and statistical inference, 14 | and these functions can also interface with other data science packages to form flexible and integrated 15 | topological data analysis pipelines. 16 | Depends: R (>= 3.5.0) 17 | Imports: parallel, doParallel, foreach, clue, rdist, parallelly, kernlab, iterators, methods, stats, utils, Rcpp (>= 0.11.0) 18 | License: GPL (>= 3) 19 | URL: https://github.com/shaelebrown/TDApplied 20 | BugReports: https://github.com/shaelebrown/TDApplied/issues 21 | Encoding: UTF-8 22 | NeedsCompilation: yes 23 | RoxygenNote: 7.3.2 24 | Suggests: 25 | rmarkdown, 26 | knitr, 27 | testthat (>= 3.0.0), 28 | TDAstats, 29 | reticulate, 30 | TDA, 31 | igraph 32 | LinkingTo: Rcpp 33 | VignetteBuilder: knitr, rmarkdown 34 | Config/testthat/edition: 3 35 | -------------------------------------------------------------------------------- /src/RcppExports.cpp: -------------------------------------------------------------------------------- 1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | #include 5 | 6 | using namespace Rcpp; 7 | 8 | #ifdef RCPP_USE_GLOBAL_ROSTREAM 9 | Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); 10 | Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); 11 | #endif 12 | 13 | // figtree 14 | std::vector figtree(std::vector X, double h, std::vector Q, std::vector Y, double epsilon, std::vector G); 15 | RcppExport SEXP _TDApplied_figtree(SEXP XSEXP, SEXP hSEXP, SEXP QSEXP, SEXP YSEXP, SEXP epsilonSEXP, SEXP GSEXP) { 16 | BEGIN_RCPP 17 | Rcpp::RObject rcpp_result_gen; 18 | Rcpp::RNGScope rcpp_rngScope_gen; 19 | Rcpp::traits::input_parameter< std::vector >::type X(XSEXP); 20 | Rcpp::traits::input_parameter< double >::type h(hSEXP); 21 | Rcpp::traits::input_parameter< std::vector >::type Q(QSEXP); 22 | Rcpp::traits::input_parameter< std::vector >::type Y(YSEXP); 23 | Rcpp::traits::input_parameter< double >::type epsilon(epsilonSEXP); 24 | Rcpp::traits::input_parameter< std::vector >::type G(GSEXP); 25 | rcpp_result_gen = Rcpp::wrap(figtree(X, h, Q, Y, epsilon, G)); 26 | return rcpp_result_gen; 27 | END_RCPP 28 | } 29 | 30 | static const R_CallMethodDef CallEntries[] = { 31 | {"_TDApplied_figtree", (DL_FUNC) &_TDApplied_figtree, 6}, 32 | {NULL, NULL, 0} 33 | }; 34 | 35 | RcppExport void R_init_TDApplied(DllInfo *dll) { 36 | R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); 37 | R_useDynamicSymbols(dll, FALSE); 38 | } 39 | -------------------------------------------------------------------------------- /man/diagram_to_df.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/convert.R 3 | \name{diagram_to_df} 4 | \alias{diagram_to_df} 5 | \title{Convert a TDA/TDAstats persistence diagram to a data frame.} 6 | \usage{ 7 | diagram_to_df(d) 8 | } 9 | \arguments{ 10 | \item{d}{the output of a TDA/TDAstats homology calculation, like ripsDiag or \code{\link[TDAstats]{calculate_homology}}.} 11 | } 12 | \value{ 13 | a 3-column data frame, with each row representing a topological feature. The first column is the feature dimension (a non-negative integer), the second column is the birth radius of the feature and the third column is the death radius. 14 | } 15 | \description{ 16 | The output of homology calculations from the R packages TDA 17 | and TDAstats are not dataframes. This function converts these 18 | outputs into a data frame either for further usage in this package or 19 | for personalized analyses. 20 | } 21 | \details{ 22 | If a diagram is constructed using a TDA function like ripsDiag 23 | with the `location` parameter set to true then the return value will ignore the location information. 24 | } 25 | \examples{ 26 | 27 | if(require("TDAstats")) 28 | { 29 | # create a persistence diagram from a 2D Gaussian 30 | df = data.frame(x = rnorm(n = 20,mean = 0,sd = 1),y = rnorm(n = 20,mean = 0,sd = 1)) 31 | 32 | # compute persistence diagram with calculate_homology from package TDAstats 33 | phom_TDAstats = TDAstats::calculate_homology(mat = df,dim = 0,threshold = 1) 34 | 35 | # convert to data frame 36 | phom_TDAstats_df = diagram_to_df(d = phom_TDAstats) 37 | } 38 | } 39 | \author{ 40 | Shael Brown - \email{shaelebrown@gmail.com} 41 | } 42 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(PyH) 4 | export(bootstrap_persistence_thresholds) 5 | export(diagram_distance) 6 | export(diagram_kernel) 7 | export(diagram_kkmeans) 8 | export(diagram_kpca) 9 | export(diagram_ksvm) 10 | export(diagram_mds) 11 | export(diagram_to_df) 12 | export(distance_matrix) 13 | export(enclosing_radius) 14 | export(gram_matrix) 15 | export(import_ripser) 16 | export(independence_test) 17 | export(permutation_model_inference) 18 | export(permutation_test) 19 | export(plot_diagram) 20 | export(plot_vr_graph) 21 | export(predict_diagram_kkmeans) 22 | export(predict_diagram_kpca) 23 | export(predict_diagram_ksvm) 24 | export(universal_null) 25 | export(vr_graphs) 26 | import(Rcpp) 27 | importFrom(clue,solve_LSAP) 28 | importFrom(doParallel,registerDoParallel) 29 | importFrom(doParallel,stopImplicitCluster) 30 | importFrom(foreach,"%:%") 31 | importFrom(foreach,"%do%") 32 | importFrom(foreach,"%dopar%") 33 | importFrom(foreach,foreach) 34 | importFrom(graphics,legend) 35 | importFrom(graphics,lines) 36 | importFrom(graphics,points) 37 | importFrom(graphics,rect) 38 | importFrom(iterators,iter) 39 | importFrom(kernlab,as.kernelMatrix) 40 | importFrom(kernlab,kkmeans) 41 | importFrom(kernlab,kpca) 42 | importFrom(kernlab,ksvm) 43 | importFrom(kernlab,predict) 44 | importFrom(methods,is) 45 | importFrom(parallel,clusterEvalQ) 46 | importFrom(parallel,clusterExport) 47 | importFrom(parallel,makeCluster) 48 | importFrom(parallel,stopCluster) 49 | importFrom(parallelly,availableCores) 50 | importFrom(rdist,cdist) 51 | importFrom(stats,as.dendrogram) 52 | importFrom(stats,as.dist) 53 | importFrom(stats,cmdscale) 54 | importFrom(stats,complete.cases) 55 | importFrom(stats,dist) 56 | importFrom(stats,hclust) 57 | importFrom(stats,heatmap) 58 | importFrom(stats,order.dendrogram) 59 | importFrom(stats,pgamma) 60 | importFrom(stats,quantile) 61 | importFrom(utils,combn) 62 | useDynLib(TDApplied, .registration = TRUE) 63 | -------------------------------------------------------------------------------- /src/kd_fix_rad_search.h: -------------------------------------------------------------------------------- 1 | //---------------------------------------------------------------------- 2 | // File: kd_fix_rad_search.h 3 | // Programmer: Sunil Arya and David Mount 4 | // Description: Standard kd-tree fixed-radius kNN search 5 | // Last modified: 05/03/05 (Version 1.1) 6 | //---------------------------------------------------------------------- 7 | // Copyright (c) 1997-2005 University of Maryland and Sunil Arya and 8 | // David Mount. All Rights Reserved. 9 | // 10 | // This software and related documentation is part of the Approximate 11 | // Nearest Neighbor Library (ANN). This software is provided under 12 | // the provisions of the Lesser GNU Public License (LGPL). See the 13 | // file ../ReadMe.txt for further information. 14 | // 15 | // The University of Maryland (U.M.) and the authors make no 16 | // representations about the suitability or fitness of this software for 17 | // any purpose. It is provided "as is" without express or implied 18 | // warranty. 19 | //---------------------------------------------------------------------- 20 | // History: 21 | // Revision 1.1 05/03/05 22 | // Initial release 23 | //---------------------------------------------------------------------- 24 | 25 | #ifndef ANN_kd_fix_rad_search_H 26 | #define ANN_kd_fix_rad_search_H 27 | 28 | #include "kd_tree.h" // kd-tree declarations 29 | #include "kd_util.h" // kd-tree utilities 30 | #include "pr_queue_k.h" // k-element priority queue 31 | 32 | #include "ANNperf.h" // performance evaluation 33 | 34 | //---------------------------------------------------------------------- 35 | // Global variables 36 | // These are active for the life of each call to 37 | // annRangeSearch(). They are set to save the number of 38 | // variables that need to be passed among the various search 39 | // procedures. 40 | //---------------------------------------------------------------------- 41 | 42 | extern ANNpoint ANNkdFRQ; // query point (static copy) 43 | 44 | #endif -------------------------------------------------------------------------------- /tests/testthat/test-utilities.R: -------------------------------------------------------------------------------- 1 | 2 | test_that("utilities are working properly",{ 3 | 4 | expect_error(check_diagram(data.frame(dimension = c(1,2,3),birth = c("1","2","3"),death = c(1,2,3))),"numeric") 5 | expect_error(check_diagram(data.frame(dimension = c(1.1,2,3),birth = c(1,2,3),death = c(1,2,3))),"whole") 6 | expect_error(check_diagram(data.frame(dimension = c(-1,2,3),birth = c(1,2,3),death = c(1,2,3))),">= 0") 7 | expect_error(check_diagram(data.frame(dimension = c(1,2,3),birth = c(1,-2,3),death = c(1,2,3))),">= 0") 8 | expect_error(check_diagram(data.frame(dimension = c(1,2,3),birth = c(1,2,3),death = c(1,2,NA))),"missing") 9 | expect_error(check_diagram(data.frame(dimension = c(1,2,3),birth = c(1,2,3),death = c(1,2,2.9))),"larger") 10 | expect_error(check_param(param_name = "test",param = "T",numeric = F),"T or F") 11 | 12 | }) 13 | 14 | test_that("check_matrix works",{ 15 | 16 | d1 = data.frame(dimension = rep(0,5),birth = 1:5,death = 1:5 + 0.1) 17 | d2 = data.frame(dimension = rep(0,5),birth = 1:5,death = 1:5 + 0.2) 18 | D = distance_matrix(list(d1,d2),dim = 0,num_workers = 2) 19 | K = gram_matrix(list(d1,d2),dim = 0,num_workers = 2) 20 | expect_error(check_matrix(D,"D"),"kernel") 21 | expect_error(check_matrix(K,"K","matrix"),"matrix") 22 | expect_error(check_matrix(rbind(D,c(1,2)),"D","matrix"),"rows") 23 | D[1,2] = NA 24 | D[2,1] = NaN 25 | expect_error(check_matrix(D,"D","matrix"),"missing") 26 | D = distance_matrix(list(d1,d2),dim = 0,num_workers = 2) 27 | D[1,1] = 1 28 | expect_error(check_matrix(D,"D","matrix"),"0's") 29 | D[1,1] = 0 30 | K[1,1] = 0 31 | expect_error(check_matrix(K,"K"),"1's") 32 | K[1,1] = 1 33 | K[1,2] = 1 34 | expect_error(check_matrix(K,"K"),"symmetric") 35 | D[1,2] = 0 36 | expect_error(check_matrix(D,"D","matrix"),"symmetric") 37 | expect_silent(check_matrix(D,"D",type = "matrix",symmetric = F)) 38 | expect_error(check_matrix(D[0,],"D",type = "matrix"),"at least") 39 | 40 | }) -------------------------------------------------------------------------------- /src/kd_pr_search.h: -------------------------------------------------------------------------------- 1 | //---------------------------------------------------------------------- 2 | // File: kd_pr_search.h 3 | // Programmer: Sunil Arya and David Mount 4 | // Description: Priority kd-tree search 5 | // Last modified: 01/04/05 (Version 1.0) 6 | //---------------------------------------------------------------------- 7 | // Copyright (c) 1997-2005 University of Maryland and Sunil Arya and 8 | // David Mount. All Rights Reserved. 9 | // 10 | // This software and related documentation is part of the Approximate 11 | // Nearest Neighbor Library (ANN). This software is provided under 12 | // the provisions of the Lesser GNU Public License (LGPL). See the 13 | // file ../ReadMe.txt for further information. 14 | // 15 | // The University of Maryland (U.M.) and the authors make no 16 | // representations about the suitability or fitness of this software for 17 | // any purpose. It is provided "as is" without express or implied 18 | // warranty. 19 | //---------------------------------------------------------------------- 20 | // History: 21 | // Revision 0.1 03/04/98 22 | // Initial release 23 | //---------------------------------------------------------------------- 24 | 25 | #ifndef ANN_kd_pr_search_H 26 | #define ANN_kd_pr_search_H 27 | 28 | #include "kd_tree.h" // kd-tree declarations 29 | #include "kd_util.h" // kd-tree utilities 30 | #include "pr_queue.h" // priority queue declarations 31 | #include "pr_queue_k.h" // k-element priority queue 32 | 33 | #include "ANNperf.h" // performance evaluation 34 | 35 | //---------------------------------------------------------------------- 36 | // Global variables 37 | // Active for the life of each call to Appx_Near_Neigh() or 38 | // Appx_k_Near_Neigh(). 39 | //---------------------------------------------------------------------- 40 | 41 | extern double ANNprEps; // the error bound 42 | extern int ANNprDim; // dimension of space 43 | extern ANNpoint ANNprQ; // query point 44 | extern double ANNprMaxErr; // max tolerable squared error 45 | extern ANNpointArray ANNprPts; // the points 46 | extern ANNpr_queue *ANNprBoxPQ; // priority queue for boxes 47 | extern ANNmin_k *ANNprPointMK; // set of k closest points 48 | 49 | #endif -------------------------------------------------------------------------------- /src/kd_search.h: -------------------------------------------------------------------------------- 1 | //---------------------------------------------------------------------- 2 | // File: kd_search.h 3 | // Programmer: Sunil Arya and David Mount 4 | // Description: Standard kd-tree search 5 | // Last modified: 01/04/05 (Version 1.0) 6 | //---------------------------------------------------------------------- 7 | // Copyright (c) 1997-2005 University of Maryland and Sunil Arya and 8 | // David Mount. All Rights Reserved. 9 | // 10 | // This software and related documentation is part of the Approximate 11 | // Nearest Neighbor Library (ANN). This software is provided under 12 | // the provisions of the Lesser GNU Public License (LGPL). See the 13 | // file ../ReadMe.txt for further information. 14 | // 15 | // The University of Maryland (U.M.) and the authors make no 16 | // representations about the suitability or fitness of this software for 17 | // any purpose. It is provided "as is" without express or implied 18 | // warranty. 19 | //---------------------------------------------------------------------- 20 | // History: 21 | // Revision 0.1 03/04/98 22 | // Initial release 23 | //---------------------------------------------------------------------- 24 | 25 | #ifndef ANN_kd_search_H 26 | #define ANN_kd_search_H 27 | 28 | #include "kd_tree.h" // kd-tree declarations 29 | #include "kd_util.h" // kd-tree utilities 30 | #include "pr_queue_k.h" // k-element priority queue 31 | 32 | #include "ANNperf.h" // performance evaluation 33 | 34 | //---------------------------------------------------------------------- 35 | // More global variables 36 | // These are active for the life of each call to annkSearch(). They 37 | // are set to save the number of variables that need to be passed 38 | // among the various search procedures. 39 | //---------------------------------------------------------------------- 40 | 41 | extern int ANNkdDim; // dimension of space (static copy) 42 | extern ANNpoint ANNkdQ; // query point (static copy) 43 | extern double ANNkdMaxErr; // max tolerable squared error 44 | extern ANNpointArray ANNkdPts; // the points (static copy) 45 | extern ANNmin_k *ANNkdPointMK; // set of k closest points 46 | extern int ANNptsVisited; // number of points visited 47 | 48 | #endif -------------------------------------------------------------------------------- /man/loss.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/distance_calculations.R 3 | \name{loss} 4 | \alias{loss} 5 | \title{Turner loss function for a list of groups (lists) of persistence diagrams.} 6 | \usage{ 7 | loss( 8 | diagram_groups, 9 | dist_mats, 10 | dims, 11 | p, 12 | q, 13 | distance, 14 | sigma, 15 | rho, 16 | num_workers, 17 | group_sizes 18 | ) 19 | } 20 | \arguments{ 21 | \item{diagram_groups}{groups (lists/vectors) of persistence diagrams, stored as lists of a data frame and 22 | an index of the diagram in all the diagrams across all groups.} 23 | 24 | \item{dist_mats}{distance matrices between all possible pairs of persistence diagrams across and within groups 25 | storing the current distances which have been pre-computed.} 26 | 27 | \item{dims}{a numeric vector of which homological dimensions in which the loss function is to be computed.} 28 | 29 | \item{p}{a number representing the wasserstein parameter, at least 1, and if Inf then the bottleneck distance is calculated.} 30 | 31 | \item{q}{a finite number at least 1.} 32 | 33 | \item{distance}{a string which determines which type of distance calculation to carry out, either "wasserstein" (default) or "fisher".} 34 | 35 | \item{sigma}{the positive bandwidth for the persistence Fisher distance.} 36 | 37 | \item{rho}{the approximation heuristic for Fisher information metric, results in sequential computation.} 38 | 39 | \item{num_workers}{the number of cores used for parallel computation.} 40 | 41 | \item{group_sizes}{for when using precomputed distance matrices.} 42 | } 43 | \value{ 44 | the numeric value of the Turner loss function. 45 | } 46 | \description{ 47 | An internal function to calculate the normalized sum of within-group exponentiated distances 48 | between pairs of persistence diagrams (stored as data frames) 49 | for an arbitrary number of groups in parallel. Note that this function may run 50 | into memory issues for large numbers of diagrams. 51 | } 52 | \details{ 53 | The Turner loss function is described in Robinson and Turner 2017 54 | (\url{https://link.springer.com/article/10.1007/s41468-017-0008-7}), and is used 55 | in the `permutation_test` function to describe how well-separated a particular 56 | grouping of persistence diagrams is. When the `distance` parameter is "fisher", 57 | `sigma` must not be NULL. 58 | } 59 | \references{ 60 | Robinson T, Turner K (2017). "Hypothesis testing for topological data analysis." \url{https://link.springer.com/article/10.1007/s41468-017-0008-7}. 61 | } 62 | \author{ 63 | Shael Brown - \email{shaelebrown@gmail.com} 64 | } 65 | \keyword{internal} 66 | -------------------------------------------------------------------------------- /man/diagram_kernel.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/kernel_calculations.R 3 | \name{diagram_kernel} 4 | \alias{diagram_kernel} 5 | \title{Calculate persistence Fisher kernel value between a pair of persistence diagrams.} 6 | \usage{ 7 | diagram_kernel(D1, D2, dim = 0, sigma = 1, t = 1, rho = NULL) 8 | } 9 | \arguments{ 10 | \item{D1}{the first persistence diagram.} 11 | 12 | \item{D2}{the second persistence diagram.} 13 | 14 | \item{dim}{the non-negative integer homological dimension in which the distance is to be computed, default 0.} 15 | 16 | \item{sigma}{a positive number representing the bandwidth for the Fisher information metric, default 1.} 17 | 18 | \item{t}{a positive number representing the scale for the persistence Fisher kernel, default 1.} 19 | 20 | \item{rho}{an optional positive number representing the heuristic for Fisher information metric approximation, see \code{\link{diagram_distance}}. Default NULL.} 21 | } 22 | \value{ 23 | the numeric kernel value. 24 | } 25 | \description{ 26 | Returns the persistence Fisher kernel value between a pair of persistence diagrams 27 | in a particular homological dimension, each of which is either the output from a \code{\link{diagram_to_df}} 28 | function call or from a persistent homology calculation like ripsDiag/\code{\link[TDAstats]{calculate_homology}}/\code{\link{PyH}}. 29 | } 30 | \details{ 31 | The persistence Fisher kernel is calculated from the Fisher information metric according to the formula 32 | \eqn{k_{PF}(D_1,D_2) = exp(-t*d_{FIM}(D_1,D_2))}, resembling a radial basis kernel for standard 33 | Euclidean spaces. 34 | } 35 | \examples{ 36 | 37 | if(require("TDAstats")) 38 | { 39 | # create two diagrams 40 | D1 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,20),], 41 | dim = 1,threshold = 2) 42 | D2 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,20),], 43 | dim = 1,threshold = 2) 44 | 45 | # calculate the kernel value between D1 and D2 with sigma = 2, t = 2 in dimension 1 46 | diagram_kernel(D1,D2,dim = 1,sigma = 2,t = 2) 47 | # calculate the kernel value between D1 and D2 with sigma = 2, t = 2 in dimension 0 48 | diagram_kernel(D1,D2,dim = 0,sigma = 2,t = 2) 49 | } 50 | } 51 | \references{ 52 | Le T, Yamada M (2018). "Persistence fisher kernel: a riemannian manifold kernel for persistence diagrams." \url{https://proceedings.neurips.cc/paper/2018/file/959ab9a0695c467e7caf75431a872e5c-Paper.pdf}. 53 | 54 | Murphy, K. "Machine learning: a probabilistic perspective", MIT press (2012). 55 | } 56 | \seealso{ 57 | \code{\link{gram_matrix}} for Gram (i.e. kernel) matrix calculations. 58 | } 59 | \author{ 60 | Shael Brown - \email{shaelebrown@gmail.com} 61 | } 62 | -------------------------------------------------------------------------------- /tests/testthat/test-plot.R: -------------------------------------------------------------------------------- 1 | 2 | test_that("plot_diagram can detect incorrect parameters",{ 3 | 4 | expect_error(plot_diagram(D = data.frame(dimension = c(0:13),birth = rep(0,14),death = rep(1,14))),"12") 5 | expect_error(plot_diagram(D = data.frame(dimension = c(0,1,2,3),birth = c(0,0,0,0),death = c(1,1,1,Inf))),"finite") 6 | expect_error(plot_diagram(D = data.frame(dimension = c(0,1,2,3),birth = c(0,0,0,0),death = c(1,1,1,2)),title = NA),"NA") 7 | expect_error(plot_diagram(D = data.frame(dimension = c(0,1,2,3),birth = c(0,0,0,0),death = c(1,1,1,2)),title = 2),"character") 8 | expect_error(plot_diagram(D = data.frame(dimension = c(0,1,2,3),birth = c(0,0,0,0),death = c(1,1,1,2)),max_radius = NA),"numeric") 9 | expect_error(plot_diagram(D = data.frame(dimension = c(0,1,2,3),birth = c(0,0,0,0),death = c(1,1,1,2)),max_radius = c(1,2)),"single") 10 | expect_error(plot_diagram(D = data.frame(dimension = c(0,1,2,3),birth = c(0,0,0,0),death = c(1,1,1,2)),max_radius = Inf),"finite") 11 | expect_error(plot_diagram(D = data.frame(dimension = c(0,1,2,3),birth = c(0,0,0,0),death = c(1,1,1,2)),max_radius = -1),"positive") 12 | expect_error(plot_diagram(D = data.frame(dimension = c(0,1,2,3),birth = c(0,0,0,0),death = c(1,1,1,2)),legend = NULL),"NULL") 13 | expect_error(plot_diagram(D = data.frame(dimension = c(0,1,2,3),birth = c(0,0,0,0),death = c(1,1,1,2)),legend = c(T,F)),"single") 14 | expect_error(plot_diagram(D = data.frame(dimension = c(0,1,2,3),birth = c(0,0,0,0),death = c(1,1,1,2)),thresholds = c(0,1,2,NA)),"NA") 15 | expect_error(plot_diagram(D = data.frame(dimension = c(0,1,2,3),birth = c(0,0,0,0),death = c(1,1,1,2)),thresholds = list(thresholds = c(0,1,2,NA))),"NA") 16 | expect_error(plot_diagram(D = data.frame(dimension = c(0,1,2,3),birth = c(0,0,0,0),death = c(1,1,1,2)),thresholds = list(foo = c(1,2,3))),"list element") 17 | expect_error(plot_diagram(D = data.frame(dimension = c(0,1,2,3),birth = c(0,0,0,0),death = c(1,1,1,2)),thresholds = list(thresholds = c(1,2,3))),"element") 18 | expect_error(plot_diagram(D = data.frame(dimension = c(0,1,2,3),birth = c(0,0,0,0),death = c(1,1,1,2)),thresholds = c(1,2,3)),"element") 19 | expect_error(plot_diagram(D = data.frame(dimension = c(0,1,2,3),birth = c(0,0,0,0),death = c(1,1,1,2)),thresholds = c(1,2,3,"5")),"numeric") 20 | expect_error(plot_diagram(D = data.frame(dimension = c(0,1,2,3),birth = c(0,0,0,0),death = c(1,1,1,2)),thresholds = list(thresholds = c(1,2,3,"5"))),"numeric") 21 | expect_error(plot_diagram(D = data.frame(dimension = c(0,1,2,3),birth = c(0,0,0,0),death = c(1,1,1,2)),thresholds = c(1,2,3,NA)),"NA") 22 | 23 | }) 24 | 25 | test_that("plot_diagram is working correctly",{ 26 | 27 | expect_identical(plot_diagram(D = data.frame(dimension = numeric(),birth = numeric(),death = numeric())),NULL) 28 | expect_identical(plot_diagram(D = data.frame(dimension = c(0),birth = c(0),death = c(1))),NULL) 29 | 30 | }) 31 | 32 | -------------------------------------------------------------------------------- /R/convert.R: -------------------------------------------------------------------------------- 1 | #### CONVERT PERSISTENCE DIAGRAMS INTO DATA FRAMES#### 2 | #' Convert a TDA/TDAstats persistence diagram to a data frame. 3 | #' 4 | #' The output of homology calculations from the R packages TDA 5 | #' and TDAstats are not dataframes. This function converts these 6 | #' outputs into a data frame either for further usage in this package or 7 | #' for personalized analyses. 8 | #' 9 | #' If a diagram is constructed using a TDA function like ripsDiag 10 | #' with the `location` parameter set to true then the return value will ignore the location information. 11 | #' 12 | #' @param d the output of a TDA/TDAstats homology calculation, like ripsDiag or \code{\link[TDAstats]{calculate_homology}}. 13 | #' @return a 3-column data frame, with each row representing a topological feature. The first column is the feature dimension (a non-negative integer), the second column is the birth radius of the feature and the third column is the death radius. 14 | #' @export 15 | #' @author Shael Brown - \email{shaelebrown@@gmail.com} 16 | #' @examples 17 | #' 18 | #' if(require("TDAstats")) 19 | #' { 20 | #' # create a persistence diagram from a 2D Gaussian 21 | #' df = data.frame(x = rnorm(n = 20,mean = 0,sd = 1),y = rnorm(n = 20,mean = 0,sd = 1)) 22 | #' 23 | #' # compute persistence diagram with calculate_homology from package TDAstats 24 | #' phom_TDAstats = TDAstats::calculate_homology(mat = df,dim = 0,threshold = 1) 25 | #' 26 | #' # convert to data frame 27 | #' phom_TDAstats_df = diagram_to_df(d = phom_TDAstats) 28 | #' } 29 | 30 | diagram_to_df <- function(d){ 31 | 32 | # function to convert d to a data frame with standardized column names 33 | # d is a diagram from library TDA or TDAstats 34 | 35 | # preliminary check, mostly for internal methods 36 | if(inherits(d,"data.frame")) 37 | { 38 | return(d) 39 | } 40 | 41 | if((is.list(d) && ((length(d) == 1 && all(names(d) %in% "diagram") && (inherits(d$diagram,"diagram")) || inherits(d$diagram,"data.frame")) || ((length(d) == 4 && all(names(d) %in% c("diagram","birthLocation","deathLocation","cycleLocation")) && inherits(d$diagram,"diagram"))))) == F && (inherits(d,"matrix") && inherits(d,"array") & all(colnames(d) %in% c("dimension","birth","death"))) == F) 42 | { 43 | stop("Diagrams must either be the output of a TDA/TDAstats/PyH computation.") 44 | } 45 | 46 | if(inherits(d,"matrix") & inherits(d,"array")) 47 | { 48 | # diagram was the output of a TDAstats calculation 49 | return(as.data.frame(d)) 50 | } 51 | 52 | if("diagram" %in% names(d)) 53 | { 54 | if(inherits(d$diagram,"data.frame")) 55 | { 56 | # diagram was the output of a PyH calculation, with representatives 57 | return(d$diagram) 58 | } 59 | } 60 | 61 | # else d was the output of a TDA calculation 62 | d <- d[[1]] 63 | class(d) <- "matrix" 64 | d <- as.data.frame(d) 65 | colnames(d) <- c("dimension","birth","death") 66 | 67 | return(d) 68 | 69 | } 70 | -------------------------------------------------------------------------------- /src/bd_search.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace Rcpp; 3 | 4 | //---------------------------------------------------------------------- 5 | // File: bd_search.cpp 6 | // Programmer: David Mount 7 | // Description: Standard bd-tree search 8 | // Last modified: 01/04/05 (Version 1.0) 9 | //---------------------------------------------------------------------- 10 | // Copyright (c) 1997-2005 University of Maryland and Sunil Arya and 11 | // David Mount. All Rights Reserved. 12 | // 13 | // This software and related documentation is part of the Approximate 14 | // Nearest Neighbor Library (ANN). This software is provided under 15 | // the provisions of the Lesser GNU Public License (LGPL). See the 16 | // file ../ReadMe.txt for further information. 17 | // 18 | // The University of Maryland (U.M.) and the authors make no 19 | // representations about the suitability or fitness of this software for 20 | // any purpose. It is provided "as is" without express or implied 21 | // warranty. 22 | //---------------------------------------------------------------------- 23 | // History: 24 | // Revision 0.1 03/04/98 25 | // Initial release 26 | //---------------------------------------------------------------------- 27 | 28 | #include "bd_tree.h" // bd-tree declarations 29 | #include "kd_search.h" // kd-tree search declarations 30 | 31 | //---------------------------------------------------------------------- 32 | // Approximate searching for bd-trees. 33 | // See the file kd_search.cpp for general information on the 34 | // approximate nearest neighbor search algorithm. Here we 35 | // include the extensions for shrinking nodes. 36 | //---------------------------------------------------------------------- 37 | 38 | //---------------------------------------------------------------------- 39 | // bd_shrink::ann_search - search a shrinking node 40 | //---------------------------------------------------------------------- 41 | 42 | void ANNbd_shrink::ann_search(ANNdist box_dist) 43 | { 44 | // check dist calc term cond. 45 | if (ANNmaxPtsVisited != 0 && ANNptsVisited > ANNmaxPtsVisited) return; 46 | 47 | ANNdist inner_dist = 0; // distance to inner box 48 | for (int i = 0; i < n_bnds; i++) { // is query point in the box? 49 | if (bnds[i].out(ANNkdQ)) { // outside this bounding side? 50 | // add to inner distance 51 | inner_dist = (ANNdist) ANN_SUM(inner_dist, bnds[i].dist(ANNkdQ)); 52 | } 53 | } 54 | if (inner_dist <= box_dist) { // if inner box is closer 55 | child[ANN_IN]->ann_search(inner_dist); // search inner child first 56 | child[ANN_OUT]->ann_search(box_dist); // ...then outer child 57 | } 58 | else { // if outer box is closer 59 | child[ANN_OUT]->ann_search(box_dist); // search outer child first 60 | child[ANN_IN]->ann_search(inner_dist); // ...then outer child 61 | } 62 | ANN_FLOP(3*n_bnds) // increment floating ops 63 | ANN_SHR(1) // one more shrinking node 64 | } -------------------------------------------------------------------------------- /src/bd_pr_search.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace Rcpp; 3 | //---------------------------------------------------------------------- 4 | // File: bd_pr_search.cpp 5 | // Programmer: David Mount 6 | // Description: Priority search for bd-trees 7 | // Last modified: 01/04/05 (Version 1.0) 8 | //---------------------------------------------------------------------- 9 | // Copyright (c) 1997-2005 University of Maryland and Sunil Arya and 10 | // David Mount. All Rights Reserved. 11 | // 12 | // This software and related documentation is part of the Approximate 13 | // Nearest Neighbor Library (ANN). This software is provided under 14 | // the provisions of the Lesser GNU Public License (LGPL). See the 15 | // file ../ReadMe.txt for further information. 16 | // 17 | // The University of Maryland (U.M.) and the authors make no 18 | // representations about the suitability or fitness of this software for 19 | // any purpose. It is provided "as is" without express or implied 20 | // warranty. 21 | //---------------------------------------------------------------------- 22 | //History: 23 | // Revision 0.1 03/04/98 24 | // Initial release 25 | //---------------------------------------------------------------------- 26 | 27 | #include "bd_tree.h" // bd-tree declarations 28 | #include "kd_pr_search.h" // kd priority search declarations 29 | 30 | //---------------------------------------------------------------------- 31 | // Approximate priority searching for bd-trees. 32 | // See the file kd_pr_search.cc for general information on the 33 | // approximate nearest neighbor priority search algorithm. Here 34 | // we include the extensions for shrinking nodes. 35 | //---------------------------------------------------------------------- 36 | 37 | //---------------------------------------------------------------------- 38 | // bd_shrink::ann_search - search a shrinking node 39 | //---------------------------------------------------------------------- 40 | 41 | void ANNbd_shrink::ann_pri_search(ANNdist box_dist) 42 | { 43 | ANNdist inner_dist = 0; // distance to inner box 44 | for (int i = 0; i < n_bnds; i++) { // is query point in the box? 45 | if (bnds[i].out(ANNprQ)) { // outside this bounding side? 46 | // add to inner distance 47 | inner_dist = (ANNdist) ANN_SUM(inner_dist, bnds[i].dist(ANNprQ)); 48 | } 49 | } 50 | if (inner_dist <= box_dist) { // if inner box is closer 51 | if (child[ANN_OUT] != KD_TRIVIAL) // enqueue outer if not trivial 52 | ANNprBoxPQ->insert(box_dist,child[ANN_OUT]); 53 | // continue with inner child 54 | child[ANN_IN]->ann_pri_search(inner_dist); 55 | } 56 | else { // if outer box is closer 57 | if (child[ANN_IN] != KD_TRIVIAL) // enqueue inner if not trivial 58 | ANNprBoxPQ->insert(inner_dist,child[ANN_IN]); 59 | // continue with outer child 60 | child[ANN_OUT]->ann_pri_search(box_dist); 61 | } 62 | ANN_FLOP(3*n_bnds) // increment floating ops 63 | ANN_SHR(1) // one more shrinking node 64 | } -------------------------------------------------------------------------------- /man/predict_diagram_kkmeans.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/machine_learning.R 3 | \name{predict_diagram_kkmeans} 4 | \alias{predict_diagram_kkmeans} 5 | \title{Predict the cluster labels for new persistence diagrams using a pre-computed clustering.} 6 | \usage{ 7 | predict_diagram_kkmeans( 8 | new_diagrams, 9 | K = NULL, 10 | clustering, 11 | num_workers = parallelly::availableCores(omit = 1) 12 | ) 13 | } 14 | \arguments{ 15 | \item{new_diagrams}{a list of persistence diagrams which are either the output of a persistent homology calculation like ripsDiag/\code{\link[TDAstats]{calculate_homology}}/\code{\link{PyH}}, or \code{\link{diagram_to_df}}. Only one of `new_diagrams` and `K` need to be supplied.} 16 | 17 | \item{K}{an optional precomputed cross Gram matrix of the new diagrams and the diagrams used in `clustering`, default NULL. If not NULL then `new_diagrams` does not need to be supplied.} 18 | 19 | \item{clustering}{the output of a \code{\link{diagram_kkmeans}} function call, of class 'diagram_kkmeans'.} 20 | 21 | \item{num_workers}{the number of cores used for parallel computation, default is one less than the number of cores on the machine.} 22 | } 23 | \value{ 24 | a vector of the predicted cluster labels for the new diagrams. 25 | } 26 | \description{ 27 | Returns the nearest (highest kernel value) \code{\link[kernlab]{kkmeans}} cluster center label for new persistence diagrams. 28 | This allows for reusing old cluster models for new tasks, or to perform cross validation. 29 | } 30 | \examples{ 31 | 32 | if(require("TDAstats")) 33 | { 34 | # create two diagrams 35 | D1 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,20),], 36 | dim = 1,threshold = 2) 37 | D2 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,20),], 38 | dim = 1,threshold = 2) 39 | g <- list(D1,D1,D2,D2) 40 | 41 | # calculate kmeans clusters with centers = 2, and sigma = t = 2 in dimension 0 42 | clust <- diagram_kkmeans(diagrams = g,centers = 2,dim = 0,t = 2,sigma = 2,num_workers = 2) 43 | 44 | # create two new diagrams 45 | D3 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,20),], 46 | dim = 1,threshold = 2) 47 | D4 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,20),], 48 | dim = 1,threshold = 2) 49 | g_new <- list(D3,D4) 50 | 51 | # predict cluster labels 52 | predict_diagram_kkmeans(new_diagrams = g_new,clustering = clust,num_workers = 2) 53 | 54 | # predict cluster labels with precomputed Gram matrix, gives same result but 55 | # much faster 56 | K <- gram_matrix(diagrams = g_new,other_diagrams = clust$diagrams, 57 | dim = clust$dim,t = clust$t,sigma = clust$sigma, 58 | num_workers = 2) 59 | predict_diagram_kkmeans(K = K,clustering = clust) 60 | 61 | } 62 | } 63 | \seealso{ 64 | \code{\link{diagram_kkmeans}} for clustering persistence diagrams. 65 | } 66 | \author{ 67 | Shael Brown - \email{shaelebrown@gmail.com} 68 | } 69 | -------------------------------------------------------------------------------- /man/plot_diagram.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plot.R 3 | \name{plot_diagram} 4 | \alias{plot_diagram} 5 | \title{Plot persistence diagrams} 6 | \usage{ 7 | plot_diagram( 8 | D, 9 | title = NULL, 10 | max_radius = NULL, 11 | legend = TRUE, 12 | thresholds = NULL 13 | ) 14 | } 15 | \arguments{ 16 | \item{D}{a persistence diagram, either outputted from either a persistent homology homology calculation like ripsDiag/\code{\link[TDAstats]{calculate_homology}}/\code{\link{PyH}} or from \code{\link{diagram_to_df}}, with 17 | maximum dimension at most 12.} 18 | 19 | \item{title}{the character string plot title, default NULL.} 20 | 21 | \item{max_radius}{the x and y limits of the plot are defined as `c(0,max_radius)`, and the default value of `max_radius` is the maximum death value in `D`.} 22 | 23 | \item{legend}{a logical indicating whether to include a legend of feature dimensions, default TRUE.} 24 | 25 | \item{thresholds}{either a numeric vector with one persistence threshold for each dimension in `D` or the output of a \code{\link{bootstrap_persistence_thresholds}} function call, default NULL.} 26 | } 27 | \description{ 28 | Plots a persistence diagram outputted from either a persistent homology calculation or from diagram_to_df, with 29 | maximum homological dimension no more than 12 (otherwise the legend doesn't fit in the plot). 30 | Each homological dimension has its own color (the rcartocolor color-blind safe color palette) and point type, 31 | and the main plot title can be altered via the `title` parameter. Each feature is plotted with 32 | a black point at its center in order to distinguish between overlapping features and easily compare 33 | features to their persistence thresholds. 34 | } 35 | \details{ 36 | The `thresholds` parameter, if not NULL, can either be a user-defined numeric vector, with 37 | one entry (persistence threshold) for each dimension in `D`, or the output of 38 | \code{\link{bootstrap_persistence_thresholds}}. Points whose persistence are greater than or equal to their dimension's 39 | threshold will be plotted in their dimension's color, and in gray otherwise. 40 | } 41 | \examples{ 42 | 43 | if(require("TDAstats")) 44 | { 45 | # create a sample diagram from the unit circle 46 | df <- TDAstats::circle2d[sample(1:100,50),] 47 | diag <- TDAstats::calculate_homology(df,threshold = 2) 48 | 49 | # plot without title 50 | plot_diagram(diag) 51 | 52 | # plot with title 53 | plot_diagram(diag,title = "Example diagram") 54 | 55 | # determine persistence thresholds 56 | thresholds <- bootstrap_persistence_thresholds(X = df,maxdim = 1, 57 | thresh = 2,num_samples = 3, 58 | num_workers = 2) 59 | 60 | # plot with bootstrap persistence thresholds 61 | plot_diagram(diag,title = "Example diagram with thresholds",thresholds = thresholds) 62 | 63 | #' # plot with personalized persistence thresholds 64 | plot_diagram(diag,title = "Example diagram with personalized thresholds",thresholds = c(0.5,1)) 65 | } 66 | } 67 | \author{ 68 | Shael Brown - \email{shaelebrown@gmail.com} 69 | } 70 | -------------------------------------------------------------------------------- /man/gram_matrix.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/kernel_calculations.R 3 | \name{gram_matrix} 4 | \alias{gram_matrix} 5 | \title{Compute the gram matrix for a group of persistence diagrams.} 6 | \usage{ 7 | gram_matrix( 8 | diagrams, 9 | other_diagrams = NULL, 10 | dim = 0, 11 | sigma = 1, 12 | t = 1, 13 | rho = NULL, 14 | num_workers = parallelly::availableCores(omit = 1) 15 | ) 16 | } 17 | \arguments{ 18 | \item{diagrams}{a list of persistence diagrams, where each diagram is either the output of a persistent homology calculation like ripsDiag/\code{\link[TDAstats]{calculate_homology}}/\code{\link{PyH}}, or \code{\link{diagram_to_df}}.} 19 | 20 | \item{other_diagrams}{either NULL (default) or another list of persistence diagrams to compute a cross-Gram matrix.} 21 | 22 | \item{dim}{the non-negative integer homological dimension in which the distance is to be computed, default 0.} 23 | 24 | \item{sigma}{a positive number representing the bandwidth for the Fisher information metric, default 1.} 25 | 26 | \item{t}{a positive number representing the scale for the kernel, default 1.} 27 | 28 | \item{rho}{an optional positive number representing the heuristic for Fisher information metric approximation, see \code{\link{diagram_distance}}. Default NULL. If supplied, code execution is sequential, but functions in the "exec" directory 29 | of the package can be loaded to calculate distance matrices in parallel with approximation.} 30 | 31 | \item{num_workers}{the number of cores used for parallel computation, default is one less than the number of cores on the machine.} 32 | } 33 | \value{ 34 | the numeric (cross) Gram matrix of class 'kernelMatrix'. 35 | } 36 | \description{ 37 | Calculate the Gram matrix \eqn{K} for either a single list of persistence diagrams \eqn{(D_1,D_2,\dots,D_n)}, i.e. \eqn{K[i,j] = k_{PF}(D_i,D_j)}, 38 | or between two lists of persistence diagrams, \eqn{(D_1,D_2,\dots,D_n)} and \eqn{(D'_1,D'_2,\dots,D'_n)}, \eqn{K[i,j] = k_{PF}(D_i,D'_j)}, in parallel. 39 | } 40 | \details{ 41 | Gram matrices are used in downstream analyses, like in the `diagram_kkmeans`, `diagram_nearest_cluster`,`diagram_kpca`, 42 | `predict_diagram_kpca`, `predict_diagram_ksvm` and `independence_test` functions. 43 | } 44 | \examples{ 45 | 46 | if(require("TDAstats")) 47 | { 48 | # create two diagrams 49 | D1 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,20),], 50 | dim = 1,threshold = 2) 51 | D2 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,20),], 52 | dim = 1,threshold = 2) 53 | g <- list(D1,D2) 54 | 55 | # calculate the Gram matrix in dimension 0 with sigma = 2, t = 2 56 | G <- gram_matrix(diagrams = g,dim = 0,sigma = 2,t = 2,num_workers = 2) 57 | 58 | # calculate cross-Gram matrix, which is the same as G 59 | G_cross <- gram_matrix(diagrams = g,other_diagrams = g,dim = 0,sigma = 2, 60 | t = 2,num_workers = 2) 61 | } 62 | } 63 | \seealso{ 64 | \code{\link{diagram_kernel}} for individual persistence Fisher kernel calculations. 65 | } 66 | \author{ 67 | Shael Brown - \email{shaelebrown@gmail.com} 68 | } 69 | -------------------------------------------------------------------------------- /man/PyH.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/python_functions.R 3 | \name{PyH} 4 | \alias{PyH} 5 | \title{Fast persistent homology calculations with python.} 6 | \usage{ 7 | PyH( 8 | X, 9 | maxdim = 1, 10 | thresh, 11 | distance_mat = FALSE, 12 | ripser, 13 | ignore_infinite_cluster = TRUE, 14 | calculate_representatives = FALSE 15 | ) 16 | } 17 | \arguments{ 18 | \item{X}{either a matrix or dataframe, representing either point cloud data or a distance matrix. In either case there 19 | must be at least two rows and 1 column.} 20 | 21 | \item{maxdim}{the non-negative integer maximum dimension for persistent homology, default 1.} 22 | 23 | \item{thresh}{the non-negative numeric radius threshold for the Vietoris-Rips filtration.} 24 | 25 | \item{distance_mat}{a boolean representing whether the input X is a distance matrix or not, default FALSE.} 26 | 27 | \item{ripser}{the ripser python module.} 28 | 29 | \item{ignore_infinite_cluster}{a boolean representing whether to remove clusters (0 dimensional cycles) which 30 | die at the threshold value. Default is TRUE as this is the default for TDAstats homology calculations, but can be set to 31 | FALSE which is the default for python ripser.} 32 | 33 | \item{calculate_representatives}{a boolean representing whether to return a list of representative cocycles for the 34 | topological features found in the persistence diagram, default FALSE.} 35 | } 36 | \value{ 37 | Either a dataframe containing the persistence diagram if `calculate_representatives` is `FALSE` (the default), otherwise a list with two elements: 38 | diagram of class diagram, containing the persistence diagram, 39 | and representatives, a list containing the edges, triangles etc. contained in each representative cocycle. 40 | } 41 | \description{ 42 | This function is a wrapper of the python wrapper of the ripser engine for persistent cohomology, 43 | but is still faster than using the R package TDAstats (see the TDApplied package vignette for details). 44 | } 45 | \details{ 46 | If `distance_mat` is `TRUE` then `X` must be a square matrix. The `ripser` parameter should be the 47 | result of an `import_ripser` function call, but since that function is slow the ripser object should 48 | be explicitly created before a PyH function call (see examples). Cohomology is computed over Z2, 49 | as is the case for the TDAstats function \code{\link[TDAstats]{calculate_homology}} (this is also the 50 | default for ripser in c++). If representative cocycles are returned, then they are stored in a list with 51 | one element for each point in the persistence diagram, ignoring dimension 0 points. Each representative of 52 | a dimension d cocycle (1 for loops, 2 for voids, etc.) is a kxd dimension matrix/array containing the row number-labelled 53 | edges, triangles etc. in the cocycle. 54 | } 55 | \examples{ 56 | \dontrun{ 57 | # create sample data 58 | df <- data.frame(x = 1:10,y = 1:10) 59 | 60 | # import the ripser module 61 | ripser <- import_ripser() 62 | 63 | # calculate persistence diagram up to dimension 1 with a maximum 64 | # radius of 5 65 | phom <- PyH(X = df,thresh = 5,ripser = ripser) 66 | } 67 | } 68 | \author{ 69 | Shael Brown - \email{shaelebrown@gmail.com} 70 | } 71 | -------------------------------------------------------------------------------- /exec/parallel_with_approximation.R: -------------------------------------------------------------------------------- 1 | 2 | # functions to calculate Fisher information distance matrices and Gram matrices 3 | # in parallel with a fast approximation 4 | 5 | # these matrices can then be input into TDApplied functions directly 6 | 7 | parallel_approx_distance_matrix <- function(diagrams,other_diagrams = NULL,dim = 0,sigma = 1,rho = 1e-3,num_workers = parallelly::availableCores(omit = 1)){ 8 | 9 | # create cluster 10 | cl <- parallel::makeCluster(num_workers) 11 | doParallel::registerDoParallel(cl) 12 | 13 | # calculate distances in parallel 14 | # clusters are closed if there is an error 15 | tryCatch(expr = { 16 | 17 | if(is.null(other_diagrams)) 18 | { 19 | # not cross distance matrix, only need to compute the upper diagonal 20 | # since the matrix is symmetric 21 | d <- matrix(data = 0,nrow = length(diagrams),ncol = length(diagrams)) 22 | u <- which(upper.tri(d),arr.ind = T) 23 | R <- lapply(X = 1:nrow(u),FUN = function(X){ 24 | 25 | return(list(diagrams[[u[[X,1]]]],diagrams[[u[[X,2]]]])) 26 | 27 | }) 28 | 29 | # remove diagrams to preserve memory 30 | rm(diagrams) 31 | 32 | # calculate distances in parallel, export TDApplied to nodes 33 | d_off_diag <- foreach::`%dopar%`(obj = foreach::foreach(r = R,.combine = c,.packages = c("TDApplied")),ex = {TDApplied::diagram_distance(D1 = r[[1]],D2 = r[[2]],dim = dim,distance = "fisher",sigma = sigma,rho = rho)}) 34 | 35 | # store results in matrix 36 | d[upper.tri(d)] <- d_off_diag 37 | d[which(upper.tri(d),arr.ind = T)[,c("col","row")]] <- d_off_diag 38 | diag(d) <- rep(0,nrow(d)) 39 | }else 40 | { 41 | # cross distance matrix, need to compute all entries 42 | u <- expand.grid(1:length(other_diagrams),1:length(diagrams)) 43 | R <- lapply(X = 1:nrow(u),FUN = function(X){ 44 | 45 | return(list(other_diagrams[[u[X,1]]],diagrams[[u[X,2]]])) 46 | 47 | }) 48 | 49 | # remove diagrams and other_diagrams to preserve memory 50 | rm(list = c("diagrams","other_diagrams")) 51 | 52 | # store distance calculations in matrix 53 | d[as.matrix(u)] <- foreach::`%dopar%`(foreach::foreach(r = R,.combine = cbind,.packages = c("TDApplied")),ex = {TDApplied::diagram_distance(D1 = r[[1]],D2 = r[[2]],dim = dim,distance = "fisher",sigma = sigma,rho = rho)}) 54 | 55 | } 56 | 57 | }, warning = function(w){warning(w)}, 58 | error = function(e){stop(e)}, 59 | finally = { 60 | # close cluster 61 | doParallel::stopImplicitCluster() 62 | parallel::stopCluster(cl) 63 | 64 | }) 65 | 66 | return(d) 67 | 68 | } 69 | 70 | parallel_approx_gram_matrix <- function(diagrams,other_diagrams = NULL,dim = 0,sigma = 1,t = 1,rho = 1e-3,num_workers = parallelly::availableCores(omit = 1)){ 71 | 72 | # compute gram matrix from distance matrix 73 | K <- exp(-t*parallel_approx_distance_matrix(diagrams = diagrams,other_diagrams = other_diagrams,dim = dim,sigma = sigma,rho = rho,num_workers = num_workers)) 74 | 75 | # update class for interfacing with kernlab package 76 | class(K) <- "kernelMatrix" 77 | 78 | return(K) 79 | 80 | } 81 | -------------------------------------------------------------------------------- /man/vr_graphs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rips_complexes.R 3 | \name{vr_graphs} 4 | \alias{vr_graphs} 5 | \title{Compute Vietoris-Rips graphs of a dataset at particular epsilon radius values.} 6 | \usage{ 7 | vr_graphs(X, distance_mat = FALSE, eps, return_clusters = TRUE) 8 | } 9 | \arguments{ 10 | \item{X}{either a point cloud data frame/matrix, or a distance matrix.} 11 | 12 | \item{distance_mat}{a boolean representing if the input `X` is a distance matrix, default value is `FALSE`.} 13 | 14 | \item{eps}{a numeric vector of the positive scales at which to compute the Rips-Vietoris complexes, i.e. all edges at most the specified values.} 15 | 16 | \item{return_clusters}{a boolean determining if the connected components (i.e. data clusters) of the complex should be explicitly returned, default is `TRUE`.} 17 | } 18 | \value{ 19 | A list with a `vertices` field, containing the rownames of `X`, and then a list `graphs` one (named) entry for each value in `eps`. Each entry is a list with a `graph` field, storing the (undirected) edges in the Rips-Vietoris complex in matrix format, and a `clusters` field, containing vectors of the data indices (or row names) in each connected component of the Rips graph. 20 | } 21 | \description{ 22 | Persistence diagrams computed from Rips-Vietoris filtrations contain information about 23 | distance radius scales at which topological features of a dataset exist, but the features 24 | can be challenging to visualize, analyze and interpret. In order to help solve this problem the `vr_graphs` 25 | function computes the 1-skeleton (i.e. graph) of Rips complexes at particular radii, called "Vietoris-Rips graphs" (VR graphs) in the literature. 26 | } 27 | \details{ 28 | This function may be used in conjunction with the igraph package to visualize the graphs (see \code{\link{plot_vr_graph}}). 29 | } 30 | \examples{ 31 | 32 | if(require("TDAstats") & require("igraph")) 33 | { 34 | # simulate data from the unit circle and calculate 35 | # its diagram 36 | df <- TDAstats::circle2d[sample(1:100,25),] 37 | diag <- TDAstats::calculate_homology(df, 38 | dim = 1, 39 | threshold = 2) 40 | 41 | # get minimum death radius of any data cluster 42 | min_death_H0 <- 43 | min(diag[which(diag[,1] == 0),3L]) 44 | 45 | # get birth and death radius of the loop 46 | loop_birth <- as.numeric(diag[nrow(diag),2L]) 47 | loop_death <- as.numeric(diag[nrow(diag),3L]) 48 | 49 | # compute VR graphs at radii half of 50 | # min_death_H0 and the mean of loop_birth and 51 | # loop_death, returning clusters 52 | graphs <- vr_graphs(X = df,eps = 53 | c(0.5*min_death_H0,(loop_birth + loop_death)/2)) 54 | 55 | # verify that there are 25 clusters for the smaller radius 56 | length(graphs$graphs[[1]]$clusters) 57 | 58 | } 59 | } 60 | \references{ 61 | A Zomorodian, The tidy set: A minimal simplicial set for computing homology of clique complexes in Proceedings of the Twenty-Sixth Annual Symposium on Computational Geometry, SoCG ’10. (Association for Computing Machinery, New York, NY, USA), p. 257–266 (2010). 62 | } 63 | \seealso{ 64 | \code{\link{plot_vr_graph}} for plotting VR graphs. 65 | } 66 | \author{ 67 | Shael Brown - \email{shaelebrown@gmail.com} 68 | } 69 | -------------------------------------------------------------------------------- /man/predict_diagram_ksvm.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/machine_learning.R 3 | \name{predict_diagram_ksvm} 4 | \alias{predict_diagram_ksvm} 5 | \title{Predict the outcome labels for a list of persistence diagrams using a pre-trained diagram ksvm model.} 6 | \usage{ 7 | predict_diagram_ksvm( 8 | new_diagrams, 9 | model, 10 | K = NULL, 11 | num_workers = parallelly::availableCores(omit = 1) 12 | ) 13 | } 14 | \arguments{ 15 | \item{new_diagrams}{a list of persistence diagrams which are either the output of a persistent homology calculation like ripsDiag/\code{\link[TDAstats]{calculate_homology}}/\code{\link{PyH}}, or \code{\link{diagram_to_df}}. Only one of `new_diagrams` and `K` need to be supplied.} 16 | 17 | \item{model}{the output of a \code{\link{diagram_ksvm}} function call, of class 'diagram_ksvm'.} 18 | 19 | \item{K}{an optional cross-Gram matrix of the new diagrams and the diagrams in `model`, default NULL. If not NULL then `new_diagrams` does not need to be supplied.} 20 | 21 | \item{num_workers}{the number of cores used for parallel computation, default is one less than the number of cores on the machine.} 22 | } 23 | \value{ 24 | a vector containing the output of \code{\link[kernlab]{predict.ksvm}} on the cross Gram matrix of the new diagrams and the support vector diagrams stored in the model. 25 | } 26 | \description{ 27 | Returns the predicted response vector of the model on the new diagrams. 28 | } 29 | \details{ 30 | This function is a wrapper of the kernlab \code{\link{predict}} function. 31 | } 32 | \examples{ 33 | 34 | if(require("TDAstats")) 35 | { 36 | # create four diagrams 37 | D1 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,20),], 38 | dim = 1,threshold = 2) 39 | D2 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,20),], 40 | dim = 1,threshold = 2) 41 | D3 <- TDAstats::calculate_homology(TDAstats::sphere3d[sample(1:100,20),], 42 | dim = 1,threshold = 2) 43 | D4 <- TDAstats::calculate_homology(TDAstats::sphere3d[sample(1:100,20),], 44 | dim = 1,threshold = 2) 45 | g <- list(D1,D2,D3,D4) 46 | 47 | # create response vector 48 | y <- as.factor(c("circle","circle","sphere","sphere")) 49 | 50 | # fit model without cross validation 51 | model_svm <- diagram_ksvm(diagrams = g,cv = 1,dim = c(0), 52 | y = y,sigma = c(1),t = c(1), 53 | num_workers = 2) 54 | 55 | # create two new diagrams 56 | D5 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,20),], 57 | dim = 1,threshold = 2) 58 | D6 <- TDAstats::calculate_homology(TDAstats::sphere3d[sample(1:100,20),], 59 | dim = 1,threshold = 2) 60 | g_new <- list(D5,D6) 61 | 62 | # predict with precomputed Gram matrix 63 | K <- gram_matrix(diagrams = g_new,other_diagrams = model_svm$diagrams, 64 | dim = model_svm$best_model$dim,sigma = model_svm$best_model$sigma, 65 | t = model_svm$best_model$t,num_workers = 2) 66 | predict_diagram_ksvm(K = K,model = model_svm,num_workers = 2) 67 | } 68 | } 69 | \seealso{ 70 | \code{\link{diagram_ksvm}} for training a SVM model on a training set of persistence diagrams and labels. 71 | } 72 | \author{ 73 | Shael Brown - \email{shaelebrown@gmail.com} 74 | } 75 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | > All changes to TDApplied are documented here. 2 | 3 | > Additions referenced with relevant [GitHub Issue](https://github.com/shaelebrown/TDApplied/issues) or 4 | [Pull Request](https://github.com/shaelebrown/TDApplied/pulls) number. 5 | Please see those for more details. 6 | 7 | # 3.0.4 8 | - fixed distance exponentiation and group permuting in permutation test 9 | - added permutation model inference procedure 10 | - added universal null and enclosing radius functions 11 | 12 | # 3.0.3 13 | - Updated documentation for JOSS paper release 14 | 15 | # 3.0.2 16 | - all CRAN issues for this update were caused by the rho parameter, which invokes external C++ code. We therefore fixed these issues by removing the rho parameter from the predict_diagram_kpca and diagram_distance examples, from all tests and in the ML_and_inference.Rmd file. This parameter has been kept and is still tested, just not tested on CRAN 17 | - sped up the independence_test example by only showing the Gram-matrix approach 18 | - removed warnings from benchmarking plots in Speed.Rmd 19 | - removed dependency on package TDA which is currently unavailable on CRAN 20 | 21 | # 3.0.1 22 | - same updates as 3.0.0 but with more efficient vignette building 23 | 24 | # 3.0.0 25 | - added ability to precompute distance/Gram matrices for ML and inference functions 26 | - added fast approximation to Fisher information metric 27 | - added vignettes for speedups, HCP analysis, personalized analyses and distance calculation comparisons (and removed those parts from the main vignette) 28 | - fixed issues with cv model fitting in diagram_ksvm 29 | - added automatic calculation of t parameters in diagram_ksvm 30 | - decreased memory load on parallel functions (except for permutation test loss function) 31 | - added checks for 0 variance distance matrices in diagram_ksvm 32 | - added comparisons against package rgudhi 33 | - updated DESCRIPTION 34 | - added interpretations tools for vr graphs and multiple representative (co) cycles 35 | - improved HCP analysis 36 | - resolved some distance 0 cases in diagram_distance 37 | 38 | # 2.0.4 39 | - fixed build issues related to use of suggested packages in tests, examples and vignettes 40 | 41 | # 2.0.3 42 | - fixed bootstrap reference in vignette 43 | 44 | # 2.0.2 45 | - set seed in vignette for reproducibility (which is reset at the end) 46 | - added more examples of TDA applications in publications 47 | 48 | # 2.0.1 49 | - increased testing coverage 50 | - fixed issue with th parameter in diagram_kpca 51 | - fixed issue with gamma distribution in independence_test 52 | - added applied analysis of TDApplied on HCP data to package vignette 53 | 54 | # 2.0.0 55 | 56 | - added PyH function for fast persistence diagram calculations with python 57 | - added bootstrap_persistence_thresholds for finding "real" topological features in a data set 58 | - added plot_diagram function for plotting persistence diagrams, with or without persistence thresholds 59 | - fixed problem with diagram_distance in which one of the two diagrams was empty in the 60 | desired dimension 61 | 62 | # 0.1.3 63 | 64 | - fixed small bug with computing mean cv model error for svm 65 | - added tryCatch's around parallelized code to ensure that clusters are closed even when errors occur 66 | 67 | # 0.1.2 68 | 69 | - fixed bug with mds test and properly cleaned up parallelization clusters 70 | 71 | # 0.1.1 72 | 73 | - Fixed bug with one diagram_mds test, although code was working properly 74 | 75 | # 0.1.0 76 | 77 | - Initial version -------------------------------------------------------------------------------- /R/enclosing_rad.R: -------------------------------------------------------------------------------- 1 | #### COMPUTE enclosing RADIUS #### 2 | #' Compute the enclosing radius for a dataset. 3 | #' 4 | #' The enclosing radius is the minimum (Euclidean distance) radius beyond which no topological changes will occur. 5 | #' 6 | #' @param X the input dataset, must either be a matrix or data frame. 7 | #' @param distance_mat whether or not `X` is a distance matrix, default FALSE. 8 | #' @return the numeric enclosing radius. 9 | #' @export 10 | #' @author Shael Brown - \email{shaelebrown@@gmail.com} 11 | #' @examples 12 | #' 13 | #' # create a persistence diagram from a 2D Gaussian 14 | #' df = data.frame(x = rnorm(n = 20,mean = 0,sd = 1),y = rnorm(n = 20,mean = 0,sd = 1)) 15 | #' 16 | #' # compute the enclosing radius from the point cloud 17 | #' enc_rad <- enclosing_radius(df, distance_mat = FALSE) 18 | #' 19 | #' # compute the distance matrix manually, stored as a matrix 20 | #' dist_df <- as.matrix(dist(df)) 21 | #' 22 | #' # compute the enclosing radius from the distance matrix 23 | #' enc_rad <- enclosing_radius(dist_df, distance_mat = TRUE) 24 | enclosing_radius <- function(X, distance_mat = FALSE){ 25 | 26 | # error check parameters 27 | if(is.null(distance_mat)) 28 | { 29 | stop("distance_mat must not be NULL.") 30 | } 31 | if(length(distance_mat) > 1 | !inherits(distance_mat,"logical")) 32 | { 33 | stop("distance_mat must be a single logical (i.e. T or F).") 34 | } 35 | if(is.na(distance_mat) | is.nan(distance_mat) ) 36 | { 37 | stop("distance_mat must not be NA/NAN.") 38 | } 39 | 40 | if(!inherits(X,"data.frame") & !inherits(X,"matrix")) 41 | { 42 | stop("X must either be a dataframe or a matrix.") 43 | } 44 | if(nrow(X) < 2 | ncol(X) < 1) 45 | { 46 | stop("X must have at least two rows and one column.") 47 | } 48 | if(length(which(stats::complete.cases(X) == F)) > 0) 49 | { 50 | stop("X must not contain any missing values.") 51 | } 52 | if(distance_mat == T & (ncol(X) != nrow(X) | !inherits(X,"matrix"))) 53 | { 54 | stop("if distance_mat is TRUE then X must be a square matrix.") 55 | } 56 | if((inherits(X,"matrix") & !inherits(X[1,1],"numeric")) | (inherits(X,"data.frame") & length(which(unlist(lapply(X,is.numeric)))) < ncol(X))) 57 | { 58 | stop("X must have only numeric entries.") 59 | } 60 | 61 | # if X is not a distance matrix, compute distance mat 62 | if(!distance_mat) 63 | { 64 | X <- as.matrix(dist(X)) 65 | # dist_X <- dist(X) 66 | # n <- nrow(X) 67 | # return(min(sapply(1:n,FUN = function(X){ 68 | # 69 | # col_inds <- c() 70 | # if(X > 1) 71 | # { 72 | # num_cols <- X - 1 73 | # col <- 1 74 | # pos <- X - 1 75 | # while(col < num_cols) 76 | # { 77 | # col_inds <- c(col_inds, pos) 78 | # col <- col + 1 79 | # pos <- pos + n - col 80 | # } 81 | # } 82 | # 83 | # row_inds <- c() 84 | # if(X < n) 85 | # { 86 | # lower_bound <- n*(X - 1) - X*(X - 1)/2 + 1 87 | # upper_bound <- lower_bound + n - X 88 | # if(X == n - 1) 89 | # { 90 | # upper_bound <- upper_bound - 1 91 | # } 92 | # row_inds <- c(lower_bound:upper_bound) 93 | # } 94 | # inds <- c(row_inds, col_inds) 95 | # 96 | # return(max(dist_X[inds])) 97 | # 98 | # }))) 99 | } 100 | 101 | enc_rad <- min(apply(X, MARGIN = 1L, max)) 102 | return(enc_rad) 103 | 104 | } 105 | -------------------------------------------------------------------------------- /man/predict_diagram_kpca.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/machine_learning.R 3 | \name{predict_diagram_kpca} 4 | \alias{predict_diagram_kpca} 5 | \title{Project persistence diagrams into a low-dimensional space via a pre-computed kernel PCA embedding.} 6 | \usage{ 7 | predict_diagram_kpca( 8 | new_diagrams, 9 | K = NULL, 10 | embedding, 11 | num_workers = parallelly::availableCores(omit = 1) 12 | ) 13 | } 14 | \arguments{ 15 | \item{new_diagrams}{a list of persistence diagrams which are either the output of a persistent homology calculation like ripsDiag/\code{\link[TDAstats]{calculate_homology}}/\code{\link{PyH}}, or \code{\link{diagram_to_df}}. Only one of `new_diagrams` and `K` need to be supplied.} 16 | 17 | \item{K}{an optional precomputed cross-Gram matrix of the new diagrams and the ones used in `embedding`, default NULL. If not NULL then `new_diagrams` does not need to be supplied.} 18 | 19 | \item{embedding}{the output of a \code{\link{diagram_kpca}} function call, of class 'diagram_kpca'.} 20 | 21 | \item{num_workers}{the number of cores used for parallel computation, default is one less than the number of cores on the machine.} 22 | } 23 | \value{ 24 | the data projection (rotation), stored as a numeric matrix. Each row corresponds to the same-index diagram in `new_diagrams`. 25 | } 26 | \description{ 27 | Compute the location in low-dimensional space of each element of a list of new persistence diagrams using a 28 | previously-computed kernel PCA embedding (from the \code{\link{diagram_kpca}} function). 29 | } 30 | \examples{ 31 | 32 | if(require("TDAstats")) 33 | { 34 | # create six diagrams 35 | D1 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,20),], 36 | dim = 1,threshold = 2) 37 | D2 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,20),], 38 | dim = 1,threshold = 2) 39 | D3 <- TDAstats::calculate_homology(TDAstats::sphere3d[sample(1:100,20),], 40 | dim = 1,threshold = 2) 41 | D4 <- TDAstats::calculate_homology(TDAstats::sphere3d[sample(1:100,20),], 42 | dim = 1,threshold = 2) 43 | D5 <- TDAstats::calculate_homology(TDAstats::sphere3d[sample(1:100,20),], 44 | dim = 1,threshold = 2) 45 | D6 <- TDAstats::calculate_homology(TDAstats::sphere3d[sample(1:100,20),], 46 | dim = 1,threshold = 2) 47 | g <- list(D1,D2,D3,D4,D5,D6) 48 | 49 | # calculate their 2D PCA embedding with sigma = t = 2 in dimension 0 50 | pca <- diagram_kpca(diagrams = g,dim = 1,t = 2,sigma = 2, 51 | features = 2,num_workers = 2,th = 1e-6) 52 | 53 | # project two new diagrams onto old model 54 | D7 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,50),], 55 | dim = 0,threshold = 2) 56 | D8 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,50),], 57 | dim = 0,threshold = 2) 58 | g_new <- list(D7,D8) 59 | 60 | # calculate new embedding coordinates 61 | new_pca <- predict_diagram_kpca(new_diagrams = g_new,embedding = pca,num_workers = 2) 62 | 63 | # repeat with precomputed Gram matrix, gives same result but much faster 64 | K <- gram_matrix(diagrams = g_new,other_diagrams = pca$diagrams,dim = pca$dim, 65 | t = pca$t,sigma = pca$sigma,num_workers = 2) 66 | new_pca <- predict_diagram_kpca(K = K,embedding = pca,num_workers = 2) 67 | } 68 | } 69 | \seealso{ 70 | \code{\link{diagram_kpca}} for embedding persistence diagrams into a low-dimensional space. 71 | } 72 | \author{ 73 | Shael Brown - \email{shaelebrown@gmail.com} 74 | } 75 | -------------------------------------------------------------------------------- /tests/testthat/test-MDS.R: -------------------------------------------------------------------------------- 1 | 2 | test_that("diagram_mds detects incorrect parameters correctly",{ 3 | 4 | D <- data.frame(dimension = c(0),birth = c(0),death = c(1)) 5 | expect_error(diagram_mds(diagrams = list(D,D,"D"),num_workers = 2),"Diagrams") 6 | expect_error(diagram_mds(diagrams = list(),num_workers = 2),"2") 7 | expect_error(diagram_mds(diagrams = list(D,D,D),distance = NaN,num_workers = 2),"distance") 8 | expect_error(diagram_mds(diagrams = list(D,D,D),distance = "fisher",sigma = NULL,num_workers = 2),"sigma") 9 | expect_error(diagram_mds(diagrams = list(D,D,D),p = NaN,num_workers = 2),"p") 10 | expect_error(diagram_mds(diagrams = list(D,D,D),k = -1,num_workers = 2),"k") 11 | 12 | }) 13 | 14 | test_that("diagram_mds is computing correctly",{ 15 | 16 | D1 <- data.frame(dimension = 0,birth = 2,death = 3) 17 | D2 <- data.frame(dimension = 0,birth = 2,death = 3.1) 18 | D3 <- data.frame(dimension = 0,birth = c(2,5),death = c(3.1,6)) 19 | d12 <- diagram_distance(D1,D2,dim = 0) # 2-wasserstein 20 | d13 <- diagram_distance(D1,D3,dim = 0) 21 | d23 <- diagram_distance(D2,D3,dim = 0) 22 | D <- matrix(data = c(0,d12,d13,d12,0,d23,d13,d23,0),byrow = T,nrow = 3,ncol = 3)^2 23 | D <- scale(D,center = T,scale = F) 24 | D <- t(scale(t(D),center = T,scale = F)) 25 | S <- -D/2 26 | ev <- eigen(S) 27 | embedding <- -1*t(diag(sqrt(ev$values[1:2])) %*% t(ev$vectors[,1:2])) 28 | dimnames(embedding) <- list(NULL,NULL) 29 | dmds <- diagram_mds(diagrams = list(D1,D2,D3),num_workers = 2) 30 | if(embedding[1,1] < 0) 31 | { 32 | embedding[,1] <- embedding[,1]/-1 33 | } 34 | if(dmds[1,1] < 0) 35 | { 36 | dmds[,1] <- dmds[,1]/-1 37 | } 38 | if(embedding[1,2] < 0) 39 | { 40 | embedding[,2] <- embedding[,2]/-1 41 | } 42 | if(dmds[1,2] < 0) 43 | { 44 | dmds[,2] <- dmds[,2]/-1 45 | } 46 | expect_equal((abs(dmds[1,1])-abs(embedding[1,1]))+(abs(dmds[2,1])-abs(embedding[2,1]))+(abs(dmds[3,1])-abs(embedding[3,1])) + (abs(dmds[1,2])-abs(embedding[1,2]))+(abs(dmds[2,2])-abs(embedding[2,2]))+(abs(dmds[3,2])-abs(embedding[3,2])),0) 47 | 48 | }) 49 | 50 | # test_that("diagram_mds can accept inputs from TDA, TDAstats and diagram_to_df",{ 51 | # 52 | # skip_if_not_installed("TDA") 53 | # skip_if_not_installed("TDAstats") 54 | # 55 | # D1 = TDA::ripsDiag(data.frame(x = runif(50,0,1),y = runif(50,0,1)),maxscale = 1,maxdimension = 1) 56 | # D2 = TDA::alphaComplexDiag(data.frame(x = runif(50,0,1),y = runif(50,0,1)),maxdimension = 1) 57 | # D3 = TDA::ripsDiag(data.frame(x = runif(50,0,1),y = runif(50,0,1)),maxscale = 1,maxdimension = 1,library = "dionysus",location = T) 58 | # D4 = TDAstats::calculate_homology(data.frame(x = runif(50,0,1),y = runif(50,0,1)),threshold = 1) 59 | # expect_type(diagram_mds(diagrams = list(D1,D2,D3,D4),dim = 1,num_workers = 2),"double") 60 | # expect_error(diagram_mds(diagrams = list(D1,D2,D3,D4),dim = 0,num_workers = 2),"Inf") 61 | # 62 | # }) 63 | 64 | # test_that("diagram_mds can take distance matrix input",{ 65 | # 66 | # skip_if_not_installed("TDA") 67 | # skip_if_not_installed("TDAstats") 68 | # 69 | # D1 = TDA::ripsDiag(data.frame(x = runif(50,0,1),y = runif(50,0,1)),maxscale = 1,maxdimension = 1) 70 | # D2 = TDA::alphaComplexDiag(data.frame(x = runif(50,0,1),y = runif(50,0,1)),maxdimension = 1) 71 | # D3 = TDA::ripsDiag(data.frame(x = runif(50,0,1),y = runif(50,0,1)),maxscale = 1,maxdimension = 1,library = "dionysus",location = T) 72 | # D4 = TDAstats::calculate_homology(data.frame(x = runif(50,0,1),y = runif(50,0,1)),threshold = 1) 73 | # D = distance_matrix(list(D1,D2,D3,D4),dim = 1,num_workers = 2) 74 | # expect_type(diagram_mds(D = D,dim = 1,num_workers = 2),"double") 75 | # 76 | # }) 77 | 78 | -------------------------------------------------------------------------------- /man/distance_matrix.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/distance_calculations.R 3 | \name{distance_matrix} 4 | \alias{distance_matrix} 5 | \title{Compute a distance matrix from a list of persistence diagrams.} 6 | \usage{ 7 | distance_matrix( 8 | diagrams, 9 | other_diagrams = NULL, 10 | dim = 0, 11 | distance = "wasserstein", 12 | p = 2, 13 | sigma = NULL, 14 | rho = NULL, 15 | num_workers = parallelly::availableCores(omit = 1) 16 | ) 17 | } 18 | \arguments{ 19 | \item{diagrams}{a list of persistence diagrams, either the output of persistent homology calculations like ripsDiag/\code{\link[TDAstats]{calculate_homology}}/\code{\link{PyH}}, or \code{\link{diagram_to_df}}.} 20 | 21 | \item{other_diagrams}{either NULL (default) or another list of persistence diagrams to compute a cross-distance matrix.} 22 | 23 | \item{dim}{the non-negative integer homological dimension in which the distance is to be computed, default 0.} 24 | 25 | \item{distance}{a character determining which metric to use, either "wasserstein" (default) or "fisher".} 26 | 27 | \item{p}{a number representing the wasserstein power parameter, at least 1 and default 2.} 28 | 29 | \item{sigma}{a positive number representing the bandwidth of the Fisher information metric, default NULL.} 30 | 31 | \item{rho}{an optional positive number representing the heuristic for Fisher information metric approximation, see \code{\link{diagram_distance}}. Default NULL. If not NULL then matrix is calculated sequentially, but functions in the "exec" directory 32 | of the package can be loaded to calculate distance matrices in parallel with approximation.} 33 | 34 | \item{num_workers}{the number of cores used for parallel computation, default is one less than the number of cores on the machine.} 35 | } 36 | \value{ 37 | the numeric distance matrix. 38 | } 39 | \description{ 40 | Calculate the distance matrix \eqn{d} for either a single list of persistence diagrams \eqn{(D_1,D_2,\dots,D_n)}, i.e. \eqn{d[i,j] = d(D_i,D_j)}, 41 | or between two lists, \eqn{(D_1,D_2,\dots,D_n)} and \eqn{(D'_1,D'_2,\dots,D'_n)}, \eqn{d[i,j] = d(D_i,D'_j)}, in parallel. 42 | } 43 | \details{ 44 | Distance matrices of persistence diagrams are used in downstream analyses, like in the 45 | \code{\link{diagram_mds}}, \code{\link{permutation_test}} and \code{\link{diagram_ksvm}} functions. 46 | If `distance` is "fisher" then `sigma` must not be NULL. Since the matrix is computed sequentially when 47 | approximating the Fisher information metric this is only recommended when the persistence diagrams 48 | contain many points and when the number of available cores is small. 49 | } 50 | \examples{ 51 | 52 | if(require("TDAstats")) 53 | { 54 | # create two diagrams 55 | D1 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,10),], 56 | dim = 0,threshold = 2) 57 | D2 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,10),], 58 | dim = 0,threshold = 2) 59 | g <- list(D1,D2) 60 | 61 | # calculate their distance matrix in dimension 0 with the persistence Fisher metric 62 | # using 2 cores 63 | D <- distance_matrix(diagrams = g,dim = 0,distance = "fisher",sigma = 1,num_workers = 2) 64 | 65 | # calculate their distance matrix in dimension 0 with the 2-wasserstein metric 66 | # using 2 cores 67 | D <- distance_matrix(diagrams = g,dim = 0,distance = "wasserstein",p = 2,num_workers = 2) 68 | 69 | # now do the cross distance matrix, which is the same as the previous 70 | D_cross <- distance_matrix(diagrams = g,other_diagrams = g, 71 | dim = 0,distance = "wasserstein", 72 | p = 2,num_workers = 2) 73 | } 74 | } 75 | \seealso{ 76 | \code{\link{diagram_distance}} for individual distance calculations. 77 | } 78 | \author{ 79 | Shael Brown - \email{shaelebrown@gmail.com} 80 | } 81 | -------------------------------------------------------------------------------- /src/kd_split.h: -------------------------------------------------------------------------------- 1 | //---------------------------------------------------------------------- 2 | // File: kd_split.h 3 | // Programmer: Sunil Arya and David Mount 4 | // Description: Methods for splitting kd-trees 5 | // Last modified: 01/04/05 (Version 1.0) 6 | //---------------------------------------------------------------------- 7 | // Copyright (c) 1997-2005 University of Maryland and Sunil Arya and 8 | // David Mount. All Rights Reserved. 9 | // 10 | // This software and related documentation is part of the Approximate 11 | // Nearest Neighbor Library (ANN). This software is provided under 12 | // the provisions of the Lesser GNU Public License (LGPL). See the 13 | // file ../ReadMe.txt for further information. 14 | // 15 | // The University of Maryland (U.M.) and the authors make no 16 | // representations about the suitability or fitness of this software for 17 | // any purpose. It is provided "as is" without express or implied 18 | // warranty. 19 | //---------------------------------------------------------------------- 20 | // History: 21 | // Revision 0.1 03/04/98 22 | // Initial release 23 | //---------------------------------------------------------------------- 24 | 25 | #ifndef ANN_KD_SPLIT_H 26 | #define ANN_KD_SPLIT_H 27 | 28 | #include "kd_tree.h" // kd-tree definitions 29 | 30 | //---------------------------------------------------------------------- 31 | // External entry points 32 | // These are all splitting procedures for kd-trees. 33 | //---------------------------------------------------------------------- 34 | 35 | void kd_split( // standard (optimized) kd-splitter 36 | ANNpointArray pa, // point array (unaltered) 37 | ANNidxArray pidx, // point indices (permuted on return) 38 | const ANNorthRect &bnds, // bounding rectangle for cell 39 | int n, // number of points 40 | int dim, // dimension of space 41 | int &cut_dim, // cutting dimension (returned) 42 | ANNcoord &cut_val, // cutting value (returned) 43 | int &n_lo); // num of points on low side (returned) 44 | 45 | void midpt_split( // midpoint kd-splitter 46 | ANNpointArray pa, // point array (unaltered) 47 | ANNidxArray pidx, // point indices (permuted on return) 48 | const ANNorthRect &bnds, // bounding rectangle for cell 49 | int n, // number of points 50 | int dim, // dimension of space 51 | int &cut_dim, // cutting dimension (returned) 52 | ANNcoord &cut_val, // cutting value (returned) 53 | int &n_lo); // num of points on low side (returned) 54 | 55 | void sl_midpt_split( // sliding midpoint kd-splitter 56 | ANNpointArray pa, // point array (unaltered) 57 | ANNidxArray pidx, // point indices (permuted on return) 58 | const ANNorthRect &bnds, // bounding rectangle for cell 59 | int n, // number of points 60 | int dim, // dimension of space 61 | int &cut_dim, // cutting dimension (returned) 62 | ANNcoord &cut_val, // cutting value (returned) 63 | int &n_lo); // num of points on low side (returned) 64 | 65 | void fair_split( // fair-split kd-splitter 66 | ANNpointArray pa, // point array (unaltered) 67 | ANNidxArray pidx, // point indices (permuted on return) 68 | const ANNorthRect &bnds, // bounding rectangle for cell 69 | int n, // number of points 70 | int dim, // dimension of space 71 | int &cut_dim, // cutting dimension (returned) 72 | ANNcoord &cut_val, // cutting value (returned) 73 | int &n_lo); // num of points on low side (returned) 74 | 75 | void sl_fair_split( // sliding fair-split kd-splitter 76 | ANNpointArray pa, // point array (unaltered) 77 | ANNidxArray pidx, // point indices (permuted on return) 78 | const ANNorthRect &bnds, // bounding rectangle for cell 79 | int n, // number of points 80 | int dim, // dimension of space 81 | int &cut_dim, // cutting dimension (returned) 82 | ANNcoord &cut_val, // cutting value (returned) 83 | int &n_lo); // num of points on low side (returned) 84 | 85 | #endif -------------------------------------------------------------------------------- /man/diagram_kkmeans.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/machine_learning.R 3 | \name{diagram_kkmeans} 4 | \alias{diagram_kkmeans} 5 | \title{Cluster a group of persistence diagrams using kernel k-means.} 6 | \usage{ 7 | diagram_kkmeans( 8 | diagrams, 9 | K = NULL, 10 | centers, 11 | dim = 0, 12 | t = 1, 13 | sigma = 1, 14 | rho = NULL, 15 | num_workers = parallelly::availableCores(omit = 1), 16 | ... 17 | ) 18 | } 19 | \arguments{ 20 | \item{diagrams}{a list of n>=2 persistence diagrams which are either the output of a persistent homology calculation like ripsDiag/\code{\link[TDAstats]{calculate_homology}}/\code{\link{PyH}}, or the \code{\link{diagram_to_df}} function.} 21 | 22 | \item{K}{an optional precomputed Gram matrix of persistence diagrams, default NULL.} 23 | 24 | \item{centers}{number of clusters to initialize, no more than the number of diagrams although smaller values are recommended.} 25 | 26 | \item{dim}{the non-negative integer homological dimension in which the distance is to be computed, default 0.} 27 | 28 | \item{t}{a positive number representing the scale for the persistence Fisher kernel, default 1.} 29 | 30 | \item{sigma}{a positive number representing the bandwidth for the Fisher information metric, default 1.} 31 | 32 | \item{rho}{an optional positive number representing the heuristic for Fisher information metric approximation, see \code{\link{diagram_distance}}. Default NULL. If supplied, Gram matrix calculation is sequential.} 33 | 34 | \item{num_workers}{the number of cores used for parallel computation, default is one less than the number of cores on the machine.} 35 | 36 | \item{...}{additional parameters for the \code{\link[kernlab]{kkmeans}} kernlab function.} 37 | } 38 | \value{ 39 | a list of class 'diagram_kkmeans' containing the output of \code{\link[kernlab]{kkmeans}} on the Gram matrix, i.e. a list containing the elements 40 | 41 | \describe{ 42 | 43 | \item{clustering}{an S4 object of class specc, the output of a \code{\link[kernlab]{kkmeans}} function call. The `.Data` slot of this object contains cluster memberships, `withinss` contains the within-cluster sum of squares for each cluster, etc.} 44 | 45 | \item{diagrams}{the input `diagrams` argument.} 46 | 47 | \item{dim}{the input `dim` argument.} 48 | 49 | \item{t}{the input `t` argument.} 50 | 51 | \item{sigma}{the input `sigma` argument.} 52 | 53 | } 54 | } 55 | \description{ 56 | Finds latent cluster labels for a group of persistence diagrams, using a kernelized version 57 | of the popular k-means algorithm. An optimal number of clusters may be determined by analyzing 58 | the withinss field of the clustering object over several values of k. 59 | } 60 | \details{ 61 | Returns the output of \code{\link[kernlab]{kkmeans}} on the desired Gram matrix of a group of persistence diagrams 62 | in a particular dimension. The additional list elements stored in the output are needed 63 | to estimate cluster labels for new persistence diagrams in the `predict_diagram_kkmeans` 64 | function. 65 | } 66 | \examples{ 67 | 68 | if(require("TDAstats")) 69 | { 70 | # create two diagrams 71 | D1 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,20),], 72 | dim = 1,threshold = 2) 73 | D2 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,20),], 74 | dim = 1,threshold = 2) 75 | g <- list(D1,D1,D2,D2) 76 | 77 | # calculate kmeans clusters with centers = 2, and sigma = t = 2 in dimension 0 78 | clust <- diagram_kkmeans(diagrams = g,centers = 2,dim = 0,t = 2,sigma = 2,num_workers = 2) 79 | 80 | # repeat with precomputed Gram matrix, gives the same result just much faster 81 | K <- gram_matrix(diagrams = g,num_workers = 2,t = 2,sigma = 2) 82 | cluster <- diagram_kkmeans(diagrams = g,K = K,centers = 2,dim = 0,sigma = 2,t = 2) 83 | 84 | } 85 | } 86 | \references{ 87 | Dhillon, I and Guan, Y and Kulis, B (2004). "A Unified View of Kernel k-means , Spectral Clustering and Graph Cuts." \url{https://people.bu.edu/bkulis/pubs/spectral_techreport.pdf}. 88 | } 89 | \seealso{ 90 | \code{\link{predict_diagram_kkmeans}} for predicting cluster labels of new diagrams. 91 | } 92 | \author{ 93 | Shael Brown - \email{shaelebrown@gmail.com} 94 | } 95 | -------------------------------------------------------------------------------- /src/bd_fix_rad_search.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace Rcpp; 3 | 4 | //---------------------------------------------------------------------- 5 | // File: bd_fix_rad_search.cpp 6 | // Programmer: David Mount 7 | // Description: Standard bd-tree search 8 | // Last modified: 05/03/05 (Version 1.1) 9 | //---------------------------------------------------------------------- 10 | // Copyright (c) 1997-2005 University of Maryland and Sunil Arya and 11 | // David Mount. All Rights Reserved. 12 | // 13 | // This software and related documentation is part of the Approximate 14 | // Nearest Neighbor Library (ANN). This software is provided under 15 | // the provisions of the Lesser GNU Public License (LGPL). See the 16 | // file ../ReadMe.txt for further information. 17 | // 18 | // The University of Maryland (U.M.) and the authors make no 19 | // representations about the suitability or fitness of this software for 20 | // any purpose. It is provided "as is" without express or implied 21 | // warranty. 22 | //---------------------------------------------------------------------- 23 | // History: 24 | // Revision 1.1 05/03/05 25 | // Initial release 26 | //---------------------------------------------------------------------- 27 | 28 | #include "bd_tree.h" // bd-tree declarations 29 | #include "kd_fix_rad_search.h" // kd-tree FR search declarations 30 | 31 | //---------------------------------------------------------------------- 32 | // Approximate searching for bd-trees. 33 | // See the file kd_FR_search.cpp for general information on the 34 | // approximate nearest neighbor search algorithm. Here we 35 | // include the extensions for shrinking nodes. 36 | //---------------------------------------------------------------------- 37 | 38 | //---------------------------------------------------------------------- 39 | // bd_shrink::ann_FR_search - search a shrinking node 40 | //---------------------------------------------------------------------- 41 | 42 | void ANNbd_shrink::ann_FR_search(ANNdist box_dist) 43 | { 44 | // check dist calc term cond. 45 | if (ANNmaxPtsVisited != 0 && ANNptsVisited > ANNmaxPtsVisited) return; 46 | 47 | ANNdist inner_dist = 0; // distance to inner box 48 | for (int i = 0; i < n_bnds; i++) { // is query point in the box? 49 | if (bnds[i].out(ANNkdFRQ)) { // outside this bounding side? 50 | // add to inner distance 51 | inner_dist = (ANNdist) ANN_SUM(inner_dist, bnds[i].dist(ANNkdFRQ)); 52 | } 53 | } 54 | if (inner_dist <= box_dist) { // if inner box is closer 55 | child[ANN_IN]->ann_FR_search(inner_dist);// search inner child first 56 | child[ANN_OUT]->ann_FR_search(box_dist);// ...then outer child 57 | } 58 | else { // if outer box is closer 59 | child[ANN_OUT]->ann_FR_search(box_dist);// search outer child first 60 | child[ANN_IN]->ann_FR_search(inner_dist);// ...then outer child 61 | } 62 | ANN_FLOP(3*n_bnds) // increment floating ops 63 | ANN_SHR(1) // one more shrinking node 64 | } 65 | 66 | 67 | //---------------------------------------------------------------------- 68 | // bd_shrink::ann_FR_search - search a shrinking node 69 | //---------------------------------------------------------------------- 70 | 71 | void ANNbd_shrink::ann_FR_searchFlops(ANNdist box_dist) 72 | { 73 | // check dist calc term cond. 74 | if (ANNmaxPtsVisited != 0 && ANNptsVisited > ANNmaxPtsVisited) return; 75 | 76 | ANNdist inner_dist = 0; // distance to inner box 77 | for (int i = 0; i < n_bnds; i++) { // is query point in the box? 78 | if (bnds[i].out(ANNkdFRQ)) { // outside this bounding side? 79 | // add to inner distance 80 | inner_dist = (ANNdist) ANN_SUM(inner_dist, bnds[i].dist(ANNkdFRQ)); 81 | } 82 | } 83 | if (inner_dist <= box_dist) { // if inner box is closer 84 | child[ANN_IN]->ann_FR_searchFlops(inner_dist);// search inner child first 85 | child[ANN_OUT]->ann_FR_searchFlops(box_dist);// ...then outer child 86 | } 87 | else { // if outer box is closer 88 | child[ANN_OUT]->ann_FR_searchFlops(box_dist);// search outer child first 89 | child[ANN_IN]->ann_FR_searchFlops(inner_dist);// ...then outer child 90 | } 91 | ANN_FLOP_ALWAYS(3*n_bnds) // increment floating ops 92 | ANN_SHR(1) // one more shrinking node 93 | } -------------------------------------------------------------------------------- /man/diagram_distance.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/distance_calculations.R 3 | \name{diagram_distance} 4 | \alias{diagram_distance} 5 | \title{Calculate distance between a pair of persistence diagrams.} 6 | \usage{ 7 | diagram_distance( 8 | D1, 9 | D2, 10 | dim = 0, 11 | p = 2, 12 | distance = "wasserstein", 13 | sigma = NULL, 14 | rho = NULL 15 | ) 16 | } 17 | \arguments{ 18 | \item{D1}{the first persistence diagram.} 19 | 20 | \item{D2}{the second persistence diagram.} 21 | 22 | \item{dim}{the non-negative integer homological dimension in which the distance is to be computed, default 0.} 23 | 24 | \item{p}{a number representing the wasserstein power parameter, at least 1 and default 2.} 25 | 26 | \item{distance}{a string which determines which type of distance calculation to carry out, either "wasserstein" (default) or "fisher".} 27 | 28 | \item{sigma}{either NULL (default) or a positive number representing the bandwidth for the Fisher information metric.} 29 | 30 | \item{rho}{either NULL (default) or a positive number. If NULL then the exact calculation of the Fisher information metric is returned and otherwise a fast approximation, see details.} 31 | } 32 | \value{ 33 | the numeric value of the distance calculation. 34 | } 35 | \description{ 36 | Calculates the distance between a pair of persistence diagrams, either the output from a \code{\link{diagram_to_df}} function call 37 | or from a persistent homology calculation like ripsDiag/\code{\link[TDAstats]{calculate_homology}}/\code{\link{PyH}}, 38 | in a particular homological dimension. 39 | } 40 | \details{ 41 | The most common distance calculations between persistence diagrams 42 | are the wasserstein and bottleneck distances, both of which "match" points between 43 | their two input diagrams and compute the "loss" of the optimal matching 44 | (see \url{https://dl.acm.org/doi/10.1145/3064175} for details). Another 45 | method for computing distances, the Fisher information metric, 46 | converts the two diagrams into distributions 47 | defined on the plane, and calculates a distance between the resulting two distributions 48 | (\url{https://proceedings.neurips.cc/paper/2018/file/959ab9a0695c467e7caf75431a872e5c-Paper.pdf}). 49 | If the `distance` parameter is "fisher" then `sigma` must not be NULL. As noted in the Persistence Fisher paper, 50 | there is a fast speed-up approximation which has been implemented from \url{https://github.com/vmorariu/figtree} 51 | and can be accessed by setting the `rho` parameter. Smaller 52 | values of `rho` will result in tighter approximations at the expense of longer runtime, and vice versa. 53 | } 54 | \examples{ 55 | 56 | if(require("TDAstats")) 57 | { 58 | # create two diagrams 59 | D1 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,size = 20),], 60 | dim = 1,threshold = 2) 61 | D2 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,size = 20),], 62 | dim = 1,threshold = 2) 63 | 64 | # calculate 2-wasserstein distance between D1 and D2 in dimension 1 65 | diagram_distance(D1,D2,dim = 1,p = 2,distance = "wasserstein") 66 | 67 | # calculate bottleneck distance between D1 and D2 in dimension 0 68 | diagram_distance(D1,D2,dim = 0,p = Inf,distance = "wasserstein") 69 | 70 | # Fisher information metric calculation between D1 and D2 for sigma = 1 in dimension 1 71 | diagram_distance(D1,D2,dim = 1,distance = "fisher",sigma = 1) 72 | 73 | # repeat but with fast approximation 74 | \dontrun{ 75 | diagram_distance(D1,D2,dim = 1,distance = "fisher",sigma = 1,rho = 0.001) 76 | } 77 | } 78 | } 79 | \references{ 80 | Kerber M, Morozov D and Nigmetov A (2017). "Geometry Helps to Compare Persistence Diagrams." \url{https://dl.acm.org/doi/10.1145/3064175}. 81 | 82 | Le T, Yamada M (2018). "Persistence fisher kernel: a riemannian manifold kernel for persistence diagrams." \url{https://proceedings.neurips.cc/paper/2018/file/959ab9a0695c467e7caf75431a872e5c-Paper.pdf}. 83 | 84 | Vlad I. Morariu, Balaji Vasan Srinivasan, Vikas C. Raykar, Ramani Duraiswami, and Larry S. Davis. Automatic online tuning for fast Gaussian summation. Advances in Neural Information Processing Systems (NIPS), 2008. 85 | } 86 | \seealso{ 87 | \code{\link{distance_matrix}} for distance matrix calculations. 88 | } 89 | \author{ 90 | Shael Brown - \email{shaelebrown@gmail.com} 91 | } 92 | -------------------------------------------------------------------------------- /man/plot_vr_graph.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rips_complexes.R 3 | \name{plot_vr_graph} 4 | \alias{plot_vr_graph} 5 | \title{Plot a VR graph using the igraph package.} 6 | \usage{ 7 | plot_vr_graph( 8 | graphs, 9 | eps, 10 | cols = NULL, 11 | layout = NULL, 12 | title = NULL, 13 | component_of = NULL, 14 | plot_isolated_vertices = FALSE, 15 | return_layout = FALSE, 16 | vertex_labels = TRUE 17 | ) 18 | } 19 | \arguments{ 20 | \item{graphs}{the output of a `vr_graphs` function call.} 21 | 22 | \item{eps}{the numeric radius of the graph in `graphs` to plot.} 23 | 24 | \item{cols}{an optional character vector of vertex colors, default `NULL`.} 25 | 26 | \item{layout}{an optional 2D matrix of vertex coordinates, default `NULL`. If row names are supplied they can be used to subset a graph by those vertex names.} 27 | 28 | \item{title}{an optional str title for the plot, default `NULL`.} 29 | 30 | \item{component_of}{a vertex name (integer or character), only the component of the graph containing that vertex will be plotted (useful for identifying representative (co)cycles in graphs). Default `NULL` (plot the whole graph).} 31 | 32 | \item{plot_isolated_vertices}{a boolean representing whether or not to plot isolated vertices, default `FALSE`.} 33 | 34 | \item{return_layout}{a boolean representing whether or not to return the plotting layout (x-y coordinates of each vertex) and the vertex labels, default `FALSE`.} 35 | 36 | \item{vertex_labels}{a boolean representing whether or not to plot vertex labels, default `TRUE`.} 37 | } 38 | \value{ 39 | if `return_layout` is `TRUE` then a list with elements "layout" (the numeric matrix of vertex x-y coordinates) and "vertices" (character vertex labels), otherwise the function does not return anything. 40 | } 41 | \description{ 42 | This function will throw an error if the igraph package is not installed. 43 | } 44 | \examples{ 45 | 46 | if(require("TDAstats") & require("igraph")) 47 | { 48 | # simulate data from the unit circle and calculate 49 | # its diagram 50 | df <- TDAstats::circle2d[sample(1:100,25),] 51 | diag <- TDAstats::calculate_homology(df, 52 | dim = 1, 53 | threshold = 2) 54 | 55 | # get minimum death radius of any data cluster 56 | min_death_H0 <- 57 | min(diag[which(diag[,1] == 0),3L]) 58 | 59 | # get birth and death radius of the loop 60 | loop_birth <- as.numeric(diag[nrow(diag),2L]) 61 | loop_death <- as.numeric(diag[nrow(diag),3L]) 62 | 63 | # compute VR graphs at radii half of 64 | # min_death_H0 and the mean of loop_birth and 65 | # loop_death, returning clusters 66 | graphs <- vr_graphs(X = df,eps = 67 | c(0.5*min_death_H0,(loop_birth + loop_death)/2)) 68 | 69 | # plot graph of smaller (first) radius 70 | plot_vr_graph(graphs = graphs,eps = 0.5*min_death_H0, 71 | plot_isolated_vertices = TRUE) 72 | 73 | # plot graph of larger (second) radius 74 | plot_vr_graph(graphs = graphs,eps = (loop_birth + loop_death)/2) 75 | 76 | # repeat but with rownames for df, each vertex 77 | # will be plotted with its rownames 78 | rownames(df) <- paste0("V",1:25) 79 | graphs <- vr_graphs(X = df,eps = 80 | c(0.5*min_death_H0,(loop_birth + loop_death)/2)) 81 | plot_vr_graph(graphs = graphs,eps = 0.5*min_death_H0, 82 | plot_isolated_vertices = TRUE) 83 | 84 | # plot without vertex labels 85 | plot_vr_graph(graphs = graphs,eps = (loop_birth + loop_death)/2, 86 | vertex_labels = FALSE) 87 | 88 | # plot only the graph component containing vertex "1" 89 | plot_vr_graph(graphs = graphs,eps = 0.5*min_death_H0, 90 | component_of = "V1",plot_isolated_vertices = TRUE) 91 | 92 | # save the layout of the graph for adding features to 93 | # the same graph layout, like color 94 | layout <- plot_vr_graph(graphs = graphs,eps = (loop_birth + loop_death)/2, 95 | return_layout = TRUE,vertex_labels = TRUE) 96 | cols <- rep("blue",25) 97 | cols[1:5] <- "red" 98 | plot_vr_graph(graphs = graphs,eps = (loop_birth + loop_death)/2,cols = cols, 99 | layout = layout) 100 | 101 | } 102 | } 103 | \seealso{ 104 | \code{\link{vr_graphs}} for computing VR graphs. 105 | } 106 | \author{ 107 | Shael Brown - \email{shaelebrown@gmail.com} 108 | } 109 | -------------------------------------------------------------------------------- /man/analyze_representatives.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/analyze_representatives.R 3 | \name{analyze_representatives} 4 | \alias{analyze_representatives} 5 | \title{Analyze the data point memberships of multiple representative (co)cycles.} 6 | \usage{ 7 | analyze_representatives( 8 | diagrams, 9 | dim, 10 | num_points, 11 | plot_heatmap = TRUE, 12 | return_contributions = FALSE, 13 | boxed_reps = NULL, 14 | d = NULL, 15 | lwd = NULL, 16 | title = NULL, 17 | return_clust = FALSE 18 | ) 19 | } 20 | \arguments{ 21 | \item{diagrams}{a list of persistence diagrams, either the output of persistent homology calculations like ripsDiag/\code{\link[TDAstats]{calculate_homology}}/\code{\link{PyH}}, \code{\link{diagram_to_df}} or \code{\link{bootstrap_persistence_thresholds}}.} 22 | 23 | \item{dim}{the integer homological dimension of representatives to consider.} 24 | 25 | \item{num_points}{the integer number of data points in all the original datasets (from which the diagrams were calculated).} 26 | 27 | \item{plot_heatmap}{a boolean representing if a heatmap of data point membership similarity of the representatives should be plotted, default `TRUE`. A dendrogram of hierarchical clustering is plotted, and rows (representatives) are sorted according to this clustering.} 28 | 29 | \item{return_contributions}{a boolean indicating whether or not to return the membership contributions (i.e. percentages) of the data points (1:`num_points`) across all the representatives, default `FALSE`.} 30 | 31 | \item{boxed_reps}{a data frame specifying specific rows of the output heatmap which should have a box drawn around them (for highlighting), default NULL. See the details section for more information.} 32 | 33 | \item{d}{either NULL (default) or a "dist" object representing a distance matrix for the representatives, which must have the same number of rows and columns as cycles in the dimension `dim`.} 34 | 35 | \item{lwd}{a positive number width for the lines of drawn boxes, if boxed_reps is not null.} 36 | 37 | \item{title}{a character string title for the plotted heatmap, default NULL.} 38 | 39 | \item{return_clust}{a boolean determining whether or not to return the result of the `stats::hclust()` call when a heatmap is plotted, default `FALSE`.} 40 | } 41 | \value{ 42 | either a matrix of data point contributions to the representatives, or a list with elements "memberships" (the matrix) and some combination of elements "contributions" (a vector of membership percentages for each data point across representatives) and "clust" (the results of `stats::hclust()` on the membership matrix). 43 | } 44 | \description{ 45 | Multiple distance matrices with corresponding data points can contain the same topological features. 46 | Therefore we may wish to compare many representative (co)cycles across distance matrices to decide if their topological features are the same. 47 | The `analyze_representatives` function returns a matrix of binary datapoint memberships in an input list of representatives across distance matrices. 48 | Optionally this matrix can be plotted as a heatmap with columns as data points and rows (i.e. representatives) reordered by similarity, and the 49 | contributions (i.e. percentage membership) of each point in the representatives can also be returned. The heatmap has 50 | dark red squares representing membership - location [i,j] is dark red if data point j is in representative i. 51 | } 52 | \details{ 53 | The clustering dendrogram can be used to determine if there are any similar groups of representatives (i.e. 54 | shared topological features across datasets) and if so how many. The row labels of the heatmap are of the form 55 | 'DX[Y]', meaning the Yth representative of diagram X, and the column labels are the data point numbers. 56 | If diagrams are the output of the \code{\link{bootstrap_persistence_thresholds}} 57 | function, then the subsetted_representatives (if present) will be analyzed. Therefore, a column label like 'DX[Y]' in the 58 | plotted heatmap would mean the Yth representative of diagram X. If certain representatives should be highlighted (by drawing a box around its row) 59 | in the heatmap, a dataframe `boxed_reps` can be supplied with two integer columns - 'diagram' and 'rep'. For example, if we wish to draw a box for DX[Y] then we 60 | add the row (diagram = X,rep = Y) to `boxed_reps`. If `d` is supplied then it will be used to cluster the representatives, based on the distances in `d`. 61 | } 62 | \author{ 63 | Shael Brown - \email{shaelebrown@gmail.com} 64 | } 65 | -------------------------------------------------------------------------------- /man/independence_test.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/inference.R 3 | \name{independence_test} 4 | \alias{independence_test} 5 | \title{Independence test for two groups of persistence diagrams.} 6 | \usage{ 7 | independence_test( 8 | g1, 9 | g2, 10 | dims = c(0, 1), 11 | sigma = 1, 12 | rho = NULL, 13 | t = 1, 14 | num_workers = parallelly::availableCores(omit = 1), 15 | verbose = FALSE, 16 | Ks = NULL, 17 | Ls = NULL 18 | ) 19 | } 20 | \arguments{ 21 | \item{g1}{the first group of persistence diagrams, where each diagram was either the output from a persistent homology calculation like ripsDiag/\code{\link[TDAstats]{calculate_homology}}/\code{\link{PyH}}, or \code{\link{diagram_to_df}}.} 22 | 23 | \item{g2}{the second group of persistence diagrams, where each diagram was either the output from a persistent homology calculation like ripsDiag/\code{\link[TDAstats]{calculate_homology}}/\code{\link{PyH}}, or \code{\link{diagram_to_df}}.} 24 | 25 | \item{dims}{a non-negative integer vector of the homological dimensions in which the test is to be carried out, default c(0,1).} 26 | 27 | \item{sigma}{a positive number representing the bandwidth for the Fisher information metric, default 1.} 28 | 29 | \item{rho}{an optional positive number representing the heuristic for Fisher information metric approximation, see \code{\link{diagram_distance}}. Default NULL. If supplied, calculation of Gram matrices is sequential.} 30 | 31 | \item{t}{a positive number representing the scale for the persistence Fisher kernel, default 1.} 32 | 33 | \item{num_workers}{the number of cores used for parallel computation, default is one less than the number of cores on the machine.} 34 | 35 | \item{verbose}{a boolean flag for if the time duration of the function call should be printed, default FALSE} 36 | 37 | \item{Ks}{an optional list of precomputed Gram matrices for the first group of diagrams, with one element for each dimension. If not NULL and `Ls` is not NULL then `g1` and `g2` do not need to be supplied.} 38 | 39 | \item{Ls}{an optional list of precomputed Gram matrices for the second group of diagrams, with one element for each dimension. If not NULL and `Ks` is not NULL then `g1` and `g2` do not need to be supplied.} 40 | } 41 | \value{ 42 | a list with the following elements: 43 | \describe{ 44 | 45 | \item{dimensions}{the input `dims` argument.} 46 | 47 | \item{test_statisics}{a numeric vector of the test statistic value in each dimension.} 48 | 49 | \item{p_values}{a numeric vector of the p-values in each dimension.} 50 | 51 | \item{run_time}{the run time of the function call, containing time units.} 52 | 53 | } 54 | } 55 | \description{ 56 | Carries out inference to determine if two groups of persistence diagrams are independent or not 57 | based on kernel calculations (see 58 | (\url{https://proceedings.neurips.cc/paper/2007/file/d5cfead94f5350c12c322b5b664544c1-Paper.pdf}) for details). 59 | A small p-value in a certain dimension suggests that the groups are not independent in that dimension. 60 | } 61 | \details{ 62 | The test is carried out with a parametric null distribution, making it much faster than non-parametric 63 | approaches. If all of the diagrams in either g1 or g2 are the same in some dimension, then some p-values may be NaN. 64 | } 65 | \examples{ 66 | 67 | if(require("TDAstats")) 68 | { 69 | # create two independent groups of diagrams of length 6, which 70 | # is the minimum length 71 | D1 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,10),], 72 | dim = 0,threshold = 2) 73 | D2 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,10),], 74 | dim = 0,threshold = 2) 75 | g1 <- list(D1,D2,D2,D2,D2,D2) 76 | g2 <- list(D2,D1,D1,D1,D1,D1) 77 | 78 | # do independence test with sigma = t = 1 in dimension 0, using 79 | # precomputed Gram matrices 80 | K = gram_matrix(diagrams = g1,dim = 0,t = 1,sigma = 1,num_workers = 2) 81 | L = gram_matrix(diagrams = g2,dim = 0,t = 1,sigma = 1,num_workers = 2) 82 | indep_test <- independence_test(Ks = list(K),Ls = list(L),dims = c(0)) 83 | 84 | } 85 | } 86 | \references{ 87 | Gretton A et al. (2007). "A Kernel Statistical Test of Independence." \url{https://proceedings.neurips.cc/paper/2007/file/d5cfead94f5350c12c322b5b664544c1-Paper.pdf}. 88 | } 89 | \seealso{ 90 | \code{\link{permutation_test}} for an inferential group difference test for groups of persistence diagrams. 91 | } 92 | \author{ 93 | Shael Brown - \email{shaelebrown@gmail.com} 94 | } 95 | -------------------------------------------------------------------------------- /src/bd_tree.h: -------------------------------------------------------------------------------- 1 | //---------------------------------------------------------------------- 2 | // File: bd_tree.h 3 | // Programmer: David Mount 4 | // Description: Declarations for standard bd-tree routines 5 | // Last modified: 01/04/05 (Version 1.0) 6 | //---------------------------------------------------------------------- 7 | // Copyright (c) 1997-2005 University of Maryland and Sunil Arya and 8 | // David Mount. All Rights Reserved. 9 | // 10 | // This software and related documentation is part of the Approximate 11 | // Nearest Neighbor Library (ANN). This software is provided under 12 | // the provisions of the Lesser GNU Public License (LGPL). See the 13 | // file ../ReadMe.txt for further information. 14 | // 15 | // The University of Maryland (U.M.) and the authors make no 16 | // representations about the suitability or fitness of this software for 17 | // any purpose. It is provided "as is" without express or implied 18 | // warranty. 19 | //---------------------------------------------------------------------- 20 | // History: 21 | // Revision 0.1 03/04/98 22 | // Initial release 23 | // Revision 1.0 04/01/05 24 | // Changed IN, OUT to ANN_IN, ANN_OUT 25 | //---------------------------------------------------------------------- 26 | 27 | #ifndef ANN_bd_tree_H 28 | #define ANN_bd_tree_H 29 | 30 | #include "ANNx.h" // all ANN includes 31 | #include "kd_tree.h" // kd-tree includes 32 | 33 | //---------------------------------------------------------------------- 34 | // bd-tree shrinking node. 35 | // The main addition in the bd-tree is the shrinking node, which 36 | // is declared here. 37 | // 38 | // Shrinking nodes are defined by list of orthogonal halfspaces. 39 | // These halfspaces define a (possibly unbounded) orthogonal 40 | // rectangle. There are two children, in and out. Points that 41 | // lie within this rectangle are stored in the in-child, and the 42 | // other points are stored in the out-child. 43 | // 44 | // We use a list of orthogonal halfspaces rather than an 45 | // orthogonal rectangle object because typically the number of 46 | // sides of the shrinking box will be much smaller than the 47 | // worst case bound of 2*dim. 48 | // 49 | // BEWARE: Note that constructor just copies the pointer to the 50 | // bounding array, but the destructor deallocates it. This is 51 | // rather poor practice, but happens to be convenient. The list 52 | // is allocated in the bd-tree building procedure rbd_tree() just 53 | // prior to construction, and is used for no other purposes. 54 | // 55 | // WARNING: In the near neighbor searching code it is assumed that 56 | // the list of bounding halfspaces is irredundant, meaning that there 57 | // are no two distinct halfspaces in the list with the same outward 58 | // pointing normals. 59 | //---------------------------------------------------------------------- 60 | 61 | class ANNbd_shrink : public ANNkd_node // splitting node of a kd-tree 62 | { 63 | int n_bnds; // number of bounding halfspaces 64 | ANNorthHSArray bnds; // list of bounding halfspaces 65 | ANNkd_ptr child[2]; // in and out children 66 | public: 67 | ANNbd_shrink( // constructor 68 | int nb, // number of bounding halfspaces 69 | ANNorthHSArray bds, // list of bounding halfspaces 70 | ANNkd_ptr ic=NULL, ANNkd_ptr oc=NULL) // children 71 | { 72 | n_bnds = nb; // cutting dimension 73 | bnds = bds; // assign bounds 74 | child[ANN_IN] = ic; // set children 75 | child[ANN_OUT] = oc; 76 | } 77 | 78 | ~ANNbd_shrink() // destructor 79 | { 80 | if (child[ANN_IN]!= NULL && child[ANN_IN]!= KD_TRIVIAL) 81 | delete child[ANN_IN]; 82 | if (child[ANN_OUT]!= NULL&& child[ANN_OUT]!= KD_TRIVIAL) 83 | delete child[ANN_OUT]; 84 | if (bnds != NULL) 85 | delete [] bnds; // delete bounds 86 | } 87 | 88 | virtual void getStats( // get tree statistics 89 | int dim, // dimension of space 90 | ANNkdStats &st, // statistics 91 | ANNorthRect &bnd_box); // bounding box 92 | virtual void print(int level, ostream &out);// print node 93 | virtual void dump(ostream &out); // dump node 94 | 95 | virtual void ann_search(ANNdist); // standard search 96 | virtual void ann_pri_search(ANNdist); // priority search 97 | virtual void ann_FR_search(ANNdist); // fixed-radius search 98 | 99 | // added by Vlad 5-1-08 to update flops even when ANN_PERF is not defined 100 | virtual void ann_FR_searchFlops(ANNdist); // fixed-radius search 101 | }; 102 | 103 | #endif -------------------------------------------------------------------------------- /src/brute.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace Rcpp; 3 | 4 | //---------------------------------------------------------------------- 5 | // File: brute.cpp 6 | // Programmer: Sunil Arya and David Mount 7 | // Description: Brute-force nearest neighbors 8 | // Last modified: 05/03/05 (Version 1.1) 9 | //---------------------------------------------------------------------- 10 | // Copyright (c) 1997-2005 University of Maryland and Sunil Arya and 11 | // David Mount. All Rights Reserved. 12 | // 13 | // This software and related documentation is part of the Approximate 14 | // Nearest Neighbor Library (ANN). This software is provided under 15 | // the provisions of the Lesser GNU Public License (LGPL). See the 16 | // file ../ReadMe.txt for further information. 17 | // 18 | // The University of Maryland (U.M.) and the authors make no 19 | // representations about the suitability or fitness of this software for 20 | // any purpose. It is provided "as is" without express or implied 21 | // warranty. 22 | //---------------------------------------------------------------------- 23 | // History: 24 | // Revision 0.1 03/04/98 25 | // Initial release 26 | // Revision 1.1 05/03/05 27 | // Added fixed-radius kNN search 28 | //---------------------------------------------------------------------- 29 | 30 | #include "ANNx.h" // all ANN includes 31 | #include "pr_queue_k.h" // k element priority queue 32 | 33 | //---------------------------------------------------------------------- 34 | // Brute-force search simply stores a pointer to the list of 35 | // data points and searches linearly for the nearest neighbor. 36 | // The k nearest neighbors are stored in a k-element priority 37 | // queue (which is implemented in a pretty dumb way as well). 38 | // 39 | // If ANN_ALLOW_SELF_MATCH is ANNfalse then data points at distance 40 | // zero are not considered. 41 | // 42 | // Note that the error bound eps is passed in, but it is ignored. 43 | // These routines compute exact nearest neighbors (which is needed 44 | // for validation purposes in ann_test.cpp). 45 | //---------------------------------------------------------------------- 46 | 47 | ANNbruteForce::ANNbruteForce( // constructor from point array 48 | ANNpointArray pa, // point array 49 | int n, // number of points 50 | int dd) // dimension 51 | { 52 | dim = dd; n_pts = n; pts = pa; 53 | } 54 | 55 | ANNbruteForce::~ANNbruteForce() { } // destructor (empty) 56 | 57 | void ANNbruteForce::annkSearch( // approx k near neighbor search 58 | ANNpoint q, // query point 59 | int k, // number of near neighbors to return 60 | ANNidxArray nn_idx, // nearest neighbor indices (returned) 61 | ANNdistArray dd, // dist to near neighbors (returned) 62 | double eps) // error bound (ignored) 63 | { 64 | ANNmin_k mk(k); // construct a k-limited priority queue 65 | int i; 66 | 67 | if (k > n_pts) { // too many near neighbors? 68 | annError((char *)"Requesting more near neighbors than data points", ANNabort); 69 | } 70 | // run every point through queue 71 | for (i = 0; i < n_pts; i++) { 72 | // compute distance to point 73 | ANNdist sqDist = annDist(dim, pts[i], q); 74 | if (ANN_ALLOW_SELF_MATCH || sqDist != 0) 75 | mk.insert(sqDist, i); 76 | } 77 | for (i = 0; i < k; i++) { // extract the k closest points 78 | dd[i] = mk.ith_smallest_key(i); 79 | nn_idx[i] = mk.ith_smallest_info(i); 80 | } 81 | } 82 | 83 | int ANNbruteForce::annkFRSearch( // approx fixed-radius kNN search 84 | ANNpoint q, // query point 85 | ANNdist sqRad, // squared radius 86 | int k, // number of near neighbors to return 87 | ANNidxArray nn_idx, // nearest neighbor array (returned) 88 | ANNdistArray dd, // dist to near neighbors (returned) 89 | double eps) // error bound 90 | { 91 | ANNmin_k mk(k); // construct a k-limited priority queue 92 | int i; 93 | int pts_in_range = 0; // number of points in query range 94 | // run every point through queue 95 | for (i = 0; i < n_pts; i++) { 96 | // compute distance to point 97 | ANNdist sqDist = annDist(dim, pts[i], q); 98 | if (sqDist <= sqRad && // within radius bound 99 | (ANN_ALLOW_SELF_MATCH || sqDist != 0)) { // ...and no self match 100 | mk.insert(sqDist, i); 101 | pts_in_range++; 102 | } 103 | } 104 | for (i = 0; i < k; i++) { // extract the k closest points 105 | if (dd != NULL) 106 | dd[i] = mk.ith_smallest_key(i); 107 | if (nn_idx != NULL) 108 | nn_idx[i] = mk.ith_smallest_info(i); 109 | } 110 | 111 | return pts_in_range; 112 | } -------------------------------------------------------------------------------- /man/diagram_kpca.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/machine_learning.R 3 | \name{diagram_kpca} 4 | \alias{diagram_kpca} 5 | \title{Calculate the kernel PCA embedding of a group of persistence diagrams.} 6 | \usage{ 7 | diagram_kpca( 8 | diagrams, 9 | K = NULL, 10 | dim = 0, 11 | t = 1, 12 | sigma = 1, 13 | rho = NULL, 14 | features = 1, 15 | num_workers = parallelly::availableCores(omit = 1), 16 | th = 1e-04 17 | ) 18 | } 19 | \arguments{ 20 | \item{diagrams}{a list of persistence diagrams which are either the output of a persistent homology calculation like ripsDiag/\code{\link[TDAstats]{calculate_homology}}/\code{\link{PyH}}, or \code{\link{diagram_to_df}}.} 21 | 22 | \item{K}{an optional precomputed Gram matrix of the persistence diagrams in `diagrams`, default NULL.} 23 | 24 | \item{dim}{the non-negative integer homological dimension in which the distance is to be computed, default 0.} 25 | 26 | \item{t}{a positive number representing the scale for the persistence Fisher kernel, default 1.} 27 | 28 | \item{sigma}{a positive number representing the bandwidth for the Fisher information metric, default 1.} 29 | 30 | \item{rho}{an optional positive number representing the heuristic for Fisher information metric approximation, see \code{\link{diagram_distance}}. Default NULL. If supplied, Gram matrix calculation is sequential.} 31 | 32 | \item{features}{number of features (principal components) to return, default 1.} 33 | 34 | \item{num_workers}{the number of cores used for parallel computation, default is one less than the number of cores on the machine.} 35 | 36 | \item{th}{the threshold value under which principal components are ignored (default 0.0001).} 37 | } 38 | \value{ 39 | a list of class 'diagram_kpca' containing the elements 40 | 41 | \describe{ 42 | 43 | \item{pca}{the output of kernlab's \code{\link[kernlab]{kpca}} function on the Gram matrix: an S4 object containing the slots `pcv` (a matrix containing the principal component vectors (column wise)), `eig` (the corresponding eigenvalues), `rotated` (the original data projected (rotated) on the principal components) and `xmatrix` (the original data matrix).} 44 | 45 | \item{diagrams}{the input `diagrams` argument.} 46 | 47 | \item{t}{the input `t` argument.} 48 | 49 | \item{sigma}{the input `sigma` argument.} 50 | 51 | \item{dim}{the input `dim` argument.} 52 | 53 | } 54 | } 55 | \description{ 56 | Project a group of persistence diagrams into a low-dimensional embedding space using 57 | a kernelized version of the popular PCA algorithm. 58 | } 59 | \details{ 60 | Returns the output of kernlab's \code{\link[kernlab]{kpca}} function on the desired Gram matrix of a group of persistence diagrams 61 | in a particular dimension. The prediction function \code{\link{predict_diagram_kpca}} can be used to 62 | project new persistence diagrams using an old embedding, and this could be one practical 63 | advantage of using \code{\link{diagram_kpca}} over \code{\link{diagram_mds}}. The embedding coordinates can also 64 | be used for further analysis, or simply as a data visualization tool for persistence diagrams. 65 | } 66 | \examples{ 67 | 68 | if(require("TDAstats")) 69 | { 70 | # create six diagrams 71 | D1 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,20),], 72 | dim = 1,threshold = 2) 73 | D2 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,20),], 74 | dim = 1,threshold = 2) 75 | D3 <- TDAstats::calculate_homology(TDAstats::sphere3d[sample(1:100,20),], 76 | dim = 1,threshold = 2) 77 | D4 <- TDAstats::calculate_homology(TDAstats::sphere3d[sample(1:100,20),], 78 | dim = 1,threshold = 2) 79 | D5 <- TDAstats::calculate_homology(TDAstats::sphere3d[sample(1:100,20),], 80 | dim = 1,threshold = 2) 81 | D6 <- TDAstats::calculate_homology(TDAstats::sphere3d[sample(1:100,20),], 82 | dim = 1,threshold = 2) 83 | g <- list(D1,D2,D3,D4,D5,D6) 84 | 85 | # calculate their 2D PCA embedding with sigma = t = 2 in dimension 1 86 | pca <- diagram_kpca(diagrams = g,dim = 1,t = 2,sigma = 2,features = 2,num_workers = 2,th = 1e-6) 87 | 88 | # repeat with precomputed Gram matrix, gives same result but much faster 89 | K <- gram_matrix(diagrams = g,dim = 1,t = 2,sigma = 2,num_workers = 2) 90 | pca <- diagram_kpca(diagrams = g,K = K,dim = 1,t = 2,sigma = 2,features = 2,th = 1e-6) 91 | 92 | } 93 | } 94 | \references{ 95 | Scholkopf, B and Smola, A and Muller, K (1998). "Nonlinear Component Analysis as a Kernel Eigenvalue Problem." \url{https://www.mlpack.org/papers/kpca.pdf}. 96 | } 97 | \seealso{ 98 | \code{\link{predict_diagram_kpca}} for predicting embedding coordinates of new diagrams. 99 | } 100 | \author{ 101 | Shael Brown - \email{shaelebrown@gmail.com} 102 | } 103 | -------------------------------------------------------------------------------- /man/diagram_mds.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/machine_learning.R 3 | \name{diagram_mds} 4 | \alias{diagram_mds} 5 | \title{Dimension reduction of a group of persistence diagrams via metric multidimensional scaling.} 6 | \usage{ 7 | diagram_mds( 8 | diagrams, 9 | D = NULL, 10 | k = 2, 11 | distance = "wasserstein", 12 | dim = 0, 13 | p = 2, 14 | sigma = NULL, 15 | rho = NULL, 16 | eig = FALSE, 17 | add = FALSE, 18 | x.ret = FALSE, 19 | list. = eig || add || x.ret, 20 | num_workers = parallelly::availableCores(omit = 1) 21 | ) 22 | } 23 | \arguments{ 24 | \item{diagrams}{a list of n>=2 persistence diagrams which are either the output of a persistent homology calculation like ripsDiag/\code{\link[TDAstats]{calculate_homology}}/\code{\link{PyH}}, or \code{\link{diagram_to_df}}. Only one of `diagrams` and `D` need to be supplied.} 25 | 26 | \item{D}{an optional precomputed distance matrix of persistence diagrams, default NULL. If not NULL then `diagrams` parameter does not need to be supplied.} 27 | 28 | \item{k}{the dimension of the space which the data are to be represented in; must be in \{1,2,...,n-1\}.} 29 | 30 | \item{distance}{a string representing the desired distance metric to be used, either 'wasserstein' (default) or 'fisher'.} 31 | 32 | \item{dim}{the non-negative integer homological dimension in which the distance is to be computed, default 0.} 33 | 34 | \item{p}{a positive number representing the wasserstein power, a number at least 1 (infinity for the bottleneck distance), default 2.} 35 | 36 | \item{sigma}{a positive number representing the bandwidth for the Fisher information metric, default NULL.} 37 | 38 | \item{rho}{an optional positive number representing the heuristic for Fisher information metric approximation, see \code{\link{diagram_distance}}. Default NULL. If supplied, distance matrix calculation is sequential.} 39 | 40 | \item{eig}{a boolean indicating whether the eigenvalues should be returned.} 41 | 42 | \item{add}{a boolean indicating if an additive constant c* should be computed, and added to the non-diagonal dissimilarities such that the modified dissimilarities are Euclidean.} 43 | 44 | \item{x.ret}{a boolean indicating whether the doubly centered symmetric distance matrix should be returned.} 45 | 46 | \item{list.}{a boolean indicating if a list should be returned or just the n*k matrix.} 47 | 48 | \item{num_workers}{the number of cores used for parallel computation, default is one less than the number of cores on the machine.} 49 | } 50 | \value{ 51 | the output of \code{\link[stats]{cmdscale}} on the diagram distance matrix. If `list.` is false (as per default), 52 | a matrix with `k` columns whose rows give the coordinates of the points chosen to represent the dissimilarities. 53 | 54 | Otherwise, a list containing the following components. 55 | 56 | \describe{ 57 | 58 | \item{points}{a matrix with `k` columns whose rows give the coordinates of the points chosen to represent the dissimilarities.} 59 | 60 | \item{eig}{the \eqn{n} eigenvalues computed during the scaling process if `eig` is true.} 61 | 62 | \item{x}{the doubly centered distance matrix if `x.ret` is true.} 63 | 64 | \item{ac}{the additive constant \eqn{c*}, 0 if `add` = FALSE.} 65 | 66 | \item{GOF}{the numeric vector of length 2, representing the sum of all the eigenvalues divided by the sum of their absolute values (first vector element) or by the sum of the max of each eigenvalue and 0 (second vector element).} 67 | 68 | } 69 | } 70 | \description{ 71 | Projects a group of persistence diagrams (or a precomputed distance matrix of diagrams) into a low-dimensional 72 | embedding space via metric multidimensional scaling. Such a projection can be used for visualization of data, 73 | or a static analysis of the embedding dimensions. 74 | } 75 | \details{ 76 | Returns the output of \code{\link[stats]{cmdscale}} on the desired distance matrix of a group of persistence diagrams 77 | in a particular dimension. If `distance` is "fisher" then `sigma` must not be NULL. 78 | } 79 | \examples{ 80 | 81 | if(require("TDAstats")) 82 | { 83 | # create two diagrams 84 | D1 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,10),], 85 | dim = 1,threshold = 2) 86 | D2 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,10),], 87 | dim = 1,threshold = 2) 88 | g <- list(D1,D2) 89 | 90 | # calculate their 1D MDS embedding in dimension 0 with the bottleneck distance 91 | mds <- diagram_mds(diagrams = g,k = 1,dim = 0,p = Inf,num_workers = 2) 92 | 93 | # repeat but with a precomputed distance matrix, gives same result just much faster 94 | Dmat <- distance_matrix(diagrams = list(D1,D2),dim = 0,p = Inf,num_workers = 2) 95 | mds <- diagram_mds(D = Dmat,k = 1) 96 | 97 | } 98 | } 99 | \references{ 100 | Cox M and Cox F (2008). "Multidimensional Scaling." \doi{10.1007/978-3-540-33037-0_14}. 101 | } 102 | \author{ 103 | Shael Brown - \email{shaelebrown@gmail.com} 104 | } 105 | -------------------------------------------------------------------------------- /src/pr_queue.h: -------------------------------------------------------------------------------- 1 | //---------------------------------------------------------------------- 2 | // File: pr_queue.h 3 | // Programmer: Sunil Arya and David Mount 4 | // Description: Include file for priority queue and related 5 | // structures. 6 | // Last modified: 01/04/05 (Version 1.0) 7 | //---------------------------------------------------------------------- 8 | // Copyright (c) 1997-2005 University of Maryland and Sunil Arya and 9 | // David Mount. All Rights Reserved. 10 | // 11 | // This software and related documentation is part of the Approximate 12 | // Nearest Neighbor Library (ANN). This software is provided under 13 | // the provisions of the Lesser GNU Public License (LGPL). See the 14 | // file ../ReadMe.txt for further information. 15 | // 16 | // The University of Maryland (U.M.) and the authors make no 17 | // representations about the suitability or fitness of this software for 18 | // any purpose. It is provided "as is" without express or implied 19 | // warranty. 20 | //---------------------------------------------------------------------- 21 | // History: 22 | // Revision 0.1 03/04/98 23 | // Initial release 24 | //---------------------------------------------------------------------- 25 | 26 | #ifndef PR_QUEUE_H 27 | #define PR_QUEUE_H 28 | 29 | #include "ANNx.h" // all ANN includes 30 | #include "ANNperf.h" // performance evaluation 31 | 32 | //---------------------------------------------------------------------- 33 | // Basic types. 34 | //---------------------------------------------------------------------- 35 | typedef void *PQinfo; // info field is generic pointer 36 | typedef ANNdist PQkey; // key field is distance 37 | 38 | //---------------------------------------------------------------------- 39 | // Priority queue 40 | // A priority queue is a list of items, along with associated 41 | // priorities. The basic operations are insert and extract_minimum. 42 | // 43 | // The priority queue is maintained using a standard binary heap. 44 | // (Implementation note: Indexing is performed from [1..max] rather 45 | // than the C standard of [0..max-1]. This simplifies parent/child 46 | // computations.) User information consists of a void pointer, 47 | // and the user is responsible for casting this quantity into whatever 48 | // useful form is desired. 49 | // 50 | // Because the priority queue is so central to the efficiency of 51 | // query processing, all the code is inline. 52 | //---------------------------------------------------------------------- 53 | 54 | class ANNpr_queue { 55 | 56 | struct pq_node { // node in priority queue 57 | PQkey key; // key value 58 | PQinfo info; // info field 59 | }; 60 | int n; // number of items in queue 61 | int max_size; // maximum queue size 62 | pq_node *pq; // the priority queue (array of nodes) 63 | 64 | public: 65 | ANNpr_queue(int max) // constructor (given max size) 66 | { 67 | n = 0; // initially empty 68 | max_size = max; // maximum number of items 69 | pq = new pq_node[max+1]; // queue is array [1..max] of nodes 70 | } 71 | 72 | ~ANNpr_queue() // destructor 73 | { delete [] pq; } 74 | 75 | ANNbool empty() // is queue empty? 76 | { if (n==0) return ANNtrue; else return ANNfalse; } 77 | 78 | ANNbool non_empty() // is queue nonempty? 79 | { if (n==0) return ANNfalse; else return ANNtrue; } 80 | 81 | void reset() // make existing queue empty 82 | { n = 0; } 83 | 84 | inline void insert( // insert item (inlined for speed) 85 | PQkey kv, // key value 86 | PQinfo inf) // item info 87 | { 88 | if (++n > max_size) annError((char *)"Priority queue overflow.", ANNabort); 89 | int r = n; 90 | while (r > 1) { // sift up new item 91 | int p = r/2; 92 | ANN_FLOP(1) // increment floating ops 93 | if (pq[p].key <= kv) // in proper order 94 | break; 95 | pq[r] = pq[p]; // else swap with parent 96 | r = p; 97 | } 98 | pq[r].key = kv; // insert new item at final location 99 | pq[r].info = inf; 100 | } 101 | 102 | inline void extr_min( // extract minimum (inlined for speed) 103 | PQkey &kv, // key (returned) 104 | PQinfo &inf) // item info (returned) 105 | { 106 | kv = pq[1].key; // key of min item 107 | inf = pq[1].info; // information of min item 108 | PQkey kn = pq[n--].key;// last item in queue 109 | int p = 1; // p points to item out of position 110 | int r = p<<1; // left child of p 111 | while (r <= n) { // while r is still within the heap 112 | ANN_FLOP(2) // increment floating ops 113 | // set r to smaller child of p 114 | if (r < n && pq[r].key > pq[r+1].key) r++; 115 | if (kn <= pq[r].key) // in proper order 116 | break; 117 | pq[p] = pq[r]; // else swap with child 118 | p = r; // advance pointers 119 | r = p<<1; 120 | } 121 | pq[p] = pq[n+1]; // insert last item in proper place 122 | } 123 | }; 124 | 125 | #endif -------------------------------------------------------------------------------- /man/universal_null.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/inference.R 3 | \name{universal_null} 4 | \alias{universal_null} 5 | \title{Filtering topological features with the universal null distribution.} 6 | \usage{ 7 | universal_null( 8 | X, 9 | FUN_diag = "calculate_homology", 10 | maxdim = 1, 11 | thresh, 12 | distance_mat = FALSE, 13 | ripser = NULL, 14 | ignore_infinite_cluster = TRUE, 15 | calculate_representatives = FALSE, 16 | alpha = 0.05, 17 | return_pvals = FALSE, 18 | infinite_cycle_inference = FALSE 19 | ) 20 | } 21 | \arguments{ 22 | \item{X}{the input dataset, must either be a matrix or data frame.} 23 | 24 | \item{FUN_diag}{a string representing the persistent homology function to use for calculating the full persistence diagram, either 25 | 'calculate_homology' (the default), 'PyH' or 'ripsDiag'.} 26 | 27 | \item{maxdim}{the integer maximum homological dimension for persistent homology, default 0.} 28 | 29 | \item{thresh}{the positive numeric maximum radius of the Vietoris-Rips filtration.} 30 | 31 | \item{distance_mat}{a boolean representing if `X` is a distance matrix (TRUE) or not (FALSE, default). 32 | dimensions together (TRUE, the default) or if one threshold should be calculated for each dimension separately (FALSE).} 33 | 34 | \item{ripser}{the imported ripser module when `FUN_diag` is `PyH`.} 35 | 36 | \item{ignore_infinite_cluster}{a boolean indicating whether or not to ignore the infinitely lived cluster when `FUN_diag` is `PyH`. If infinite cycle inference is to be performed, 37 | this parameter should be set to FALSE.} 38 | 39 | \item{calculate_representatives}{a boolean representing whether to calculate representative (co)cycles, default FALSE. Note that representatives cant be 40 | calculated when using the 'calculate_homology' function. Note that representatives cannot be computed for (significant) infinite cycles.} 41 | 42 | \item{alpha}{the type-1 error threshold, default 0.05.} 43 | 44 | \item{return_pvals}{a boolean representing whether or not to return p-values for features in the subsetted diagram as well as a list of p-value thresholds, default FALSE. 45 | Infinite cycles that are significant (see below) will have p-value NA in this list, as the true value is unknown but less than its dimension's p-value threshold.} 46 | 47 | \item{infinite_cycle_inference}{a boolean representing whether or not to perform inference for features with infinite (i.e. `thresh`) death values, default FALSE. If `FUN_diag` is `calculate_homology` (the 48 | default) then no infinite cycles will be returned by the persistent homology calculation at all.} 49 | } 50 | \value{ 51 | a list containing the full persistence diagram, the subsetted diagram, representatives and/or subsetted representatives if desired, the p-values of subsetted features and the Bonferroni p-value thresholds in each dimension if desired. 52 | } 53 | \description{ 54 | An inference procedure to determine which topological features (if any) of a datasets are likely signal (i.e. significant) 55 | vs noise (not). 56 | } 57 | \details{ 58 | For each feature in a diagram we compute its persistence ratio \eqn{\pi = death/birth}, and a 59 | test statistic \eqn{A log log \pi + B} (where \eqn{A} and \eqn{B} are constants). This statistic is compared to a left-skewed Gumbel distribution 60 | to get a p-value. A Bonferroni correction is applied to all the p-values across all features, so when `return_pvals` is TRUE a list of 61 | p-value thresholds is also returned, one for each dimension, which is `alpha` divided by the number of features in that dimension. 62 | If desired, infinite cycles (i.e. cycles whose death value is equal to the maximum distance threshold parameter for the persistent homology calculation) 63 | can be anaylzed for significance by determining their minimum distance thresholds where they might be significant (using the Gumbel distribution again), 64 | calculating the persistence diagram up to those thresholds and seeing if they are still infinite (i.e. significant) or not. 65 | This function is significantly faster than the \code{\link{bootstrap_persistence_thresholds}} function. Note that the `calculate_homology` 66 | function does not seem to store infinite cycles (i.e. cycles that have death value equal to `thresh`). 67 | } 68 | \examples{ 69 | 70 | if(require("TDA")) 71 | { 72 | # create dataset 73 | theta <- runif(n = 100,min = 0,max = 2*pi) 74 | x <- cos(theta) 75 | y <- sin(theta) 76 | circ <- data.frame(x = x,y = y) 77 | 78 | # add noise 79 | x_noise <- -0.1 + 0.2*stats::runif(n = 100) 80 | y_noise <- -0.1 + 0.2*stats::runif(n = 100) 81 | circ$x <- circ$x + x_noise 82 | circ$y <- circ$y + y_noise 83 | 84 | # determine significant topological features 85 | library(TDA) 86 | res <- universal_null(circ, thresh = 2,alpha = 0.1,return_pvals = TRUE,FUN_diag = "ripsDiag") 87 | res$subsetted_diag 88 | res$pvals 89 | res$alpha_thresh 90 | 91 | # at a lower threshold we can check for 92 | # infinite cycles 93 | res2 <- universal_null(circ, thresh = 1.1, 94 | infinite_cycle_inference = TRUE, 95 | alpha = 0.1, 96 | FUN_diag = "ripsDiag") 97 | res2$subsetted_diag 98 | } 99 | } 100 | \references{ 101 | Bobrowski O, Skraba P (2023). "A universal null-distribution for topological data analysis." \url{https://www.nature.com/articles/s41598-023-37842-2}. 102 | } 103 | \author{ 104 | Shael Brown - \email{shaelebrown@gmail.com} 105 | } 106 | -------------------------------------------------------------------------------- /src/pr_queue_k.h: -------------------------------------------------------------------------------- 1 | //---------------------------------------------------------------------- 2 | // File: pr_queue_k.h 3 | // Programmer: Sunil Arya and David Mount 4 | // Description: Include file for priority queue with k items. 5 | // Last modified: 01/04/05 (Version 1.0) 6 | //---------------------------------------------------------------------- 7 | // Copyright (c) 1997-2005 University of Maryland and Sunil Arya and 8 | // David Mount. All Rights Reserved. 9 | // 10 | // This software and related documentation is part of the Approximate 11 | // Nearest Neighbor Library (ANN). This software is provided under 12 | // the provisions of the Lesser GNU Public License (LGPL). See the 13 | // file ../ReadMe.txt for further information. 14 | // 15 | // The University of Maryland (U.M.) and the authors make no 16 | // representations about the suitability or fitness of this software for 17 | // any purpose. It is provided "as is" without express or implied 18 | // warranty. 19 | //---------------------------------------------------------------------- 20 | // History: 21 | // Revision 0.1 03/04/98 22 | // Initial release 23 | //---------------------------------------------------------------------- 24 | 25 | #ifndef PR_QUEUE_K_H 26 | #define PR_QUEUE_K_H 27 | 28 | #include "ANNx.h" // all ANN includes 29 | #include "ANNperf.h" // performance evaluation 30 | 31 | //---------------------------------------------------------------------- 32 | // Basic types 33 | //---------------------------------------------------------------------- 34 | typedef ANNdist PQKkey; // key field is distance 35 | typedef int PQKinfo; // info field is int 36 | 37 | //---------------------------------------------------------------------- 38 | // Constants 39 | // The NULL key value is used to initialize the priority queue, and 40 | // so it should be larger than any valid distance, so that it will 41 | // be replaced as legal distance values are inserted. The NULL 42 | // info value must be a nonvalid array index, we use ANN_NULL_IDX, 43 | // which is guaranteed to be negative. 44 | //---------------------------------------------------------------------- 45 | 46 | const PQKkey PQ_NULL_KEY = ANN_DIST_INF; // nonexistent key value 47 | const PQKinfo PQ_NULL_INFO = ANN_NULL_IDX; // nonexistent info value 48 | 49 | //---------------------------------------------------------------------- 50 | // ANNmin_k 51 | // An ANNmin_k structure is one which maintains the smallest 52 | // k values (of type PQKkey) and associated information (of type 53 | // PQKinfo). The special info and key values PQ_NULL_INFO and 54 | // PQ_NULL_KEY means that thise entry is empty. 55 | // 56 | // It is currently implemented using an array with k items. 57 | // Items are stored in increasing sorted order, and insertions 58 | // are made through standard insertion sort. (This is quite 59 | // inefficient, but current applications call for small values 60 | // of k and relatively few insertions.) 61 | // 62 | // Note that the list contains k+1 entries, but the last entry 63 | // is used as a simple placeholder and is otherwise ignored. 64 | //---------------------------------------------------------------------- 65 | 66 | class ANNmin_k { 67 | struct mk_node { // node in min_k structure 68 | PQKkey key; // key value 69 | PQKinfo info; // info field (user defined) 70 | }; 71 | 72 | int k; // max number of keys to store 73 | int n; // number of keys currently active 74 | mk_node *mk; // the list itself 75 | 76 | public: 77 | ANNmin_k(int max) // constructor (given max size) 78 | { 79 | n = 0; // initially no items 80 | k = max; // maximum number of items 81 | mk = new mk_node[max+1]; // sorted array of keys 82 | } 83 | 84 | ~ANNmin_k() // destructor 85 | { delete [] mk; } 86 | 87 | PQKkey ANNmin_key() // return minimum key 88 | { return (n > 0 ? mk[0].key : PQ_NULL_KEY); } 89 | 90 | PQKkey max_key() // return maximum key 91 | { return (n == k ? mk[k-1].key : PQ_NULL_KEY); } 92 | 93 | PQKkey ith_smallest_key(int i) // ith smallest key (i in [0..n-1]) 94 | { return (i < n ? mk[i].key : PQ_NULL_KEY); } 95 | 96 | PQKinfo ith_smallest_info(int i) // info for ith smallest (i in [0..n-1]) 97 | { return (i < n ? mk[i].info : PQ_NULL_INFO); } 98 | 99 | inline void insert( // insert item (inlined for speed) 100 | PQKkey kv, // key value 101 | PQKinfo inf) // item info 102 | { 103 | int i; 104 | // slide larger values up 105 | for (i = n; i > 0; i--) { 106 | if (mk[i-1].key > kv) 107 | mk[i] = mk[i-1]; 108 | else 109 | break; 110 | } 111 | mk[i].key = kv; // store element here 112 | mk[i].info = inf; 113 | if (n < k) n++; // increment number of items 114 | ANN_FLOP(k-i+1) // increment floating ops 115 | } 116 | 117 | // added by Vlad 5-1-08 to allow user to update flops by calling this 118 | // function even when ANN_PERF is not defined 119 | inline void insertFlops( // insert item (inlined for speed) 120 | PQKkey kv, // key value 121 | PQKinfo inf) // item info 122 | { 123 | int i; 124 | // slide larger values up 125 | for (i = n; i > 0; i--) { 126 | if (mk[i-1].key > kv) 127 | mk[i] = mk[i-1]; 128 | else 129 | break; 130 | } 131 | mk[i].key = kv; // store element here 132 | mk[i].info = inf; 133 | if (n < k) n++; // increment number of items 134 | ANN_FLOP_ALWAYS(k-i+1) // increment floating ops 135 | } 136 | }; 137 | 138 | #endif -------------------------------------------------------------------------------- /tests/testthat/test-kernel.R: -------------------------------------------------------------------------------- 1 | 2 | test_that("diagram_kernel detects incorrect parameters correctly",{ 3 | 4 | D <- data.frame(dimension = c(0),birth = c(0),death = c(1)) 5 | expect_error(diagram_kernel(D1 = NULL,D2 = D,dim = 1),"TDA/TDAstats") 6 | expect_error(diagram_kernel(D1 = D,D2 = NULL,dim = 1),"TDA/TDAstats") 7 | expect_error(diagram_kernel(D1 = D,D2 = D,dim = "2"),"numeric") 8 | expect_error(diagram_kernel(D1 = D,D2 = D,sigma = "2"),"numeric") 9 | expect_error(diagram_kernel(D1 = D,D2 = D,t = NA),"NA") 10 | expect_error(diagram_kernel(D1 = D,D2 = D,t = -1),"positive") 11 | 12 | }) 13 | 14 | # test_that("diagram_kernel can accept inputs from either TDA/TDAstats homology output or diagram_to_df function, with or without cycle location",{ 15 | # 16 | # skip_if_not_installed("TDA") 17 | # skip_if_not_installed("TDAstats") 18 | # D1 = TDA::ripsDiag(data.frame(x = runif(50,0,1),y = runif(50,0,1)),maxscale = 1,maxdimension = 1) 19 | # D2 = TDA::alphaComplexDiag(data.frame(x = runif(50,0,1),y = runif(50,0,1)),maxdimension = 1) 20 | # D3 = TDA::ripsDiag(data.frame(x = runif(50,0,1),y = runif(50,0,1)),maxscale = 1,maxdimension = 1,library = "dionysus",location = T) 21 | # D4 = TDAstats::calculate_homology(data.frame(x = runif(50,0,1),y = runif(50,0,1)),threshold = 1) 22 | # expect_gte(diagram_kernel(D1 = D1,D2 = D2,dim = 1),0) 23 | # expect_gte(diagram_kernel(D1 = diagram_to_df(D1),D2 = D2,dim = 1),0) 24 | # expect_gte(diagram_kernel(D1 = D1,D2 = diagram_to_df(D2),dim = 1),0) 25 | # expect_gte(diagram_kernel(D1 = D3,D2 = diagram_to_df(D2),dim = 1),0) 26 | # expect_gte(diagram_kernel(D1 = D1,D2 = diagram_to_df(D3),dim = 1),0) 27 | # expect_gte(diagram_kernel(D1 = D1,D2 = D4,dim = 1),0) 28 | # expect_error(diagram_kernel(D1 = D1,D2 = D2,dim = 0),"Inf") 29 | # 30 | # }) 31 | 32 | test_that("diagram_kernel is computing correctly",{ 33 | 34 | D1 <- data.frame(dimension = 0,birth = 2,death = 3) 35 | D2 <- data.frame(dimension = 0,birth = 2,death = 3.1) 36 | D3 <- data.frame(dimension = 0,birth = c(2,5),death = c(3.1,6)) 37 | sqrt_rho_1 <- function(sigma) 38 | { 39 | v <- (1/(2*pi*sigma^2))*c(exp(0)+exp(-(0.45^2+0.55^2)/(2*sigma^2)),exp(-(0.1^2)/(2*sigma^2))+exp(-(2*0.55^2)/(2*sigma^2)),exp(-(2*0.5^2)/(2*sigma^2)) + exp(-(2*0.05^2)/(2*sigma^2)),exp(-(0.45^2+0.55^2)/(2*sigma^2)) + exp(0)) 40 | v <- v/sum(v) 41 | return(sqrt(v)) 42 | } 43 | sqrt_rho_2 <- function(sigma) 44 | { 45 | v <- (1/(2*pi*sigma^2))*c(exp(-(0.1^2)/(2*sigma^2))+exp(-(2*0.5^2)/(2*sigma^2)),exp(0)+exp(-(0.5^2+0.6^2)/(2*sigma^2)),exp(-(0.5^2+0.6^2)/(2*sigma^2)) + exp(0),exp(-(2*0.55^2)/(2*sigma^2)) + exp(-(2*0.05^2)/(2*sigma^2))) 46 | v <- v/sum(v) 47 | return(sqrt(v)) 48 | } 49 | v11 <- sqrt_rho_1(1) 50 | v21 <- sqrt_rho_2(1) 51 | v12 <- sqrt_rho_1(2) 52 | v22 <- sqrt_rho_2(2) 53 | norm_11 <- as.numeric(v11 %*% v21) 54 | norm_22 <- as.numeric(v12 %*% v22) 55 | if(norm_11 > 1) 56 | { 57 | norm_11 <- 1 58 | } 59 | if(norm_11 < -1) 60 | { 61 | norm_11 <- -1 62 | } 63 | if(norm_22 > 1) 64 | { 65 | norm_22 <- 1 66 | } 67 | if(norm_22 < -1) 68 | { 69 | norm_22 <- -1 70 | } 71 | val_1 <- acos(norm_11) 72 | val_2 <- acos(norm_22) 73 | expect_equal(diagram_kernel(D1,D2,dim = 0,sigma = 1,t = 1),exp(-1*val_1)) 74 | expect_equal(diagram_kernel(D2,D1,dim = 0,sigma = 1,t = 1),exp(-1*val_1)) 75 | expect_equal(diagram_kernel(D1,D2,dim = 0,sigma = 2,t = 1),exp(-1*val_2)) 76 | expect_equal(diagram_kernel(D1 = D1,D2 = D2,dim = 0,sigma = 1,t = 2),exp(-2*val_1)) 77 | expect_equal(diagram_kernel(D1 = D1,D2 = D2,sigma = 2,t = 2),exp(-2*val_2)) 78 | expect_equal(diagram_kernel(D1 = D2,D2 = D1,sigma = 2,t = 2),exp(-2*val_2)) 79 | expect_identical(diagram_kernel(D1,D1,sigma = 1,t = 1),1) 80 | 81 | }) 82 | 83 | test_that("gram_matrix detect incorrect parameters correctly",{ 84 | 85 | D1 <- data.frame(dimension = 0,birth = 2,death = 3) 86 | D2 <- data.frame(dimension = 0,birth = 2,death = 3.1) 87 | D3 <- data.frame(dimension = 0,birth = c(2,5),death = c(3.1,6)) 88 | expect_error(gram_matrix(diagrams = list(D1,D2,D3),num_workers = NaN),"NaN") 89 | expect_error(gram_matrix(diagrams = list(D1,D2,D3),num_workers = "1"),"numeric") 90 | 91 | }) 92 | 93 | test_that("gram_matrix is computing correctly",{ 94 | 95 | D1 <- data.frame(dimension = 0,birth = 2,death = 3) 96 | D2 <- data.frame(dimension = 0,birth = 2,death = 3.1) 97 | D3 <- data.frame(dimension = 0,birth = c(2,5),death = c(3.1,6)) 98 | m1 <- matrix(data = c(1,diagram_kernel(D1,D2,dim = 0,sigma = 1,t = 1),diagram_kernel(D1,D2,dim = 0,sigma = 1,t = 1),1),byrow = T,nrow = 2,ncol = 2) 99 | class(m1) <- "kernelMatrix" 100 | m2 <- matrix(data = c(1,diagram_kernel(D1,D2,dim = 0,sigma = 1,t = 1),diagram_kernel(D1,D3,dim = 0,sigma = 1,t = 1),diagram_kernel(D2,D1,dim = 0,sigma = 1,t = 1),1,diagram_kernel(D2,D3,dim = 0,sigma = 1,t = 1),diagram_kernel(D3,D1,dim = 0,sigma = 1,t = 1),diagram_kernel(D3,D2,dim = 0,sigma = 1,t = 1),1),byrow = T,nrow = 3,ncol = 3) 101 | class(m2) <- "kernelMatrix" 102 | m3 <- matrix(data = c(1,diagram_kernel(D1,D3,dim = 0,sigma = 1,t = 1),diagram_kernel(D1,D2,dim = 0,sigma = 1,t = 1),diagram_kernel(D2,D3,dim = 0,sigma = 1,t = 1)),byrow = T,nrow = 2,ncol = 2) 103 | class(m3) <- "kernelMatrix" 104 | expect_identical(gram_matrix(diagrams = list(D1,D2),dim = 0,sigma = 1,t = 1,num_workers = 2),m1) 105 | expect_equal(gram_matrix(diagrams = list(D1,D2,D3),dim = 0,sigma = 1,t = 1,num_workers = 2),m2) 106 | expect_equal(gram_matrix(diagrams = list(D1,D2),other_diagrams = list(D1,D3),dim = 0,sigma = 1,t = 1,num_workers = 2),m3) 107 | 108 | }) 109 | 110 | 111 | -------------------------------------------------------------------------------- /src/kd_util.h: -------------------------------------------------------------------------------- 1 | //---------------------------------------------------------------------- 2 | // File: kd_util.h 3 | // Programmer: Sunil Arya and David Mount 4 | // Description: Common utilities for kd- trees 5 | // Last modified: 01/04/05 (Version 1.0) 6 | //---------------------------------------------------------------------- 7 | // Copyright (c) 1997-2005 University of Maryland and Sunil Arya and 8 | // David Mount. All Rights Reserved. 9 | // 10 | // This software and related documentation is part of the Approximate 11 | // Nearest Neighbor Library (ANN). This software is provided under 12 | // the provisions of the Lesser GNU Public License (LGPL). See the 13 | // file ../ReadMe.txt for further information. 14 | // 15 | // The University of Maryland (U.M.) and the authors make no 16 | // representations about the suitability or fitness of this software for 17 | // any purpose. It is provided "as is" without express or implied 18 | // warranty. 19 | //---------------------------------------------------------------------- 20 | // History: 21 | // Revision 0.1 03/04/98 22 | // Initial release 23 | //---------------------------------------------------------------------- 24 | 25 | #ifndef ANN_kd_util_H 26 | #define ANN_kd_util_H 27 | 28 | #include "kd_tree.h" // kd-tree declarations 29 | 30 | //---------------------------------------------------------------------- 31 | // externally accessible functions 32 | //---------------------------------------------------------------------- 33 | 34 | double annAspectRatio( // compute aspect ratio of box 35 | int dim, // dimension 36 | const ANNorthRect &bnd_box); // bounding cube 37 | 38 | void annEnclRect( // compute smallest enclosing rectangle 39 | ANNpointArray pa, // point array 40 | ANNidxArray pidx, // point indices 41 | int n, // number of points 42 | int dim, // dimension 43 | ANNorthRect &bnds); // bounding cube (returned) 44 | 45 | void annEnclCube( // compute smallest enclosing cube 46 | ANNpointArray pa, // point array 47 | ANNidxArray pidx, // point indices 48 | int n, // number of points 49 | int dim, // dimension 50 | ANNorthRect &bnds); // bounding cube (returned) 51 | 52 | ANNdist annBoxDistance( // compute distance from point to box 53 | const ANNpoint q, // the point 54 | const ANNpoint lo, // low point of box 55 | const ANNpoint hi, // high point of box 56 | int dim); // dimension of space 57 | 58 | // added by vlad 5-1-2008 to allow user to compute flops at runtime in release version 59 | // while keeping the version above fast 60 | ANNdist annBoxDistanceFlops( // compute distance from point to box 61 | const ANNpoint q, // the point 62 | const ANNpoint lo, // low point of box 63 | const ANNpoint hi, // high point of box 64 | int dim); // dimension of space 65 | 66 | 67 | ANNcoord annSpread( // compute point spread along dimension 68 | ANNpointArray pa, // point array 69 | ANNidxArray pidx, // point indices 70 | int n, // number of points 71 | int d); // dimension to check 72 | 73 | void annMinMax( // compute min and max coordinates along dim 74 | ANNpointArray pa, // point array 75 | ANNidxArray pidx, // point indices 76 | int n, // number of points 77 | int d, // dimension to check 78 | ANNcoord& min, // minimum value (returned) 79 | ANNcoord& max); // maximum value (returned) 80 | 81 | int annMaxSpread( // compute dimension of max spread 82 | ANNpointArray pa, // point array 83 | ANNidxArray pidx, // point indices 84 | int n, // number of points 85 | int dim); // dimension of space 86 | 87 | void annMedianSplit( // split points along median value 88 | ANNpointArray pa, // points to split 89 | ANNidxArray pidx, // point indices 90 | int n, // number of points 91 | int d, // dimension along which to split 92 | ANNcoord &cv, // cutting value 93 | int n_lo); // split into n_lo and n-n_lo 94 | 95 | void annPlaneSplit( // split points by a plane 96 | ANNpointArray pa, // points to split 97 | ANNidxArray pidx, // point indices 98 | int n, // number of points 99 | int d, // dimension along which to split 100 | ANNcoord cv, // cutting value 101 | int &br1, // first break (values < cv) 102 | int &br2); // second break (values == cv) 103 | 104 | void annBoxSplit( // split points by a box 105 | ANNpointArray pa, // points to split 106 | ANNidxArray pidx, // point indices 107 | int n, // number of points 108 | int dim, // dimension of space 109 | ANNorthRect &box, // the box 110 | int &n_in); // number of points inside (returned) 111 | 112 | int annSplitBalance( // determine balance factor of a split 113 | ANNpointArray pa, // points to split 114 | ANNidxArray pidx, // point indices 115 | int n, // number of points 116 | int d, // dimension along which to split 117 | ANNcoord cv); // cutting value 118 | 119 | void annBox2Bnds( // convert inner box to bounds 120 | const ANNorthRect &inner_box, // inner box 121 | const ANNorthRect &bnd_box, // enclosing box 122 | int dim, // dimension of space 123 | int &n_bnds, // number of bounds (returned) 124 | ANNorthHSArray &bnds); // bounds array (returned) 125 | 126 | void annBnds2Box( // convert bounds to inner box 127 | const ANNorthRect &bnd_box, // enclosing box 128 | int dim, // dimension of space 129 | int n_bnds, // number of bounds 130 | ANNorthHSArray bnds, // bounds array 131 | ANNorthRect &inner_box); // inner box (returned) 132 | 133 | #endif -------------------------------------------------------------------------------- /src/perf.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace Rcpp; 3 | 4 | //---------------------------------------------------------------------- 5 | // File: perf.cpp 6 | // Programmer: Sunil Arya and David Mount 7 | // Description: Methods for performance stats 8 | // Last modified: 01/04/05 (Version 1.0) 9 | //---------------------------------------------------------------------- 10 | // Copyright (c) 1997-2005 University of Maryland and Sunil Arya and 11 | // David Mount. All Rights Reserved. 12 | // 13 | // This software and related documentation is part of the Approximate 14 | // Nearest Neighbor Library (ANN). This software is provided under 15 | // the provisions of the Lesser GNU Public License (LGPL). See the 16 | // file ../ReadMe.txt for further information. 17 | // 18 | // The University of Maryland (U.M.) and the authors make no 19 | // representations about the suitability or fitness of this software for 20 | // any purpose. It is provided "as is" without express or implied 21 | // warranty. 22 | //---------------------------------------------------------------------- 23 | // History: 24 | // Revision 0.1 03/04/98 25 | // Initial release 26 | // Revision 1.0 04/01/05 27 | // Changed names to avoid namespace conflicts. 28 | // Added flush after printing performance stats to fix bug 29 | // in Microsoft Windows version. 30 | //---------------------------------------------------------------------- 31 | 32 | #include "ANN.h" // basic ANN includes 33 | #include "ANNperf.h" // performance includes 34 | 35 | using namespace std; // make std:: available 36 | 37 | //---------------------------------------------------------------------- 38 | // Performance statistics 39 | // The following data and routines are used for computing 40 | // performance statistics for nearest neighbor searching. 41 | // Because these routines can slow the code down, they can be 42 | // activated and deactiviated by defining the PERF variable, 43 | // by compiling with the option: -DPERF 44 | //---------------------------------------------------------------------- 45 | 46 | //---------------------------------------------------------------------- 47 | // Global counters for performance measurement 48 | //---------------------------------------------------------------------- 49 | 50 | int ann_Ndata_pts = 0; // number of data points 51 | int ann_Nvisit_lfs = 0; // number of leaf nodes visited 52 | int ann_Nvisit_spl = 0; // number of splitting nodes visited 53 | int ann_Nvisit_shr = 0; // number of shrinking nodes visited 54 | int ann_Nvisit_pts = 0; // visited points for one query 55 | int ann_Ncoord_hts = 0; // coordinate hits for one query 56 | int ann_Nfloat_ops = 0; // floating ops for one query 57 | ANNsampStat ann_visit_lfs; // stats on leaf nodes visits 58 | ANNsampStat ann_visit_spl; // stats on splitting nodes visits 59 | ANNsampStat ann_visit_shr; // stats on shrinking nodes visits 60 | ANNsampStat ann_visit_nds; // stats on total nodes visits 61 | ANNsampStat ann_visit_pts; // stats on points visited 62 | ANNsampStat ann_coord_hts; // stats on coordinate hits 63 | ANNsampStat ann_float_ops; // stats on floating ops 64 | // 65 | ANNsampStat ann_average_err; // average error 66 | ANNsampStat ann_rank_err; // rank error 67 | 68 | //---------------------------------------------------------------------- 69 | // Routines for statistics. 70 | //---------------------------------------------------------------------- 71 | 72 | DLL_API void annResetStats(int data_size) // reset stats for a set of queries 73 | { 74 | ann_Ndata_pts = data_size; 75 | ann_visit_lfs.reset(); 76 | ann_visit_spl.reset(); 77 | ann_visit_shr.reset(); 78 | ann_visit_nds.reset(); 79 | ann_visit_pts.reset(); 80 | ann_coord_hts.reset(); 81 | ann_float_ops.reset(); 82 | ann_average_err.reset(); 83 | ann_rank_err.reset(); 84 | } 85 | 86 | DLL_API void annResetCounts() // reset counts for one query 87 | { 88 | ann_Nvisit_lfs = 0; 89 | ann_Nvisit_spl = 0; 90 | ann_Nvisit_shr = 0; 91 | ann_Nvisit_pts = 0; 92 | ann_Ncoord_hts = 0; 93 | ann_Nfloat_ops = 0; 94 | } 95 | 96 | DLL_API void annUpdateStats() // update stats with current counts 97 | { 98 | ann_visit_lfs += ann_Nvisit_lfs; 99 | ann_visit_nds += ann_Nvisit_spl + ann_Nvisit_lfs; 100 | ann_visit_spl += ann_Nvisit_spl; 101 | ann_visit_shr += ann_Nvisit_shr; 102 | ann_visit_pts += ann_Nvisit_pts; 103 | ann_coord_hts += ann_Ncoord_hts; 104 | ann_float_ops += ann_Nfloat_ops; 105 | } 106 | 107 | // print a single statistic 108 | void print_one_stat(char *title, ANNsampStat s, double div) 109 | { 110 | Rcout << title << "= [ "; 111 | Rcout.width(9); Rcout << s.mean()/div << " : "; 112 | Rcout.width(9); Rcout << s.stdDev()/div << " ]<"; 113 | Rcout.width(9); Rcout << s.min()/div << " , "; 114 | Rcout.width(9); Rcout << s.max()/div << " >\n"; 115 | } 116 | 117 | DLL_API void annPrintStats( // print statistics for a run 118 | ANNbool validate) // true if average errors desired 119 | { 120 | Rcout.precision(4); // set floating precision 121 | Rcout << " (Performance stats: " 122 | << " [ mean : stddev ]< min , max >\n"; 123 | print_one_stat((char*)" leaf_nodes ", ann_visit_lfs, 1); 124 | print_one_stat((char*)" splitting_nodes ", ann_visit_spl, 1); 125 | print_one_stat((char*)" shrinking_nodes ", ann_visit_shr, 1); 126 | print_one_stat((char*)" total_nodes ", ann_visit_nds, 1); 127 | print_one_stat((char*)" points_visited ", ann_visit_pts, 1); 128 | print_one_stat((char*)" coord_hits/pt ", ann_coord_hts, ann_Ndata_pts); 129 | print_one_stat((char*)" floating_ops_(K) ", ann_float_ops, 1000); 130 | if (validate) { 131 | print_one_stat((char*)" average_error ", ann_average_err, 1); 132 | print_one_stat((char*)" rank_error ", ann_rank_err, 1); 133 | } 134 | Rcout.precision(0); // restore the default 135 | Rcout << " )\n"; 136 | Rcout.flush(); 137 | } -------------------------------------------------------------------------------- /src/KCenterClustering.h: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------- 2 | // This code was modified by Vlad Morariu: 3 | // 11/03/06: 4 | // Removed references to Matlab to compile code into a library 5 | // 01/24/07: 6 | // KCenterClustering now has the ability to increase the number of 7 | // clusters incrementally, calculating the max cluster radius at each 8 | // iteration. 9 | // 02/07/07: 10 | // Clustering now stops when the max cluster radius 11 | // is zero (when number of clusters has reached the number of 12 | // unique points), and the number of ACTUAL clusters used is returned. 13 | //------------------------------------------------------------------- 14 | 15 | //------------------------------------------------------------------- 16 | // The code was written by Changjiang Yang and Vikas Raykar 17 | // and is copyrighted under the Lesser GPL: 18 | // 19 | // Copyright (C) 2006 Changjiang Yang and Vikas Raykar 20 | // 21 | // This program is free software; you can redistribute it and/or modify 22 | // it under the terms of the GNU Lesser General Public License as 23 | // published by the Free Software Foundation; version 2.1 or later. 24 | // This program is distributed in the hope that it will be useful, 25 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 26 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 27 | // See the GNU Lesser General Public License for more details. 28 | // You should have received a copy of the GNU Lesser General Public 29 | // License along with this program; if not, write to the Free Software 30 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, 31 | // MA 02111-1307, USA. 32 | // 33 | // The author may be contacted via email at:cyang(at)sarnoff(.)com 34 | // vikas(at)umiacs(.)umd(.)edu 35 | //------------------------------------------------------------------- 36 | 37 | //---------------------------------------------------------------------------- 38 | // File : KCenterClustering.h 39 | // Purpose : Interface for the k-center clustering algorithm. 40 | // Author : Vikas C. Raykar (vikas@cs.umd.edu) 41 | // Date : April 25 2005, June 10 2005, August 23, 2005 42 | // 43 | //---------------------------------------------------------------------------- 44 | // Gonzalez's farthest-point clustering algorithm. 45 | // 46 | // June 10, 2005: 47 | // This version now returns the number points and the radius of each cluster. 48 | // 49 | // August 23, 2005: 50 | // Speed up using the doubly circular list. 51 | // The clusters far away are trimmed. The nodes inside the neighboring 52 | // clusters which are within half sphere are trimmed. 53 | // 54 | //---------------------------------------------------------------------------- 55 | // 56 | // INPUT 57 | // ---------------- 58 | // 59 | // Dim --> dimension of the points. 60 | // NSources --> number of sources. 61 | // pSources --> pointer to sources, (d*N). 62 | // NumClusters --> number of clusters. 63 | // 64 | // OUTPUT 65 | // ---------------- 66 | // 67 | // MaxClusterRadius --> maximum radius of the clusters, (rx). 68 | // pClusterIndex --> vector of length N where the i th element is the 69 | // cluster number to which the i th point belongs. 70 | // pClusterIndex[i] varies between 0 to K-1. 71 | // pClusterCenters --> pointer to the cluster centers, (d*K). 72 | // pNumPoints --> pointer to the number of points in each cluster, (K). 73 | // pClusterRadii --> pointer to the radius of each cluster, (K). 74 | //---------------------------------------------------------------------------- 75 | 76 | #ifndef K_CENTER_CLUSTERING_H 77 | #define K_CENTER_CLUSTERING_H 78 | 79 | class KCenterClustering{ 80 | public: 81 | 82 | //Output parameters 83 | 84 | double MaxClusterRadius; //maximum cluster radius 85 | 86 | //Functions 87 | 88 | //constructor 89 | KCenterClustering(int Dim, 90 | int NSources, 91 | double *pSources, 92 | int *pClusterIndex, 93 | int NumClusters 94 | ); 95 | 96 | //destructor 97 | ~KCenterClustering(); 98 | 99 | //K-center clustering 100 | //Returns the number of actual clusters (it might have stopped early if all clusters have 101 | // radius of 0 -- which means that the number of clusters has reached the number 102 | // of unique pts) 103 | int Cluster(); 104 | 105 | //Incremental k-center clustering 106 | // nClusters - if non-NULL, value is set to the # of clusters at end of call 107 | // maxRadius - if non-NULL, value is set to the max radius of all clusters 108 | void ClusterIncrement( int * nClusters, double * maxRadius ); 109 | 110 | //Compute cluster centers and the number of points in each cluster 111 | //and the radius of each cluster. 112 | void ComputeClusterCenters( int NumClusters, 113 | double *pClusterCenters, 114 | int *pNumPoints, 115 | double *pClusterRadii 116 | ); 117 | 118 | private: 119 | //Input Parameters 120 | 121 | int d; // dimension of the points. 122 | int N; // number of sources. 123 | double *px; // pointer to sources, (d*N). 124 | int K; // max number of clusters 125 | int *pci; // pointer to a vector of length N where the i th element is the 126 | // cluster number to which the i th point belongs. 127 | double *dist_C; // distances to the center. 128 | double *r; 129 | 130 | int *pCenters; // indices of the centers. 131 | int *cprev; // index to the previous node 132 | int *cnext; // index to the next node 133 | int *far2c; // farthest node to the center 134 | 135 | int numClusters; // added by Vlad to keep track of # of clusters 136 | 137 | //Functions 138 | double ddist(const int d, const double *x, const double *y); 139 | int idmax(int n, double *x); 140 | 141 | }; 142 | 143 | 144 | #endif -------------------------------------------------------------------------------- /man/permutation_model_inference.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/inference.R 3 | \name{permutation_model_inference} 4 | \alias{permutation_model_inference} 5 | \title{Model inference with permutation test.} 6 | \usage{ 7 | permutation_model_inference( 8 | D1, 9 | D2, 10 | iterations, 11 | num_samples, 12 | dims = c(0, 1), 13 | samp = NULL, 14 | paired = F, 15 | num_workers = parallelly::availableCores(omit = 1), 16 | verbose = F, 17 | FUN_boot = "calculate_homology", 18 | thresh, 19 | distance_mat = FALSE, 20 | ripser = NULL, 21 | return_diagrams = FALSE 22 | ) 23 | } 24 | \arguments{ 25 | \item{D1}{the first dataset (a data frame).} 26 | 27 | \item{D2}{the second dataset (a data frame).} 28 | 29 | \item{iterations}{the number of iterations for permuting group labels, default 20.} 30 | 31 | \item{num_samples}{the number of bootstrap iterations, default 30.} 32 | 33 | \item{dims}{a non-negative integer vector of the homological dimensions in which the test is to be carried out, default c(0,1).} 34 | 35 | \item{samp}{an optional list of row-number samples of `D1`, default NULL. See details and examples for more information. Ignored when `paired` is FALSE.} 36 | 37 | \item{paired}{a boolean flag for if there is a second-order pairing between diagrams at the same index in different groups, default FALSE.} 38 | 39 | \item{num_workers}{the number of cores used for parallel computation, default is one less than the number of cores on the machine.} 40 | 41 | \item{verbose}{a boolean flag for if the time duration of the function call should be printed, default FALSE} 42 | 43 | \item{FUN_boot}{a string representing the persistent homology function to use for calculating the bootstrapped persistence diagrams, either 44 | 'calculate_homology' (the default), 'PyH' or 'ripsDiag'.} 45 | 46 | \item{thresh}{the positive numeric maximum radius of the Vietoris-Rips filtration.} 47 | 48 | \item{distance_mat}{a boolean representing if `X` is a distance matrix (TRUE) or not (FALSE, default). 49 | dimensions together (TRUE, the default) or if one threshold should be calculated for each dimension separately (FALSE).} 50 | 51 | \item{ripser}{the imported ripser module when `FUN_boot` is `PyH`.} 52 | 53 | \item{return_diagrams}{whether or not to return the two lists of bootstrapped persistence diagrams, default FALSE.} 54 | } 55 | \value{ 56 | a list which contains the output of the call to \code{\link{permutation_test}} and the two groups of bootstrapped 57 | persistence diagrams if desired, in entries called `diagrams1` and `diagrams2`. 58 | } 59 | \description{ 60 | An inference procedure to determine if two datasets were unlikely to be generated by the same process (i.e. if 61 | the persistence diagram of one dataset is a good model of the persistence diagram of the other dataset). 62 | } 63 | \details{ 64 | Inference is carried out by generating bootstrap resampled persistence diagrams from the two datasets and carrying out a permutation test 65 | on the resulting two groups. A small p-value in a certain dimension suggests that the datasets are not good models of each other. `samp` should 66 | only be provided when `paired`is TRUE in order to generate the same row samplings of `D1` and `D2` for the bootstrapped persistence diagrams. 67 | This makes a paired permutation test more appropriate, which has higher statistical power for detecting topological differences. See the examples 68 | for how to properly supply `samp`. 69 | } 70 | \examples{ 71 | 72 | if(require("TDAstats")) 73 | { 74 | # create two datasets 75 | D1 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,10),], 76 | dim = 0,threshold = 2) 77 | D2 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,10),], 78 | dim = 0,threshold = 2) 79 | 80 | # do model inference test with 1 iteration (for speed, more 81 | # iterations should be used in practice) 82 | model_test <- permutation_model_inference(D1, D2, iterations = 1, 83 | thresh = 1.75,num_samples = 3, 84 | num_workers = 2L) 85 | 86 | # with more iterations, p-values show a difference in the 87 | # clustering of points but not in the arrangement of loops 88 | model_test$p_values 89 | 90 | # to supply samp, when we believe there is a correspondence between 91 | # the rows in D1 and the rows in D2 92 | # note that the number of entries of samp (3 in this case) must 93 | # match the num_samples parameter to the function call 94 | samp <- lapply(X = 1:3,FUN = function(X){ 95 | 96 | return(unique(sample(1:nrow(D1),size = nrow(D1),replace = TRUE))) 97 | 98 | }) 99 | 100 | # model inference will theoretically have higher power now for a 101 | # paired test 102 | model_test2 <- permutation_model_inference(D1, D2, iterations = 1, 103 | thresh = 1.75,num_samples = 3, 104 | paired = TRUE,samp = samp, 105 | num_workers = 2L) 106 | model_test2$p_values 107 | } 108 | } 109 | \references{ 110 | Robinson T, Turner K (2017). "Hypothesis testing for topological data analysis." \url{https://link.springer.com/article/10.1007/s41468-017-0008-7}. 111 | 112 | Chazal F et al (2017). "Robust Topological Inference: Distance to a Measure and Kernel Distance." \url{https://www.jmlr.org/papers/volume18/15-484/15-484.pdf}. 113 | 114 | Abdallah H et al. (2021). "Statistical Inference for Persistent Homology applied to fMRI." \url{https://github.com/hassan-abdallah/Statistical_Inference_PH_fMRI/blob/main/Abdallah_et_al_Statistical_Inference_PH_fMRI.pdf}. 115 | } 116 | \seealso{ 117 | \code{\link{permutation_test}} for an inferential group difference test for groups of persistence diagrams and \code{\link{bootstrap_persistence_thresholds}} for computing confidence sets for persistence diagrams. 118 | } 119 | \author{ 120 | Shael Brown - \email{shaelebrown@gmail.com} 121 | } 122 | -------------------------------------------------------------------------------- /man/bootstrap_persistence_thresholds.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/bootstrap.R 3 | \name{bootstrap_persistence_thresholds} 4 | \alias{bootstrap_persistence_thresholds} 5 | \title{Estimate persistence threshold(s) for topological features in a data set using bootstrapping.} 6 | \usage{ 7 | bootstrap_persistence_thresholds( 8 | X, 9 | FUN_diag = "calculate_homology", 10 | FUN_boot = "calculate_homology", 11 | maxdim = 0, 12 | thresh, 13 | distance_mat = FALSE, 14 | ripser = NULL, 15 | ignore_infinite_cluster = TRUE, 16 | calculate_representatives = FALSE, 17 | num_samples = 30, 18 | alpha = 0.05, 19 | return_subsetted = FALSE, 20 | return_pvals = FALSE, 21 | return_diag = TRUE, 22 | num_workers = parallelly::availableCores(omit = 1), 23 | p_less_than_alpha = FALSE, 24 | ... 25 | ) 26 | } 27 | \arguments{ 28 | \item{X}{the input dataset, must either be a matrix or data frame.} 29 | 30 | \item{FUN_diag}{a string representing the persistent homology function to use for calculating the full persistence diagram, either 31 | 'calculate_homology' (the default), 'PyH' or 'ripsDiag'.} 32 | 33 | \item{FUN_boot}{a string representing the persistent homology function to use for calculating the bootstrapped persistence diagrams, either 34 | 'calculate_homology' (the default), 'PyH' or 'ripsDiag'.} 35 | 36 | \item{maxdim}{the integer maximum homological dimension for persistent homology, default 0.} 37 | 38 | \item{thresh}{the positive numeric maximum radius of the Vietoris-Rips filtration.} 39 | 40 | \item{distance_mat}{a boolean representing if `X` is a distance matrix (TRUE) or not (FALSE, default). 41 | dimensions together (TRUE, the default) or if one threshold should be calculated for each dimension separately (FALSE).} 42 | 43 | \item{ripser}{the imported ripser module when `FUN_diag` or `FUN_boot` is `PyH`.} 44 | 45 | \item{ignore_infinite_cluster}{a boolean indicating whether or not to ignore the infinitely lived cluster when `FUN_diag` or `FUN_boot` is `PyH`.} 46 | 47 | \item{calculate_representatives}{a boolean representing whether to calculate representative (co)cycles, default FALSE. Note that representatives cant be 48 | calculated when using the 'calculate_homology' function.} 49 | 50 | \item{num_samples}{the positive integer number of bootstrap samples, default 30.} 51 | 52 | \item{alpha}{the type-1 error threshold, default 0.05.} 53 | 54 | \item{return_subsetted}{a boolean representing whether or not to return the subsetted persistence diagram (with or without representatives), default FALSE.} 55 | 56 | \item{return_pvals}{a boolean representing whether or not to return p-values for features in the subsetted diagram, default FALSE.} 57 | 58 | \item{return_diag}{a boolean representing whether or not to return the calculated persistence diagram, default TRUE.} 59 | 60 | \item{num_workers}{the integer number of cores used for parallelizing (over bootstrap samples), default one less the maximum amount of cores on the machine.} 61 | 62 | \item{p_less_than_alpha}{a boolean representing whether or not subset further and return only feature whose p-values are strictly less than `alpha`, default `FALSE`. Note that this is not part of the original bootstrap procedure.} 63 | 64 | \item{...}{additional parameters for internal methods.} 65 | } 66 | \value{ 67 | either a numeric vector of threshold values, with one for each dimension 0..`maxdim` (in that order), or a list containing those thresholds and elements (if desired) 68 | } 69 | \description{ 70 | Bootstrapping is used to find a conservative estimate of a 1-`alpha` percent "confidence interval" around 71 | each point in the persistence diagram of the data set, and points whose intervals do not 72 | touch the diagonal (birth == death) would be considered "significant" or "real". 73 | One threshold is computed for each dimension in the diagram. 74 | } 75 | \details{ 76 | The thresholds are then determined by calculating the 1-`alpha'` percentile of the bottleneck 77 | distance values between the real persistence diagram and other diagrams obtained 78 | by bootstrap resampling the data. Since `ripsDiag` is the slowest homology engine but is the 79 | only engine which calculates representative cycles (as opposed to co-cycles with `PyH`), two 80 | homology engines are input to this function - one to calculate the actual persistence diagram, `FUN_diag` 81 | (possibly with representative (co)cycles) and one to calculate the bootstrap diagrams, `FUN_boot` (this should be 82 | a faster engine, like `calculate_homology` or `PyH`). 83 | p-values can be calculated for any feature which survives the thresholding if both `return_subsetted` and `return_pvals` are `TRUE`, 84 | however these values may be larger than the original `alpha` value in some cases. Note that this is not part of the original bootstrap procedure. 85 | If stricter thresholding is desired, 86 | or the p-values must be less than `alpha`, set `p_less_than_alpha` to `TRUE`. The minimum 87 | possible p-value is always 1/(`num_samples` + 1). 88 | Note that since \code{\link[TDAstats]{calculate_homology}} 89 | can ignore the longest-lived cluster, fewer "real" clusters may be found. To avoid this possibility 90 | try setting `FUN_diag` equal to 'ripsDiag'. Please note that due to the TDA package no longer being available on CRAN, 91 | if `FUN_diag` or `FUN_boot` are 'ripsDiag' then `bootstrap_persistence_thresholds` will look for the ripsDiag function in the global environment, 92 | so the TDA package should be attached with `library("TDA")` prior to use. 93 | } 94 | \examples{ 95 | 96 | if(require("TDAstats")) 97 | { 98 | # create a persistence diagram from a sample of the unit circle 99 | df <- TDAstats::circle2d[sample(1:100,size = 50),] 100 | 101 | # calculate persistence thresholds for alpha = 0.05 102 | # and return the calculated diagram as well as the subsetted diagram 103 | bootstrapped_diagram <- bootstrap_persistence_thresholds(X = df, 104 | maxdim = 1,thresh = 2,num_workers = 2) 105 | } 106 | } 107 | \references{ 108 | Chazal F et al (2017). "Robust Topological Inference: Distance to a Measure and Kernel Distance." \url{https://www.jmlr.org/papers/volume18/15-484/15-484.pdf}. 109 | } 110 | \author{ 111 | Shael Brown - \email{shaelebrown@gmail.com} 112 | } 113 | -------------------------------------------------------------------------------- /R/kernel_calculations.R: -------------------------------------------------------------------------------- 1 | #### PERSISTENCE FISHER KERNEL #### 2 | #' Calculate persistence Fisher kernel value between a pair of persistence diagrams. 3 | #' 4 | #' Returns the persistence Fisher kernel value between a pair of persistence diagrams 5 | #' in a particular homological dimension, each of which is either the output from a \code{\link{diagram_to_df}} 6 | #' function call or from a persistent homology calculation like ripsDiag/\code{\link[TDAstats]{calculate_homology}}/\code{\link{PyH}}. 7 | #' 8 | #' The persistence Fisher kernel is calculated from the Fisher information metric according to the formula 9 | #' \eqn{k_{PF}(D_1,D_2) = exp(-t*d_{FIM}(D_1,D_2))}, resembling a radial basis kernel for standard 10 | #' Euclidean spaces. 11 | #' 12 | #' @param D1 the first persistence diagram. 13 | #' @param D2 the second persistence diagram. 14 | #' @param dim the non-negative integer homological dimension in which the distance is to be computed, default 0. 15 | #' @param sigma a positive number representing the bandwidth for the Fisher information metric, default 1. 16 | #' @param rho an optional positive number representing the heuristic for Fisher information metric approximation, see \code{\link{diagram_distance}}. Default NULL. 17 | #' @param t a positive number representing the scale for the persistence Fisher kernel, default 1. 18 | #' 19 | #' @return the numeric kernel value. 20 | #' @export 21 | #' @author Shael Brown - \email{shaelebrown@@gmail.com} 22 | #' @seealso \code{\link{gram_matrix}} for Gram (i.e. kernel) matrix calculations. 23 | #' @references 24 | #' Le T, Yamada M (2018). "Persistence fisher kernel: a riemannian manifold kernel for persistence diagrams." \url{https://proceedings.neurips.cc/paper/2018/file/959ab9a0695c467e7caf75431a872e5c-Paper.pdf}. 25 | #' 26 | #' Murphy, K. "Machine learning: a probabilistic perspective", MIT press (2012). 27 | #' @examples 28 | #' 29 | #' if(require("TDAstats")) 30 | #' { 31 | #' # create two diagrams 32 | #' D1 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,20),], 33 | #' dim = 1,threshold = 2) 34 | #' D2 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,20),], 35 | #' dim = 1,threshold = 2) 36 | #' 37 | #' # calculate the kernel value between D1 and D2 with sigma = 2, t = 2 in dimension 1 38 | #' diagram_kernel(D1,D2,dim = 1,sigma = 2,t = 2) 39 | #' # calculate the kernel value between D1 and D2 with sigma = 2, t = 2 in dimension 0 40 | #' diagram_kernel(D1,D2,dim = 0,sigma = 2,t = 2) 41 | #' } 42 | 43 | diagram_kernel <- function(D1,D2,dim = 0,sigma = 1,t = 1,rho = NULL){ 44 | 45 | # function to compute the Persistence Fisher kernel of two persistence diagrams 46 | 47 | # check kernel-specific parameter, other inputs are checked in distance calculation 48 | check_param("t",t,positive = T,numeric = T,finite = T,multiple = F) 49 | 50 | # return kernel calculation 51 | return(exp(-1*t*diagram_distance(D1 = D1,D2 = D2,dim = dim,distance = "fisher",sigma = sigma,rho = rho))) 52 | 53 | } 54 | 55 | #### GRAM MATRIX #### 56 | #' Compute the gram matrix for a group of persistence diagrams. 57 | #' 58 | #' Calculate the Gram matrix \eqn{K} for either a single list of persistence diagrams \eqn{(D_1,D_2,\dots,D_n)}, i.e. \eqn{K[i,j] = k_{PF}(D_i,D_j)}, 59 | #' or between two lists of persistence diagrams, \eqn{(D_1,D_2,\dots,D_n)} and \eqn{(D'_1,D'_2,\dots,D'_n)}, \eqn{K[i,j] = k_{PF}(D_i,D'_j)}, in parallel. 60 | #' 61 | #' Gram matrices are used in downstream analyses, like in the `diagram_kkmeans`, `diagram_nearest_cluster`,`diagram_kpca`, 62 | #' `predict_diagram_kpca`, `predict_diagram_ksvm` and `independence_test` functions. 63 | #' 64 | #' @param diagrams a list of persistence diagrams, where each diagram is either the output of a persistent homology calculation like ripsDiag/\code{\link[TDAstats]{calculate_homology}}/\code{\link{PyH}}, or \code{\link{diagram_to_df}}. 65 | #' @param other_diagrams either NULL (default) or another list of persistence diagrams to compute a cross-Gram matrix. 66 | #' @param dim the non-negative integer homological dimension in which the distance is to be computed, default 0. 67 | #' @param sigma a positive number representing the bandwidth for the Fisher information metric, default 1. 68 | #' @param t a positive number representing the scale for the kernel, default 1. 69 | #' @param rho an optional positive number representing the heuristic for Fisher information metric approximation, see \code{\link{diagram_distance}}. Default NULL. If supplied, code execution is sequential, but functions in the "exec" directory 70 | #' of the package can be loaded to calculate distance matrices in parallel with approximation. 71 | #' @param num_workers the number of cores used for parallel computation, default is one less than the number of cores on the machine. 72 | #' 73 | #' @return the numeric (cross) Gram matrix of class 'kernelMatrix'. 74 | #' @export 75 | #' @author Shael Brown - \email{shaelebrown@@gmail.com} 76 | #' @seealso \code{\link{diagram_kernel}} for individual persistence Fisher kernel calculations. 77 | #' @examples 78 | #' 79 | #' if(require("TDAstats")) 80 | #' { 81 | #' # create two diagrams 82 | #' D1 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,20),], 83 | #' dim = 1,threshold = 2) 84 | #' D2 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,20),], 85 | #' dim = 1,threshold = 2) 86 | #' g <- list(D1,D2) 87 | #' 88 | #' # calculate the Gram matrix in dimension 0 with sigma = 2, t = 2 89 | #' G <- gram_matrix(diagrams = g,dim = 0,sigma = 2,t = 2,num_workers = 2) 90 | #' 91 | #' # calculate cross-Gram matrix, which is the same as G 92 | #' G_cross <- gram_matrix(diagrams = g,other_diagrams = g,dim = 0,sigma = 2, 93 | #' t = 2,num_workers = 2) 94 | #' } 95 | 96 | gram_matrix <- function(diagrams,other_diagrams = NULL,dim = 0,sigma = 1,t = 1,rho = NULL,num_workers = parallelly::availableCores(omit = 1)){ 97 | 98 | # function to compute (cross) Gram matrix in parallel 99 | check_param(param_name = "t",param = t,numeric = T,positive = T,multiple = F,finite = T) 100 | 101 | # compute gram matrix from distance matrix 102 | K <- exp(-t*distance_matrix(diagrams = diagrams,other_diagrams = other_diagrams,dim = dim,distance = "fisher",sigma = sigma,rho = rho,num_workers = num_workers)) 103 | 104 | # update class for interfacing with kernlab package 105 | class(K) <- "kernelMatrix" 106 | 107 | return(K) 108 | 109 | } 110 | -------------------------------------------------------------------------------- /man/permutation_test.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/inference.R 3 | \name{permutation_test} 4 | \alias{permutation_test} 5 | \title{Permutation test for finding group differences between persistence diagrams.} 6 | \usage{ 7 | permutation_test( 8 | ..., 9 | iterations = 20, 10 | p = 2, 11 | q = 2, 12 | dims = c(0, 1), 13 | dist_mats = NULL, 14 | group_sizes = NULL, 15 | paired = FALSE, 16 | distance = "wasserstein", 17 | sigma = NULL, 18 | rho = NULL, 19 | num_workers = parallelly::availableCores(omit = 1), 20 | verbose = FALSE 21 | ) 22 | } 23 | \arguments{ 24 | \item{...}{lists of persistence diagrams which are either the output of persistent homology calculations like ripsDiag/\code{\link[TDAstats]{calculate_homology}}/\code{\link{PyH}}, or \code{\link{diagram_to_df}}. Each list must contain at least 2 diagrams.} 25 | 26 | \item{iterations}{the number of iterations for permuting group labels, default 20.} 27 | 28 | \item{p}{a positive number representing the wasserstein power parameter, a number at least 1 (and Inf if using the bottleneck distance) and default 2.} 29 | 30 | \item{q}{a finite number at least 1 for exponentiation in the Turner loss function, default 2.} 31 | 32 | \item{dims}{a non-negative integer vector of the homological dimensions in which the test is to be carried out, default c(0,1).} 33 | 34 | \item{dist_mats}{an optional list of precomputed distances matrices, one for each dimension, where the rows and columns would correspond to the unlisted groups of diagrams (in order), default NULL. If not NULL then no lists of diagrams need to be supplied.} 35 | 36 | \item{group_sizes}{a vector of group sizes, one for each group, when `dist_mats` is not NULL.} 37 | 38 | \item{paired}{a boolean flag for if there is a second-order pairing between diagrams at the same index in different groups, default FALSE} 39 | 40 | \item{distance}{a string which determines which type of distance calculation to carry out, either "wasserstein" (default) or "fisher".} 41 | 42 | \item{sigma}{the positive bandwidth for the Fisher information metric, default NULL.} 43 | 44 | \item{rho}{an optional positive number representing the heuristic for Fisher information metric approximation, see \code{\link{diagram_distance}}. Default NULL. If supplied, code execution is sequential.} 45 | 46 | \item{num_workers}{the number of cores used for parallel computation, default is one less than the number of cores on the machine.} 47 | 48 | \item{verbose}{a boolean flag for if the time duration of the function call should be printed, default FALSE} 49 | } 50 | \value{ 51 | a list with the following elements: 52 | \describe{ 53 | 54 | \item{dimensions}{the input `dims` argument.} 55 | 56 | \item{permvals}{a numeric vector of length `iterations` with the permuted loss value for each iteration (permutation)} 57 | 58 | \item{test_statisics}{a numeric vector of the test statistic value in each dimension.} 59 | 60 | \item{p_values}{a numeric vector of the p-values in each dimension.} 61 | 62 | \item{run_time}{the run time of the function call, containing time units.} 63 | 64 | } 65 | } 66 | \description{ 67 | A non-parametric ANOVA-like test for persistence diagrams 68 | (see \url{https://link.springer.com/article/10.1007/s41468-017-0008-7} for details). In each 69 | desired dimension a test statistic (loss) is calculated, then the group labels are shuffled 70 | for some number of iterations and the loss is recomputed each time thereby generating a null 71 | distribution for the test statistic. This test generates a p-value in each desired dimension. 72 | } 73 | \details{ 74 | The test is carried out in parallel and optimized in order to not recompute already-calculated distances. As such, memory issues 75 | may occur when the number of persistence diagrams is very large. 76 | Like in (\url{https://github.com/hassan-abdallah/Statistical_Inference_PH_fMRI/blob/main/Abdallah_et_al_Statistical_Inference_PH_fMRI.pdf}) 77 | an option is provided for pairing diagrams between groups to reduce variance (in order to boost statistical power), and 78 | like it was suggested in the original paper functionality is provided for an arbitrary number of groups (not just 2). 79 | A small p-value in a dimension suggests that the groups are different (separated) in that dimension. 80 | If `distance` is "fisher" then `sigma` must not be NULL. TDAstats also has a `permutation_test` function 81 | so care should be taken to use the desired function when using TDApplied with TDAstats. If `dist_mats` is supplied 82 | then the sum of the elements of `group_sizes` must equal the number of rows and columns of each of its elements. 83 | } 84 | \examples{ 85 | 86 | if(require("TDAstats")) 87 | { 88 | # create two groups of diagrams 89 | D1 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,10),], 90 | dim = 0,threshold = 2) 91 | D2 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,10),], 92 | dim = 0,threshold = 2) 93 | g1 <- list(D1,D2) 94 | g2 <- list(D1,D2) 95 | 96 | # run test in dimension 0 with 1 iteration, note that the TDA package function 97 | # "permutation_test" can mask TDApplied's function, so we will specify explicitly 98 | # which function we are using 99 | perm_test <- TDApplied::permutation_test(g1,g2,iterations = 1, 100 | num_workers = 2, 101 | dims = c(0)) 102 | 103 | # repeat with precomputed distance matrix, gives similar results 104 | # (same but the randomness of the permutations can give small differences) 105 | # just much faster 106 | D <- distance_matrix(diagrams = list(D1,D2,D1,D2),dim = 0, 107 | num_workers = 2) 108 | perm_test <- TDApplied::permutation_test(dist_mats = list(D),group_sizes = c(2,2), 109 | dims = c(0)) 110 | } 111 | } 112 | \references{ 113 | Robinson T, Turner K (2017). "Hypothesis testing for topological data analysis." \url{https://link.springer.com/article/10.1007/s41468-017-0008-7}. 114 | 115 | Abdallah H et al. (2021). "Statistical Inference for Persistent Homology applied to fMRI." \url{https://github.com/hassan-abdallah/Statistical_Inference_PH_fMRI/blob/main/Abdallah_et_al_Statistical_Inference_PH_fMRI.pdf}. 116 | } 117 | \seealso{ 118 | \code{\link{independence_test}} for an inferential test of independence for two groups of persistence diagrams. 119 | } 120 | \author{ 121 | Shael Brown - \email{shaelebrown@gmail.com} 122 | } 123 | -------------------------------------------------------------------------------- /src/ANNx.h: -------------------------------------------------------------------------------- 1 | //---------------------------------------------------------------------- 2 | // File: ANNx.h 3 | // Programmer: Sunil Arya and David Mount 4 | // Last modified: 03/04/98 (Release 0.1) 5 | // Description: Internal include file for ANN 6 | // 7 | // These declarations are of use in manipulating some of 8 | // the internal data objects appearing in ANN, but are not 9 | // needed for applications just using the nearest neighbor 10 | // search. 11 | // 12 | // Typical users of ANN should not need to access this file. 13 | //---------------------------------------------------------------------- 14 | // Copyright (c) 1997-2005 University of Maryland and Sunil Arya and 15 | // David Mount. All Rights Reserved. 16 | // 17 | // This software and related documentation is part of the Approximate 18 | // Nearest Neighbor Library (ANN). This software is provided under 19 | // the provisions of the Lesser GNU Public License (LGPL). See the 20 | // file ../ReadMe.txt for further information. 21 | // 22 | // The University of Maryland (U.M.) and the authors make no 23 | // representations about the suitability or fitness of this software for 24 | // any purpose. It is provided "as is" without express or implied 25 | // warranty. 26 | //---------------------------------------------------------------------- 27 | // History: 28 | // Revision 0.1 03/04/98 29 | // Initial release 30 | // Revision 1.0 04/01/05 31 | // Changed LO, HI, IN, OUT to ANN_LO, ANN_HI, etc. 32 | //---------------------------------------------------------------------- 33 | 34 | #ifndef ANNx_H 35 | #define ANNx_H 36 | 37 | #include // I/O manipulators 38 | #include "ANN.h" // ANN includes 39 | 40 | //---------------------------------------------------------------------- 41 | // Global constants and types 42 | //---------------------------------------------------------------------- 43 | enum {ANN_LO=0, ANN_HI=1}; // splitting indices 44 | enum {ANN_IN=0, ANN_OUT=1}; // shrinking indices 45 | // what to do in case of error 46 | enum ANNerr {ANNwarn = 0, ANNabort = 1}; 47 | 48 | //---------------------------------------------------------------------- 49 | // Maximum number of points to visit 50 | // We have an option for terminating the search early if the 51 | // number of points visited exceeds some threshold. If the 52 | // threshold is 0 (its default) this means there is no limit 53 | // and the algorithm applies its normal termination condition. 54 | //---------------------------------------------------------------------- 55 | 56 | extern int ANNmaxPtsVisited; // maximum number of pts visited 57 | extern int ANNptsVisited; // number of pts visited in search 58 | 59 | //---------------------------------------------------------------------- 60 | // Global function declarations 61 | //---------------------------------------------------------------------- 62 | 63 | void annError( // ANN error routine 64 | char *msg, // error message 65 | ANNerr level); // level of error 66 | 67 | void annPrintPt( // print a point 68 | ANNpoint pt, // the point 69 | int dim, // the dimension 70 | std::ostream &out); // output stream 71 | 72 | //---------------------------------------------------------------------- 73 | // Orthogonal (axis aligned) rectangle 74 | // Orthogonal rectangles are represented by two points, one 75 | // for the lower left corner (min coordinates) and the other 76 | // for the upper right corner (max coordinates). 77 | // 78 | // The constructor initializes from either a pair of coordinates, 79 | // pair of points, or another rectangle. Note that all constructors 80 | // allocate new point storage. The destructor deallocates this 81 | // storage. 82 | // 83 | // BEWARE: Orthogonal rectangles should be passed ONLY BY REFERENCE. 84 | // (C++'s default copy constructor will not allocate new point 85 | // storage, then on return the destructor free's storage, and then 86 | // you get into big trouble in the calling procedure.) 87 | //---------------------------------------------------------------------- 88 | 89 | class ANNorthRect { 90 | public: 91 | ANNpoint lo; // rectangle lower bounds 92 | ANNpoint hi; // rectangle upper bounds 93 | // 94 | ANNorthRect( // basic constructor 95 | int dd, // dimension of space 96 | ANNcoord l=0, // default is empty 97 | ANNcoord h=0) 98 | { lo = annAllocPt(dd, l); hi = annAllocPt(dd, h); } 99 | 100 | ANNorthRect( // (almost a) copy constructor 101 | int dd, // dimension 102 | const ANNorthRect &r) // rectangle to copy 103 | { lo = annCopyPt(dd, r.lo); hi = annCopyPt(dd, r.hi); } 104 | 105 | ANNorthRect( // construct from points 106 | int dd, // dimension 107 | ANNpoint l, // low point 108 | ANNpoint h) // hight point 109 | { lo = annCopyPt(dd, l); hi = annCopyPt(dd, h); } 110 | 111 | ~ANNorthRect() // destructor 112 | { annDeallocPt(lo); annDeallocPt(hi); } 113 | 114 | ANNbool inside(int dim, ANNpoint p);// is point p inside rectangle? 115 | }; 116 | 117 | void annAssignRect( // assign one rect to another 118 | int dim, // dimension (both must be same) 119 | ANNorthRect &dest, // destination (modified) 120 | const ANNorthRect &source); // source 121 | 122 | //---------------------------------------------------------------------- 123 | // Orthogonal (axis aligned) halfspace 124 | // An orthogonal halfspace is represented by an integer cutting 125 | // dimension cd, coordinate cutting value, cv, and side, sd, which is 126 | // either +1 or -1. Our convention is that point q lies in the (closed) 127 | // halfspace if (q[cd] - cv)*sd >= 0. 128 | //---------------------------------------------------------------------- 129 | 130 | class ANNorthHalfSpace { 131 | public: 132 | int cd; // cutting dimension 133 | ANNcoord cv; // cutting value 134 | int sd; // which side 135 | // 136 | ANNorthHalfSpace() // default constructor 137 | { cd = 0; cv = 0; sd = 0; } 138 | 139 | ANNorthHalfSpace( // basic constructor 140 | int cdd, // dimension of space 141 | ANNcoord cvv, // cutting value 142 | int sdd) // side 143 | { cd = cdd; cv = cvv; sd = sdd; } 144 | 145 | ANNbool in(ANNpoint q) const // is q inside halfspace? 146 | { return (ANNbool) ((q[cd] - cv)*sd >= 0); } 147 | 148 | ANNbool out(ANNpoint q) const // is q outside halfspace? 149 | { return (ANNbool) ((q[cd] - cv)*sd < 0); } 150 | 151 | ANNdist dist(ANNpoint q) const // (squared) distance from q 152 | { return (ANNdist) ANN_POW(q[cd] - cv); } 153 | 154 | void setLowerBound(int d, ANNpoint p)// set to lower bound at p[i] 155 | { cd = d; cv = p[d]; sd = +1; } 156 | 157 | void setUpperBound(int d, ANNpoint p)// set to upper bound at p[i] 158 | { cd = d; cv = p[d]; sd = -1; } 159 | 160 | void project(ANNpoint &q) // project q (modified) onto halfspace 161 | { if (out(q)) q[cd] = cv; } 162 | }; 163 | 164 | // array of halfspaces 165 | typedef ANNorthHalfSpace *ANNorthHSArray; 166 | 167 | #endif -------------------------------------------------------------------------------- /tests/testthat/test-python.R: -------------------------------------------------------------------------------- 1 | # all python tests are skipped to avoid build errors, even though they succeed locally 2 | # to run the tests the reticulate package must be installed, correctly hooked up to 3 | # python, and the ripser module must be downloaded. 4 | 5 | test_that("ripser can be imported and verified.",{ 6 | 7 | skip_if(T) 8 | ripser <- import_ripser() 9 | expect_invisible(check_ripser(ripser)) 10 | expect_error(check_ripser(2),"ripser object") 11 | expect_error(check_ripser(NULL),"ripser object") 12 | np <- reticulate::import("numpy") 13 | expect_error(check_ripser(np),"ripser object") 14 | 15 | }) 16 | 17 | test_that("PyH can detect bad input parameters.",{ 18 | 19 | skip_if(T) 20 | ripser <- import_ripser() 21 | expect_error(PyH(X = data.frame(),maxdim = 1,thresh = 1,distance_mat = F,ripser = ripser),"two rows") 22 | expect_error(PyH(X = NULL,maxdim = 1,thresh = 1,distance_mat = F,ripser = ripser),"dataframe") 23 | expect_error(PyH(X = data.frame(x = 1:2,y = c("1","2")),maxdim = 1,thresh = 1,distance_mat = F,ripser = ripser),"numeric") 24 | expect_error(PyH(X = data.frame(x = c(1,NA,2)),maxdim = 1,thresh = 1,distance_mat = F,ripser = ripser),"missing") 25 | expect_error(PyH(X = data.frame(x = 1:3),maxdim = 1,thresh = 1,distance_mat = T,ripser = ripser),"square") 26 | expect_error(PyH(X = data.frame(x = 1:3,y = 1:3,z = 1:3),maxdim = 1,thresh = 1,distance_mat = T,ripser = ripser),"matrix") 27 | expect_error(PyH(X = data.frame(x = 1:3,y = 1:3,z = 1:3),maxdim = NA,thresh = 1,distance_mat = T,ripser = ripser),"maxdim") 28 | expect_error(PyH(X = data.frame(x = 1:3,y = 1:3,z = 1:3),maxdim = -1,thresh = 1,distance_mat = T,ripser = ripser),"maxdim") 29 | expect_error(PyH(X = data.frame(x = 1:3,y = 1:3,z = 1:3),maxdim = 1,thresh = 1,distance_mat = NULL,ripser = ripser),"NULL") 30 | expect_error(PyH(X = data.frame(x = 1:3,y = 1:3,z = 1:3),maxdim = 1,thresh = 1,distance_mat = NA,ripser = ripser),"NA") 31 | expect_error(PyH(X = data.frame(x = 1:3,y = 1:3,z = 1:3),maxdim = 1,thresh = 1,distance_mat = c(T,F),ripser = ripser),"logical") 32 | expect_error(PyH(X = data.frame(x = 1:3,y = 1:3,z = 1:3),maxdim = 1,thresh = 1,distance_mat = T,ripser = ripser,ignore_infinite_cluster = NULL),"NULL") 33 | expect_error(PyH(X = data.frame(x = 1:3,y = 1:3,z = 1:3),maxdim = 1,thresh = 1,distance_mat = T,ripser = ripser,ignore_infinite_cluster = c(T,F)),"single") 34 | expect_error(PyH(X = data.frame(x = 1:3,y = 1:3,z = 1:3),maxdim = 1,thresh = 1,distance_mat = T,ripser = ripser,ignore_infinite_cluster = NA),"NA") 35 | expect_error(PyH(X = data.frame(x = 1:3,y = 1:3,z = 1:3),maxdim = 1,thresh = 1,distance_mat = T,ripser = ripser,calculate_representatives = NULL),"NULL") 36 | expect_error(PyH(X = data.frame(x = 1:3,y = 1:3,z = 1:3),maxdim = 1,thresh = 1,distance_mat = T,ripser = ripser,calculate_representatives = c(T,F)),"single") 37 | expect_error(PyH(X = data.frame(x = 1:3,y = 1:3,z = 1:3),maxdim = 1,thresh = 1,distance_mat = T,ripser = ripser,calculate_representatives = NA),"NA") 38 | 39 | }) 40 | 41 | test_that("PyH is computing correctly.",{ 42 | 43 | skip_if(T) 44 | skip_if_not_installed("TDAstats") 45 | D1 <- data.frame(x = stats::rnorm(20),y = stats::rnorm(20)) 46 | D2 <- data.frame(x = stats::rnorm(20),y = stats::rnorm(20)) 47 | D3 <- data.frame(x = stats::rnorm(20),y = stats::rnorm(20)) 48 | 49 | phom_TDA_1 <- diagram_to_df(TDAstats::calculate_homology(D1,threshold = 5)) 50 | phom_TDA_2 <- diagram_to_df(TDAstats::calculate_homology(D2,threshold = 5)) 51 | phom_TDA_3 <- diagram_to_df(TDAstats::calculate_homology(D3,threshold = 5)) 52 | 53 | ripser <- import_ripser() 54 | 55 | phom_py_1 <- PyH(D1,thresh = 5,ripser = ripser) 56 | phom_py_2 <- PyH(D2,thresh = 5,ripser = ripser) 57 | phom_py_3 <- PyH(D3,thresh = 5,ripser = ripser) 58 | 59 | expect_equal(phom_TDA_1,phom_py_1,tolerance = 0.00001) 60 | expect_equal(phom_TDA_2,phom_py_2,tolerance = 0.00001) 61 | expect_equal(phom_TDA_3,phom_py_3,tolerance = 0.000001) 62 | 63 | phom_with_extra_cluster <- PyH(D1,thresh = 5,ripser = ripser,ignore_infinite_cluster = F) 64 | 65 | expect_length(which(phom_with_extra_cluster$dimension == 0),20) 66 | 67 | phom_with_reps <- PyH(D1,thresh = 5,ripser = ripser,calculate_representatives = T) 68 | expect_type(phom_with_reps,"list") 69 | 70 | circ <- TDAstats::circle2d[sample(1:100,10),] 71 | phom_with_empty_dim <- PyH(circ,thresh = 2,ripser = ripser,maxdim = 2) 72 | expect_s3_class(phom_with_empty_dim,"data.frame") 73 | 74 | }) 75 | 76 | test_that("bootstrap function can detect PyH errors correctly.",{ 77 | 78 | skip_if(T) 79 | skip_if_not_installed("TDAstats") 80 | ripser = import_ripser() 81 | D <- TDAstats::circle2d[sample(1:100,10),] 82 | expect_error(bootstrap_persistence_thresholds(X = D,FUN_diag = "PyH",maxdim = 1,thresh = 2,calculate_representatives = T,return_diag = T,ripser = ripser,num_workers = 2,num_samples = 3,return_subsetted = T,ignore_infinite_cluster = NULL),"NULL") 83 | expect_error(bootstrap_persistence_thresholds(X = D,FUN_diag = "PyH",maxdim = 1,thresh = 2,calculate_representatives = T,return_diag = T,ripser = ripser,num_workers = 2,num_samples = 3,return_subsetted = T,ignore_infinite_cluster = 2),"logical") 84 | expect_error(bootstrap_persistence_thresholds(X = D,FUN_boot = "PyH",maxdim = 1,thresh = 2,calculate_representatives = T,return_diag = T,ripser = ripser,num_workers = 2,num_samples = 3,return_subsetted = T,ignore_infinite_cluster = NA),"NA") 85 | 86 | }) 87 | 88 | test_that("PyH functionality works in bootstrap function.",{ 89 | 90 | skip_if(T) 91 | skip_if_not_installed("TDAstats") 92 | ripser = import_ripser() 93 | D <- TDAstats::circle2d[sample(1:100,10),] 94 | 95 | # PyH with multiple thresholds 96 | bs <- bootstrap_persistence_thresholds(X = D,FUN_diag = "PyH",FUN_boot = "PyH",maxdim = 1,thresh = 2,calculate_representatives = T,return_diag = T,ripser = ripser,num_workers = 2,num_samples = 3,return_subsetted = T,ignore_infinite_cluster = F) 97 | expect_length(bs$representatives[[2]],length(which(bs$diag$dimension == 1))) 98 | expect_length(bs$thresholds,2) 99 | expect_gt(bs$thresholds[[1]],0) 100 | expect_gt(bs$thresholds[[2]],0) 101 | expect_lte(length(bs$subsetted_representatives),nrow(bs$subsetted_diag) + 1) 102 | if(length(which(bs$subsetted_diag$dimension == 0)) > 0) 103 | { 104 | expect_true(min(bs$subsetted_diag[which(bs$subsetted_diag$dimension == 0),]$death - bs$subsetted_diag[which(bs$subsetted_diag$dimension == 0),]$birth) >= bs$thresholds[[1]]) 105 | } 106 | 107 | expect_true(min(bs$subsetted_diag[which(bs$subsetted_diag$dimension == 1),]$death - bs$subsetted_diag[which(bs$subsetted_diag$dimension == 1),]$birth) > bs$thresholds[[2]]) 108 | 109 | # check on circle 110 | bs <- bootstrap_persistence_thresholds(X = D,FUN_diag = "PyH",maxdim = 1,thresh = 2,return_diag = T,ripser = ripser,num_workers = 2,num_samples = 3) 111 | expect_lte(length(bs$subsetted_diag$dimension),1) 112 | bs <- bootstrap_persistence_thresholds(X = D,FUN_diag = "PyH",maxdim = 1,thresh = 2,return_diag = T,ripser = ripser,ignore_infinite_cluster = F,num_workers = 2,num_samples = 3) 113 | expect_lte(length(bs$subsetted_diag$dimension),2) 114 | 115 | }) 116 | -------------------------------------------------------------------------------- /man/diagram_ksvm.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/machine_learning.R 3 | \name{diagram_ksvm} 4 | \alias{diagram_ksvm} 5 | \title{Fit a support vector machine model where each training set instance is a persistence diagram.} 6 | \usage{ 7 | diagram_ksvm( 8 | diagrams, 9 | cv = 1, 10 | dim, 11 | t = 1, 12 | sigma = 1, 13 | rho = NULL, 14 | y, 15 | type = NULL, 16 | distance_matrices = NULL, 17 | C = 1, 18 | nu = 0.2, 19 | epsilon = 0.1, 20 | prob.model = FALSE, 21 | class.weights = NULL, 22 | fit = TRUE, 23 | cache = 40, 24 | tol = 0.001, 25 | shrinking = TRUE, 26 | num_workers = parallelly::availableCores(omit = 1) 27 | ) 28 | } 29 | \arguments{ 30 | \item{diagrams}{a list of persistence diagrams which are either the output of a persistent homology calculation like ripsDiag/\code{\link[TDAstats]{calculate_homology}}/\code{\link{PyH}}, or \code{\link{diagram_to_df}}.} 31 | 32 | \item{cv}{a positive number at most the length of `diagrams` which determines the number of cross validation splits to be performed (default 1, aka no cross-validation). If `prob.model` is TRUE then cv is set to 1 since kernlab performs 3-fold CV internally in this case. When performing classification, classes are balanced within each cv fold.} 33 | 34 | \item{dim}{a non-negative integer vector of homological dimensions in which the model is to be fit.} 35 | 36 | \item{t}{either a vector of positive numbers representing the grid of values for the scale of the persistence Fisher kernel or NULL, default 1. If NULL then t is selected automatically, see details.} 37 | 38 | \item{sigma}{a vector of positive numbers representing the grid of values for the bandwidth of the Fisher information metric, default 1.} 39 | 40 | \item{rho}{an optional positive number representing the heuristic for Fisher information metric approximation, see \code{\link{diagram_distance}}. Default NULL. If supplied, distance matrix calculations are sequential.} 41 | 42 | \item{y}{a response vector with one label for each persistence diagram. Must be either numeric or factor, but doesn't need to be supplied when `type` is "one-svc".} 43 | 44 | \item{type}{a string representing the type of task to be performed. Can be any one of "C-svc","nu-svc","one-svc","eps-svr","nu-svr" - default for regression is "eps-svr" and for classification is "C-svc". See \code{\link[kernlab]{ksvm}} for details.} 45 | 46 | \item{distance_matrices}{an optional list of precomputed Fisher distance matrices, corresponding to the rows in `expand.grid(dim = dim,sigma = sigma)`, default NULL.} 47 | 48 | \item{C}{a number representing the cost of constraints violation (default 1) this is the 'C'-constant of the regularization term in the Lagrange formulation.} 49 | 50 | \item{nu}{numeric parameter needed for nu-svc, one-svc and nu-svr. The `nu` parameter sets the upper bound on the training error and the lower bound on the fraction of data points to become Support Vector (default 0.2).} 51 | 52 | \item{epsilon}{epsilon in the insensitive-loss function used for eps-svr, nu-svr and eps-bsvm (default 0.1).} 53 | 54 | \item{prob.model}{if set to TRUE builds a model for calculating class probabilities or in case of regression, calculates the scaling parameter of the Laplacian distribution fitted on the residuals. Fitting is done on output data created by performing a 3-fold cross-validation on the training data. For details see references (default FALSE).} 55 | 56 | \item{class.weights}{a named vector of weights for the different classes, used for asymmetric class sizes. Not all factor levels have to be supplied (default weight: 1). All components have to be named.} 57 | 58 | \item{fit}{indicates whether the fitted values should be computed and included in the model or not (default TRUE).} 59 | 60 | \item{cache}{cache memory in MB (default 40).} 61 | 62 | \item{tol}{tolerance of termination criteria (default 0.001).} 63 | 64 | \item{shrinking}{option whether to use the shrinking-heuristics (default TRUE).} 65 | 66 | \item{num_workers}{the number of cores used for parallel computation, default is one less the number of cores on the machine.} 67 | } 68 | \value{ 69 | a list of class 'diagram_ksvm' containing the elements 70 | 71 | \describe{ 72 | 73 | \item{cv_results}{the cross-validation results - a matrix storing the parameters for each model in the tuning grid and its mean cross-validation error over all splits.} 74 | 75 | \item{best_model}{a list containing the output of \code{\link[kernlab]{ksvm}} run on the whole dataset with the optimal model parameters found during cross-validation, as well as the optimal kernel parameters for the model.} 76 | 77 | \item{diagrams}{the diagrams which were supplied in the function call.} 78 | 79 | } 80 | } 81 | \description{ 82 | Returns the output of kernlab's \code{\link[kernlab]{ksvm}} function on the Gram matrix of the list of persistence diagrams 83 | in a particular dimension. 84 | } 85 | \details{ 86 | Cross validation is carried out in parallel, using a trick 87 | noted in \doi{10.1007/s41468-017-0008-7} - since the persistence Fisher kernel can be 88 | written as \eqn{d_{PF}(D_1,D_2)=exp(t*d_{FIM}(D_1,D_2))=exp(d_{FIM}(D_1,D_2))^t}, we can 89 | store the Fisher information metric distance matrix for each sigma value in the parameter grid to avoid 90 | recomputing distances, and cross validation is therefore performed in parallel. 91 | Note that the response parameter `y` must be a factor for classification - 92 | a character vector for instance will throw an error. If `t` is NULL then 1/`t` is selected as 93 | the 1,2,5,10,20,50 percentiles of the upper triangle of the distance matrix of its training sample (per fold in the case of cross-validation). 94 | This is the process suggested in the persistence Fisher kernel paper. If 95 | any of these values would divide by 0 (i.e. if the training set is small) then the minimum non-zero element 96 | is taken as the denominator (and hence the returned parameters may have duplicate rows except for differing error values). If 97 | cross-validation is performed then the mean error across folds is still recorded, but the best `t` parameter 98 | across all folds is recorded in the cv results table. 99 | } 100 | \examples{ 101 | 102 | if(require("TDAstats")) 103 | { 104 | # create four diagrams 105 | D1 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,20),], 106 | dim = 1,threshold = 2) 107 | D2 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,20),], 108 | dim = 1,threshold = 2) 109 | D3 <- TDAstats::calculate_homology(TDAstats::sphere3d[sample(1:100,20),], 110 | dim = 1,threshold = 2) 111 | D4 <- TDAstats::calculate_homology(TDAstats::sphere3d[sample(1:100,20),], 112 | dim = 1,threshold = 2) 113 | g <- list(D1,D2,D3,D4) 114 | 115 | # create response vector 116 | y <- as.factor(c("circle","circle","sphere","sphere")) 117 | 118 | # fit model without cross validation 119 | model_svm <- diagram_ksvm(diagrams = g,cv = 1,dim = c(0), 120 | y = y,sigma = c(1),t = c(1), 121 | num_workers = 2) 122 | } 123 | } 124 | \references{ 125 | Murphy, K. "Machine learning: a probabilistic perspective." MIT press (2012). 126 | } 127 | \seealso{ 128 | \code{\link{predict_diagram_ksvm}} for predicting labels of new diagrams. 129 | } 130 | \author{ 131 | Shael Brown - \email{shaelebrown@gmail.com} 132 | } 133 | --------------------------------------------------------------------------------