├── vignettes
    ├── .gitignore
    ├── cols.rds
    ├── emb.rds
    ├── r_face.rds
    ├── thresh.rds
    ├── r_nodes.png
    ├── r_shape.rds
    ├── rips_all.rds
    ├── shape_rt.rds
    ├── rips_cycle.rds
    ├── theta_nodes.png
    ├── rips_secondary.rds
    ├── cols_respiratory.rds
    └── cols_time_since_last_block.rds
├── src
    ├── Makevars.win
    ├── ANN.o
    ├── perf.o
    ├── bd_tree.o
    ├── brute.o
    ├── figtree.o
    ├── kd_dump.o
    ├── kd_tree.o
    ├── kd_util.o
    ├── bd_search.o
    ├── kd_search.o
    ├── kd_split.o
    ├── RcppExports.o
    ├── TDApplied.so
    ├── bd_pr_search.o
    ├── kd_pr_search.o
    ├── KCenterClustering.o
    ├── bd_fix_rad_search.o
    ├── kd_fix_rad_search.o
    ├── RcppExports.cpp
    ├── kd_fix_rad_search.h
    ├── kd_pr_search.h
    ├── kd_search.h
    ├── bd_search.cpp
    ├── bd_pr_search.cpp
    ├── kd_split.h
    ├── bd_fix_rad_search.cpp
    ├── bd_tree.h
    ├── brute.cpp
    ├── pr_queue.h
    ├── pr_queue_k.h
    ├── kd_util.h
    ├── perf.cpp
    ├── KCenterClustering.h
    └── ANNx.h
├── tests
    ├── testthat.R
    └── testthat
    │   ├── test-convert.R
    │   ├── test-enclosing.R
    │   ├── test-utilities.R
    │   ├── test-plot.R
    │   ├── test-MDS.R
    │   ├── test-kernel.R
    │   └── test-python.R
├── .gitignore
├── CRAN-SUBMISSION
├── R
    ├── zzz.R
    ├── TDApplied-package.R
    ├── RcppExports.R
    ├── convert.R
    ├── enclosing_rad.R
    └── kernel_calculations.R
├── .Rbuildignore
├── TDApplied.Rproj
├── man
    ├── check_ripser.Rd
    ├── import_ripser.Rd
    ├── check_PyH_setup.Rd
    ├── enclosing_radius.Rd
    ├── TDApplied-package.Rd
    ├── diagram_to_df.Rd
    ├── loss.Rd
    ├── diagram_kernel.Rd
    ├── predict_diagram_kkmeans.Rd
    ├── plot_diagram.Rd
    ├── gram_matrix.Rd
    ├── PyH.Rd
    ├── vr_graphs.Rd
    ├── predict_diagram_ksvm.Rd
    ├── predict_diagram_kpca.Rd
    ├── distance_matrix.Rd
    ├── diagram_kkmeans.Rd
    ├── diagram_distance.Rd
    ├── plot_vr_graph.Rd
    ├── analyze_representatives.Rd
    ├── independence_test.Rd
    ├── diagram_kpca.Rd
    ├── diagram_mds.Rd
    ├── universal_null.Rd
    ├── permutation_model_inference.Rd
    ├── bootstrap_persistence_thresholds.Rd
    ├── permutation_test.Rd
    └── diagram_ksvm.Rd
├── cran-comments.md
├── DESCRIPTION
├── NAMESPACE
├── exec
    └── parallel_with_approximation.R
└── NEWS.md


/vignettes/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | *.R
3 | 


--------------------------------------------------------------------------------
/src/Makevars.win:
--------------------------------------------------------------------------------
1 | PKG_CPPFLAGS += -DFIGTREE_DLL_EXPORTS -DDLL_EXPORTS


--------------------------------------------------------------------------------
/src/ANN.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/src/ANN.o


--------------------------------------------------------------------------------
/src/perf.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/src/perf.o


--------------------------------------------------------------------------------
/src/bd_tree.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/src/bd_tree.o


--------------------------------------------------------------------------------
/src/brute.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/src/brute.o


--------------------------------------------------------------------------------
/src/figtree.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/src/figtree.o


--------------------------------------------------------------------------------
/src/kd_dump.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/src/kd_dump.o


--------------------------------------------------------------------------------
/src/kd_tree.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/src/kd_tree.o


--------------------------------------------------------------------------------
/src/kd_util.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/src/kd_util.o


--------------------------------------------------------------------------------
/src/bd_search.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/src/bd_search.o


--------------------------------------------------------------------------------
/src/kd_search.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/src/kd_search.o


--------------------------------------------------------------------------------
/src/kd_split.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/src/kd_split.o


--------------------------------------------------------------------------------
/src/RcppExports.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/src/RcppExports.o


--------------------------------------------------------------------------------
/src/TDApplied.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/src/TDApplied.so


--------------------------------------------------------------------------------
/src/bd_pr_search.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/src/bd_pr_search.o


--------------------------------------------------------------------------------
/src/kd_pr_search.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/src/kd_pr_search.o


--------------------------------------------------------------------------------
/vignettes/cols.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/vignettes/cols.rds


--------------------------------------------------------------------------------
/vignettes/emb.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/vignettes/emb.rds


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(TDApplied)
3 | 
4 | test_check("TDApplied")
5 | 


--------------------------------------------------------------------------------
/vignettes/r_face.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/vignettes/r_face.rds


--------------------------------------------------------------------------------
/vignettes/thresh.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/vignettes/thresh.rds


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | inst/doc
6 | /doc/
7 | /Meta/
8 | 


--------------------------------------------------------------------------------
/src/KCenterClustering.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/src/KCenterClustering.o


--------------------------------------------------------------------------------
/src/bd_fix_rad_search.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/src/bd_fix_rad_search.o


--------------------------------------------------------------------------------
/src/kd_fix_rad_search.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/src/kd_fix_rad_search.o


--------------------------------------------------------------------------------
/vignettes/r_nodes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/vignettes/r_nodes.png


--------------------------------------------------------------------------------
/vignettes/r_shape.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/vignettes/r_shape.rds


--------------------------------------------------------------------------------
/vignettes/rips_all.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/vignettes/rips_all.rds


--------------------------------------------------------------------------------
/vignettes/shape_rt.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/vignettes/shape_rt.rds


--------------------------------------------------------------------------------
/vignettes/rips_cycle.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/vignettes/rips_cycle.rds


--------------------------------------------------------------------------------
/vignettes/theta_nodes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/vignettes/theta_nodes.png


--------------------------------------------------------------------------------
/vignettes/rips_secondary.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/vignettes/rips_secondary.rds


--------------------------------------------------------------------------------
/CRAN-SUBMISSION:
--------------------------------------------------------------------------------
1 | Version: 3.0.4
2 | Date: 2024-10-27 22:04:37 UTC
3 | SHA: f72e303388b8467a96701d8746df308db4029c69
4 | 


--------------------------------------------------------------------------------
/vignettes/cols_respiratory.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/vignettes/cols_respiratory.rds


--------------------------------------------------------------------------------
/vignettes/cols_time_since_last_block.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shaelebrown/TDApplied/HEAD/vignettes/cols_time_since_last_block.rds


--------------------------------------------------------------------------------
/R/zzz.R:
--------------------------------------------------------------------------------
1 | 
2 | # unload C++ DLL for proper cleanup
3 | .onUnload <- function (libpath) {
4 |   library.dynam.unload("TDApplied", libpath)
5 | }


--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | ^README\.Rmd$
4 | ^cran-comments\.md$
5 | ^CRAN-SUBMISSION$
6 | ^doc$
7 | ^Meta$
8 | ^LICENSE\.md$
9 | 


--------------------------------------------------------------------------------
/R/TDApplied-package.R:
--------------------------------------------------------------------------------
1 | #' @useDynLib TDApplied, .registration = TRUE
2 | #' @docType package
3 | #' @keywords internal
4 | "_PACKAGE"
5 | 
6 | ## usethis namespace: start
7 | ## usethis namespace: end
8 | NULL
9 | 


--------------------------------------------------------------------------------
/R/RcppExports.R:
--------------------------------------------------------------------------------
1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand
2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
3 | 
4 | figtree <- function(X, h, Q, Y, epsilon, G) {
5 |     .Call(`_TDApplied_figtree`, X, h, Q, Y, epsilon, G)
6 | }
7 | 
8 | 


--------------------------------------------------------------------------------
/TDApplied.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | BuildType: Package
16 | PackageUseDevtools: Yes
17 | PackageInstallArgs: --no-multiarch --with-keep.source
18 | 


--------------------------------------------------------------------------------
/man/check_ripser.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/python_functions.R
 3 | \name{check_ripser}
 4 | \alias{check_ripser}
 5 | \title{Verify an imported ripser module.}
 6 | \usage{
 7 | check_ripser(ripser)
 8 | }
 9 | \arguments{
10 | \item{ripser}{the ripser module object.}
11 | }
12 | \description{
13 | Verify an imported ripser module.
14 | }
15 | \author{
16 | Shael Brown - \email{shaelebrown@gmail.com}
17 | }
18 | 


--------------------------------------------------------------------------------
/man/import_ripser.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/python_functions.R
 3 | \name{import_ripser}
 4 | \alias{import_ripser}
 5 | \title{Import the python module ripser.}
 6 | \usage{
 7 | import_ripser()
 8 | }
 9 | \value{
10 | the python ripser module.
11 | }
12 | \description{
13 | The ripser module is needed for fast persistent cohomology calculations with the PyH function.
14 | }
15 | \details{
16 | Same as "reticulate::import("ripser")", just with additional checks.
17 | }
18 | \examples{
19 | \dontrun{
20 | # import ripser
21 | ripser <- import_ripser()
22 | }
23 | }
24 | \author{
25 | Shael Brown - \email{shaelebrown@gmail.com}
26 | }
27 | 


--------------------------------------------------------------------------------
/man/check_PyH_setup.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/python_functions.R
 3 | \name{check_PyH_setup}
 4 | \alias{check_PyH_setup}
 5 | \title{Make sure that python has been configured correctly for persistent homology calculations.}
 6 | \usage{
 7 | check_PyH_setup()
 8 | }
 9 | \description{
10 | Ensures that the reticulate package has been installed, that python is available to be used
11 | by reticulate functions, and that the python module "ripser" has been installed.
12 | }
13 | \details{
14 | An error message will be thrown if any of the above conditions are not met.
15 | }
16 | \author{
17 | Shael Brown - \email{shaelebrown@gmail.com}
18 | }
19 | 


--------------------------------------------------------------------------------
/tests/testthat/test-convert.R:
--------------------------------------------------------------------------------
 1 | 
 2 | # test_that("diagram_to_df can accept the right kinds of input",{
 3 | # 
 4 | #   skip_if_not_installed("TDA")
 5 | #   skip_if_not_installed("TDAstats")
 6 | #   D <- TDA::circleUnif(n = 20,r = 1)
 7 | #   phom_TDA <- TDA::ripsDiag(X = D,maxdimension = 1,maxscale = 2)
 8 | #   phom_TDAstats <- TDAstats::calculate_homology(mat = D,threshold = 2)
 9 | #   simulated_PyH_phom <- list(diagram = diagram_to_df(phom_TDA),representatives = list())
10 | #   expect_s3_class(diagram_to_df(phom_TDA),"data.frame")
11 | #   expect_s3_class(diagram_to_df(phom_TDAstats),"data.frame")
12 | #   expect_s3_class(diagram_to_df(diagram_to_df(phom_TDA)),"data.frame")
13 | #   expect_s3_class(diagram_to_df(simulated_PyH_phom),"data.frame")
14 | # 
15 | # })
16 | 
17 | test_that("diagram_to_df can detect incorrect parameters properly",{
18 |   
19 |   expect_error(diagram_to_df(2),"computation")
20 |   
21 | })


--------------------------------------------------------------------------------
/cran-comments.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ## Test environments
 3 | * local Mac OS X install, R 4.1.2
 4 | * win-builder (devel and release)
 5 | * rhub windows virtual machine
 6 | * rhub macos virtual machine
 7 | * rhub linux virtual machine
 8 | * rhub ubuntu-release, valgrind, ubuntu-clang, clang19 and atlas containers
 9 | 
10 | ## R CMD check results
11 | 
12 | 0 errors | 0 warnings | 1 note
13 | 
14 | ## NOTES
15 | 
16 | * the note on R CMD check is for large sub directory size (necessary for the extensive documentation needed for journal publication).
17 | * on rhub there are build errors for gcc14 (Fedora Linux R devel) and macos-arm64, seemingly because some of the package dependencies are not available on those platforms.
18 | * some of the examples run for over 5s, however these examples have been made as small and fast as possible without throwing errors.
19 | * there are domain-specific words and author names in ML_and_Inference.Rmd which were flagged by devtools::check_spelling() but to the author's knowledge they are all spelled correctly.
20 | 
21 | 
22 | 


--------------------------------------------------------------------------------
/man/enclosing_radius.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/enclosing_rad.R
 3 | \name{enclosing_radius}
 4 | \alias{enclosing_radius}
 5 | \title{Compute the enclosing radius for a dataset.}
 6 | \usage{
 7 | enclosing_radius(X, distance_mat = FALSE)
 8 | }
 9 | \arguments{
10 | \item{X}{the input dataset, must either be a matrix or data frame.}
11 | 
12 | \item{distance_mat}{whether or not `X` is a distance matrix, default FALSE.}
13 | }
14 | \value{
15 | the numeric enclosing radius.
16 | }
17 | \description{
18 | The enclosing radius is the minimum (Euclidean distance) radius beyond which no topological changes will occur.
19 | }
20 | \examples{
21 | 
22 | # create a persistence diagram from a 2D Gaussian
23 | df = data.frame(x = rnorm(n = 20,mean = 0,sd = 1),y = rnorm(n = 20,mean = 0,sd = 1))
24 |   
25 | # compute the enclosing radius from the point cloud
26 | enc_rad <- enclosing_radius(df, distance_mat = FALSE)
27 |   
28 | # compute the distance matrix manually, stored as a matrix
29 | dist_df <- as.matrix(dist(df))
30 |   
31 | # compute the enclosing radius from the distance matrix
32 | enc_rad <- enclosing_radius(dist_df, distance_mat = TRUE)
33 | }
34 | \author{
35 | Shael Brown - \email{shaelebrown@gmail.com}
36 | }
37 | 


--------------------------------------------------------------------------------
/man/TDApplied-package.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/TDApplied-package.R
 3 | \docType{package}
 4 | \name{TDApplied-package}
 5 | \alias{TDApplied}
 6 | \alias{TDApplied-package}
 7 | \title{TDApplied: Machine Learning and Inference for Topological Data Analysis}
 8 | \description{
 9 | Topological data analysis is a powerful tool for finding non-linear global structure in whole datasets. The main tool of topological data analysis is persistent homology, which computes a topological shape descriptor of a dataset called a persistence diagram. 'TDApplied' provides useful and efficient methods for analyzing groups of persistence diagrams with machine learning and statistical inference, and these functions can also interface with other data science packages to form flexible and integrated topological data analysis pipelines.
10 | }
11 | \seealso{
12 | Useful links:
13 | \itemize{
14 |   \item \url{https://github.com/shaelebrown/TDApplied}
15 |   \item Report bugs at \url{https://github.com/shaelebrown/TDApplied/issues}
16 | }
17 | 
18 | }
19 | \author{
20 | \strong{Maintainer}: Shael Brown \email{shaelebrown@gmail.com}
21 | 
22 | Authors:
23 | \itemize{
24 |   \item Dr. Reza Farivar \email{reza.farivar@mcgill.ca} [funder]
25 | }
26 | 
27 | }
28 | \keyword{internal}
29 | 


--------------------------------------------------------------------------------
/tests/testthat/test-enclosing.R:
--------------------------------------------------------------------------------
 1 | 
 2 | test_that("enclosing_radius can detect incorrect inputs",{
 3 |   
 4 |   expect_error(enclosing_radius(NULL, NULL), "distance_mat")
 5 |   expect_error(enclosing_radius(NULL, c(T,F)), "single")
 6 |   expect_error(enclosing_radius(NULL, NA), "NA")
 7 |   expect_error(enclosing_radius(NULL, T), "X")
 8 |   expect_error(enclosing_radius(data.frame(),T),"X")
 9 |   expect_error(enclosing_radius(data.frame(x = 1),T),"X")
10 |   expect_error(enclosing_radius(data.frame(x = c(1,2)),T),"X")
11 |   expect_error(enclosing_radius(X = NULL,T),"X")
12 |   expect_error(enclosing_radius(X = data.frame(x = c(1,NA)),T),"missing")
13 |   expect_error(enclosing_radius(data.frame(x = c(1),y = c(2)),T),"two")
14 |   expect_error(enclosing_radius(data.frame(x = c(1,2,3),y = c(2,1,2)),T),"square")
15 |   
16 | })
17 | 
18 | test_that("enclosing_radius is computing properly",{
19 |   
20 |   X <- data.frame(x = c(1:10),y = c(1:10))
21 |   dist_X <- as.matrix(dist(X))
22 |   expect_equal(enclosing_radius(X, F), dist_X[1,6])
23 |   expect_equal(enclosing_radius(dist_X, T), dist_X[1,6])
24 |   
25 |   theta <- runif(n = 100,min = 0,max = 2*pi)
26 |   x <- cos(theta)
27 |   y <- sin(theta)
28 |   df <- data.frame(x = x,y = y)
29 |   dist_df <- as.matrix(dist(df))
30 |   expect_equal(enclosing_radius(df, F),enclosing_radius(dist_df, T))
31 |     
32 | })
33 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: TDApplied
 2 | Type: Package
 3 | Title: Machine Learning and Inference for Topological Data Analysis
 4 | Version: 3.0.4
 5 | Authors@R: c(person("Shael", "Brown", email = "shaelebrown@gmail.com", role = c("aut","cre")),
 6 |              person("Dr. Reza", "Farivar", email = "reza.farivar@mcgill.ca", role = c("aut","fnd")))
 7 | Author: Shael Brown [aut, cre],
 8 |         Dr. Reza Farivar [aut, fnd]
 9 | Maintainer: Shael Brown <shaelebrown@gmail.com>
10 | Description: Topological data analysis is a powerful tool for finding non-linear global structure
11 |     in whole datasets. The main tool of topological data analysis is persistent homology, which computes
12 |     a topological shape descriptor of a dataset called a persistence diagram. 'TDApplied' provides 
13 |     useful and efficient methods for analyzing groups of persistence diagrams with machine learning and statistical inference,
14 |     and these functions can also interface with other data science packages to form flexible and integrated
15 |     topological data analysis pipelines.
16 | Depends: R (>= 3.5.0)
17 | Imports: parallel, doParallel, foreach, clue, rdist, parallelly, kernlab, iterators, methods, stats, utils, Rcpp (>= 0.11.0)
18 | License: GPL (>= 3)
19 | URL: https://github.com/shaelebrown/TDApplied
20 | BugReports: https://github.com/shaelebrown/TDApplied/issues
21 | Encoding: UTF-8
22 | NeedsCompilation: yes
23 | RoxygenNote: 7.3.2
24 | Suggests: 
25 |     rmarkdown,
26 |     knitr,
27 |     testthat (>= 3.0.0),
28 |     TDAstats,
29 |     reticulate,
30 |     TDA,
31 |     igraph
32 | LinkingTo: Rcpp
33 | VignetteBuilder: knitr, rmarkdown
34 | Config/testthat/edition: 3
35 | 


--------------------------------------------------------------------------------
/src/RcppExports.cpp:
--------------------------------------------------------------------------------
 1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand
 2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 3 | 
 4 | #include <Rcpp.h>
 5 | 
 6 | using namespace Rcpp;
 7 | 
 8 | #ifdef RCPP_USE_GLOBAL_ROSTREAM
 9 | Rcpp::Rostream<true>&  Rcpp::Rcout = Rcpp::Rcpp_cout_get();
10 | Rcpp::Rostream<false>& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get();
11 | #endif
12 | 
13 | // figtree
14 | std::vector<double> figtree(std::vector<double> X, double h, std::vector<double> Q, std::vector<double> Y, double epsilon, std::vector<double> G);
15 | RcppExport SEXP _TDApplied_figtree(SEXP XSEXP, SEXP hSEXP, SEXP QSEXP, SEXP YSEXP, SEXP epsilonSEXP, SEXP GSEXP) {
16 | BEGIN_RCPP
17 |     Rcpp::RObject rcpp_result_gen;
18 |     Rcpp::RNGScope rcpp_rngScope_gen;
19 |     Rcpp::traits::input_parameter< std::vector<double> >::type X(XSEXP);
20 |     Rcpp::traits::input_parameter< double >::type h(hSEXP);
21 |     Rcpp::traits::input_parameter< std::vector<double> >::type Q(QSEXP);
22 |     Rcpp::traits::input_parameter< std::vector<double> >::type Y(YSEXP);
23 |     Rcpp::traits::input_parameter< double >::type epsilon(epsilonSEXP);
24 |     Rcpp::traits::input_parameter< std::vector<double> >::type G(GSEXP);
25 |     rcpp_result_gen = Rcpp::wrap(figtree(X, h, Q, Y, epsilon, G));
26 |     return rcpp_result_gen;
27 | END_RCPP
28 | }
29 | 
30 | static const R_CallMethodDef CallEntries[] = {
31 |     {"_TDApplied_figtree", (DL_FUNC) &_TDApplied_figtree, 6},
32 |     {NULL, NULL, 0}
33 | };
34 | 
35 | RcppExport void R_init_TDApplied(DllInfo *dll) {
36 |     R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
37 |     R_useDynamicSymbols(dll, FALSE);
38 | }
39 | 


--------------------------------------------------------------------------------
/man/diagram_to_df.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/convert.R
 3 | \name{diagram_to_df}
 4 | \alias{diagram_to_df}
 5 | \title{Convert a TDA/TDAstats persistence diagram to a data frame.}
 6 | \usage{
 7 | diagram_to_df(d)
 8 | }
 9 | \arguments{
10 | \item{d}{the output of a TDA/TDAstats homology calculation, like ripsDiag or \code{\link[TDAstats]{calculate_homology}}.}
11 | }
12 | \value{
13 | a 3-column data frame, with each row representing a topological feature. The first column is the feature dimension (a non-negative integer), the second column is the birth radius of the feature and the third column is the death radius.
14 | }
15 | \description{
16 | The output of homology calculations from the R packages TDA
17 | and TDAstats are not dataframes. This function converts these 
18 | outputs into a data frame either for further usage in this package or
19 | for personalized analyses.
20 | }
21 | \details{
22 | If a diagram is constructed using a TDA function like ripsDiag
23 | with the `location` parameter set to true then the return value will ignore the location information.
24 | }
25 | \examples{
26 | 
27 | if(require("TDAstats"))
28 | {
29 |   # create a persistence diagram from a 2D Gaussian
30 |   df = data.frame(x = rnorm(n = 20,mean = 0,sd = 1),y = rnorm(n = 20,mean = 0,sd = 1))
31 | 
32 |   # compute persistence diagram with calculate_homology from package TDAstats
33 |   phom_TDAstats = TDAstats::calculate_homology(mat = df,dim = 0,threshold = 1)
34 | 
35 |   # convert to data frame
36 |   phom_TDAstats_df = diagram_to_df(d = phom_TDAstats)
37 | }
38 | }
39 | \author{
40 | Shael Brown - \email{shaelebrown@gmail.com}
41 | }
42 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(PyH)
 4 | export(bootstrap_persistence_thresholds)
 5 | export(diagram_distance)
 6 | export(diagram_kernel)
 7 | export(diagram_kkmeans)
 8 | export(diagram_kpca)
 9 | export(diagram_ksvm)
10 | export(diagram_mds)
11 | export(diagram_to_df)
12 | export(distance_matrix)
13 | export(enclosing_radius)
14 | export(gram_matrix)
15 | export(import_ripser)
16 | export(independence_test)
17 | export(permutation_model_inference)
18 | export(permutation_test)
19 | export(plot_diagram)
20 | export(plot_vr_graph)
21 | export(predict_diagram_kkmeans)
22 | export(predict_diagram_kpca)
23 | export(predict_diagram_ksvm)
24 | export(universal_null)
25 | export(vr_graphs)
26 | import(Rcpp)
27 | importFrom(clue,solve_LSAP)
28 | importFrom(doParallel,registerDoParallel)
29 | importFrom(doParallel,stopImplicitCluster)
30 | importFrom(foreach,"%:%")
31 | importFrom(foreach,"%do%")
32 | importFrom(foreach,"%dopar%")
33 | importFrom(foreach,foreach)
34 | importFrom(graphics,legend)
35 | importFrom(graphics,lines)
36 | importFrom(graphics,points)
37 | importFrom(graphics,rect)
38 | importFrom(iterators,iter)
39 | importFrom(kernlab,as.kernelMatrix)
40 | importFrom(kernlab,kkmeans)
41 | importFrom(kernlab,kpca)
42 | importFrom(kernlab,ksvm)
43 | importFrom(kernlab,predict)
44 | importFrom(methods,is)
45 | importFrom(parallel,clusterEvalQ)
46 | importFrom(parallel,clusterExport)
47 | importFrom(parallel,makeCluster)
48 | importFrom(parallel,stopCluster)
49 | importFrom(parallelly,availableCores)
50 | importFrom(rdist,cdist)
51 | importFrom(stats,as.dendrogram)
52 | importFrom(stats,as.dist)
53 | importFrom(stats,cmdscale)
54 | importFrom(stats,complete.cases)
55 | importFrom(stats,dist)
56 | importFrom(stats,hclust)
57 | importFrom(stats,heatmap)
58 | importFrom(stats,order.dendrogram)
59 | importFrom(stats,pgamma)
60 | importFrom(stats,quantile)
61 | importFrom(utils,combn)
62 | useDynLib(TDApplied, .registration = TRUE)
63 | 


--------------------------------------------------------------------------------
/src/kd_fix_rad_search.h:
--------------------------------------------------------------------------------
 1 | //----------------------------------------------------------------------
 2 | // File:			kd_fix_rad_search.h
 3 | // Programmer:		Sunil Arya and David Mount
 4 | // Description:		Standard kd-tree fixed-radius kNN search
 5 | // Last modified:	05/03/05 (Version 1.1)
 6 | //----------------------------------------------------------------------
 7 | // Copyright (c) 1997-2005 University of Maryland and Sunil Arya and
 8 | // David Mount.  All Rights Reserved.
 9 | // 
10 | // This software and related documentation is part of the Approximate
11 | // Nearest Neighbor Library (ANN).  This software is provided under
12 | // the provisions of the Lesser GNU Public License (LGPL).  See the
13 | // file ../ReadMe.txt for further information.
14 | // 
15 | // The University of Maryland (U.M.) and the authors make no
16 | // representations about the suitability or fitness of this software for
17 | // any purpose.  It is provided "as is" without express or implied
18 | // warranty.
19 | //----------------------------------------------------------------------
20 | // History:
21 | //	Revision 1.1  05/03/05
22 | //		Initial release
23 | //----------------------------------------------------------------------
24 | 
25 | #ifndef ANN_kd_fix_rad_search_H
26 | #define ANN_kd_fix_rad_search_H
27 | 
28 | #include "kd_tree.h"					// kd-tree declarations
29 | #include "kd_util.h"					// kd-tree utilities
30 | #include "pr_queue_k.h"					// k-element priority queue
31 | 
32 | #include "ANNperf.h" // performance evaluation
33 | 
34 | //----------------------------------------------------------------------
35 | //	Global variables
36 | //		These are active for the life of each call to
37 | //		annRangeSearch().  They are set to save the number of
38 | //		variables that need to be passed among the various search
39 | //		procedures.
40 | //----------------------------------------------------------------------
41 | 
42 | extern ANNpoint			ANNkdFRQ;			// query point (static copy)
43 | 
44 | #endif


--------------------------------------------------------------------------------
/tests/testthat/test-utilities.R:
--------------------------------------------------------------------------------
 1 | 
 2 | test_that("utilities are working properly",{
 3 |   
 4 |   expect_error(check_diagram(data.frame(dimension = c(1,2,3),birth = c("1","2","3"),death = c(1,2,3))),"numeric")
 5 |   expect_error(check_diagram(data.frame(dimension = c(1.1,2,3),birth = c(1,2,3),death = c(1,2,3))),"whole")
 6 |   expect_error(check_diagram(data.frame(dimension = c(-1,2,3),birth = c(1,2,3),death = c(1,2,3))),">= 0")
 7 |   expect_error(check_diagram(data.frame(dimension = c(1,2,3),birth = c(1,-2,3),death = c(1,2,3))),">= 0")
 8 |   expect_error(check_diagram(data.frame(dimension = c(1,2,3),birth = c(1,2,3),death = c(1,2,NA))),"missing")
 9 |   expect_error(check_diagram(data.frame(dimension = c(1,2,3),birth = c(1,2,3),death = c(1,2,2.9))),"larger")
10 |   expect_error(check_param(param_name = "test",param = "T",numeric = F),"T or F")
11 |   
12 | })
13 | 
14 | test_that("check_matrix works",{
15 |   
16 |   d1 = data.frame(dimension = rep(0,5),birth = 1:5,death = 1:5 + 0.1)
17 |   d2 = data.frame(dimension = rep(0,5),birth = 1:5,death = 1:5 + 0.2)
18 |   D = distance_matrix(list(d1,d2),dim = 0,num_workers = 2)
19 |   K = gram_matrix(list(d1,d2),dim = 0,num_workers = 2)
20 |   expect_error(check_matrix(D,"D"),"kernel")
21 |   expect_error(check_matrix(K,"K","matrix"),"matrix")
22 |   expect_error(check_matrix(rbind(D,c(1,2)),"D","matrix"),"rows")
23 |   D[1,2] = NA
24 |   D[2,1] = NaN
25 |   expect_error(check_matrix(D,"D","matrix"),"missing")
26 |   D = distance_matrix(list(d1,d2),dim = 0,num_workers = 2)
27 |   D[1,1] = 1
28 |   expect_error(check_matrix(D,"D","matrix"),"0's")
29 |   D[1,1] = 0
30 |   K[1,1] = 0
31 |   expect_error(check_matrix(K,"K"),"1's")
32 |   K[1,1] = 1
33 |   K[1,2] = 1
34 |   expect_error(check_matrix(K,"K"),"symmetric")
35 |   D[1,2] = 0
36 |   expect_error(check_matrix(D,"D","matrix"),"symmetric")
37 |   expect_silent(check_matrix(D,"D",type = "matrix",symmetric = F))
38 |   expect_error(check_matrix(D[0,],"D",type = "matrix"),"at least")
39 |   
40 | })


--------------------------------------------------------------------------------
/src/kd_pr_search.h:
--------------------------------------------------------------------------------
 1 | //----------------------------------------------------------------------
 2 | // File:			kd_pr_search.h
 3 | // Programmer:		Sunil Arya and David Mount
 4 | // Description:		Priority kd-tree search
 5 | // Last modified:	01/04/05 (Version 1.0)
 6 | //----------------------------------------------------------------------
 7 | // Copyright (c) 1997-2005 University of Maryland and Sunil Arya and
 8 | // David Mount.  All Rights Reserved.
 9 | // 
10 | // This software and related documentation is part of the Approximate
11 | // Nearest Neighbor Library (ANN).  This software is provided under
12 | // the provisions of the Lesser GNU Public License (LGPL).  See the
13 | // file ../ReadMe.txt for further information.
14 | // 
15 | // The University of Maryland (U.M.) and the authors make no
16 | // representations about the suitability or fitness of this software for
17 | // any purpose.  It is provided "as is" without express or implied
18 | // warranty.
19 | //----------------------------------------------------------------------
20 | // History:
21 | //	Revision 0.1  03/04/98
22 | //		Initial release
23 | //----------------------------------------------------------------------
24 | 
25 | #ifndef ANN_kd_pr_search_H
26 | #define ANN_kd_pr_search_H
27 | 
28 | #include "kd_tree.h"					// kd-tree declarations
29 | #include "kd_util.h"					// kd-tree utilities
30 | #include "pr_queue.h"					// priority queue declarations
31 | #include "pr_queue_k.h"					// k-element priority queue
32 | 
33 | #include "ANNperf.h"				// performance evaluation
34 | 
35 | //----------------------------------------------------------------------
36 | //	Global variables
37 | //		Active for the life of each call to Appx_Near_Neigh() or
38 | //		Appx_k_Near_Neigh().
39 | //----------------------------------------------------------------------
40 | 
41 | extern double			ANNprEps;		// the error bound
42 | extern int				ANNprDim;		// dimension of space
43 | extern ANNpoint			ANNprQ;			// query point
44 | extern double			ANNprMaxErr;	// max tolerable squared error
45 | extern ANNpointArray	ANNprPts;		// the points
46 | extern ANNpr_queue		*ANNprBoxPQ;	// priority queue for boxes
47 | extern ANNmin_k			*ANNprPointMK;	// set of k closest points
48 | 
49 | #endif


--------------------------------------------------------------------------------
/src/kd_search.h:
--------------------------------------------------------------------------------
 1 | //----------------------------------------------------------------------
 2 | // File:			kd_search.h
 3 | // Programmer:		Sunil Arya and David Mount
 4 | // Description:		Standard kd-tree search
 5 | // Last modified:	01/04/05 (Version 1.0)
 6 | //----------------------------------------------------------------------
 7 | // Copyright (c) 1997-2005 University of Maryland and Sunil Arya and
 8 | // David Mount.  All Rights Reserved.
 9 | // 
10 | // This software and related documentation is part of the Approximate
11 | // Nearest Neighbor Library (ANN).  This software is provided under
12 | // the provisions of the Lesser GNU Public License (LGPL).  See the
13 | // file ../ReadMe.txt for further information.
14 | // 
15 | // The University of Maryland (U.M.) and the authors make no
16 | // representations about the suitability or fitness of this software for
17 | // any purpose.  It is provided "as is" without express or implied
18 | // warranty.
19 | //----------------------------------------------------------------------
20 | // History:
21 | //	Revision 0.1  03/04/98
22 | //		Initial release
23 | //----------------------------------------------------------------------
24 | 
25 | #ifndef ANN_kd_search_H
26 | #define ANN_kd_search_H
27 | 
28 | #include "kd_tree.h"					// kd-tree declarations
29 | #include "kd_util.h"					// kd-tree utilities
30 | #include "pr_queue_k.h"					// k-element priority queue
31 | 
32 | #include "ANNperf.h"				// performance evaluation
33 | 
34 | //----------------------------------------------------------------------
35 | //	More global variables
36 | //		These are active for the life of each call to annkSearch(). They
37 | //		are set to save the number of variables that need to be passed
38 | //		among the various search procedures.
39 | //----------------------------------------------------------------------
40 | 
41 | extern int				ANNkdDim;		// dimension of space (static copy)
42 | extern ANNpoint			ANNkdQ;			// query point (static copy)
43 | extern double			ANNkdMaxErr;	// max tolerable squared error
44 | extern ANNpointArray	ANNkdPts;		// the points (static copy)
45 | extern ANNmin_k			*ANNkdPointMK;	// set of k closest points
46 | extern int				ANNptsVisited;	// number of points visited
47 | 
48 | #endif


--------------------------------------------------------------------------------
/man/loss.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/distance_calculations.R
 3 | \name{loss}
 4 | \alias{loss}
 5 | \title{Turner loss function for a list of groups (lists) of persistence diagrams.}
 6 | \usage{
 7 | loss(
 8 |   diagram_groups,
 9 |   dist_mats,
10 |   dims,
11 |   p,
12 |   q,
13 |   distance,
14 |   sigma,
15 |   rho,
16 |   num_workers,
17 |   group_sizes
18 | )
19 | }
20 | \arguments{
21 | \item{diagram_groups}{groups (lists/vectors) of persistence diagrams, stored as lists of a data frame and
22 | an index of the diagram in all the diagrams across all groups.}
23 | 
24 | \item{dist_mats}{distance matrices between all possible pairs of persistence diagrams across and within groups
25 | storing the current distances which have been pre-computed.}
26 | 
27 | \item{dims}{a numeric vector of which homological dimensions in which the loss function is to be computed.}
28 | 
29 | \item{p}{a number representing the wasserstein parameter, at least 1, and if Inf then the bottleneck distance is calculated.}
30 | 
31 | \item{q}{a finite number at least 1.}
32 | 
33 | \item{distance}{a string which determines which type of distance calculation to carry out, either "wasserstein" (default) or "fisher".}
34 | 
35 | \item{sigma}{the positive bandwidth for the persistence Fisher distance.}
36 | 
37 | \item{rho}{the approximation heuristic for Fisher information metric, results in sequential computation.}
38 | 
39 | \item{num_workers}{the number of cores used for parallel computation.}
40 | 
41 | \item{group_sizes}{for when using precomputed distance matrices.}
42 | }
43 | \value{
44 | the numeric value of the Turner loss function.
45 | }
46 | \description{
47 | An internal function to calculate the normalized sum of within-group exponentiated distances 
48 | between pairs of persistence diagrams (stored as data frames)
49 | for an arbitrary number of groups in parallel. Note that this function may run
50 | into memory issues for large numbers of diagrams.
51 | }
52 | \details{
53 | The Turner loss function is described in Robinson and Turner 2017
54 | (\url{https://link.springer.com/article/10.1007/s41468-017-0008-7}), and is used
55 | in the `permutation_test` function to describe how well-separated a particular
56 | grouping of persistence diagrams is. When the `distance` parameter is "fisher",
57 | `sigma` must not be NULL.
58 | }
59 | \references{
60 | Robinson T, Turner K (2017). "Hypothesis testing for topological data analysis." \url{https://link.springer.com/article/10.1007/s41468-017-0008-7}.
61 | }
62 | \author{
63 | Shael Brown - \email{shaelebrown@gmail.com}
64 | }
65 | \keyword{internal}
66 | 


--------------------------------------------------------------------------------
/man/diagram_kernel.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/kernel_calculations.R
 3 | \name{diagram_kernel}
 4 | \alias{diagram_kernel}
 5 | \title{Calculate persistence Fisher kernel value between a pair of persistence diagrams.}
 6 | \usage{
 7 | diagram_kernel(D1, D2, dim = 0, sigma = 1, t = 1, rho = NULL)
 8 | }
 9 | \arguments{
10 | \item{D1}{the first persistence diagram.}
11 | 
12 | \item{D2}{the second persistence diagram.}
13 | 
14 | \item{dim}{the non-negative integer homological dimension in which the distance is to be computed, default 0.}
15 | 
16 | \item{sigma}{a positive number representing the bandwidth for the Fisher information metric, default 1.}
17 | 
18 | \item{t}{a positive number representing the scale for the persistence Fisher kernel, default 1.}
19 | 
20 | \item{rho}{an optional positive number representing the heuristic for Fisher information metric approximation, see \code{\link{diagram_distance}}. Default NULL.}
21 | }
22 | \value{
23 | the numeric kernel value.
24 | }
25 | \description{
26 | Returns the persistence Fisher kernel value between a pair of persistence diagrams
27 | in a particular homological dimension, each of which is either the output from a \code{\link{diagram_to_df}} 
28 | function call or from a persistent homology calculation like ripsDiag/\code{\link[TDAstats]{calculate_homology}}/\code{\link{PyH}}.
29 | }
30 | \details{
31 | The persistence Fisher kernel is calculated from the Fisher information metric according to the formula
32 | \eqn{k_{PF}(D_1,D_2) = exp(-t*d_{FIM}(D_1,D_2))}, resembling a radial basis kernel for standard
33 | Euclidean spaces.
34 | }
35 | \examples{
36 | 
37 | if(require("TDAstats"))
38 | {
39 |   # create two diagrams
40 |   D1 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,20),],
41 |                       dim = 1,threshold = 2)
42 |   D2 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,20),],
43 |                       dim = 1,threshold = 2)
44 | 
45 |   # calculate the kernel value between D1 and D2 with sigma = 2, t = 2 in dimension 1
46 |   diagram_kernel(D1,D2,dim = 1,sigma = 2,t = 2)
47 |   # calculate the kernel value between D1 and D2 with sigma = 2, t = 2 in dimension 0
48 |   diagram_kernel(D1,D2,dim = 0,sigma = 2,t = 2)
49 | }
50 | }
51 | \references{
52 | Le T, Yamada M (2018). "Persistence fisher kernel: a riemannian manifold kernel for persistence diagrams." \url{https://proceedings.neurips.cc/paper/2018/file/959ab9a0695c467e7caf75431a872e5c-Paper.pdf}.
53 | 
54 | Murphy, K. "Machine learning: a probabilistic perspective", MIT press (2012).
55 | }
56 | \seealso{
57 | \code{\link{gram_matrix}} for Gram (i.e. kernel) matrix calculations.
58 | }
59 | \author{
60 | Shael Brown - \email{shaelebrown@gmail.com}
61 | }
62 | 


--------------------------------------------------------------------------------
/tests/testthat/test-plot.R:
--------------------------------------------------------------------------------
 1 | 
 2 | test_that("plot_diagram can detect incorrect parameters",{
 3 |   
 4 |   expect_error(plot_diagram(D = data.frame(dimension = c(0:13),birth = rep(0,14),death = rep(1,14))),"12")
 5 |   expect_error(plot_diagram(D = data.frame(dimension = c(0,1,2,3),birth = c(0,0,0,0),death = c(1,1,1,Inf))),"finite")
 6 |   expect_error(plot_diagram(D = data.frame(dimension = c(0,1,2,3),birth = c(0,0,0,0),death = c(1,1,1,2)),title = NA),"NA")
 7 |   expect_error(plot_diagram(D = data.frame(dimension = c(0,1,2,3),birth = c(0,0,0,0),death = c(1,1,1,2)),title = 2),"character")
 8 |   expect_error(plot_diagram(D = data.frame(dimension = c(0,1,2,3),birth = c(0,0,0,0),death = c(1,1,1,2)),max_radius = NA),"numeric")
 9 |   expect_error(plot_diagram(D = data.frame(dimension = c(0,1,2,3),birth = c(0,0,0,0),death = c(1,1,1,2)),max_radius = c(1,2)),"single")
10 |   expect_error(plot_diagram(D = data.frame(dimension = c(0,1,2,3),birth = c(0,0,0,0),death = c(1,1,1,2)),max_radius = Inf),"finite")
11 |   expect_error(plot_diagram(D = data.frame(dimension = c(0,1,2,3),birth = c(0,0,0,0),death = c(1,1,1,2)),max_radius = -1),"positive")
12 |   expect_error(plot_diagram(D = data.frame(dimension = c(0,1,2,3),birth = c(0,0,0,0),death = c(1,1,1,2)),legend = NULL),"NULL")
13 |   expect_error(plot_diagram(D = data.frame(dimension = c(0,1,2,3),birth = c(0,0,0,0),death = c(1,1,1,2)),legend = c(T,F)),"single")
14 |   expect_error(plot_diagram(D = data.frame(dimension = c(0,1,2,3),birth = c(0,0,0,0),death = c(1,1,1,2)),thresholds = c(0,1,2,NA)),"NA")
15 |   expect_error(plot_diagram(D = data.frame(dimension = c(0,1,2,3),birth = c(0,0,0,0),death = c(1,1,1,2)),thresholds = list(thresholds = c(0,1,2,NA))),"NA")
16 |   expect_error(plot_diagram(D = data.frame(dimension = c(0,1,2,3),birth = c(0,0,0,0),death = c(1,1,1,2)),thresholds = list(foo = c(1,2,3))),"list element")
17 |   expect_error(plot_diagram(D = data.frame(dimension = c(0,1,2,3),birth = c(0,0,0,0),death = c(1,1,1,2)),thresholds = list(thresholds = c(1,2,3))),"element")
18 |   expect_error(plot_diagram(D = data.frame(dimension = c(0,1,2,3),birth = c(0,0,0,0),death = c(1,1,1,2)),thresholds = c(1,2,3)),"element")
19 |   expect_error(plot_diagram(D = data.frame(dimension = c(0,1,2,3),birth = c(0,0,0,0),death = c(1,1,1,2)),thresholds = c(1,2,3,"5")),"numeric")
20 |   expect_error(plot_diagram(D = data.frame(dimension = c(0,1,2,3),birth = c(0,0,0,0),death = c(1,1,1,2)),thresholds = list(thresholds = c(1,2,3,"5"))),"numeric")
21 |   expect_error(plot_diagram(D = data.frame(dimension = c(0,1,2,3),birth = c(0,0,0,0),death = c(1,1,1,2)),thresholds = c(1,2,3,NA)),"NA")
22 |   
23 | })
24 | 
25 | test_that("plot_diagram is working correctly",{
26 |   
27 |   expect_identical(plot_diagram(D = data.frame(dimension = numeric(),birth = numeric(),death = numeric())),NULL)
28 |   expect_identical(plot_diagram(D = data.frame(dimension = c(0),birth = c(0),death = c(1))),NULL)
29 |   
30 | })
31 | 
32 | 


--------------------------------------------------------------------------------
/R/convert.R:
--------------------------------------------------------------------------------
 1 | #### CONVERT PERSISTENCE DIAGRAMS INTO DATA FRAMES####
 2 | #' Convert a TDA/TDAstats persistence diagram to a data frame.
 3 | #'
 4 | #' The output of homology calculations from the R packages TDA
 5 | #' and TDAstats are not dataframes. This function converts these 
 6 | #' outputs into a data frame either for further usage in this package or
 7 | #' for personalized analyses.
 8 | #' 
 9 | #' If a diagram is constructed using a TDA function like ripsDiag
10 | #' with the `location` parameter set to true then the return value will ignore the location information.
11 | #'
12 | #' @param d the output of a TDA/TDAstats homology calculation, like ripsDiag or \code{\link[TDAstats]{calculate_homology}}.
13 | #' @return a 3-column data frame, with each row representing a topological feature. The first column is the feature dimension (a non-negative integer), the second column is the birth radius of the feature and the third column is the death radius.
14 | #' @export
15 | #' @author Shael Brown - \email{shaelebrown@@gmail.com}
16 | #' @examples
17 | #'
18 | #' if(require("TDAstats"))
19 | #' {
20 | #'   # create a persistence diagram from a 2D Gaussian
21 | #'   df = data.frame(x = rnorm(n = 20,mean = 0,sd = 1),y = rnorm(n = 20,mean = 0,sd = 1))
22 | #' 
23 | #'   # compute persistence diagram with calculate_homology from package TDAstats
24 | #'   phom_TDAstats = TDAstats::calculate_homology(mat = df,dim = 0,threshold = 1)
25 | #' 
26 | #'   # convert to data frame
27 | #'   phom_TDAstats_df = diagram_to_df(d = phom_TDAstats)
28 | #' }
29 | 
30 | diagram_to_df <- function(d){
31 | 
32 |   # function to convert d to a data frame with standardized column names
33 |   # d is a diagram from library TDA or TDAstats
34 |   
35 |   # preliminary check, mostly for internal methods
36 |   if(inherits(d,"data.frame"))
37 |   {
38 |     return(d)
39 |   }
40 |   
41 |   if((is.list(d) && ((length(d) == 1 && all(names(d) %in% "diagram") && (inherits(d$diagram,"diagram")) || inherits(d$diagram,"data.frame")) || ((length(d) == 4 && all(names(d) %in% c("diagram","birthLocation","deathLocation","cycleLocation")) && inherits(d$diagram,"diagram"))))) == F && (inherits(d,"matrix") && inherits(d,"array") & all(colnames(d) %in% c("dimension","birth","death"))) == F)
42 |   {
43 |     stop("Diagrams must either be the output of a TDA/TDAstats/PyH computation.")
44 |   }
45 |   
46 |   if(inherits(d,"matrix") & inherits(d,"array"))
47 |   {
48 |     # diagram was the output of a TDAstats calculation
49 |     return(as.data.frame(d))
50 |   }
51 |   
52 |   if("diagram" %in% names(d))
53 |   {
54 |     if(inherits(d$diagram,"data.frame"))
55 |     {
56 |       # diagram was the output of a PyH calculation, with representatives
57 |       return(d$diagram)
58 |     }
59 |   }
60 | 
61 |   # else d was the output of a TDA calculation
62 |   d <- d[[1]]
63 |   class(d) <- "matrix"
64 |   d <- as.data.frame(d)
65 |   colnames(d) <- c("dimension","birth","death")
66 | 
67 |   return(d)
68 | 
69 | }
70 | 


--------------------------------------------------------------------------------
/src/bd_search.cpp:
--------------------------------------------------------------------------------
 1 | #include <Rcpp.h>
 2 | using namespace Rcpp;
 3 | 
 4 | //----------------------------------------------------------------------
 5 | // File:			bd_search.cpp
 6 | // Programmer:		David Mount
 7 | // Description:		Standard bd-tree search
 8 | // Last modified:	01/04/05 (Version 1.0)
 9 | //----------------------------------------------------------------------
10 | // Copyright (c) 1997-2005 University of Maryland and Sunil Arya and
11 | // David Mount.  All Rights Reserved.
12 | // 
13 | // This software and related documentation is part of the Approximate
14 | // Nearest Neighbor Library (ANN).  This software is provided under
15 | // the provisions of the Lesser GNU Public License (LGPL).  See the
16 | // file ../ReadMe.txt for further information.
17 | // 
18 | // The University of Maryland (U.M.) and the authors make no
19 | // representations about the suitability or fitness of this software for
20 | // any purpose.  It is provided "as is" without express or implied
21 | // warranty.
22 | //----------------------------------------------------------------------
23 | // History:
24 | //	Revision 0.1  03/04/98
25 | //		Initial release
26 | //----------------------------------------------------------------------
27 | 
28 | #include "bd_tree.h"					// bd-tree declarations
29 | #include "kd_search.h"					// kd-tree search declarations
30 | 
31 | //----------------------------------------------------------------------
32 | //	Approximate searching for bd-trees.
33 | //		See the file kd_search.cpp for general information on the
34 | //		approximate nearest neighbor search algorithm.  Here we
35 | //		include the extensions for shrinking nodes.
36 | //----------------------------------------------------------------------
37 | 
38 | //----------------------------------------------------------------------
39 | //	bd_shrink::ann_search - search a shrinking node
40 | //----------------------------------------------------------------------
41 | 
42 | void ANNbd_shrink::ann_search(ANNdist box_dist)
43 | {
44 |   // check dist calc term cond.
45 |   if (ANNmaxPtsVisited != 0 && ANNptsVisited > ANNmaxPtsVisited) return;
46 |   
47 |   ANNdist inner_dist = 0;						// distance to inner box
48 |   for (int i = 0; i < n_bnds; i++) {			// is query point in the box?
49 |     if (bnds[i].out(ANNkdQ)) {				// outside this bounding side?
50 |       // add to inner distance
51 |       inner_dist = (ANNdist) ANN_SUM(inner_dist, bnds[i].dist(ANNkdQ));
52 |     }
53 |   }
54 |   if (inner_dist <= box_dist) {				// if inner box is closer
55 |     child[ANN_IN]->ann_search(inner_dist);	// search inner child first
56 |     child[ANN_OUT]->ann_search(box_dist);	// ...then outer child
57 |   }
58 |   else {										// if outer box is closer
59 |     child[ANN_OUT]->ann_search(box_dist);	// search outer child first
60 |     child[ANN_IN]->ann_search(inner_dist);	// ...then outer child
61 |   }
62 |   ANN_FLOP(3*n_bnds)							// increment floating ops
63 |     ANN_SHR(1)									// one more shrinking node
64 | }


--------------------------------------------------------------------------------
/src/bd_pr_search.cpp:
--------------------------------------------------------------------------------
 1 | #include <Rcpp.h>
 2 | using namespace Rcpp;
 3 | //----------------------------------------------------------------------
 4 | // File:			bd_pr_search.cpp
 5 | // Programmer:		David Mount
 6 | // Description:		Priority search for bd-trees
 7 | // Last modified:	01/04/05 (Version 1.0)
 8 | //----------------------------------------------------------------------
 9 | // Copyright (c) 1997-2005 University of Maryland and Sunil Arya and
10 | // David Mount.  All Rights Reserved.
11 | // 
12 | // This software and related documentation is part of the Approximate
13 | // Nearest Neighbor Library (ANN).  This software is provided under
14 | // the provisions of the Lesser GNU Public License (LGPL).  See the
15 | // file ../ReadMe.txt for further information.
16 | // 
17 | // The University of Maryland (U.M.) and the authors make no
18 | // representations about the suitability or fitness of this software for
19 | // any purpose.  It is provided "as is" without express or implied
20 | // warranty.
21 | //----------------------------------------------------------------------
22 | //History:
23 | //	Revision 0.1  03/04/98
24 | //		Initial release
25 | //----------------------------------------------------------------------
26 | 
27 | #include "bd_tree.h"					// bd-tree declarations
28 | #include "kd_pr_search.h"				// kd priority search declarations
29 | 
30 | //----------------------------------------------------------------------
31 | //	Approximate priority searching for bd-trees.
32 | //		See the file kd_pr_search.cc for general information on the
33 | //		approximate nearest neighbor priority search algorithm.  Here
34 | //		we include the extensions for shrinking nodes.
35 | //----------------------------------------------------------------------
36 | 
37 | //----------------------------------------------------------------------
38 | //	bd_shrink::ann_search - search a shrinking node
39 | //----------------------------------------------------------------------
40 | 
41 | void ANNbd_shrink::ann_pri_search(ANNdist box_dist)
42 | {
43 |   ANNdist inner_dist = 0;						// distance to inner box
44 |   for (int i = 0; i < n_bnds; i++) {			// is query point in the box?
45 |     if (bnds[i].out(ANNprQ)) {				// outside this bounding side?
46 |       // add to inner distance
47 |       inner_dist = (ANNdist) ANN_SUM(inner_dist, bnds[i].dist(ANNprQ));
48 |     }
49 |   }
50 |   if (inner_dist <= box_dist) {				// if inner box is closer
51 |     if (child[ANN_OUT] != KD_TRIVIAL)		// enqueue outer if not trivial
52 |       ANNprBoxPQ->insert(box_dist,child[ANN_OUT]);
53 |     // continue with inner child
54 |     child[ANN_IN]->ann_pri_search(inner_dist);
55 |   }
56 |   else {										// if outer box is closer
57 |     if (child[ANN_IN] != KD_TRIVIAL)		// enqueue inner if not trivial
58 |       ANNprBoxPQ->insert(inner_dist,child[ANN_IN]);
59 |     // continue with outer child
60 |     child[ANN_OUT]->ann_pri_search(box_dist);
61 |   }
62 |   ANN_FLOP(3*n_bnds)							// increment floating ops
63 |     ANN_SHR(1)									// one more shrinking node
64 | }


--------------------------------------------------------------------------------
/man/predict_diagram_kkmeans.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/machine_learning.R
 3 | \name{predict_diagram_kkmeans}
 4 | \alias{predict_diagram_kkmeans}
 5 | \title{Predict the cluster labels for new persistence diagrams using a pre-computed clustering.}
 6 | \usage{
 7 | predict_diagram_kkmeans(
 8 |   new_diagrams,
 9 |   K = NULL,
10 |   clustering,
11 |   num_workers = parallelly::availableCores(omit = 1)
12 | )
13 | }
14 | \arguments{
15 | \item{new_diagrams}{a list of persistence diagrams which are either the output of a persistent homology calculation like ripsDiag/\code{\link[TDAstats]{calculate_homology}}/\code{\link{PyH}}, or \code{\link{diagram_to_df}}. Only one of `new_diagrams` and `K` need to be supplied.}
16 | 
17 | \item{K}{an optional precomputed cross Gram matrix of the new diagrams and the diagrams used in `clustering`, default NULL. If not NULL then `new_diagrams` does not need to be supplied.}
18 | 
19 | \item{clustering}{the output of a \code{\link{diagram_kkmeans}} function call, of class 'diagram_kkmeans'.}
20 | 
21 | \item{num_workers}{the number of cores used for parallel computation, default is one less than the number of cores on the machine.}
22 | }
23 | \value{
24 | a vector of the predicted cluster labels for the new diagrams.
25 | }
26 | \description{
27 | Returns the nearest (highest kernel value) \code{\link[kernlab]{kkmeans}} cluster center label for new persistence diagrams.
28 | This allows for reusing old cluster models for new tasks, or to perform cross validation.
29 | }
30 | \examples{
31 | 
32 | if(require("TDAstats"))
33 | {
34 |   # create two diagrams
35 |   D1 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,20),],
36 |                       dim = 1,threshold = 2)
37 |   D2 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,20),],
38 |                       dim = 1,threshold = 2)
39 |   g <- list(D1,D1,D2,D2)
40 | 
41 |   # calculate kmeans clusters with centers = 2, and sigma = t = 2 in dimension 0
42 |   clust <- diagram_kkmeans(diagrams = g,centers = 2,dim = 0,t = 2,sigma = 2,num_workers = 2)
43 | 
44 |   # create two new diagrams
45 |   D3 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,20),],
46 |                       dim = 1,threshold = 2)
47 |   D4 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,20),],
48 |                       dim = 1,threshold = 2)
49 |   g_new <- list(D3,D4)
50 | 
51 |   # predict cluster labels
52 |   predict_diagram_kkmeans(new_diagrams = g_new,clustering = clust,num_workers = 2)
53 |   
54 |   # predict cluster labels with precomputed Gram matrix, gives same result but
55 |   # much faster
56 |   K <- gram_matrix(diagrams = g_new,other_diagrams = clust$diagrams,
57 |                    dim = clust$dim,t = clust$t,sigma = clust$sigma,
58 |                    num_workers = 2)
59 |   predict_diagram_kkmeans(K = K,clustering = clust)
60 |   
61 | }
62 | }
63 | \seealso{
64 | \code{\link{diagram_kkmeans}} for clustering persistence diagrams.
65 | }
66 | \author{
67 | Shael Brown - \email{shaelebrown@gmail.com}
68 | }
69 | 


--------------------------------------------------------------------------------
/man/plot_diagram.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plot.R
 3 | \name{plot_diagram}
 4 | \alias{plot_diagram}
 5 | \title{Plot persistence diagrams}
 6 | \usage{
 7 | plot_diagram(
 8 |   D,
 9 |   title = NULL,
10 |   max_radius = NULL,
11 |   legend = TRUE,
12 |   thresholds = NULL
13 | )
14 | }
15 | \arguments{
16 | \item{D}{a persistence diagram, either outputted from either a persistent homology homology calculation like ripsDiag/\code{\link[TDAstats]{calculate_homology}}/\code{\link{PyH}} or from \code{\link{diagram_to_df}}, with
17 | maximum dimension at most 12.}
18 | 
19 | \item{title}{the character string plot title, default NULL.}
20 | 
21 | \item{max_radius}{the x and y limits of the plot are defined as `c(0,max_radius)`, and the default value of `max_radius` is the maximum death value in `D`.}
22 | 
23 | \item{legend}{a logical indicating whether to include a legend of feature dimensions, default TRUE.}
24 | 
25 | \item{thresholds}{either a numeric vector with one persistence threshold for each dimension in `D` or the output of a \code{\link{bootstrap_persistence_thresholds}} function call, default NULL.}
26 | }
27 | \description{
28 | Plots a persistence diagram outputted from either a persistent homology calculation or from diagram_to_df, with
29 | maximum homological dimension no more than 12 (otherwise the legend doesn't fit in the plot).
30 | Each homological dimension has its own color (the rcartocolor color-blind safe color palette) and point type, 
31 | and the main plot title can be altered via the `title` parameter. Each feature is plotted with
32 | a black point at its center in order to distinguish between overlapping features and easily compare
33 | features to their persistence thresholds.
34 | }
35 | \details{
36 | The `thresholds` parameter, if not NULL, can either be a user-defined numeric vector, with
37 | one entry (persistence threshold) for each dimension in `D`, or the output of
38 | \code{\link{bootstrap_persistence_thresholds}}. Points whose persistence are greater than or equal to their dimension's
39 | threshold will be plotted in their dimension's color, and in gray otherwise.
40 | }
41 | \examples{
42 | 
43 | if(require("TDAstats"))
44 | {
45 |   # create a sample diagram from the unit circle
46 |   df <- TDAstats::circle2d[sample(1:100,50),]
47 |   diag <- TDAstats::calculate_homology(df,threshold = 2)
48 | 
49 |   # plot without title
50 |   plot_diagram(diag)
51 | 
52 |   # plot with title
53 |   plot_diagram(diag,title = "Example diagram")
54 | 
55 |   # determine persistence thresholds
56 |   thresholds <- bootstrap_persistence_thresholds(X = df,maxdim = 1,
57 |   thresh = 2,num_samples = 3,
58 |   num_workers = 2)
59 | 
60 |   # plot with bootstrap persistence thresholds
61 |   plot_diagram(diag,title = "Example diagram with thresholds",thresholds = thresholds)
62 | 
63 |   #' # plot with personalized persistence thresholds
64 |   plot_diagram(diag,title = "Example diagram with personalized thresholds",thresholds = c(0.5,1))
65 | }
66 | }
67 | \author{
68 | Shael Brown - \email{shaelebrown@gmail.com}
69 | }
70 | 


--------------------------------------------------------------------------------
/man/gram_matrix.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/kernel_calculations.R
 3 | \name{gram_matrix}
 4 | \alias{gram_matrix}
 5 | \title{Compute the gram matrix for a group of persistence diagrams.}
 6 | \usage{
 7 | gram_matrix(
 8 |   diagrams,
 9 |   other_diagrams = NULL,
10 |   dim = 0,
11 |   sigma = 1,
12 |   t = 1,
13 |   rho = NULL,
14 |   num_workers = parallelly::availableCores(omit = 1)
15 | )
16 | }
17 | \arguments{
18 | \item{diagrams}{a list of persistence diagrams, where each diagram is either the output of a persistent homology calculation like ripsDiag/\code{\link[TDAstats]{calculate_homology}}/\code{\link{PyH}}, or \code{\link{diagram_to_df}}.}
19 | 
20 | \item{other_diagrams}{either NULL (default) or another list of persistence diagrams to compute a cross-Gram matrix.}
21 | 
22 | \item{dim}{the non-negative integer homological dimension in which the distance is to be computed, default 0.}
23 | 
24 | \item{sigma}{a positive number representing the bandwidth for the Fisher information metric, default 1.}
25 | 
26 | \item{t}{a positive number representing the scale for the kernel, default 1.}
27 | 
28 | \item{rho}{an optional positive number representing the heuristic for Fisher information metric approximation, see \code{\link{diagram_distance}}. Default NULL. If supplied, code execution is sequential, but functions in the "exec" directory
29 | of the package can be loaded to calculate distance matrices in parallel with approximation.}
30 | 
31 | \item{num_workers}{the number of cores used for parallel computation, default is one less than the number of cores on the machine.}
32 | }
33 | \value{
34 | the numeric (cross) Gram matrix of class 'kernelMatrix'.
35 | }
36 | \description{
37 | Calculate the Gram matrix \eqn{K} for either a single list of persistence diagrams \eqn{(D_1,D_2,\dots,D_n)}, i.e. \eqn{K[i,j] = k_{PF}(D_i,D_j)}, 
38 | or between two lists of persistence diagrams, \eqn{(D_1,D_2,\dots,D_n)} and \eqn{(D'_1,D'_2,\dots,D'_n)}, \eqn{K[i,j] = k_{PF}(D_i,D'_j)}, in parallel.
39 | }
40 | \details{
41 | Gram matrices are used in downstream analyses, like in the `diagram_kkmeans`, `diagram_nearest_cluster`,`diagram_kpca`, 
42 | `predict_diagram_kpca`, `predict_diagram_ksvm` and `independence_test` functions.
43 | }
44 | \examples{
45 | 
46 | if(require("TDAstats"))
47 | {
48 |   # create two diagrams
49 |   D1 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,20),],
50 |                       dim = 1,threshold = 2)
51 |   D2 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,20),],
52 |                       dim = 1,threshold = 2)
53 |   g <- list(D1,D2)
54 | 
55 |   # calculate the Gram matrix in dimension 0 with sigma = 2, t = 2
56 |   G <- gram_matrix(diagrams = g,dim = 0,sigma = 2,t = 2,num_workers = 2)
57 | 
58 |   # calculate cross-Gram matrix, which is the same as G
59 |   G_cross <- gram_matrix(diagrams = g,other_diagrams = g,dim = 0,sigma = 2,
60 |                          t = 2,num_workers = 2)
61 | }
62 | }
63 | \seealso{
64 | \code{\link{diagram_kernel}} for individual persistence Fisher kernel calculations.
65 | }
66 | \author{
67 | Shael Brown - \email{shaelebrown@gmail.com}
68 | }
69 | 


--------------------------------------------------------------------------------
/man/PyH.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/python_functions.R
 3 | \name{PyH}
 4 | \alias{PyH}
 5 | \title{Fast persistent homology calculations with python.}
 6 | \usage{
 7 | PyH(
 8 |   X,
 9 |   maxdim = 1,
10 |   thresh,
11 |   distance_mat = FALSE,
12 |   ripser,
13 |   ignore_infinite_cluster = TRUE,
14 |   calculate_representatives = FALSE
15 | )
16 | }
17 | \arguments{
18 | \item{X}{either a matrix or dataframe, representing either point cloud data or a distance matrix. In either case there
19 | must be at least two rows and 1 column.}
20 | 
21 | \item{maxdim}{the non-negative integer maximum dimension for persistent homology, default 1.}
22 | 
23 | \item{thresh}{the non-negative numeric radius threshold for the Vietoris-Rips filtration.}
24 | 
25 | \item{distance_mat}{a boolean representing whether the input X is a distance matrix or not, default FALSE.}
26 | 
27 | \item{ripser}{the ripser python module.}
28 | 
29 | \item{ignore_infinite_cluster}{a boolean representing whether to remove clusters (0 dimensional cycles) which
30 | die at the threshold value. Default is TRUE as this is the default for TDAstats homology calculations, but can be set to
31 | FALSE which is the default for python ripser.}
32 | 
33 | \item{calculate_representatives}{a boolean representing whether to return a list of representative cocycles for the
34 | topological features found in the persistence diagram, default FALSE.}
35 | }
36 | \value{
37 | Either a dataframe containing the persistence diagram if `calculate_representatives` is `FALSE` (the default), otherwise a list with two elements: 
38 | diagram of class diagram, containing the persistence diagram,
39 | and representatives, a list containing the edges, triangles etc. contained in each representative cocycle.
40 | }
41 | \description{
42 | This function is a wrapper of the python wrapper of the ripser engine for persistent cohomology, 
43 | but is still faster than using the R package TDAstats (see the TDApplied package vignette for details).
44 | }
45 | \details{
46 | If `distance_mat` is `TRUE` then `X` must be a square matrix. The `ripser` parameter should be the
47 | result of an `import_ripser` function call, but since that function is slow the ripser object should
48 | be explicitly created before a PyH function call (see examples). Cohomology is computed over Z2,
49 | as is the case for the TDAstats function \code{\link[TDAstats]{calculate_homology}} (this is also the
50 | default for ripser in c++). If representative cocycles are returned, then they are stored in a list with
51 | one element for each point in the persistence diagram, ignoring dimension 0 points. Each representative of
52 | a dimension d cocycle (1 for loops, 2 for voids, etc.) is a kxd dimension matrix/array containing the row number-labelled
53 | edges, triangles etc. in the cocycle.
54 | }
55 | \examples{
56 | \dontrun{
57 | # create sample data
58 | df <- data.frame(x = 1:10,y = 1:10)
59 | 
60 | # import the ripser module
61 | ripser <- import_ripser()
62 | 
63 | # calculate persistence diagram up to dimension 1 with a maximum
64 | # radius of 5
65 | phom <- PyH(X = df,thresh = 5,ripser = ripser)
66 | }
67 | }
68 | \author{
69 | Shael Brown - \email{shaelebrown@gmail.com}
70 | }
71 | 


--------------------------------------------------------------------------------
/exec/parallel_with_approximation.R:
--------------------------------------------------------------------------------
 1 | 
 2 | # functions to calculate Fisher information distance matrices and Gram matrices
 3 | # in parallel with a fast approximation
 4 | 
 5 | # these matrices can then be input into TDApplied functions directly
 6 | 
 7 | parallel_approx_distance_matrix <- function(diagrams,other_diagrams = NULL,dim = 0,sigma = 1,rho = 1e-3,num_workers = parallelly::availableCores(omit = 1)){
 8 |   
 9 |   # create cluster
10 |   cl <- parallel::makeCluster(num_workers)
11 |   doParallel::registerDoParallel(cl)
12 | 
13 |   # calculate distances in parallel
14 |   # clusters are closed if there is an error
15 |   tryCatch(expr = {
16 |     
17 |     if(is.null(other_diagrams))
18 |     {
19 |       # not cross distance matrix, only need to compute the upper diagonal
20 |       # since the matrix is symmetric
21 |       d <- matrix(data = 0,nrow = length(diagrams),ncol = length(diagrams))
22 |       u <- which(upper.tri(d),arr.ind = T)
23 |       R <- lapply(X = 1:nrow(u),FUN = function(X){
24 |         
25 |         return(list(diagrams[[u[[X,1]]]],diagrams[[u[[X,2]]]]))
26 |         
27 |       })
28 |       
29 |       # remove diagrams to preserve memory
30 |       rm(diagrams)
31 |       
32 |       # calculate distances in parallel, export TDApplied to nodes
33 |       d_off_diag <- foreach::`%dopar%`(obj = foreach::foreach(r = R,.combine = c,.packages = c("TDApplied")),ex = {TDApplied::diagram_distance(D1 = r[[1]],D2 = r[[2]],dim = dim,distance = "fisher",sigma = sigma,rho = rho)})
34 |       
35 |       # store results in matrix
36 |       d[upper.tri(d)] <- d_off_diag
37 |       d[which(upper.tri(d),arr.ind = T)[,c("col","row")]] <- d_off_diag
38 |       diag(d) <- rep(0,nrow(d))
39 |     }else
40 |     {
41 |       # cross distance matrix, need to compute all entries
42 |       u <- expand.grid(1:length(other_diagrams),1:length(diagrams))
43 |       R <- lapply(X = 1:nrow(u),FUN = function(X){
44 |         
45 |         return(list(other_diagrams[[u[X,1]]],diagrams[[u[X,2]]]))
46 |         
47 |       })
48 |       
49 |       # remove diagrams and other_diagrams to preserve memory
50 |       rm(list = c("diagrams","other_diagrams"))
51 |       
52 |       # store distance calculations in matrix
53 |       d[as.matrix(u)] <- foreach::`%dopar%`(foreach::foreach(r = R,.combine = cbind,.packages = c("TDApplied")),ex = {TDApplied::diagram_distance(D1 = r[[1]],D2 = r[[2]],dim = dim,distance = "fisher",sigma = sigma,rho = rho)})
54 |       
55 |     }
56 |     
57 |   }, warning = function(w){warning(w)},
58 |   error = function(e){stop(e)},
59 |   finally = {
60 |     # close cluster
61 |     doParallel::stopImplicitCluster()
62 |     parallel::stopCluster(cl)
63 |     
64 |   })
65 |   
66 |   return(d)
67 |   
68 | }
69 | 
70 | parallel_approx_gram_matrix <- function(diagrams,other_diagrams = NULL,dim = 0,sigma = 1,t = 1,rho = 1e-3,num_workers = parallelly::availableCores(omit = 1)){
71 |   
72 |   # compute gram matrix from distance matrix
73 |   K <- exp(-t*parallel_approx_distance_matrix(diagrams = diagrams,other_diagrams = other_diagrams,dim = dim,sigma = sigma,rho = rho,num_workers = num_workers))
74 |   
75 |   # update class for interfacing with kernlab package
76 |   class(K) <- "kernelMatrix"
77 |   
78 |   return(K)
79 |   
80 | }
81 | 


--------------------------------------------------------------------------------
/man/vr_graphs.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/rips_complexes.R
 3 | \name{vr_graphs}
 4 | \alias{vr_graphs}
 5 | \title{Compute Vietoris-Rips graphs of a dataset at particular epsilon radius values.}
 6 | \usage{
 7 | vr_graphs(X, distance_mat = FALSE, eps, return_clusters = TRUE)
 8 | }
 9 | \arguments{
10 | \item{X}{either a point cloud data frame/matrix, or a distance matrix.}
11 | 
12 | \item{distance_mat}{a boolean representing if the input `X` is a distance matrix, default value is `FALSE`.}
13 | 
14 | \item{eps}{a numeric vector of the positive scales at which to compute the Rips-Vietoris complexes, i.e. all edges at most the specified values.}
15 | 
16 | \item{return_clusters}{a boolean determining if the connected components (i.e. data clusters) of the complex should be explicitly returned, default is `TRUE`.}
17 | }
18 | \value{
19 | A list with a `vertices` field, containing the rownames of `X`, and then a list `graphs` one (named) entry for each value in `eps`. Each entry is a list with a `graph` field, storing the (undirected) edges in the Rips-Vietoris complex in matrix format, and a `clusters` field, containing vectors of the data indices (or row names) in each connected component of the Rips graph.
20 | }
21 | \description{
22 | Persistence diagrams computed from Rips-Vietoris filtrations contain information about 
23 | distance radius scales at which topological features of a dataset exist, but the features
24 | can be challenging to visualize, analyze and interpret. In order to help solve this problem the `vr_graphs`
25 | function computes the 1-skeleton (i.e. graph) of Rips complexes at particular radii, called "Vietoris-Rips graphs" (VR graphs) in the literature.
26 | }
27 | \details{
28 | This function may be used in conjunction with the igraph package to visualize the graphs (see \code{\link{plot_vr_graph}}).
29 | }
30 | \examples{
31 | 
32 | if(require("TDAstats") & require("igraph"))
33 | {
34 |   # simulate data from the unit circle and calculate 
35 |   # its diagram
36 |   df <- TDAstats::circle2d[sample(1:100,25),]
37 |   diag <- TDAstats::calculate_homology(df,
38 |                                        dim = 1,
39 |                                        threshold = 2)
40 |   
41 |   # get minimum death radius of any data cluster
42 |   min_death_H0 <- 
43 |   min(diag[which(diag[,1] == 0),3L])
44 |   
45 |   # get birth and death radius of the loop
46 |   loop_birth <- as.numeric(diag[nrow(diag),2L])
47 |   loop_death <- as.numeric(diag[nrow(diag),3L])
48 | 
49 |   # compute VR graphs at radii half of 
50 |   # min_death_H0 and the mean of loop_birth and 
51 |   # loop_death, returning clusters
52 |   graphs <- vr_graphs(X = df,eps = 
53 |   c(0.5*min_death_H0,(loop_birth + loop_death)/2))
54 | 
55 |   # verify that there are 25 clusters for the smaller radius
56 |   length(graphs$graphs[[1]]$clusters)
57 |   
58 | }
59 | }
60 | \references{
61 | A Zomorodian, The tidy set: A minimal simplicial set for computing homology of clique complexes in Proceedings of the Twenty-Sixth Annual Symposium on Computational Geometry, SoCG ’10. (Association for Computing Machinery, New York, NY, USA), p. 257–266 (2010).
62 | }
63 | \seealso{
64 | \code{\link{plot_vr_graph}} for plotting VR graphs.
65 | }
66 | \author{
67 | Shael Brown - \email{shaelebrown@gmail.com}
68 | }
69 | 


--------------------------------------------------------------------------------
/man/predict_diagram_ksvm.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/machine_learning.R
 3 | \name{predict_diagram_ksvm}
 4 | \alias{predict_diagram_ksvm}
 5 | \title{Predict the outcome labels for a list of persistence diagrams using a pre-trained diagram ksvm model.}
 6 | \usage{
 7 | predict_diagram_ksvm(
 8 |   new_diagrams,
 9 |   model,
10 |   K = NULL,
11 |   num_workers = parallelly::availableCores(omit = 1)
12 | )
13 | }
14 | \arguments{
15 | \item{new_diagrams}{a list of persistence diagrams which are either the output of a persistent homology calculation like ripsDiag/\code{\link[TDAstats]{calculate_homology}}/\code{\link{PyH}}, or \code{\link{diagram_to_df}}. Only one of `new_diagrams` and `K` need to be supplied.}
16 | 
17 | \item{model}{the output of a \code{\link{diagram_ksvm}} function call, of class 'diagram_ksvm'.}
18 | 
19 | \item{K}{an optional cross-Gram matrix of the new diagrams and the diagrams in `model`, default NULL. If not NULL then `new_diagrams` does not need to be supplied.}
20 | 
21 | \item{num_workers}{the number of cores used for parallel computation, default is one less than the number of cores on the machine.}
22 | }
23 | \value{
24 | a vector containing the output of \code{\link[kernlab]{predict.ksvm}} on the cross Gram matrix of the new diagrams and the support vector diagrams stored in the model.
25 | }
26 | \description{
27 | Returns the predicted response vector of the model on the new diagrams.
28 | }
29 | \details{
30 | This function is a wrapper of the kernlab \code{\link{predict}} function.
31 | }
32 | \examples{
33 | 
34 | if(require("TDAstats"))
35 | {
36 |   # create four diagrams
37 |   D1 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,20),],
38 |                       dim = 1,threshold = 2)
39 |   D2 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,20),],
40 |                       dim = 1,threshold = 2)
41 |   D3 <- TDAstats::calculate_homology(TDAstats::sphere3d[sample(1:100,20),],
42 |                       dim = 1,threshold = 2)
43 |   D4 <- TDAstats::calculate_homology(TDAstats::sphere3d[sample(1:100,20),],
44 |                       dim = 1,threshold = 2)
45 |   g <- list(D1,D2,D3,D4)
46 | 
47 |   # create response vector
48 |   y <- as.factor(c("circle","circle","sphere","sphere"))
49 | 
50 |   # fit model without cross validation
51 |   model_svm <- diagram_ksvm(diagrams = g,cv = 1,dim = c(0),
52 |                             y = y,sigma = c(1),t = c(1),
53 |                             num_workers = 2)
54 | 
55 |   # create two new diagrams
56 |   D5 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,20),],
57 |                       dim = 1,threshold = 2)
58 |   D6 <- TDAstats::calculate_homology(TDAstats::sphere3d[sample(1:100,20),],
59 |                       dim = 1,threshold = 2)
60 |   g_new <- list(D5,D6)
61 | 
62 |   # predict with precomputed Gram matrix
63 |   K <- gram_matrix(diagrams = g_new,other_diagrams = model_svm$diagrams,
64 |                    dim = model_svm$best_model$dim,sigma = model_svm$best_model$sigma,
65 |                    t = model_svm$best_model$t,num_workers = 2)
66 |   predict_diagram_ksvm(K = K,model = model_svm,num_workers = 2)
67 | }
68 | }
69 | \seealso{
70 | \code{\link{diagram_ksvm}} for training a SVM model on a training set of persistence diagrams and labels.
71 | }
72 | \author{
73 | Shael Brown - \email{shaelebrown@gmail.com}
74 | }
75 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
 1 | > All changes to TDApplied are documented here.
 2 | 
 3 | > Additions referenced with relevant [GitHub Issue](https://github.com/shaelebrown/TDApplied/issues) or
 4 | [Pull Request](https://github.com/shaelebrown/TDApplied/pulls) number.
 5 | Please see those for more details.
 6 | 
 7 | # 3.0.4
 8 | - fixed distance exponentiation and group permuting in permutation test
 9 | - added permutation model inference procedure
10 | - added universal null and enclosing radius functions
11 | 
12 | # 3.0.3
13 | - Updated documentation for JOSS paper release
14 | 
15 | # 3.0.2
16 | - all CRAN issues for this update were caused by the rho parameter, which invokes external C++ code. We therefore fixed these issues by removing the rho parameter from the predict_diagram_kpca and diagram_distance examples, from all tests and in the ML_and_inference.Rmd file. This parameter has been kept and is still tested, just not tested on CRAN
17 | - sped up the independence_test example by only showing the Gram-matrix approach
18 | - removed warnings from benchmarking plots in Speed.Rmd
19 | - removed dependency on package TDA which is currently unavailable on CRAN
20 | 
21 | # 3.0.1
22 | - same updates as 3.0.0 but with more efficient vignette building
23 | 
24 | # 3.0.0
25 | - added ability to precompute distance/Gram matrices for ML and inference functions
26 | - added fast approximation to Fisher information metric
27 | - added vignettes for speedups, HCP analysis, personalized analyses and distance calculation comparisons (and removed those parts from the main vignette)
28 | - fixed issues with cv model fitting in diagram_ksvm
29 | - added automatic calculation of t parameters in diagram_ksvm
30 | - decreased memory load on parallel functions (except for permutation test loss function)
31 | - added checks for 0 variance distance matrices in diagram_ksvm
32 | - added comparisons against package rgudhi
33 | - updated DESCRIPTION
34 | - added interpretations tools for vr graphs and multiple representative (co) cycles
35 | - improved HCP analysis
36 | - resolved some distance 0 cases in diagram_distance
37 | 
38 | # 2.0.4
39 | - fixed build issues related to use of suggested packages in tests, examples and vignettes
40 | 
41 | # 2.0.3
42 | - fixed bootstrap reference in vignette
43 | 
44 | # 2.0.2
45 | - set seed in vignette for reproducibility (which is reset at the end)
46 | - added more examples of TDA applications in publications
47 | 
48 | # 2.0.1
49 | - increased testing coverage
50 | - fixed issue with th parameter in diagram_kpca
51 | - fixed issue with gamma distribution in independence_test
52 | - added applied analysis of TDApplied on HCP data to package vignette
53 | 
54 | # 2.0.0
55 | 
56 | - added PyH function for fast persistence diagram calculations with python
57 | - added bootstrap_persistence_thresholds for finding "real" topological features in a data set
58 | - added plot_diagram function for plotting persistence diagrams, with or without persistence thresholds
59 | - fixed problem with diagram_distance in which one of the two diagrams was empty in the
60 | desired dimension
61 | 
62 | # 0.1.3
63 | 
64 | - fixed small bug with computing mean cv model error for svm
65 | - added tryCatch's around parallelized code to ensure that clusters are closed even when errors occur
66 | 
67 | # 0.1.2
68 | 
69 | - fixed bug with mds test and properly cleaned up parallelization clusters
70 | 
71 | # 0.1.1
72 | 
73 | - Fixed bug with one diagram_mds test, although code was working properly
74 | 
75 | # 0.1.0
76 | 
77 | - Initial version


--------------------------------------------------------------------------------
/R/enclosing_rad.R:
--------------------------------------------------------------------------------
  1 | #### COMPUTE enclosing RADIUS ####
  2 | #' Compute the enclosing radius for a dataset.
  3 | #'
  4 | #' The enclosing radius is the minimum (Euclidean distance) radius beyond which no topological changes will occur.
  5 | #' 
  6 | #' @param X the input dataset, must either be a matrix or data frame.
  7 | #' @param distance_mat whether or not `X` is a distance matrix, default FALSE.
  8 | #' @return the numeric enclosing radius.
  9 | #' @export
 10 | #' @author Shael Brown - \email{shaelebrown@@gmail.com}
 11 | #' @examples
 12 | #'
 13 | #' # create a persistence diagram from a 2D Gaussian
 14 | #' df = data.frame(x = rnorm(n = 20,mean = 0,sd = 1),y = rnorm(n = 20,mean = 0,sd = 1))
 15 | #'   
 16 | #' # compute the enclosing radius from the point cloud
 17 | #' enc_rad <- enclosing_radius(df, distance_mat = FALSE)
 18 | #'   
 19 | #' # compute the distance matrix manually, stored as a matrix
 20 | #' dist_df <- as.matrix(dist(df))
 21 | #'   
 22 | #' # compute the enclosing radius from the distance matrix
 23 | #' enc_rad <- enclosing_radius(dist_df, distance_mat = TRUE)
 24 | enclosing_radius <- function(X, distance_mat = FALSE){
 25 |   
 26 |   # error check parameters
 27 |   if(is.null(distance_mat))
 28 |   {
 29 |     stop("distance_mat must not be NULL.")
 30 |   }
 31 |   if(length(distance_mat) > 1 | !inherits(distance_mat,"logical"))
 32 |   {
 33 |     stop("distance_mat must be a single logical (i.e. T or F).")
 34 |   }
 35 |   if(is.na(distance_mat) | is.nan(distance_mat) )
 36 |   {
 37 |     stop("distance_mat must not be NA/NAN.")
 38 |   }
 39 |   
 40 |   if(!inherits(X,"data.frame") & !inherits(X,"matrix"))
 41 |   {
 42 |     stop("X must either be a dataframe or a matrix.")
 43 |   }
 44 |   if(nrow(X) < 2 | ncol(X) < 1)
 45 |   {
 46 |     stop("X must have at least two rows and one column.")
 47 |   }
 48 |   if(length(which(stats::complete.cases(X) == F)) > 0)
 49 |   {
 50 |     stop("X must not contain any missing values.")
 51 |   }
 52 |   if(distance_mat == T & (ncol(X) != nrow(X) | !inherits(X,"matrix")))
 53 |   {
 54 |     stop("if distance_mat is TRUE then X must be a square matrix.")
 55 |   }
 56 |   if((inherits(X,"matrix") & !inherits(X[1,1],"numeric")) | (inherits(X,"data.frame") & length(which(unlist(lapply(X,is.numeric)))) < ncol(X)))
 57 |   {
 58 |     stop("X must have only numeric entries.")
 59 |   }
 60 |   
 61 |   # if X is not a distance matrix, compute distance mat
 62 |   if(!distance_mat)
 63 |   {
 64 |     X <- as.matrix(dist(X))
 65 |     # dist_X <- dist(X)
 66 |     # n <- nrow(X)
 67 |     # return(min(sapply(1:n,FUN = function(X){
 68 |     #   
 69 |     #   col_inds <- c()
 70 |     #   if(X > 1)
 71 |     #   {
 72 |     #     num_cols <- X - 1
 73 |     #     col <- 1
 74 |     #     pos <- X - 1
 75 |     #     while(col < num_cols)
 76 |     #     {
 77 |     #       col_inds <- c(col_inds, pos)
 78 |     #       col <- col + 1
 79 |     #       pos <- pos + n - col
 80 |     #     }
 81 |     #   }
 82 |     #   
 83 |     #   row_inds <- c()
 84 |     #   if(X < n)
 85 |     #   {
 86 |     #     lower_bound <- n*(X - 1) - X*(X - 1)/2 + 1
 87 |     #     upper_bound <- lower_bound + n - X
 88 |     #     if(X == n - 1)
 89 |     #     {
 90 |     #       upper_bound <- upper_bound - 1
 91 |     #     }
 92 |     #     row_inds <- c(lower_bound:upper_bound)
 93 |     #   }
 94 |     #   inds <- c(row_inds, col_inds)
 95 |     #   
 96 |     #   return(max(dist_X[inds]))
 97 |     #   
 98 |     # })))
 99 |   }
100 |   
101 |   enc_rad <- min(apply(X, MARGIN = 1L, max))
102 |   return(enc_rad)
103 |   
104 | }
105 | 


--------------------------------------------------------------------------------
/man/predict_diagram_kpca.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/machine_learning.R
 3 | \name{predict_diagram_kpca}
 4 | \alias{predict_diagram_kpca}
 5 | \title{Project persistence diagrams into a low-dimensional space via a pre-computed kernel PCA embedding.}
 6 | \usage{
 7 | predict_diagram_kpca(
 8 |   new_diagrams,
 9 |   K = NULL,
10 |   embedding,
11 |   num_workers = parallelly::availableCores(omit = 1)
12 | )
13 | }
14 | \arguments{
15 | \item{new_diagrams}{a list of persistence diagrams which are either the output of a persistent homology calculation like ripsDiag/\code{\link[TDAstats]{calculate_homology}}/\code{\link{PyH}}, or \code{\link{diagram_to_df}}. Only one of `new_diagrams` and `K` need to be supplied.}
16 | 
17 | \item{K}{an optional precomputed cross-Gram matrix of the new diagrams and the ones used in `embedding`, default NULL. If not NULL then `new_diagrams` does not need to be supplied.}
18 | 
19 | \item{embedding}{the output of a \code{\link{diagram_kpca}} function call, of class 'diagram_kpca'.}
20 | 
21 | \item{num_workers}{the number of cores used for parallel computation, default is one less than the number of cores on the machine.}
22 | }
23 | \value{
24 | the data projection (rotation), stored as a numeric matrix. Each row corresponds to the same-index diagram in `new_diagrams`.
25 | }
26 | \description{
27 | Compute the location in low-dimensional space of each element of a list of new persistence diagrams using a
28 | previously-computed kernel PCA embedding (from the \code{\link{diagram_kpca}} function).
29 | }
30 | \examples{
31 | 
32 | if(require("TDAstats"))
33 | {
34 |   # create six diagrams
35 |   D1 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,20),],
36 |                       dim = 1,threshold = 2)
37 |   D2 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,20),],
38 |                       dim = 1,threshold = 2)
39 |   D3 <- TDAstats::calculate_homology(TDAstats::sphere3d[sample(1:100,20),],
40 |                       dim = 1,threshold = 2)
41 |   D4 <- TDAstats::calculate_homology(TDAstats::sphere3d[sample(1:100,20),],
42 |                       dim = 1,threshold = 2)
43 |   D5 <- TDAstats::calculate_homology(TDAstats::sphere3d[sample(1:100,20),],
44 |                       dim = 1,threshold = 2)
45 |   D6 <- TDAstats::calculate_homology(TDAstats::sphere3d[sample(1:100,20),],
46 |                       dim = 1,threshold = 2)
47 |   g <- list(D1,D2,D3,D4,D5,D6)
48 | 
49 |   # calculate their 2D PCA embedding with sigma = t = 2 in dimension 0
50 |   pca <- diagram_kpca(diagrams = g,dim = 1,t = 2,sigma = 2,
51 |                       features = 2,num_workers = 2,th = 1e-6)
52 | 
53 |   # project two new diagrams onto old model
54 |   D7 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,50),],
55 |                                      dim = 0,threshold = 2)
56 |   D8 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,50),],
57 |                                      dim = 0,threshold = 2)
58 |   g_new <- list(D7,D8)
59 | 
60 |   # calculate new embedding coordinates
61 |   new_pca <- predict_diagram_kpca(new_diagrams = g_new,embedding = pca,num_workers = 2)
62 |   
63 |   # repeat with precomputed Gram matrix, gives same result but much faster
64 |   K <- gram_matrix(diagrams = g_new,other_diagrams = pca$diagrams,dim = pca$dim,
65 |                    t = pca$t,sigma = pca$sigma,num_workers = 2)
66 |   new_pca <- predict_diagram_kpca(K = K,embedding = pca,num_workers = 2)
67 | }
68 | }
69 | \seealso{
70 | \code{\link{diagram_kpca}} for embedding persistence diagrams into a low-dimensional space.
71 | }
72 | \author{
73 | Shael Brown - \email{shaelebrown@gmail.com}
74 | }
75 | 


--------------------------------------------------------------------------------
/tests/testthat/test-MDS.R:
--------------------------------------------------------------------------------
 1 | 
 2 | test_that("diagram_mds detects incorrect parameters correctly",{
 3 |   
 4 |   D <- data.frame(dimension = c(0),birth = c(0),death = c(1))
 5 |   expect_error(diagram_mds(diagrams = list(D,D,"D"),num_workers = 2),"Diagrams")
 6 |   expect_error(diagram_mds(diagrams = list(),num_workers = 2),"2")
 7 |   expect_error(diagram_mds(diagrams = list(D,D,D),distance = NaN,num_workers = 2),"distance")
 8 |   expect_error(diagram_mds(diagrams = list(D,D,D),distance = "fisher",sigma = NULL,num_workers = 2),"sigma")
 9 |   expect_error(diagram_mds(diagrams = list(D,D,D),p = NaN,num_workers = 2),"p")
10 |   expect_error(diagram_mds(diagrams = list(D,D,D),k = -1,num_workers = 2),"k")
11 |   
12 | })
13 | 
14 | test_that("diagram_mds is computing correctly",{
15 |   
16 |   D1 <- data.frame(dimension = 0,birth = 2,death = 3)
17 |   D2 <- data.frame(dimension = 0,birth = 2,death = 3.1)
18 |   D3 <- data.frame(dimension = 0,birth = c(2,5),death = c(3.1,6))
19 |   d12 <- diagram_distance(D1,D2,dim = 0) # 2-wasserstein
20 |   d13 <- diagram_distance(D1,D3,dim = 0)
21 |   d23 <- diagram_distance(D2,D3,dim = 0)
22 |   D <- matrix(data = c(0,d12,d13,d12,0,d23,d13,d23,0),byrow = T,nrow = 3,ncol = 3)^2
23 |   D <- scale(D,center = T,scale = F)
24 |   D <- t(scale(t(D),center = T,scale = F))
25 |   S <- -D/2
26 |   ev <- eigen(S)
27 |   embedding <- -1*t(diag(sqrt(ev$values[1:2])) %*% t(ev$vectors[,1:2]))
28 |   dimnames(embedding) <- list(NULL,NULL)
29 |   dmds <- diagram_mds(diagrams = list(D1,D2,D3),num_workers = 2)
30 |   if(embedding[1,1] < 0)
31 |   {
32 |     embedding[,1] <- embedding[,1]/-1
33 |   }
34 |   if(dmds[1,1] < 0)
35 |   {
36 |     dmds[,1] <- dmds[,1]/-1
37 |   }
38 |   if(embedding[1,2] < 0)
39 |   {
40 |     embedding[,2] <- embedding[,2]/-1
41 |   }
42 |   if(dmds[1,2] < 0)
43 |   {
44 |     dmds[,2] <- dmds[,2]/-1
45 |   }
46 |   expect_equal((abs(dmds[1,1])-abs(embedding[1,1]))+(abs(dmds[2,1])-abs(embedding[2,1]))+(abs(dmds[3,1])-abs(embedding[3,1])) + (abs(dmds[1,2])-abs(embedding[1,2]))+(abs(dmds[2,2])-abs(embedding[2,2]))+(abs(dmds[3,2])-abs(embedding[3,2])),0)
47 |   
48 | })
49 | 
50 | # test_that("diagram_mds can accept inputs from TDA, TDAstats and diagram_to_df",{
51 | # 
52 | #   skip_if_not_installed("TDA")
53 | #   skip_if_not_installed("TDAstats")
54 | # 
55 | #   D1 = TDA::ripsDiag(data.frame(x = runif(50,0,1),y = runif(50,0,1)),maxscale = 1,maxdimension = 1)
56 | #   D2 = TDA::alphaComplexDiag(data.frame(x = runif(50,0,1),y = runif(50,0,1)),maxdimension = 1)
57 | #   D3 = TDA::ripsDiag(data.frame(x = runif(50,0,1),y = runif(50,0,1)),maxscale = 1,maxdimension = 1,library = "dionysus",location = T)
58 | #   D4 = TDAstats::calculate_homology(data.frame(x = runif(50,0,1),y = runif(50,0,1)),threshold = 1)
59 | #   expect_type(diagram_mds(diagrams = list(D1,D2,D3,D4),dim = 1,num_workers = 2),"double")
60 | #   expect_error(diagram_mds(diagrams = list(D1,D2,D3,D4),dim = 0,num_workers = 2),"Inf")
61 | # 
62 | # })
63 | 
64 | # test_that("diagram_mds can take distance matrix input",{
65 | # 
66 | #   skip_if_not_installed("TDA")
67 | #   skip_if_not_installed("TDAstats")
68 | # 
69 | #   D1 = TDA::ripsDiag(data.frame(x = runif(50,0,1),y = runif(50,0,1)),maxscale = 1,maxdimension = 1)
70 | #   D2 = TDA::alphaComplexDiag(data.frame(x = runif(50,0,1),y = runif(50,0,1)),maxdimension = 1)
71 | #   D3 = TDA::ripsDiag(data.frame(x = runif(50,0,1),y = runif(50,0,1)),maxscale = 1,maxdimension = 1,library = "dionysus",location = T)
72 | #   D4 = TDAstats::calculate_homology(data.frame(x = runif(50,0,1),y = runif(50,0,1)),threshold = 1)
73 | #   D = distance_matrix(list(D1,D2,D3,D4),dim = 1,num_workers = 2)
74 | #   expect_type(diagram_mds(D = D,dim = 1,num_workers = 2),"double")
75 | # 
76 | # })
77 | 
78 | 


--------------------------------------------------------------------------------
/man/distance_matrix.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/distance_calculations.R
 3 | \name{distance_matrix}
 4 | \alias{distance_matrix}
 5 | \title{Compute a distance matrix from a list of persistence diagrams.}
 6 | \usage{
 7 | distance_matrix(
 8 |   diagrams,
 9 |   other_diagrams = NULL,
10 |   dim = 0,
11 |   distance = "wasserstein",
12 |   p = 2,
13 |   sigma = NULL,
14 |   rho = NULL,
15 |   num_workers = parallelly::availableCores(omit = 1)
16 | )
17 | }
18 | \arguments{
19 | \item{diagrams}{a list of persistence diagrams, either the output of persistent homology calculations like ripsDiag/\code{\link[TDAstats]{calculate_homology}}/\code{\link{PyH}}, or \code{\link{diagram_to_df}}.}
20 | 
21 | \item{other_diagrams}{either NULL (default) or another list of persistence diagrams to compute a cross-distance matrix.}
22 | 
23 | \item{dim}{the non-negative integer homological dimension in which the distance is to be computed, default 0.}
24 | 
25 | \item{distance}{a character determining which metric to use, either "wasserstein" (default) or "fisher".}
26 | 
27 | \item{p}{a number representing the wasserstein power parameter, at least 1 and default 2.}
28 | 
29 | \item{sigma}{a positive number representing the bandwidth of the Fisher information metric, default NULL.}
30 | 
31 | \item{rho}{an optional positive number representing the heuristic for Fisher information metric approximation, see \code{\link{diagram_distance}}. Default NULL. If not NULL then matrix is calculated sequentially, but functions in the "exec" directory
32 | of the package can be loaded to calculate distance matrices in parallel with approximation.}
33 | 
34 | \item{num_workers}{the number of cores used for parallel computation, default is one less than the number of cores on the machine.}
35 | }
36 | \value{
37 | the numeric distance matrix.
38 | }
39 | \description{
40 | Calculate the distance matrix \eqn{d} for either a single list of persistence diagrams \eqn{(D_1,D_2,\dots,D_n)}, i.e. \eqn{d[i,j] = d(D_i,D_j)}, 
41 | or between two lists, \eqn{(D_1,D_2,\dots,D_n)} and \eqn{(D'_1,D'_2,\dots,D'_n)}, \eqn{d[i,j] = d(D_i,D'_j)}, in parallel.
42 | }
43 | \details{
44 | Distance matrices of persistence diagrams are used in downstream analyses, like in the 
45 | \code{\link{diagram_mds}}, \code{\link{permutation_test}} and \code{\link{diagram_ksvm}} functions. 
46 | If `distance` is "fisher" then `sigma` must not be NULL. Since the matrix is computed sequentially when
47 | approximating the Fisher information metric this is only recommended when the persistence diagrams
48 | contain many points and when the number of available cores is small.
49 | }
50 | \examples{
51 | 
52 | if(require("TDAstats"))
53 | {
54 |   # create two diagrams
55 |   D1 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,10),],
56 |                                      dim = 0,threshold = 2)
57 |   D2 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,10),],
58 |                                      dim = 0,threshold = 2)
59 |   g <- list(D1,D2)
60 | 
61 |   # calculate their distance matrix in dimension 0 with the persistence Fisher metric
62 |   # using 2 cores
63 |   D <- distance_matrix(diagrams = g,dim = 0,distance = "fisher",sigma = 1,num_workers = 2)
64 | 
65 |   # calculate their distance matrix in dimension 0 with the 2-wasserstein metric 
66 |   # using 2 cores
67 |   D <- distance_matrix(diagrams = g,dim = 0,distance = "wasserstein",p = 2,num_workers = 2)
68 | 
69 |   # now do the cross distance matrix, which is the same as the previous
70 |   D_cross <- distance_matrix(diagrams = g,other_diagrams = g,
71 |                              dim = 0,distance = "wasserstein",
72 |                              p = 2,num_workers = 2)
73 | }
74 | }
75 | \seealso{
76 | \code{\link{diagram_distance}} for individual distance calculations.
77 | }
78 | \author{
79 | Shael Brown - \email{shaelebrown@gmail.com}
80 | }
81 | 


--------------------------------------------------------------------------------
/src/kd_split.h:
--------------------------------------------------------------------------------
 1 | //----------------------------------------------------------------------
 2 | // File:			kd_split.h
 3 | // Programmer:		Sunil Arya and David Mount
 4 | // Description:		Methods for splitting kd-trees
 5 | // Last modified:	01/04/05 (Version 1.0)
 6 | //----------------------------------------------------------------------
 7 | // Copyright (c) 1997-2005 University of Maryland and Sunil Arya and
 8 | // David Mount.  All Rights Reserved.
 9 | // 
10 | // This software and related documentation is part of the Approximate
11 | // Nearest Neighbor Library (ANN).  This software is provided under
12 | // the provisions of the Lesser GNU Public License (LGPL).  See the
13 | // file ../ReadMe.txt for further information.
14 | // 
15 | // The University of Maryland (U.M.) and the authors make no
16 | // representations about the suitability or fitness of this software for
17 | // any purpose.  It is provided "as is" without express or implied
18 | // warranty.
19 | //----------------------------------------------------------------------
20 | // History:
21 | //	Revision 0.1  03/04/98
22 | //		Initial release
23 | //----------------------------------------------------------------------
24 | 
25 | #ifndef ANN_KD_SPLIT_H
26 | #define ANN_KD_SPLIT_H
27 | 
28 | #include "kd_tree.h"					// kd-tree definitions
29 | 
30 | //----------------------------------------------------------------------
31 | //	External entry points
32 | //		These are all splitting procedures for kd-trees.
33 | //----------------------------------------------------------------------
34 | 
35 | void kd_split(							// standard (optimized) kd-splitter
36 |     ANNpointArray		pa,				// point array (unaltered)
37 |     ANNidxArray			pidx,			// point indices (permuted on return)
38 |     const ANNorthRect	&bnds,			// bounding rectangle for cell
39 |     int					n,				// number of points
40 |     int					dim,			// dimension of space
41 |     int					&cut_dim,		// cutting dimension (returned)
42 |     ANNcoord			&cut_val,		// cutting value (returned)
43 |     int					&n_lo);			// num of points on low side (returned)
44 | 
45 | void midpt_split(						// midpoint kd-splitter
46 |     ANNpointArray		pa,				// point array (unaltered)
47 |     ANNidxArray			pidx,			// point indices (permuted on return)
48 |     const ANNorthRect	&bnds,			// bounding rectangle for cell
49 |     int					n,				// number of points
50 |     int					dim,			// dimension of space
51 |     int					&cut_dim,		// cutting dimension (returned)
52 |     ANNcoord			&cut_val,		// cutting value (returned)
53 |     int					&n_lo);			// num of points on low side (returned)
54 | 
55 | void sl_midpt_split(					// sliding midpoint kd-splitter
56 |     ANNpointArray		pa,				// point array (unaltered)
57 |     ANNidxArray			pidx,			// point indices (permuted on return)
58 |     const ANNorthRect	&bnds,			// bounding rectangle for cell
59 |     int					n,				// number of points
60 |     int					dim,			// dimension of space
61 |     int					&cut_dim,		// cutting dimension (returned)
62 |     ANNcoord			&cut_val,		// cutting value (returned)
63 |     int					&n_lo);			// num of points on low side (returned)
64 | 
65 | void fair_split(						// fair-split kd-splitter
66 |     ANNpointArray		pa,				// point array (unaltered)
67 |     ANNidxArray			pidx,			// point indices (permuted on return)
68 |     const ANNorthRect	&bnds,			// bounding rectangle for cell
69 |     int					n,				// number of points
70 |     int					dim,			// dimension of space
71 |     int					&cut_dim,		// cutting dimension (returned)
72 |     ANNcoord			&cut_val,		// cutting value (returned)
73 |     int					&n_lo);			// num of points on low side (returned)
74 | 
75 | void sl_fair_split(						// sliding fair-split kd-splitter
76 |     ANNpointArray		pa,				// point array (unaltered)
77 |     ANNidxArray			pidx,			// point indices (permuted on return)
78 |     const ANNorthRect	&bnds,			// bounding rectangle for cell
79 |     int					n,				// number of points
80 |     int					dim,			// dimension of space
81 |     int					&cut_dim,		// cutting dimension (returned)
82 |     ANNcoord			&cut_val,		// cutting value (returned)
83 |     int					&n_lo);			// num of points on low side (returned)
84 | 
85 | #endif


--------------------------------------------------------------------------------
/man/diagram_kkmeans.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/machine_learning.R
 3 | \name{diagram_kkmeans}
 4 | \alias{diagram_kkmeans}
 5 | \title{Cluster a group of persistence diagrams using kernel k-means.}
 6 | \usage{
 7 | diagram_kkmeans(
 8 |   diagrams,
 9 |   K = NULL,
10 |   centers,
11 |   dim = 0,
12 |   t = 1,
13 |   sigma = 1,
14 |   rho = NULL,
15 |   num_workers = parallelly::availableCores(omit = 1),
16 |   ...
17 | )
18 | }
19 | \arguments{
20 | \item{diagrams}{a list of n>=2 persistence diagrams which are either the output of a persistent homology calculation like ripsDiag/\code{\link[TDAstats]{calculate_homology}}/\code{\link{PyH}}, or the \code{\link{diagram_to_df}} function.}
21 | 
22 | \item{K}{an optional precomputed Gram matrix of persistence diagrams, default NULL.}
23 | 
24 | \item{centers}{number of clusters to initialize, no more than the number of diagrams although smaller values are recommended.}
25 | 
26 | \item{dim}{the non-negative integer homological dimension in which the distance is to be computed, default 0.}
27 | 
28 | \item{t}{a positive number representing the scale for the persistence Fisher kernel, default 1.}
29 | 
30 | \item{sigma}{a positive number representing the bandwidth for the Fisher information metric, default 1.}
31 | 
32 | \item{rho}{an optional positive number representing the heuristic for Fisher information metric approximation, see \code{\link{diagram_distance}}. Default NULL. If supplied, Gram matrix calculation is sequential.}
33 | 
34 | \item{num_workers}{the number of cores used for parallel computation, default is one less than the number of cores on the machine.}
35 | 
36 | \item{...}{additional parameters for the \code{\link[kernlab]{kkmeans}} kernlab function.}
37 | }
38 | \value{
39 | a list of class 'diagram_kkmeans' containing the output of \code{\link[kernlab]{kkmeans}} on the Gram matrix, i.e. a list containing the elements
40 | 
41 | \describe{
42 | 
43 | \item{clustering}{an S4 object of class specc, the output of a \code{\link[kernlab]{kkmeans}} function call. The `.Data` slot of this object contains cluster memberships, `withinss` contains the within-cluster sum of squares for each cluster, etc.}
44 | 
45 | \item{diagrams}{the input `diagrams` argument.}
46 | 
47 | \item{dim}{the input `dim` argument.}
48 | 
49 | \item{t}{the input `t` argument.}
50 | 
51 | \item{sigma}{the input `sigma` argument.}
52 | 
53 | }
54 | }
55 | \description{
56 | Finds latent cluster labels for a group of persistence diagrams, using a kernelized version
57 | of the popular k-means algorithm. An optimal number of clusters may be determined by analyzing
58 | the withinss field of the clustering object over several values of k.
59 | }
60 | \details{
61 | Returns the output of \code{\link[kernlab]{kkmeans}} on the desired Gram matrix of a group of persistence diagrams
62 | in a particular dimension. The additional list elements stored in the output are needed
63 | to estimate cluster labels for new persistence diagrams in the `predict_diagram_kkmeans`
64 | function.
65 | }
66 | \examples{
67 | 
68 | if(require("TDAstats"))
69 | {
70 |   # create two diagrams
71 |   D1 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,20),],
72 |                       dim = 1,threshold = 2)
73 |   D2 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,20),],
74 |                       dim = 1,threshold = 2)
75 |   g <- list(D1,D1,D2,D2)
76 | 
77 |   # calculate kmeans clusters with centers = 2, and sigma = t = 2 in dimension 0
78 |   clust <- diagram_kkmeans(diagrams = g,centers = 2,dim = 0,t = 2,sigma = 2,num_workers = 2)
79 |   
80 |   # repeat with precomputed Gram matrix, gives the same result just much faster
81 |   K <- gram_matrix(diagrams = g,num_workers = 2,t = 2,sigma = 2)
82 |   cluster <- diagram_kkmeans(diagrams = g,K = K,centers = 2,dim = 0,sigma = 2,t = 2)
83 |   
84 | }
85 | }
86 | \references{
87 | Dhillon, I and Guan, Y and Kulis, B (2004). "A Unified View of Kernel k-means , Spectral Clustering and Graph Cuts." \url{https://people.bu.edu/bkulis/pubs/spectral_techreport.pdf}.
88 | }
89 | \seealso{
90 | \code{\link{predict_diagram_kkmeans}} for predicting cluster labels of new diagrams.
91 | }
92 | \author{
93 | Shael Brown - \email{shaelebrown@gmail.com}
94 | }
95 | 


--------------------------------------------------------------------------------
/src/bd_fix_rad_search.cpp:
--------------------------------------------------------------------------------
 1 | #include <Rcpp.h>
 2 | using namespace Rcpp;
 3 | 
 4 | //----------------------------------------------------------------------
 5 | // File:			bd_fix_rad_search.cpp
 6 | // Programmer:		David Mount
 7 | // Description:		Standard bd-tree search
 8 | // Last modified:	05/03/05 (Version 1.1)
 9 | //----------------------------------------------------------------------
10 | // Copyright (c) 1997-2005 University of Maryland and Sunil Arya and
11 | // David Mount.  All Rights Reserved.
12 | // 
13 | // This software and related documentation is part of the Approximate
14 | // Nearest Neighbor Library (ANN).  This software is provided under
15 | // the provisions of the Lesser GNU Public License (LGPL).  See the
16 | // file ../ReadMe.txt for further information.
17 | // 
18 | // The University of Maryland (U.M.) and the authors make no
19 | // representations about the suitability or fitness of this software for
20 | // any purpose.  It is provided "as is" without express or implied
21 | // warranty.
22 | //----------------------------------------------------------------------
23 | // History:
24 | //	Revision 1.1  05/03/05
25 | //		Initial release
26 | //----------------------------------------------------------------------
27 | 
28 | #include "bd_tree.h"					// bd-tree declarations
29 | #include "kd_fix_rad_search.h"			// kd-tree FR search declarations
30 | 
31 | //----------------------------------------------------------------------
32 | //	Approximate searching for bd-trees.
33 | //		See the file kd_FR_search.cpp for general information on the
34 | //		approximate nearest neighbor search algorithm.  Here we
35 | //		include the extensions for shrinking nodes.
36 | //----------------------------------------------------------------------
37 | 
38 | //----------------------------------------------------------------------
39 | //	bd_shrink::ann_FR_search - search a shrinking node
40 | //----------------------------------------------------------------------
41 | 
42 | void ANNbd_shrink::ann_FR_search(ANNdist box_dist)
43 | {
44 |   // check dist calc term cond.
45 |   if (ANNmaxPtsVisited != 0 && ANNptsVisited > ANNmaxPtsVisited) return;
46 |   
47 |   ANNdist inner_dist = 0;						// distance to inner box
48 |   for (int i = 0; i < n_bnds; i++) {			// is query point in the box?
49 |     if (bnds[i].out(ANNkdFRQ)) {			// outside this bounding side?
50 |       // add to inner distance
51 |       inner_dist = (ANNdist) ANN_SUM(inner_dist, bnds[i].dist(ANNkdFRQ));
52 |     }
53 |   }
54 |   if (inner_dist <= box_dist) {				// if inner box is closer
55 |     child[ANN_IN]->ann_FR_search(inner_dist);// search inner child first
56 |     child[ANN_OUT]->ann_FR_search(box_dist);// ...then outer child
57 |   }
58 |   else {										// if outer box is closer
59 |     child[ANN_OUT]->ann_FR_search(box_dist);// search outer child first
60 |     child[ANN_IN]->ann_FR_search(inner_dist);// ...then outer child
61 |   }
62 |   ANN_FLOP(3*n_bnds)							// increment floating ops
63 |     ANN_SHR(1)									// one more shrinking node
64 | }
65 | 
66 | 
67 | //----------------------------------------------------------------------
68 | //	bd_shrink::ann_FR_search - search a shrinking node
69 | //----------------------------------------------------------------------
70 | 
71 | void ANNbd_shrink::ann_FR_searchFlops(ANNdist box_dist)
72 | {
73 |   // check dist calc term cond.
74 |   if (ANNmaxPtsVisited != 0 && ANNptsVisited > ANNmaxPtsVisited) return;
75 |   
76 |   ANNdist inner_dist = 0;						// distance to inner box
77 |   for (int i = 0; i < n_bnds; i++) {			// is query point in the box?
78 |     if (bnds[i].out(ANNkdFRQ)) {			// outside this bounding side?
79 |       // add to inner distance
80 |       inner_dist = (ANNdist) ANN_SUM(inner_dist, bnds[i].dist(ANNkdFRQ));
81 |     }
82 |   }
83 |   if (inner_dist <= box_dist) {				// if inner box is closer
84 |     child[ANN_IN]->ann_FR_searchFlops(inner_dist);// search inner child first
85 |     child[ANN_OUT]->ann_FR_searchFlops(box_dist);// ...then outer child
86 |   }
87 |   else {										// if outer box is closer
88 |     child[ANN_OUT]->ann_FR_searchFlops(box_dist);// search outer child first
89 |     child[ANN_IN]->ann_FR_searchFlops(inner_dist);// ...then outer child
90 |   }
91 |   ANN_FLOP_ALWAYS(3*n_bnds)							// increment floating ops
92 |     ANN_SHR(1)									// one more shrinking node
93 | }


--------------------------------------------------------------------------------
/man/diagram_distance.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/distance_calculations.R
 3 | \name{diagram_distance}
 4 | \alias{diagram_distance}
 5 | \title{Calculate distance between a pair of persistence diagrams.}
 6 | \usage{
 7 | diagram_distance(
 8 |   D1,
 9 |   D2,
10 |   dim = 0,
11 |   p = 2,
12 |   distance = "wasserstein",
13 |   sigma = NULL,
14 |   rho = NULL
15 | )
16 | }
17 | \arguments{
18 | \item{D1}{the first persistence diagram.}
19 | 
20 | \item{D2}{the second persistence diagram.}
21 | 
22 | \item{dim}{the non-negative integer homological dimension in which the distance is to be computed, default 0.}
23 | 
24 | \item{p}{a number representing the wasserstein power parameter, at least 1 and default 2.}
25 | 
26 | \item{distance}{a string which determines which type of distance calculation to carry out, either "wasserstein" (default) or "fisher".}
27 | 
28 | \item{sigma}{either NULL (default) or a positive number representing the bandwidth for the Fisher information metric.}
29 | 
30 | \item{rho}{either NULL (default) or a positive number. If NULL then the exact calculation of the Fisher information metric is returned and otherwise a fast approximation, see details.}
31 | }
32 | \value{
33 | the numeric value of the distance calculation.
34 | }
35 | \description{
36 | Calculates the distance between a pair of persistence diagrams, either the output from a \code{\link{diagram_to_df}} function call
37 | or from a persistent homology calculation like ripsDiag/\code{\link[TDAstats]{calculate_homology}}/\code{\link{PyH}},
38 | in a particular homological dimension.
39 | }
40 | \details{
41 | The most common distance calculations between persistence diagrams
42 | are the wasserstein and bottleneck distances, both of which "match" points between
43 | their two input diagrams and compute the "loss" of the optimal matching 
44 | (see \url{https://dl.acm.org/doi/10.1145/3064175} for details). Another 
45 | method for computing distances, the Fisher information metric, 
46 | converts the two diagrams into distributions
47 | defined on the plane, and calculates a distance between the resulting two distributions
48 | (\url{https://proceedings.neurips.cc/paper/2018/file/959ab9a0695c467e7caf75431a872e5c-Paper.pdf}).
49 | If the `distance` parameter is "fisher" then `sigma` must not be NULL. As noted in the Persistence Fisher paper,
50 | there is a fast speed-up approximation which has been implemented from \url{https://github.com/vmorariu/figtree} 
51 | and can be accessed by setting the `rho` parameter. Smaller
52 | values of `rho` will result in tighter approximations at the expense of longer runtime, and vice versa.
53 | }
54 | \examples{
55 | 
56 | if(require("TDAstats"))
57 | {
58 |   # create two diagrams
59 |   D1 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,size = 20),],
60 |                       dim = 1,threshold = 2)
61 |   D2 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,size = 20),],
62 |                       dim = 1,threshold = 2)
63 | 
64 |   # calculate 2-wasserstein distance between D1 and D2 in dimension 1
65 |   diagram_distance(D1,D2,dim = 1,p = 2,distance = "wasserstein")
66 | 
67 |   # calculate bottleneck distance between D1 and D2 in dimension 0
68 |   diagram_distance(D1,D2,dim = 0,p = Inf,distance = "wasserstein")
69 | 
70 |   # Fisher information metric calculation between D1 and D2 for sigma = 1 in dimension 1
71 |   diagram_distance(D1,D2,dim = 1,distance = "fisher",sigma = 1)
72 |   
73 |   # repeat but with fast approximation
74 |   \dontrun{
75 |   diagram_distance(D1,D2,dim = 1,distance = "fisher",sigma = 1,rho = 0.001)
76 |   }
77 | }
78 | }
79 | \references{
80 | Kerber M, Morozov D and Nigmetov A (2017). "Geometry Helps to Compare Persistence Diagrams." \url{https://dl.acm.org/doi/10.1145/3064175}.
81 | 
82 | Le T, Yamada M (2018). "Persistence fisher kernel: a riemannian manifold kernel for persistence diagrams." \url{https://proceedings.neurips.cc/paper/2018/file/959ab9a0695c467e7caf75431a872e5c-Paper.pdf}.
83 | 
84 | Vlad I. Morariu, Balaji Vasan Srinivasan, Vikas C. Raykar, Ramani Duraiswami, and Larry S. Davis. Automatic online tuning for fast Gaussian summation. Advances in Neural Information Processing Systems (NIPS), 2008.
85 | }
86 | \seealso{
87 | \code{\link{distance_matrix}} for distance matrix calculations.
88 | }
89 | \author{
90 | Shael Brown - \email{shaelebrown@gmail.com}
91 | }
92 | 


--------------------------------------------------------------------------------
/man/plot_vr_graph.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/rips_complexes.R
  3 | \name{plot_vr_graph}
  4 | \alias{plot_vr_graph}
  5 | \title{Plot a VR graph using the igraph package.}
  6 | \usage{
  7 | plot_vr_graph(
  8 |   graphs,
  9 |   eps,
 10 |   cols = NULL,
 11 |   layout = NULL,
 12 |   title = NULL,
 13 |   component_of = NULL,
 14 |   plot_isolated_vertices = FALSE,
 15 |   return_layout = FALSE,
 16 |   vertex_labels = TRUE
 17 | )
 18 | }
 19 | \arguments{
 20 | \item{graphs}{the output of a `vr_graphs` function call.}
 21 | 
 22 | \item{eps}{the numeric radius of the graph in `graphs` to plot.}
 23 | 
 24 | \item{cols}{an optional character vector of vertex colors, default `NULL`.}
 25 | 
 26 | \item{layout}{an optional 2D matrix of vertex coordinates, default `NULL`. If row names are supplied they can be used to subset a graph by those vertex names.}
 27 | 
 28 | \item{title}{an optional str title for the plot, default `NULL`.}
 29 | 
 30 | \item{component_of}{a vertex name (integer or character), only the component of the graph containing that vertex will be plotted (useful for identifying representative (co)cycles in graphs). Default `NULL` (plot the whole graph).}
 31 | 
 32 | \item{plot_isolated_vertices}{a boolean representing whether or not to plot isolated vertices, default `FALSE`.}
 33 | 
 34 | \item{return_layout}{a boolean representing whether or not to return the plotting layout (x-y coordinates of each vertex) and the vertex labels, default `FALSE`.}
 35 | 
 36 | \item{vertex_labels}{a boolean representing whether or not to plot vertex labels, default `TRUE`.}
 37 | }
 38 | \value{
 39 | if `return_layout` is `TRUE` then a list with elements "layout" (the numeric matrix of vertex x-y coordinates) and "vertices" (character vertex labels), otherwise the function does not return anything.
 40 | }
 41 | \description{
 42 | This function will throw an error if the igraph package is not installed.
 43 | }
 44 | \examples{
 45 | 
 46 | if(require("TDAstats") & require("igraph"))
 47 | {
 48 |   # simulate data from the unit circle and calculate 
 49 |   # its diagram
 50 |   df <- TDAstats::circle2d[sample(1:100,25),]
 51 |   diag <- TDAstats::calculate_homology(df,
 52 |                                        dim = 1,
 53 |                                        threshold = 2)
 54 |   
 55 |   # get minimum death radius of any data cluster
 56 |   min_death_H0 <- 
 57 |   min(diag[which(diag[,1] == 0),3L])
 58 |   
 59 |   # get birth and death radius of the loop
 60 |   loop_birth <- as.numeric(diag[nrow(diag),2L])
 61 |   loop_death <- as.numeric(diag[nrow(diag),3L])
 62 | 
 63 |   # compute VR graphs at radii half of 
 64 |   # min_death_H0 and the mean of loop_birth and 
 65 |   # loop_death, returning clusters
 66 |   graphs <- vr_graphs(X = df,eps = 
 67 |   c(0.5*min_death_H0,(loop_birth + loop_death)/2))
 68 |   
 69 |   # plot graph of smaller (first) radius
 70 |   plot_vr_graph(graphs = graphs,eps = 0.5*min_death_H0,
 71 |                   plot_isolated_vertices = TRUE)
 72 |   
 73 |   # plot graph of larger (second) radius
 74 |   plot_vr_graph(graphs = graphs,eps = (loop_birth + loop_death)/2)
 75 |   
 76 |   # repeat but with rownames for df, each vertex
 77 |   # will be plotted with its rownames
 78 |   rownames(df) <- paste0("V",1:25)
 79 |   graphs <- vr_graphs(X = df,eps = 
 80 |   c(0.5*min_death_H0,(loop_birth + loop_death)/2))
 81 |   plot_vr_graph(graphs = graphs,eps = 0.5*min_death_H0,
 82 |                   plot_isolated_vertices = TRUE)
 83 |   
 84 |   # plot without vertex labels
 85 |   plot_vr_graph(graphs = graphs,eps = (loop_birth + loop_death)/2,
 86 |                   vertex_labels = FALSE)
 87 |                  
 88 |   # plot only the graph component containing vertex "1"
 89 |   plot_vr_graph(graphs = graphs,eps = 0.5*min_death_H0,
 90 |                   component_of = "V1",plot_isolated_vertices = TRUE)
 91 |  
 92 |   # save the layout of the graph for adding features to 
 93 |   # the same graph layout, like color
 94 |   layout <- plot_vr_graph(graphs = graphs,eps = (loop_birth + loop_death)/2,
 95 |                             return_layout = TRUE,vertex_labels = TRUE)
 96 |   cols <- rep("blue",25)
 97 |   cols[1:5] <- "red"
 98 |   plot_vr_graph(graphs = graphs,eps = (loop_birth + loop_death)/2,cols = cols,
 99 |                   layout = layout)
100 |   
101 | }
102 | }
103 | \seealso{
104 | \code{\link{vr_graphs}} for computing VR graphs.
105 | }
106 | \author{
107 | Shael Brown - \email{shaelebrown@gmail.com}
108 | }
109 | 


--------------------------------------------------------------------------------
/man/analyze_representatives.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/analyze_representatives.R
 3 | \name{analyze_representatives}
 4 | \alias{analyze_representatives}
 5 | \title{Analyze the data point memberships of multiple representative (co)cycles.}
 6 | \usage{
 7 | analyze_representatives(
 8 |   diagrams,
 9 |   dim,
10 |   num_points,
11 |   plot_heatmap = TRUE,
12 |   return_contributions = FALSE,
13 |   boxed_reps = NULL,
14 |   d = NULL,
15 |   lwd = NULL,
16 |   title = NULL,
17 |   return_clust = FALSE
18 | )
19 | }
20 | \arguments{
21 | \item{diagrams}{a list of persistence diagrams, either the output of persistent homology calculations like ripsDiag/\code{\link[TDAstats]{calculate_homology}}/\code{\link{PyH}}, \code{\link{diagram_to_df}} or \code{\link{bootstrap_persistence_thresholds}}.}
22 | 
23 | \item{dim}{the integer homological dimension of representatives to consider.}
24 | 
25 | \item{num_points}{the integer number of data points in all the original datasets (from which the diagrams were calculated).}
26 | 
27 | \item{plot_heatmap}{a boolean representing if a heatmap of data point membership similarity of the representatives should be plotted, default `TRUE`. A dendrogram of hierarchical clustering is plotted, and rows (representatives) are sorted according to this clustering.}
28 | 
29 | \item{return_contributions}{a boolean indicating whether or not to return the membership contributions (i.e. percentages) of the data points (1:`num_points`) across all the representatives, default `FALSE`.}
30 | 
31 | \item{boxed_reps}{a data frame specifying specific rows of the output heatmap which should have a box drawn around them (for highlighting), default NULL. See the details section for more information.}
32 | 
33 | \item{d}{either NULL (default) or a "dist" object representing a distance matrix for the representatives, which must have the same number of rows and columns as cycles in the dimension `dim`.}
34 | 
35 | \item{lwd}{a positive number width for the lines of drawn boxes, if boxed_reps is not null.}
36 | 
37 | \item{title}{a character string title for the plotted heatmap, default NULL.}
38 | 
39 | \item{return_clust}{a boolean determining whether or not to return the result of the `stats::hclust()` call when a heatmap is plotted, default `FALSE`.}
40 | }
41 | \value{
42 | either a matrix of data point contributions to the representatives, or a list with elements "memberships" (the matrix) and some combination of elements "contributions" (a vector of membership percentages for each data point across representatives) and "clust" (the results of `stats::hclust()` on the membership matrix).
43 | }
44 | \description{
45 | Multiple distance matrices with corresponding data points can contain the same topological features. 
46 | Therefore we may wish to compare many representative (co)cycles across distance matrices to decide if their topological features are the same.
47 | The `analyze_representatives` function returns a matrix of binary datapoint memberships in an input list of representatives across distance matrices.
48 | Optionally this matrix can be plotted as a heatmap with columns as data points and rows (i.e. representatives) reordered by similarity, and the 
49 | contributions (i.e. percentage membership) of each point in the representatives can also be returned. The heatmap has
50 | dark red squares representing membership - location [i,j] is dark red if data point j is in representative i.
51 | }
52 | \details{
53 | The clustering dendrogram can be used to determine if there are any similar groups of representatives (i.e.
54 | shared topological features across datasets) and if so how many. The row labels of the heatmap are of the form
55 | 'DX[Y]', meaning the Yth representative of diagram X, and the column labels are the data point numbers.
56 | If diagrams are the output of the \code{\link{bootstrap_persistence_thresholds}}
57 | function, then the subsetted_representatives (if present) will be analyzed. Therefore, a column label like 'DX[Y]' in the 
58 | plotted heatmap would mean the Yth representative of diagram X. If certain representatives should be highlighted (by drawing a box around its row)
59 | in the heatmap, a dataframe `boxed_reps` can be supplied with two integer columns - 'diagram' and 'rep'. For example, if we wish to draw a box for DX[Y] then we
60 | add the row (diagram = X,rep = Y) to `boxed_reps`. If `d` is supplied then it will be used to cluster the representatives, based on the distances in `d`.
61 | }
62 | \author{
63 | Shael Brown - \email{shaelebrown@gmail.com}
64 | }
65 | 


--------------------------------------------------------------------------------
/man/independence_test.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/inference.R
 3 | \name{independence_test}
 4 | \alias{independence_test}
 5 | \title{Independence test for two groups of persistence diagrams.}
 6 | \usage{
 7 | independence_test(
 8 |   g1,
 9 |   g2,
10 |   dims = c(0, 1),
11 |   sigma = 1,
12 |   rho = NULL,
13 |   t = 1,
14 |   num_workers = parallelly::availableCores(omit = 1),
15 |   verbose = FALSE,
16 |   Ks = NULL,
17 |   Ls = NULL
18 | )
19 | }
20 | \arguments{
21 | \item{g1}{the first group of persistence diagrams, where each diagram was either the output from a persistent homology calculation like ripsDiag/\code{\link[TDAstats]{calculate_homology}}/\code{\link{PyH}}, or \code{\link{diagram_to_df}}.}
22 | 
23 | \item{g2}{the second group of persistence diagrams, where each diagram was either the output from a persistent homology calculation like ripsDiag/\code{\link[TDAstats]{calculate_homology}}/\code{\link{PyH}}, or \code{\link{diagram_to_df}}.}
24 | 
25 | \item{dims}{a non-negative integer vector of the homological dimensions in which the test is to be carried out, default c(0,1).}
26 | 
27 | \item{sigma}{a positive number representing the bandwidth for the Fisher information metric, default 1.}
28 | 
29 | \item{rho}{an optional positive number representing the heuristic for Fisher information metric approximation, see \code{\link{diagram_distance}}. Default NULL. If supplied, calculation of Gram matrices is sequential.}
30 | 
31 | \item{t}{a positive number representing the scale for the persistence Fisher kernel, default 1.}
32 | 
33 | \item{num_workers}{the number of cores used for parallel computation, default is one less than the number of cores on the machine.}
34 | 
35 | \item{verbose}{a boolean flag for if the time duration of the function call should be printed, default FALSE}
36 | 
37 | \item{Ks}{an optional list of precomputed Gram matrices for the first group of diagrams, with one element for each dimension. If not NULL and `Ls` is not NULL then `g1` and `g2` do not need to be supplied.}
38 | 
39 | \item{Ls}{an optional list of precomputed Gram matrices for the second group of diagrams, with one element for each dimension. If not NULL and `Ks` is not NULL then `g1` and `g2` do not need to be supplied.}
40 | }
41 | \value{
42 | a list with the following elements:
43 | \describe{
44 | 
45 |  \item{dimensions}{the input `dims` argument.}
46 |  
47 |  \item{test_statisics}{a numeric vector of the test statistic value in each dimension.}
48 |  
49 |  \item{p_values}{a numeric vector of the p-values in each dimension.}
50 |  
51 |  \item{run_time}{the run time of the function call, containing time units.}
52 | 
53 | }
54 | }
55 | \description{
56 | Carries out inference to determine if two groups of persistence diagrams are independent or not
57 | based on kernel calculations (see 
58 | (\url{https://proceedings.neurips.cc/paper/2007/file/d5cfead94f5350c12c322b5b664544c1-Paper.pdf}) for details).
59 | A small p-value in a certain dimension suggests that the groups are not independent in that dimension.
60 | }
61 | \details{
62 | The test is carried out with a parametric null distribution, making it much faster than non-parametric
63 | approaches. If all of the diagrams in either g1 or g2 are the same in some dimension, then some p-values may be NaN.
64 | }
65 | \examples{
66 | 
67 | if(require("TDAstats"))
68 | {
69 |   # create two independent groups of diagrams of length 6, which
70 |   # is the minimum length
71 |   D1 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,10),],
72 |                                      dim = 0,threshold = 2)
73 |   D2 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,10),],
74 |                                      dim = 0,threshold = 2)
75 |   g1 <- list(D1,D2,D2,D2,D2,D2)
76 |   g2 <- list(D2,D1,D1,D1,D1,D1)
77 | 
78 |   # do independence test with sigma = t = 1 in dimension 0, using
79 |   # precomputed Gram matrices
80 |   K = gram_matrix(diagrams = g1,dim = 0,t = 1,sigma = 1,num_workers = 2)
81 |   L = gram_matrix(diagrams = g2,dim = 0,t = 1,sigma = 1,num_workers = 2)
82 |   indep_test <- independence_test(Ks = list(K),Ls = list(L),dims = c(0))
83 |   
84 | }
85 | }
86 | \references{
87 | Gretton A et al. (2007). "A Kernel Statistical Test of Independence." \url{https://proceedings.neurips.cc/paper/2007/file/d5cfead94f5350c12c322b5b664544c1-Paper.pdf}.
88 | }
89 | \seealso{
90 | \code{\link{permutation_test}} for an inferential group difference test for groups of persistence diagrams.
91 | }
92 | \author{
93 | Shael Brown - \email{shaelebrown@gmail.com}
94 | }
95 | 


--------------------------------------------------------------------------------
/src/bd_tree.h:
--------------------------------------------------------------------------------
  1 | //----------------------------------------------------------------------
  2 | // File:			bd_tree.h
  3 | // Programmer:		David Mount
  4 | // Description:		Declarations for standard bd-tree routines
  5 | // Last modified:	01/04/05 (Version 1.0)
  6 | //----------------------------------------------------------------------
  7 | // Copyright (c) 1997-2005 University of Maryland and Sunil Arya and
  8 | // David Mount.  All Rights Reserved.
  9 | // 
 10 | // This software and related documentation is part of the Approximate
 11 | // Nearest Neighbor Library (ANN).  This software is provided under
 12 | // the provisions of the Lesser GNU Public License (LGPL).  See the
 13 | // file ../ReadMe.txt for further information.
 14 | // 
 15 | // The University of Maryland (U.M.) and the authors make no
 16 | // representations about the suitability or fitness of this software for
 17 | // any purpose.  It is provided "as is" without express or implied
 18 | // warranty.
 19 | //----------------------------------------------------------------------
 20 | // History:
 21 | //	Revision 0.1  03/04/98
 22 | //		Initial release
 23 | //	Revision 1.0  04/01/05
 24 | //		Changed IN, OUT to ANN_IN, ANN_OUT
 25 | //----------------------------------------------------------------------
 26 | 
 27 | #ifndef ANN_bd_tree_H
 28 | #define ANN_bd_tree_H
 29 | 
 30 | #include "ANNx.h"					// all ANN includes
 31 | #include "kd_tree.h"					// kd-tree includes
 32 | 
 33 | //----------------------------------------------------------------------
 34 | //	bd-tree shrinking node.
 35 | //		The main addition in the bd-tree is the shrinking node, which
 36 | //		is declared here.
 37 | //
 38 | //		Shrinking nodes are defined by list of orthogonal halfspaces.
 39 | //		These halfspaces define a (possibly unbounded) orthogonal
 40 | //		rectangle.  There are two children, in and out.  Points that
 41 | //		lie within this rectangle are stored in the in-child, and the
 42 | //		other points are stored in the out-child.
 43 | //
 44 | //		We use a list of orthogonal halfspaces rather than an
 45 | //		orthogonal rectangle object because typically the number of
 46 | //		sides of the shrinking box will be much smaller than the
 47 | //		worst case bound of 2*dim.
 48 | //
 49 | //		BEWARE: Note that constructor just copies the pointer to the
 50 | //		bounding array, but the destructor deallocates it.  This is
 51 | //		rather poor practice, but happens to be convenient.  The list
 52 | //		is allocated in the bd-tree building procedure rbd_tree() just
 53 | //		prior to construction, and is used for no other purposes.
 54 | //
 55 | //		WARNING: In the near neighbor searching code it is assumed that
 56 | //		the list of bounding halfspaces is irredundant, meaning that there
 57 | //		are no two distinct halfspaces in the list with the same outward
 58 | //		pointing normals.
 59 | //----------------------------------------------------------------------
 60 | 
 61 | class ANNbd_shrink : public ANNkd_node	// splitting node of a kd-tree
 62 | {
 63 |   int					n_bnds;			// number of bounding halfspaces
 64 |   ANNorthHSArray		bnds;			// list of bounding halfspaces
 65 |   ANNkd_ptr			child[2];		// in and out children
 66 | public:
 67 |   ANNbd_shrink(						// constructor
 68 |     int				nb,				// number of bounding halfspaces
 69 |     ANNorthHSArray	bds,			// list of bounding halfspaces
 70 |     ANNkd_ptr ic=NULL, ANNkd_ptr oc=NULL)	// children
 71 |   {
 72 |     n_bnds			= nb;				// cutting dimension
 73 |     bnds			= bds;				// assign bounds
 74 |     child[ANN_IN]	= ic;				// set children
 75 |     child[ANN_OUT]	= oc;
 76 |   }
 77 |   
 78 |   ~ANNbd_shrink()						// destructor
 79 |   {
 80 |     if (child[ANN_IN]!= NULL && child[ANN_IN]!=  KD_TRIVIAL) 
 81 |       delete child[ANN_IN];
 82 |     if (child[ANN_OUT]!= NULL&& child[ANN_OUT]!= KD_TRIVIAL) 
 83 |       delete child[ANN_OUT];
 84 |     if (bnds != NULL)
 85 |       delete [] bnds;			// delete bounds
 86 |   }
 87 |   
 88 |   virtual void getStats(						// get tree statistics
 89 |       int dim,						// dimension of space
 90 |       ANNkdStats &st,					// statistics
 91 |       ANNorthRect &bnd_box);			// bounding box
 92 |   virtual void print(int level, ostream &out);// print node
 93 |   virtual void dump(ostream &out);			// dump node
 94 |   
 95 |   virtual void ann_search(ANNdist);			// standard search
 96 |   virtual void ann_pri_search(ANNdist);		// priority search
 97 |   virtual void ann_FR_search(ANNdist); 		// fixed-radius search
 98 |   
 99 |   // added by Vlad 5-1-08 to update flops even when ANN_PERF is not defined
100 |   virtual void ann_FR_searchFlops(ANNdist); 		// fixed-radius search
101 | };
102 | 
103 | #endif


--------------------------------------------------------------------------------
/src/brute.cpp:
--------------------------------------------------------------------------------
  1 | #include <Rcpp.h>
  2 | using namespace Rcpp;
  3 | 
  4 | //----------------------------------------------------------------------
  5 | // File:			brute.cpp
  6 | // Programmer:		Sunil Arya and David Mount
  7 | // Description:		Brute-force nearest neighbors
  8 | // Last modified:	05/03/05 (Version 1.1)
  9 | //----------------------------------------------------------------------
 10 | // Copyright (c) 1997-2005 University of Maryland and Sunil Arya and
 11 | // David Mount.  All Rights Reserved.
 12 | // 
 13 | // This software and related documentation is part of the Approximate
 14 | // Nearest Neighbor Library (ANN).  This software is provided under
 15 | // the provisions of the Lesser GNU Public License (LGPL).  See the
 16 | // file ../ReadMe.txt for further information.
 17 | // 
 18 | // The University of Maryland (U.M.) and the authors make no
 19 | // representations about the suitability or fitness of this software for
 20 | // any purpose.  It is provided "as is" without express or implied
 21 | // warranty.
 22 | //----------------------------------------------------------------------
 23 | // History:
 24 | //	Revision 0.1  03/04/98
 25 | //		Initial release
 26 | //	Revision 1.1  05/03/05
 27 | //		Added fixed-radius kNN search
 28 | //----------------------------------------------------------------------
 29 | 
 30 | #include "ANNx.h"					// all ANN includes
 31 | #include "pr_queue_k.h"					// k element priority queue
 32 | 
 33 | //----------------------------------------------------------------------
 34 | //		Brute-force search simply stores a pointer to the list of
 35 | //		data points and searches linearly for the nearest neighbor.
 36 | //		The k nearest neighbors are stored in a k-element priority
 37 | //		queue (which is implemented in a pretty dumb way as well).
 38 | //
 39 | //		If ANN_ALLOW_SELF_MATCH is ANNfalse then data points at distance
 40 | //		zero are not considered.
 41 | //
 42 | //		Note that the error bound eps is passed in, but it is ignored.
 43 | //		These routines compute exact nearest neighbors (which is needed
 44 | //		for validation purposes in ann_test.cpp).
 45 | //----------------------------------------------------------------------
 46 | 
 47 | ANNbruteForce::ANNbruteForce(			// constructor from point array
 48 |   ANNpointArray		pa,				// point array
 49 |   int					n,				// number of points
 50 |   int					dd)				// dimension
 51 | {
 52 |   dim = dd;  n_pts = n;  pts = pa;
 53 | }
 54 | 
 55 | ANNbruteForce::~ANNbruteForce() { }		// destructor (empty)
 56 | 
 57 | void ANNbruteForce::annkSearch(			// approx k near neighbor search
 58 |     ANNpoint			q,				// query point
 59 |     int					k,				// number of near neighbors to return
 60 |     ANNidxArray			nn_idx,			// nearest neighbor indices (returned)
 61 |     ANNdistArray		dd,				// dist to near neighbors (returned)
 62 |     double				eps)			// error bound (ignored)
 63 | {
 64 |   ANNmin_k mk(k);						// construct a k-limited priority queue
 65 |   int i;
 66 |   
 67 |   if (k > n_pts) {					// too many near neighbors?
 68 |     annError((char *)"Requesting more near neighbors than data points", ANNabort);
 69 |   }
 70 |   // run every point through queue
 71 |   for (i = 0; i < n_pts; i++) {
 72 |     // compute distance to point
 73 |     ANNdist sqDist = annDist(dim, pts[i], q);
 74 |     if (ANN_ALLOW_SELF_MATCH || sqDist != 0)
 75 |       mk.insert(sqDist, i);
 76 |   }
 77 |   for (i = 0; i < k; i++) {			// extract the k closest points
 78 |     dd[i] = mk.ith_smallest_key(i);
 79 |     nn_idx[i] = mk.ith_smallest_info(i);
 80 |   }
 81 | }
 82 | 
 83 | int ANNbruteForce::annkFRSearch(		// approx fixed-radius kNN search
 84 |     ANNpoint			q,				// query point
 85 |     ANNdist				sqRad,			// squared radius
 86 |     int					k,				// number of near neighbors to return
 87 |     ANNidxArray			nn_idx,			// nearest neighbor array (returned)
 88 |     ANNdistArray		dd,				// dist to near neighbors (returned)
 89 |     double				eps)			// error bound
 90 | {
 91 |   ANNmin_k mk(k);						// construct a k-limited priority queue
 92 |   int i;
 93 |   int pts_in_range = 0;				// number of points in query range
 94 |   // run every point through queue
 95 |   for (i = 0; i < n_pts; i++) {
 96 |     // compute distance to point
 97 |     ANNdist sqDist = annDist(dim, pts[i], q);
 98 |     if (sqDist <= sqRad &&			// within radius bound
 99 |         (ANN_ALLOW_SELF_MATCH || sqDist != 0)) { // ...and no self match
100 |       mk.insert(sqDist, i);
101 |       pts_in_range++;
102 |     }
103 |   }
104 |   for (i = 0; i < k; i++) {			// extract the k closest points
105 |     if (dd != NULL)
106 |       dd[i] = mk.ith_smallest_key(i);
107 |     if (nn_idx != NULL)
108 |       nn_idx[i] = mk.ith_smallest_info(i);
109 |   }
110 |   
111 |   return pts_in_range;
112 | }


--------------------------------------------------------------------------------
/man/diagram_kpca.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/machine_learning.R
  3 | \name{diagram_kpca}
  4 | \alias{diagram_kpca}
  5 | \title{Calculate the kernel PCA embedding of a group of persistence diagrams.}
  6 | \usage{
  7 | diagram_kpca(
  8 |   diagrams,
  9 |   K = NULL,
 10 |   dim = 0,
 11 |   t = 1,
 12 |   sigma = 1,
 13 |   rho = NULL,
 14 |   features = 1,
 15 |   num_workers = parallelly::availableCores(omit = 1),
 16 |   th = 1e-04
 17 | )
 18 | }
 19 | \arguments{
 20 | \item{diagrams}{a list of persistence diagrams which are either the output of a persistent homology calculation like ripsDiag/\code{\link[TDAstats]{calculate_homology}}/\code{\link{PyH}}, or \code{\link{diagram_to_df}}.}
 21 | 
 22 | \item{K}{an optional precomputed Gram matrix of the persistence diagrams in `diagrams`, default NULL.}
 23 | 
 24 | \item{dim}{the non-negative integer homological dimension in which the distance is to be computed, default 0.}
 25 | 
 26 | \item{t}{a positive number representing the scale for the persistence Fisher kernel, default 1.}
 27 | 
 28 | \item{sigma}{a positive number representing the bandwidth for the Fisher information metric, default 1.}
 29 | 
 30 | \item{rho}{an optional positive number representing the heuristic for Fisher information metric approximation, see \code{\link{diagram_distance}}. Default NULL. If supplied, Gram matrix calculation is sequential.}
 31 | 
 32 | \item{features}{number of features (principal components) to return, default 1.}
 33 | 
 34 | \item{num_workers}{the number of cores used for parallel computation, default is one less than the number of cores on the machine.}
 35 | 
 36 | \item{th}{the threshold value under which principal components are ignored (default 0.0001).}
 37 | }
 38 | \value{
 39 | a list of class 'diagram_kpca' containing the elements
 40 | 
 41 | \describe{
 42 | 
 43 | \item{pca}{the output of kernlab's \code{\link[kernlab]{kpca}} function on the Gram matrix: an S4 object containing the slots `pcv` (a matrix containing the principal component vectors (column wise)), `eig` (the corresponding eigenvalues), `rotated` (the original data projected (rotated) on the principal components) and `xmatrix` (the original data matrix).}
 44 | 
 45 | \item{diagrams}{the input `diagrams` argument.}
 46 | 
 47 | \item{t}{the input `t` argument.}
 48 | 
 49 | \item{sigma}{the input `sigma` argument.}
 50 | 
 51 | \item{dim}{the input `dim` argument.}
 52 | 
 53 | }
 54 | }
 55 | \description{
 56 | Project a group of persistence diagrams into a low-dimensional embedding space using
 57 | a kernelized version of the popular PCA algorithm.
 58 | }
 59 | \details{
 60 | Returns the output of kernlab's \code{\link[kernlab]{kpca}} function on the desired Gram matrix of a group of persistence diagrams
 61 | in a particular dimension. The prediction function \code{\link{predict_diagram_kpca}} can be used to 
 62 | project new persistence diagrams using an old embedding, and this could be one practical
 63 | advantage of using \code{\link{diagram_kpca}} over \code{\link{diagram_mds}}. The embedding coordinates can also
 64 | be used for further analysis, or simply as a data visualization tool for persistence diagrams.
 65 | }
 66 | \examples{
 67 | 
 68 | if(require("TDAstats"))
 69 | {
 70 |   # create six diagrams
 71 |   D1 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,20),],
 72 |                       dim = 1,threshold = 2)
 73 |   D2 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,20),],
 74 |                       dim = 1,threshold = 2)
 75 |   D3 <- TDAstats::calculate_homology(TDAstats::sphere3d[sample(1:100,20),],
 76 |                       dim = 1,threshold = 2)
 77 |   D4 <- TDAstats::calculate_homology(TDAstats::sphere3d[sample(1:100,20),],
 78 |                       dim = 1,threshold = 2)
 79 |   D5 <- TDAstats::calculate_homology(TDAstats::sphere3d[sample(1:100,20),],
 80 |                       dim = 1,threshold = 2)
 81 |   D6 <- TDAstats::calculate_homology(TDAstats::sphere3d[sample(1:100,20),],
 82 |                       dim = 1,threshold = 2)
 83 |   g <- list(D1,D2,D3,D4,D5,D6)
 84 | 
 85 |   # calculate their 2D PCA embedding with sigma = t = 2 in dimension 1
 86 |   pca <- diagram_kpca(diagrams = g,dim = 1,t = 2,sigma = 2,features = 2,num_workers = 2,th = 1e-6)
 87 |   
 88 |   # repeat with precomputed Gram matrix, gives same result but much faster
 89 |   K <- gram_matrix(diagrams = g,dim = 1,t = 2,sigma = 2,num_workers = 2)
 90 |   pca <- diagram_kpca(diagrams = g,K = K,dim = 1,t = 2,sigma = 2,features = 2,th = 1e-6)
 91 |   
 92 | }
 93 | }
 94 | \references{
 95 | Scholkopf, B and Smola, A and Muller, K (1998). "Nonlinear Component Analysis as a Kernel Eigenvalue Problem." \url{https://www.mlpack.org/papers/kpca.pdf}.
 96 | }
 97 | \seealso{
 98 | \code{\link{predict_diagram_kpca}} for predicting embedding coordinates of new diagrams.
 99 | }
100 | \author{
101 | Shael Brown - \email{shaelebrown@gmail.com}
102 | }
103 | 


--------------------------------------------------------------------------------
/man/diagram_mds.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/machine_learning.R
  3 | \name{diagram_mds}
  4 | \alias{diagram_mds}
  5 | \title{Dimension reduction of a group of persistence diagrams via metric multidimensional scaling.}
  6 | \usage{
  7 | diagram_mds(
  8 |   diagrams,
  9 |   D = NULL,
 10 |   k = 2,
 11 |   distance = "wasserstein",
 12 |   dim = 0,
 13 |   p = 2,
 14 |   sigma = NULL,
 15 |   rho = NULL,
 16 |   eig = FALSE,
 17 |   add = FALSE,
 18 |   x.ret = FALSE,
 19 |   list. = eig || add || x.ret,
 20 |   num_workers = parallelly::availableCores(omit = 1)
 21 | )
 22 | }
 23 | \arguments{
 24 | \item{diagrams}{a list of n>=2 persistence diagrams which are either the output of a persistent homology calculation like ripsDiag/\code{\link[TDAstats]{calculate_homology}}/\code{\link{PyH}}, or \code{\link{diagram_to_df}}. Only one of `diagrams` and `D` need to be supplied.}
 25 | 
 26 | \item{D}{an optional precomputed distance matrix of persistence diagrams, default NULL. If not NULL then `diagrams` parameter does not need to be supplied.}
 27 | 
 28 | \item{k}{the dimension of the space which the data are to be represented in; must be in \{1,2,...,n-1\}.}
 29 | 
 30 | \item{distance}{a string representing the desired distance metric to be used, either 'wasserstein' (default) or 'fisher'.}
 31 | 
 32 | \item{dim}{the non-negative integer homological dimension in which the distance is to be computed, default 0.}
 33 | 
 34 | \item{p}{a positive number representing the wasserstein power, a number at least 1 (infinity for the bottleneck distance), default 2.}
 35 | 
 36 | \item{sigma}{a positive number representing the bandwidth for the Fisher information metric, default NULL.}
 37 | 
 38 | \item{rho}{an optional positive number representing the heuristic for Fisher information metric approximation, see \code{\link{diagram_distance}}. Default NULL. If supplied, distance matrix calculation is sequential.}
 39 | 
 40 | \item{eig}{a boolean indicating whether the eigenvalues should be returned.}
 41 | 
 42 | \item{add}{a boolean indicating if an additive constant c* should be computed, and added to the non-diagonal dissimilarities such that the modified dissimilarities are Euclidean.}
 43 | 
 44 | \item{x.ret}{a boolean indicating whether the doubly centered symmetric distance matrix should be returned.}
 45 | 
 46 | \item{list.}{a boolean indicating if a list should be returned or just the n*k matrix.}
 47 | 
 48 | \item{num_workers}{the number of cores used for parallel computation, default is one less than the number of cores on the machine.}
 49 | }
 50 | \value{
 51 | the output of \code{\link[stats]{cmdscale}} on the diagram distance matrix. If `list.` is false (as per default),
 52 | a matrix with `k` columns whose rows give the coordinates of the points chosen to represent the dissimilarities.
 53 | 
 54 | Otherwise, a list containing the following components.
 55 | 
 56 | \describe{
 57 | 
 58 |  \item{points}{a matrix with `k` columns whose rows give the coordinates of the points chosen to represent the dissimilarities.}
 59 | 
 60 |  \item{eig}{the \eqn{n} eigenvalues computed during the scaling process if `eig` is true.}
 61 |  
 62 |  \item{x}{the doubly centered distance matrix if `x.ret` is true.}
 63 |  
 64 |  \item{ac}{the additive constant \eqn{c*}, 0 if `add` = FALSE.}
 65 |  
 66 |  \item{GOF}{the numeric vector of length 2, representing the sum of all the eigenvalues divided by the sum of their absolute values (first vector element) or by the sum of the max of each eigenvalue and 0 (second vector element).}
 67 | 
 68 | }
 69 | }
 70 | \description{
 71 | Projects a group of persistence diagrams (or a precomputed distance matrix of diagrams) into a low-dimensional 
 72 | embedding space via metric multidimensional scaling. Such a projection can be used for visualization of data, 
 73 | or a static analysis of the embedding dimensions.
 74 | }
 75 | \details{
 76 | Returns the output of \code{\link[stats]{cmdscale}} on the desired distance matrix of a group of persistence diagrams
 77 | in a particular dimension. If `distance` is "fisher" then `sigma` must not be NULL.
 78 | }
 79 | \examples{
 80 | 
 81 | if(require("TDAstats"))
 82 | {
 83 |   # create two diagrams
 84 |   D1 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,10),],
 85 |                       dim = 1,threshold = 2)
 86 |   D2 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,10),],
 87 |                       dim = 1,threshold = 2)
 88 |   g <- list(D1,D2)
 89 | 
 90 |   # calculate their 1D MDS embedding in dimension 0 with the bottleneck distance
 91 |   mds <- diagram_mds(diagrams = g,k = 1,dim = 0,p = Inf,num_workers = 2)
 92 |   
 93 |   # repeat but with a precomputed distance matrix, gives same result just much faster
 94 |   Dmat <- distance_matrix(diagrams = list(D1,D2),dim = 0,p = Inf,num_workers = 2)
 95 |   mds <- diagram_mds(D = Dmat,k = 1)
 96 |   
 97 | }
 98 | }
 99 | \references{
100 | Cox M and Cox F (2008). "Multidimensional Scaling." \doi{10.1007/978-3-540-33037-0_14}.
101 | }
102 | \author{
103 | Shael Brown - \email{shaelebrown@gmail.com}
104 | }
105 | 


--------------------------------------------------------------------------------
/src/pr_queue.h:
--------------------------------------------------------------------------------
  1 | //----------------------------------------------------------------------
  2 | // File:			pr_queue.h
  3 | // Programmer:		Sunil Arya and David Mount
  4 | // Description:		Include file for priority queue and related
  5 | // 					structures.
  6 | // Last modified:	01/04/05 (Version 1.0)
  7 | //----------------------------------------------------------------------
  8 | // Copyright (c) 1997-2005 University of Maryland and Sunil Arya and
  9 | // David Mount.  All Rights Reserved.
 10 | // 
 11 | // This software and related documentation is part of the Approximate
 12 | // Nearest Neighbor Library (ANN).  This software is provided under
 13 | // the provisions of the Lesser GNU Public License (LGPL).  See the
 14 | // file ../ReadMe.txt for further information.
 15 | // 
 16 | // The University of Maryland (U.M.) and the authors make no
 17 | // representations about the suitability or fitness of this software for
 18 | // any purpose.  It is provided "as is" without express or implied
 19 | // warranty.
 20 | //----------------------------------------------------------------------
 21 | // History:
 22 | //	Revision 0.1  03/04/98
 23 | //		Initial release
 24 | //----------------------------------------------------------------------
 25 | 
 26 | #ifndef PR_QUEUE_H
 27 | #define PR_QUEUE_H
 28 | 
 29 | #include "ANNx.h"					// all ANN includes
 30 | #include "ANNperf.h"				// performance evaluation
 31 | 
 32 | //----------------------------------------------------------------------
 33 | //	Basic types.
 34 | //----------------------------------------------------------------------
 35 | typedef void			*PQinfo;		// info field is generic pointer
 36 | typedef ANNdist			PQkey;			// key field is distance
 37 | 
 38 | //----------------------------------------------------------------------
 39 | //	Priority queue
 40 | //		A priority queue is a list of items, along with associated
 41 | //		priorities.  The basic operations are insert and extract_minimum.
 42 | //
 43 | //		The priority queue is maintained using a standard binary heap.
 44 | //		(Implementation note: Indexing is performed from [1..max] rather
 45 | //		than the C standard of [0..max-1].  This simplifies parent/child
 46 | //		computations.)  User information consists of a void pointer,
 47 | //		and the user is responsible for casting this quantity into whatever
 48 | //		useful form is desired.
 49 | //
 50 | //		Because the priority queue is so central to the efficiency of
 51 | //		query processing, all the code is inline.
 52 | //----------------------------------------------------------------------
 53 | 
 54 | class ANNpr_queue {
 55 |   
 56 |   struct pq_node {					// node in priority queue
 57 |     PQkey			key;			// key value
 58 |     PQinfo			info;			// info field
 59 |   };
 60 |   int			n;						// number of items in queue
 61 |   int			max_size;				// maximum queue size
 62 |   pq_node		*pq;					// the priority queue (array of nodes)
 63 |   
 64 | public:
 65 |   ANNpr_queue(int max)				// constructor (given max size)
 66 |   {
 67 |     n = 0;						// initially empty
 68 |     max_size = max;				// maximum number of items
 69 |     pq = new pq_node[max+1];	// queue is array [1..max] of nodes
 70 |   }
 71 |   
 72 |   ~ANNpr_queue()						// destructor
 73 |   { delete [] pq; }
 74 |   
 75 |   ANNbool empty()						// is queue empty?
 76 |   { if (n==0) return ANNtrue; else return ANNfalse; }
 77 |   
 78 |   ANNbool non_empty()					// is queue nonempty?
 79 |   { if (n==0) return ANNfalse; else return ANNtrue; }
 80 |   
 81 |   void reset()						// make existing queue empty
 82 |   { n = 0; }
 83 |   
 84 |   inline void insert(					// insert item (inlined for speed)
 85 |       PQkey kv,						// key value
 86 |       PQinfo inf)						// item info
 87 |   {
 88 |     if (++n > max_size) annError((char *)"Priority queue overflow.", ANNabort);
 89 |     int r = n;
 90 |     while (r > 1) {				// sift up new item
 91 |       int p = r/2;
 92 |       ANN_FLOP(1)				// increment floating ops
 93 |         if (pq[p].key <= kv)	// in proper order
 94 |           break;
 95 |         pq[r] = pq[p];			// else swap with parent
 96 |         r = p;
 97 |     }
 98 |     pq[r].key = kv;				// insert new item at final location
 99 |     pq[r].info = inf;
100 |   }
101 |   
102 |   inline void extr_min(				// extract minimum (inlined for speed)
103 |       PQkey &kv,						// key (returned)
104 |       PQinfo &inf)					// item info (returned)
105 |   {
106 |     kv = pq[1].key;				// key of min item
107 |     inf = pq[1].info;			// information of min item
108 |     PQkey kn = pq[n--].key;// last item in queue
109 |     int p = 1;			// p points to item out of position
110 |     int r = p<<1;		// left child of p
111 |     while (r <= n) {			// while r is still within the heap
112 |       ANN_FLOP(2)				// increment floating ops
113 |       // set r to smaller child of p
114 |       if (r < n  && pq[r].key > pq[r+1].key) r++;
115 |       if (kn <= pq[r].key)	// in proper order
116 |         break;
117 |       pq[p] = pq[r];			// else swap with child
118 |       p = r;					// advance pointers
119 |       r = p<<1;
120 |     }
121 |     pq[p] = pq[n+1];			// insert last item in proper place
122 |   }
123 | };
124 | 
125 | #endif


--------------------------------------------------------------------------------
/man/universal_null.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/inference.R
  3 | \name{universal_null}
  4 | \alias{universal_null}
  5 | \title{Filtering topological features with the universal null distribution.}
  6 | \usage{
  7 | universal_null(
  8 |   X,
  9 |   FUN_diag = "calculate_homology",
 10 |   maxdim = 1,
 11 |   thresh,
 12 |   distance_mat = FALSE,
 13 |   ripser = NULL,
 14 |   ignore_infinite_cluster = TRUE,
 15 |   calculate_representatives = FALSE,
 16 |   alpha = 0.05,
 17 |   return_pvals = FALSE,
 18 |   infinite_cycle_inference = FALSE
 19 | )
 20 | }
 21 | \arguments{
 22 | \item{X}{the input dataset, must either be a matrix or data frame.}
 23 | 
 24 | \item{FUN_diag}{a string representing the persistent homology function to use for calculating the full persistence diagram, either
 25 | 'calculate_homology' (the default), 'PyH' or 'ripsDiag'.}
 26 | 
 27 | \item{maxdim}{the integer maximum homological dimension for persistent homology, default 0.}
 28 | 
 29 | \item{thresh}{the positive numeric maximum radius of the Vietoris-Rips filtration.}
 30 | 
 31 | \item{distance_mat}{a boolean representing if `X` is a distance matrix (TRUE) or not (FALSE, default).
 32 | dimensions together (TRUE, the default) or if one threshold should be calculated for each dimension separately (FALSE).}
 33 | 
 34 | \item{ripser}{the imported ripser module when `FUN_diag` is `PyH`.}
 35 | 
 36 | \item{ignore_infinite_cluster}{a boolean indicating whether or not to ignore the infinitely lived cluster when `FUN_diag` is `PyH`. If infinite cycle inference is to be performed,
 37 | this parameter should be set to FALSE.}
 38 | 
 39 | \item{calculate_representatives}{a boolean representing whether to calculate representative (co)cycles, default FALSE. Note that representatives cant be
 40 | calculated when using the 'calculate_homology' function. Note that representatives cannot be computed for (significant) infinite cycles.}
 41 | 
 42 | \item{alpha}{the type-1 error threshold, default 0.05.}
 43 | 
 44 | \item{return_pvals}{a boolean representing whether or not to return p-values for features in the subsetted diagram as well as a list of p-value thresholds, default FALSE.
 45 | Infinite cycles that are significant (see below) will have p-value NA in this list, as the true value is unknown but less than its dimension's p-value threshold.}
 46 | 
 47 | \item{infinite_cycle_inference}{a boolean representing whether or not to perform inference for features with infinite (i.e. `thresh`) death values, default FALSE. If `FUN_diag` is `calculate_homology` (the
 48 | default) then no infinite cycles will be returned by the persistent homology calculation at all.}
 49 | }
 50 | \value{
 51 | a list containing the full persistence diagram, the subsetted diagram, representatives and/or subsetted representatives if desired, the p-values of subsetted features and the Bonferroni p-value thresholds in each dimension if desired.
 52 | }
 53 | \description{
 54 | An inference procedure to determine which topological features (if any) of a datasets are likely signal (i.e. significant)
 55 | vs noise (not).
 56 | }
 57 | \details{
 58 | For each feature in a diagram we compute its persistence ratio \eqn{\pi = death/birth}, and a
 59 | test statistic \eqn{A log log \pi + B} (where \eqn{A} and \eqn{B} are constants). This statistic is compared to a left-skewed Gumbel distribution
 60 | to get a p-value. A Bonferroni correction is applied to all the p-values across all features, so when `return_pvals` is TRUE a list of 
 61 | p-value thresholds is also returned, one for each dimension, which is `alpha` divided by the number of features in that dimension.
 62 | If desired, infinite cycles (i.e. cycles whose death value is equal to the maximum distance threshold parameter for the persistent homology calculation) 
 63 | can be anaylzed for significance by determining their minimum distance thresholds where they might be significant (using the Gumbel distribution again),
 64 | calculating the persistence diagram up to those thresholds and seeing if they are still infinite (i.e. significant) or not.
 65 | This function is significantly faster than the \code{\link{bootstrap_persistence_thresholds}} function. Note that the `calculate_homology`
 66 | function does not seem to store infinite cycles (i.e. cycles that have death value equal to `thresh`).
 67 | }
 68 | \examples{
 69 | 
 70 | if(require("TDA"))
 71 | {
 72 |   # create dataset
 73 |   theta <- runif(n = 100,min = 0,max = 2*pi)
 74 |   x <- cos(theta)
 75 |   y <- sin(theta)
 76 |   circ <- data.frame(x = x,y = y)
 77 | 
 78 |   # add noise
 79 |   x_noise <- -0.1 + 0.2*stats::runif(n = 100)
 80 |   y_noise <- -0.1 + 0.2*stats::runif(n = 100)
 81 |   circ$x <- circ$x + x_noise
 82 |   circ$y <- circ$y + y_noise
 83 | 
 84 |   # determine significant topological features
 85 |   library(TDA)
 86 |   res <- universal_null(circ, thresh = 2,alpha = 0.1,return_pvals = TRUE,FUN_diag = "ripsDiag")
 87 |   res$subsetted_diag
 88 |   res$pvals
 89 |   res$alpha_thresh
 90 | 
 91 |   # at a lower threshold we can check for 
 92 |   # infinite cycles
 93 |   res2 <- universal_null(circ, thresh = 1.1, 
 94 |                          infinite_cycle_inference = TRUE,
 95 |                          alpha = 0.1,
 96 |                          FUN_diag = "ripsDiag")
 97 |   res2$subsetted_diag
 98 | }
 99 | }
100 | \references{
101 | Bobrowski O, Skraba P (2023). "A universal null-distribution for topological data analysis." \url{https://www.nature.com/articles/s41598-023-37842-2}.
102 | }
103 | \author{
104 | Shael Brown - \email{shaelebrown@gmail.com}
105 | }
106 | 


--------------------------------------------------------------------------------
/src/pr_queue_k.h:
--------------------------------------------------------------------------------
  1 | //----------------------------------------------------------------------
  2 | // File:			pr_queue_k.h
  3 | // Programmer:		Sunil Arya and David Mount
  4 | // Description:		Include file for priority queue with k items.
  5 | // Last modified:	01/04/05 (Version 1.0)
  6 | //----------------------------------------------------------------------
  7 | // Copyright (c) 1997-2005 University of Maryland and Sunil Arya and
  8 | // David Mount.  All Rights Reserved.
  9 | // 
 10 | // This software and related documentation is part of the Approximate
 11 | // Nearest Neighbor Library (ANN).  This software is provided under
 12 | // the provisions of the Lesser GNU Public License (LGPL).  See the
 13 | // file ../ReadMe.txt for further information.
 14 | // 
 15 | // The University of Maryland (U.M.) and the authors make no
 16 | // representations about the suitability or fitness of this software for
 17 | // any purpose.  It is provided "as is" without express or implied
 18 | // warranty.
 19 | //----------------------------------------------------------------------
 20 | // History:
 21 | //	Revision 0.1  03/04/98
 22 | //		Initial release
 23 | //----------------------------------------------------------------------
 24 | 
 25 | #ifndef PR_QUEUE_K_H
 26 | #define PR_QUEUE_K_H
 27 | 
 28 | #include "ANNx.h"					// all ANN includes
 29 | #include "ANNperf.h"				// performance evaluation
 30 | 
 31 | //----------------------------------------------------------------------
 32 | //	Basic types
 33 | //----------------------------------------------------------------------
 34 | typedef ANNdist			PQKkey;			// key field is distance
 35 | typedef int				PQKinfo;		// info field is int
 36 | 
 37 | //----------------------------------------------------------------------
 38 | //	Constants
 39 | //		The NULL key value is used to initialize the priority queue, and
 40 | //		so it should be larger than any valid distance, so that it will
 41 | //		be replaced as legal distance values are inserted.  The NULL
 42 | //		info value must be a nonvalid array index, we use ANN_NULL_IDX,
 43 | //		which is guaranteed to be negative.
 44 | //----------------------------------------------------------------------
 45 | 
 46 | const PQKkey	PQ_NULL_KEY  =  ANN_DIST_INF;	// nonexistent key value
 47 | const PQKinfo	PQ_NULL_INFO =  ANN_NULL_IDX;	// nonexistent info value
 48 | 
 49 | //----------------------------------------------------------------------
 50 | //	ANNmin_k
 51 | //		An ANNmin_k structure is one which maintains the smallest
 52 | //		k values (of type PQKkey) and associated information (of type
 53 | //		PQKinfo).  The special info and key values PQ_NULL_INFO and
 54 | //		PQ_NULL_KEY means that thise entry is empty.
 55 | //
 56 | //		It is currently implemented using an array with k items.
 57 | //		Items are stored in increasing sorted order, and insertions
 58 | //		are made through standard insertion sort.  (This is quite
 59 | //		inefficient, but current applications call for small values
 60 | //		of k and relatively few insertions.)
 61 | //		
 62 | //		Note that the list contains k+1 entries, but the last entry
 63 | //		is used as a simple placeholder and is otherwise ignored.
 64 | //----------------------------------------------------------------------
 65 | 
 66 | class ANNmin_k {
 67 |   struct mk_node {					// node in min_k structure
 68 |     PQKkey			key;			// key value
 69 |     PQKinfo			info;			// info field (user defined)
 70 |   };
 71 |   
 72 |   int			k;						// max number of keys to store
 73 |   int			n;						// number of keys currently active
 74 |   mk_node		*mk;					// the list itself
 75 |   
 76 | public:
 77 |   ANNmin_k(int max)					// constructor (given max size)
 78 |   {
 79 |     n = 0;						// initially no items
 80 |     k = max;					// maximum number of items
 81 |     mk = new mk_node[max+1];	// sorted array of keys
 82 |   }
 83 |   
 84 |   ~ANNmin_k()							// destructor
 85 |   { delete [] mk; }
 86 |   
 87 |   PQKkey ANNmin_key()					// return minimum key
 88 |   { return (n > 0 ? mk[0].key : PQ_NULL_KEY); }
 89 |   
 90 |   PQKkey max_key()					// return maximum key
 91 |   { return (n == k ? mk[k-1].key : PQ_NULL_KEY); }
 92 |   
 93 |   PQKkey ith_smallest_key(int i)		// ith smallest key (i in [0..n-1])
 94 |   { return (i < n ? mk[i].key : PQ_NULL_KEY); }
 95 |   
 96 |   PQKinfo ith_smallest_info(int i)	// info for ith smallest (i in [0..n-1])
 97 |   { return (i < n ? mk[i].info : PQ_NULL_INFO); }
 98 |   
 99 |   inline void insert(					// insert item (inlined for speed)
100 |       PQKkey kv,						// key value
101 |       PQKinfo inf)					// item info
102 |   {
103 |     int i;
104 |     // slide larger values up
105 |     for (i = n; i > 0; i--) {
106 |       if (mk[i-1].key > kv)
107 |         mk[i] = mk[i-1];
108 |       else
109 |         break;
110 |     }
111 |     mk[i].key = kv;				// store element here
112 |     mk[i].info = inf;
113 |     if (n < k) n++;				// increment number of items
114 |     ANN_FLOP(k-i+1)				// increment floating ops
115 |   }
116 |   
117 |   // added by Vlad 5-1-08 to allow user to update flops by calling this
118 |   // function even when ANN_PERF is not defined
119 |   inline void insertFlops(					// insert item (inlined for speed)
120 |       PQKkey kv,						// key value
121 |       PQKinfo inf)					// item info
122 |   {
123 |     int i;
124 |     // slide larger values up
125 |     for (i = n; i > 0; i--) {
126 |       if (mk[i-1].key > kv)
127 |         mk[i] = mk[i-1];
128 |       else
129 |         break;
130 |     }
131 |     mk[i].key = kv;				// store element here
132 |     mk[i].info = inf;
133 |     if (n < k) n++;				// increment number of items
134 |     ANN_FLOP_ALWAYS(k-i+1)				// increment floating ops
135 |   }
136 | };
137 | 
138 | #endif


--------------------------------------------------------------------------------
/tests/testthat/test-kernel.R:
--------------------------------------------------------------------------------
  1 | 
  2 | test_that("diagram_kernel detects incorrect parameters correctly",{
  3 |   
  4 |   D <- data.frame(dimension = c(0),birth = c(0),death = c(1))
  5 |   expect_error(diagram_kernel(D1 = NULL,D2 = D,dim = 1),"TDA/TDAstats")
  6 |   expect_error(diagram_kernel(D1 = D,D2 = NULL,dim = 1),"TDA/TDAstats")
  7 |   expect_error(diagram_kernel(D1 = D,D2 = D,dim = "2"),"numeric")
  8 |   expect_error(diagram_kernel(D1 = D,D2 = D,sigma = "2"),"numeric")
  9 |   expect_error(diagram_kernel(D1 = D,D2 = D,t = NA),"NA")
 10 |   expect_error(diagram_kernel(D1 = D,D2 = D,t = -1),"positive")
 11 | 
 12 | })
 13 | 
 14 | # test_that("diagram_kernel can accept inputs from either TDA/TDAstats homology output or diagram_to_df function, with or without cycle location",{
 15 | # 
 16 | #   skip_if_not_installed("TDA")
 17 | #   skip_if_not_installed("TDAstats")
 18 | #   D1 = TDA::ripsDiag(data.frame(x = runif(50,0,1),y = runif(50,0,1)),maxscale = 1,maxdimension = 1)
 19 | #   D2 = TDA::alphaComplexDiag(data.frame(x = runif(50,0,1),y = runif(50,0,1)),maxdimension = 1)
 20 | #   D3 = TDA::ripsDiag(data.frame(x = runif(50,0,1),y = runif(50,0,1)),maxscale = 1,maxdimension = 1,library = "dionysus",location = T)
 21 | #   D4 = TDAstats::calculate_homology(data.frame(x = runif(50,0,1),y = runif(50,0,1)),threshold = 1)
 22 | #   expect_gte(diagram_kernel(D1 = D1,D2 = D2,dim = 1),0)
 23 | #   expect_gte(diagram_kernel(D1 = diagram_to_df(D1),D2 = D2,dim = 1),0)
 24 | #   expect_gte(diagram_kernel(D1 = D1,D2 = diagram_to_df(D2),dim = 1),0)
 25 | #   expect_gte(diagram_kernel(D1 = D3,D2 = diagram_to_df(D2),dim = 1),0)
 26 | #   expect_gte(diagram_kernel(D1 = D1,D2 = diagram_to_df(D3),dim = 1),0)
 27 | #   expect_gte(diagram_kernel(D1 = D1,D2 = D4,dim = 1),0)
 28 | #   expect_error(diagram_kernel(D1 = D1,D2 = D2,dim = 0),"Inf")
 29 | # 
 30 | # })
 31 | 
 32 | test_that("diagram_kernel is computing correctly",{
 33 |   
 34 |   D1 <- data.frame(dimension = 0,birth = 2,death = 3)
 35 |   D2 <- data.frame(dimension = 0,birth = 2,death = 3.1)
 36 |   D3 <- data.frame(dimension = 0,birth = c(2,5),death = c(3.1,6))
 37 |   sqrt_rho_1 <- function(sigma)
 38 |   {
 39 |     v <- (1/(2*pi*sigma^2))*c(exp(0)+exp(-(0.45^2+0.55^2)/(2*sigma^2)),exp(-(0.1^2)/(2*sigma^2))+exp(-(2*0.55^2)/(2*sigma^2)),exp(-(2*0.5^2)/(2*sigma^2)) + exp(-(2*0.05^2)/(2*sigma^2)),exp(-(0.45^2+0.55^2)/(2*sigma^2)) + exp(0))
 40 |     v <- v/sum(v)
 41 |     return(sqrt(v))
 42 |   }
 43 |   sqrt_rho_2 <- function(sigma)
 44 |   {
 45 |     v <- (1/(2*pi*sigma^2))*c(exp(-(0.1^2)/(2*sigma^2))+exp(-(2*0.5^2)/(2*sigma^2)),exp(0)+exp(-(0.5^2+0.6^2)/(2*sigma^2)),exp(-(0.5^2+0.6^2)/(2*sigma^2)) + exp(0),exp(-(2*0.55^2)/(2*sigma^2)) + exp(-(2*0.05^2)/(2*sigma^2)))
 46 |     v <- v/sum(v)
 47 |     return(sqrt(v))
 48 |   }
 49 |   v11 <- sqrt_rho_1(1)
 50 |   v21 <- sqrt_rho_2(1)
 51 |   v12 <- sqrt_rho_1(2)
 52 |   v22 <- sqrt_rho_2(2)
 53 |   norm_11 <- as.numeric(v11 %*% v21)
 54 |   norm_22 <- as.numeric(v12 %*% v22)
 55 |   if(norm_11 > 1)
 56 |   {
 57 |     norm_11 <- 1
 58 |   }
 59 |   if(norm_11 < -1)
 60 |   {
 61 |     norm_11 <- -1
 62 |   }
 63 |   if(norm_22 > 1)
 64 |   {
 65 |     norm_22 <- 1
 66 |   }
 67 |   if(norm_22 < -1)
 68 |   {
 69 |     norm_22 <- -1
 70 |   }
 71 |   val_1 <- acos(norm_11)
 72 |   val_2 <- acos(norm_22)
 73 |   expect_equal(diagram_kernel(D1,D2,dim = 0,sigma = 1,t = 1),exp(-1*val_1))
 74 |   expect_equal(diagram_kernel(D2,D1,dim = 0,sigma = 1,t = 1),exp(-1*val_1))
 75 |   expect_equal(diagram_kernel(D1,D2,dim = 0,sigma = 2,t = 1),exp(-1*val_2))
 76 |   expect_equal(diagram_kernel(D1 = D1,D2 = D2,dim = 0,sigma = 1,t = 2),exp(-2*val_1))
 77 |   expect_equal(diagram_kernel(D1 = D1,D2 = D2,sigma = 2,t = 2),exp(-2*val_2))
 78 |   expect_equal(diagram_kernel(D1 = D2,D2 = D1,sigma = 2,t = 2),exp(-2*val_2))
 79 |   expect_identical(diagram_kernel(D1,D1,sigma = 1,t = 1),1)
 80 |   
 81 | })
 82 | 
 83 | test_that("gram_matrix detect incorrect parameters correctly",{
 84 |   
 85 |   D1 <- data.frame(dimension = 0,birth = 2,death = 3)
 86 |   D2 <- data.frame(dimension = 0,birth = 2,death = 3.1)
 87 |   D3 <- data.frame(dimension = 0,birth = c(2,5),death = c(3.1,6))
 88 |   expect_error(gram_matrix(diagrams = list(D1,D2,D3),num_workers = NaN),"NaN")
 89 |   expect_error(gram_matrix(diagrams = list(D1,D2,D3),num_workers = "1"),"numeric")
 90 |   
 91 | })
 92 | 
 93 | test_that("gram_matrix is computing correctly",{
 94 |   
 95 |   D1 <- data.frame(dimension = 0,birth = 2,death = 3)
 96 |   D2 <- data.frame(dimension = 0,birth = 2,death = 3.1)
 97 |   D3 <- data.frame(dimension = 0,birth = c(2,5),death = c(3.1,6))
 98 |   m1 <- matrix(data = c(1,diagram_kernel(D1,D2,dim = 0,sigma = 1,t = 1),diagram_kernel(D1,D2,dim = 0,sigma = 1,t = 1),1),byrow = T,nrow = 2,ncol = 2)
 99 |   class(m1) <- "kernelMatrix"
100 |   m2 <- matrix(data = c(1,diagram_kernel(D1,D2,dim = 0,sigma = 1,t = 1),diagram_kernel(D1,D3,dim = 0,sigma = 1,t = 1),diagram_kernel(D2,D1,dim = 0,sigma = 1,t = 1),1,diagram_kernel(D2,D3,dim = 0,sigma = 1,t = 1),diagram_kernel(D3,D1,dim = 0,sigma = 1,t = 1),diagram_kernel(D3,D2,dim = 0,sigma = 1,t = 1),1),byrow = T,nrow = 3,ncol = 3)
101 |   class(m2) <- "kernelMatrix"
102 |   m3 <- matrix(data = c(1,diagram_kernel(D1,D3,dim = 0,sigma = 1,t = 1),diagram_kernel(D1,D2,dim = 0,sigma = 1,t = 1),diagram_kernel(D2,D3,dim = 0,sigma = 1,t = 1)),byrow = T,nrow = 2,ncol = 2)
103 |   class(m3) <- "kernelMatrix"
104 |   expect_identical(gram_matrix(diagrams = list(D1,D2),dim = 0,sigma = 1,t = 1,num_workers = 2),m1)
105 |   expect_equal(gram_matrix(diagrams = list(D1,D2,D3),dim = 0,sigma = 1,t = 1,num_workers = 2),m2)
106 |   expect_equal(gram_matrix(diagrams = list(D1,D2),other_diagrams = list(D1,D3),dim = 0,sigma = 1,t = 1,num_workers = 2),m3)
107 |   
108 | })
109 | 
110 | 
111 | 


--------------------------------------------------------------------------------
/src/kd_util.h:
--------------------------------------------------------------------------------
  1 | //----------------------------------------------------------------------
  2 | // File:			kd_util.h
  3 | // Programmer:		Sunil Arya and David Mount
  4 | // Description:		Common utilities for kd- trees
  5 | // Last modified:	01/04/05 (Version 1.0)
  6 | //----------------------------------------------------------------------
  7 | // Copyright (c) 1997-2005 University of Maryland and Sunil Arya and
  8 | // David Mount.  All Rights Reserved.
  9 | // 
 10 | // This software and related documentation is part of the Approximate
 11 | // Nearest Neighbor Library (ANN).  This software is provided under
 12 | // the provisions of the Lesser GNU Public License (LGPL).  See the
 13 | // file ../ReadMe.txt for further information.
 14 | // 
 15 | // The University of Maryland (U.M.) and the authors make no
 16 | // representations about the suitability or fitness of this software for
 17 | // any purpose.  It is provided "as is" without express or implied
 18 | // warranty.
 19 | //----------------------------------------------------------------------
 20 | // History:
 21 | //	Revision 0.1  03/04/98
 22 | //		Initial release
 23 | //----------------------------------------------------------------------
 24 | 
 25 | #ifndef ANN_kd_util_H
 26 | #define ANN_kd_util_H
 27 | 
 28 | #include "kd_tree.h"					// kd-tree declarations
 29 | 
 30 | //----------------------------------------------------------------------
 31 | //	externally accessible functions
 32 | //----------------------------------------------------------------------
 33 | 
 34 | double annAspectRatio(			// compute aspect ratio of box
 35 |     int					dim,			// dimension
 36 |     const ANNorthRect	&bnd_box);		// bounding cube
 37 | 
 38 | void annEnclRect(				// compute smallest enclosing rectangle
 39 |     ANNpointArray		pa,				// point array
 40 |     ANNidxArray			pidx,			// point indices
 41 |     int					n,				// number of points
 42 |     int					dim,			// dimension
 43 |     ANNorthRect &bnds);					// bounding cube (returned)
 44 | 
 45 | void annEnclCube(				// compute smallest enclosing cube
 46 |     ANNpointArray		pa,				// point array
 47 |     ANNidxArray			pidx,			// point indices
 48 |     int					n,				// number of points
 49 |     int					dim,			// dimension
 50 |     ANNorthRect &bnds);					// bounding cube (returned)
 51 | 
 52 | ANNdist annBoxDistance(			// compute distance from point to box
 53 |     const ANNpoint		q,				// the point
 54 |     const ANNpoint		lo,				// low point of box
 55 |     const ANNpoint		hi,				// high point of box
 56 |     int					dim);			// dimension of space
 57 | 
 58 | // added by vlad 5-1-2008 to allow user to compute flops at runtime in release version
 59 | //   while keeping the version above fast
 60 | ANNdist annBoxDistanceFlops(			// compute distance from point to box
 61 |     const ANNpoint		q,				// the point
 62 |     const ANNpoint		lo,				// low point of box
 63 |     const ANNpoint		hi,				// high point of box
 64 |     int					dim);			// dimension of space
 65 | 
 66 | 
 67 | ANNcoord annSpread(				// compute point spread along dimension
 68 |     ANNpointArray		pa,				// point array
 69 |     ANNidxArray			pidx,			// point indices
 70 |     int					n,				// number of points
 71 |     int					d);				// dimension to check
 72 | 
 73 | void annMinMax(					// compute min and max coordinates along dim
 74 |     ANNpointArray		pa,				// point array
 75 |     ANNidxArray			pidx,			// point indices
 76 |     int					n,				// number of points
 77 |     int					d,				// dimension to check
 78 |     ANNcoord&			min,			// minimum value (returned)
 79 |     ANNcoord&			max);			// maximum value (returned)
 80 | 
 81 | int annMaxSpread(				// compute dimension of max spread
 82 |     ANNpointArray		pa,				// point array
 83 |     ANNidxArray			pidx,			// point indices
 84 |     int					n,				// number of points
 85 |     int					dim);			// dimension of space
 86 | 
 87 | void annMedianSplit(			// split points along median value
 88 |     ANNpointArray		pa,				// points to split
 89 |     ANNidxArray			pidx,			// point indices
 90 |     int					n,				// number of points
 91 |     int					d,				// dimension along which to split
 92 |     ANNcoord			&cv,			// cutting value
 93 |     int					n_lo);			// split into n_lo and n-n_lo
 94 | 
 95 | void annPlaneSplit(				// split points by a plane
 96 |     ANNpointArray		pa,				// points to split
 97 |     ANNidxArray			pidx,			// point indices
 98 |     int					n,				// number of points
 99 |     int					d,				// dimension along which to split
100 |     ANNcoord			cv,				// cutting value
101 |     int					&br1,			// first break (values < cv)
102 |     int					&br2);			// second break (values == cv)
103 | 
104 | void annBoxSplit(				// split points by a box
105 |     ANNpointArray		pa,				// points to split
106 |     ANNidxArray			pidx,			// point indices
107 |     int					n,				// number of points
108 |     int					dim,			// dimension of space
109 |     ANNorthRect			&box,			// the box
110 |     int					&n_in);			// number of points inside (returned)
111 | 
112 | int annSplitBalance(			// determine balance factor of a split
113 |     ANNpointArray		pa,				// points to split
114 |     ANNidxArray			pidx,			// point indices
115 |     int					n,				// number of points
116 |     int					d,				// dimension along which to split
117 |     ANNcoord			cv);			// cutting value
118 | 
119 | void annBox2Bnds(				// convert inner box to bounds
120 |     const ANNorthRect	&inner_box,		// inner box
121 |     const ANNorthRect	&bnd_box,		// enclosing box
122 |     int					dim,			// dimension of space
123 |     int					&n_bnds,		// number of bounds (returned)
124 |     ANNorthHSArray		&bnds);			// bounds array (returned)
125 | 
126 | void annBnds2Box(				// convert bounds to inner box
127 |     const ANNorthRect	&bnd_box,		// enclosing box
128 |     int					dim,			// dimension of space
129 |     int					n_bnds,			// number of bounds
130 |     ANNorthHSArray		bnds,			// bounds array
131 |     ANNorthRect			&inner_box);	// inner box (returned)
132 | 
133 | #endif


--------------------------------------------------------------------------------
/src/perf.cpp:
--------------------------------------------------------------------------------
  1 | #include <Rcpp.h>
  2 | using namespace Rcpp;
  3 | 
  4 | //----------------------------------------------------------------------
  5 | // File:			perf.cpp
  6 | // Programmer:		Sunil Arya and David Mount
  7 | // Description:		Methods for performance stats
  8 | // Last modified:	01/04/05 (Version 1.0)
  9 | //----------------------------------------------------------------------
 10 | // Copyright (c) 1997-2005 University of Maryland and Sunil Arya and
 11 | // David Mount.  All Rights Reserved.
 12 | // 
 13 | // This software and related documentation is part of the Approximate
 14 | // Nearest Neighbor Library (ANN).  This software is provided under
 15 | // the provisions of the Lesser GNU Public License (LGPL).  See the
 16 | // file ../ReadMe.txt for further information.
 17 | // 
 18 | // The University of Maryland (U.M.) and the authors make no
 19 | // representations about the suitability or fitness of this software for
 20 | // any purpose.  It is provided "as is" without express or implied
 21 | // warranty.
 22 | //----------------------------------------------------------------------
 23 | // History:
 24 | //	Revision 0.1  03/04/98
 25 | //		Initial release
 26 | //	Revision 1.0  04/01/05
 27 | //		Changed names to avoid namespace conflicts.
 28 | //		Added flush after printing performance stats to fix bug
 29 | //			in Microsoft Windows version.
 30 | //----------------------------------------------------------------------
 31 | 
 32 | #include "ANN.h"					// basic ANN includes
 33 | #include "ANNperf.h"				// performance includes
 34 | 
 35 | using namespace std;					// make std:: available
 36 | 
 37 | //----------------------------------------------------------------------
 38 | //	Performance statistics
 39 | //		The following data and routines are used for computing
 40 | //		performance statistics for nearest neighbor searching.
 41 | //		Because these routines can slow the code down, they can be
 42 | //		activated and deactiviated by defining the PERF variable,
 43 | //		by compiling with the option: -DPERF
 44 | //----------------------------------------------------------------------
 45 | 
 46 | //----------------------------------------------------------------------
 47 | //	Global counters for performance measurement
 48 | //----------------------------------------------------------------------
 49 | 
 50 | int				ann_Ndata_pts  = 0;		// number of data points
 51 | int				ann_Nvisit_lfs = 0;		// number of leaf nodes visited
 52 | int				ann_Nvisit_spl = 0;		// number of splitting nodes visited
 53 | int				ann_Nvisit_shr = 0;		// number of shrinking nodes visited
 54 | int				ann_Nvisit_pts = 0;		// visited points for one query
 55 | int				ann_Ncoord_hts = 0;		// coordinate hits for one query
 56 | int				ann_Nfloat_ops = 0;		// floating ops for one query
 57 | ANNsampStat		ann_visit_lfs;			// stats on leaf nodes visits
 58 | ANNsampStat		ann_visit_spl;			// stats on splitting nodes visits
 59 | ANNsampStat		ann_visit_shr;			// stats on shrinking nodes visits
 60 | ANNsampStat		ann_visit_nds;			// stats on total nodes visits
 61 | ANNsampStat		ann_visit_pts;			// stats on points visited
 62 | ANNsampStat		ann_coord_hts;			// stats on coordinate hits
 63 | ANNsampStat		ann_float_ops;			// stats on floating ops
 64 | //
 65 | ANNsampStat		ann_average_err;		// average error
 66 | ANNsampStat		ann_rank_err;			// rank error
 67 | 
 68 | //----------------------------------------------------------------------
 69 | //	Routines for statistics.
 70 | //----------------------------------------------------------------------
 71 | 
 72 | DLL_API void annResetStats(int data_size) // reset stats for a set of queries
 73 | {
 74 |   ann_Ndata_pts  = data_size;
 75 |   ann_visit_lfs.reset();
 76 |   ann_visit_spl.reset();
 77 |   ann_visit_shr.reset();
 78 |   ann_visit_nds.reset();
 79 |   ann_visit_pts.reset();
 80 |   ann_coord_hts.reset();
 81 |   ann_float_ops.reset();
 82 |   ann_average_err.reset();
 83 |   ann_rank_err.reset();
 84 | }
 85 | 
 86 | DLL_API void annResetCounts()				// reset counts for one query
 87 | {
 88 |   ann_Nvisit_lfs = 0;
 89 |   ann_Nvisit_spl = 0;
 90 |   ann_Nvisit_shr = 0;
 91 |   ann_Nvisit_pts = 0;
 92 |   ann_Ncoord_hts = 0;
 93 |   ann_Nfloat_ops = 0;
 94 | }
 95 | 
 96 | DLL_API void annUpdateStats()				// update stats with current counts
 97 | {
 98 |   ann_visit_lfs += ann_Nvisit_lfs;
 99 |   ann_visit_nds += ann_Nvisit_spl + ann_Nvisit_lfs;
100 |   ann_visit_spl += ann_Nvisit_spl;
101 |   ann_visit_shr += ann_Nvisit_shr;
102 |   ann_visit_pts += ann_Nvisit_pts;
103 |   ann_coord_hts += ann_Ncoord_hts;
104 |   ann_float_ops += ann_Nfloat_ops;
105 | }
106 | 
107 | // print a single statistic
108 | void print_one_stat(char *title, ANNsampStat s, double div)
109 | {
110 |   Rcout << title << "= [ ";
111 |   Rcout.width(9); Rcout << s.mean()/div			<< " : ";
112 |   Rcout.width(9); Rcout << s.stdDev()/div		<< " ]<";
113 |   Rcout.width(9); Rcout << s.min()/div			<< " , ";
114 |   Rcout.width(9); Rcout << s.max()/div			<< " >\n";
115 | }
116 | 
117 | DLL_API void annPrintStats(				// print statistics for a run
118 |     ANNbool validate)					// true if average errors desired
119 | {
120 |   Rcout.precision(4);					// set floating precision
121 |   Rcout << "  (Performance stats: "
122 |        << " [      mean :    stddev ]<      min ,       max >\n";
123 |   print_one_stat((char*)"    leaf_nodes       ", ann_visit_lfs, 1);
124 |   print_one_stat((char*)"    splitting_nodes  ", ann_visit_spl, 1);
125 |   print_one_stat((char*)"    shrinking_nodes  ", ann_visit_shr, 1);
126 |   print_one_stat((char*)"    total_nodes      ", ann_visit_nds, 1);
127 |   print_one_stat((char*)"    points_visited   ", ann_visit_pts, 1);
128 |   print_one_stat((char*)"    coord_hits/pt    ", ann_coord_hts, ann_Ndata_pts);
129 |   print_one_stat((char*)"    floating_ops_(K) ", ann_float_ops, 1000);
130 |   if (validate) {
131 |     print_one_stat((char*)"    average_error    ", ann_average_err, 1);
132 |     print_one_stat((char*)"    rank_error       ", ann_rank_err, 1);
133 |   }
134 |   Rcout.precision(0);					// restore the default
135 |   Rcout << "  )\n";
136 |   Rcout.flush();
137 | }


--------------------------------------------------------------------------------
/src/KCenterClustering.h:
--------------------------------------------------------------------------------
  1 | //-------------------------------------------------------------------
  2 | // This code was modified by Vlad Morariu:
  3 | // 11/03/06: 
  4 | //   Removed references to Matlab to compile code into a library
  5 | // 01/24/07:
  6 | //   KCenterClustering now has the ability to increase the number of 
  7 | //   clusters incrementally, calculating the max cluster radius at each
  8 | //   iteration.  
  9 | // 02/07/07:
 10 | //   Clustering now stops when the max cluster radius
 11 | //   is zero (when number of  clusters has reached the number of 
 12 | //   unique points), and the number of ACTUAL clusters used is returned.
 13 | //-------------------------------------------------------------------
 14 | 
 15 | //-------------------------------------------------------------------
 16 | // The code was written by Changjiang Yang and Vikas Raykar
 17 | // and is copyrighted under the Lesser GPL: 
 18 | //
 19 | // Copyright (C) 2006  Changjiang Yang and Vikas Raykar
 20 | //
 21 | // This program is free software; you can redistribute it and/or modify
 22 | // it under the terms of the GNU Lesser General Public License as
 23 | // published by the Free Software Foundation; version 2.1 or later.
 24 | // This program is distributed in the hope that it will be useful,
 25 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
 26 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 
 27 | // See the GNU Lesser General Public License for more details. 
 28 | // You should have received a copy of the GNU Lesser General Public
 29 | // License along with this program; if not, write to the Free Software
 30 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, 
 31 | // MA 02111-1307, USA.  
 32 | //
 33 | // The author may be contacted via email at:cyang(at)sarnoff(.)com
 34 | // vikas(at)umiacs(.)umd(.)edu
 35 | //-------------------------------------------------------------------
 36 | 
 37 | //----------------------------------------------------------------------------
 38 | // File    : KCenterClustering.h
 39 | // Purpose : Interface for the k-center clustering algorithm.
 40 | // Author  : Vikas C. Raykar (vikas@cs.umd.edu)
 41 | // Date    : April 25 2005, June 10 2005, August 23, 2005
 42 | //
 43 | //----------------------------------------------------------------------------
 44 | // Gonzalez's farthest-point clustering algorithm.
 45 | //
 46 | // June 10, 2005: 
 47 | // This version now returns the number points and the radius of each cluster.
 48 | //
 49 | // August 23, 2005:
 50 | // Speed up using the doubly circular list.
 51 | // The clusters far away are trimmed. The nodes inside the neighboring
 52 | // clusters which are within half sphere are trimmed.
 53 | //
 54 | //----------------------------------------------------------------------------
 55 | //
 56 | // INPUT 
 57 | // ----------------
 58 | //
 59 | // Dim               --> dimension of the points.
 60 | // NSources          --> number of sources.
 61 | // pSources          --> pointer to sources, (d*N).
 62 | // NumClusters       --> number of clusters.
 63 | //
 64 | // OUTPUT
 65 | // ----------------
 66 | //
 67 | // MaxClusterRadius  --> maximum radius of the clusters, (rx).
 68 | // pClusterIndex     --> vector of length N where the i th element is the
 69 | //                     cluster number to which the i th point belongs.
 70 | //                     pClusterIndex[i] varies between 0 to K-1. 
 71 | // pClusterCenters   --> pointer to the cluster centers, (d*K). 
 72 | // pNumPoints        --> pointer to the number of points in each cluster, (K).
 73 | // pClusterRadii     --> pointer to the radius of each cluster, (K). 
 74 | //----------------------------------------------------------------------------
 75 | 
 76 | #ifndef K_CENTER_CLUSTERING_H
 77 | #define K_CENTER_CLUSTERING_H
 78 | 
 79 | class KCenterClustering{
 80 | public:    
 81 |   
 82 |   //Output parameters
 83 |   
 84 |   double MaxClusterRadius;  //maximum cluster radius
 85 |   
 86 |   //Functions
 87 |   
 88 |   //constructor 
 89 |   KCenterClustering(int Dim,
 90 |                     int NSources,
 91 |                     double *pSources,
 92 |                     int *pClusterIndex,
 93 |                     int NumClusters
 94 |   );
 95 |   
 96 |   //destructor
 97 |   ~KCenterClustering();
 98 |   
 99 |   //K-center clustering
100 |   //Returns the number of actual clusters (it might have stopped early if all clusters have
101 |   //    radius of 0 -- which means that the number of clusters has reached the number
102 |   //    of unique pts)
103 |   int Cluster();
104 |   
105 |   //Incremental k-center clustering
106 |   //  nClusters - if non-NULL, value is set to the # of clusters at end of call
107 |   //  maxRadius - if non-NULL, value is set to the max radius of all clusters
108 |   void ClusterIncrement( int * nClusters, double * maxRadius );
109 |   
110 |   //Compute cluster centers and the number of points in each cluster
111 |   //and the radius of each cluster.
112 |   void ComputeClusterCenters( int NumClusters,
113 |                               double *pClusterCenters,
114 |                               int *pNumPoints,
115 |                               double *pClusterRadii
116 |   );
117 |   
118 | private:
119 |   //Input Parameters
120 |   
121 |   int d;           // dimension of the points.
122 |   int N;           // number of sources.
123 |   double *px;      // pointer to sources, (d*N).
124 |   int K;           // max number of clusters
125 |   int *pci;        // pointer to a vector of length N where the i th element is the 
126 |   // cluster number to which the i th point belongs.
127 |   double *dist_C;  // distances to the center.
128 |   double *r;
129 |   
130 |   int *pCenters;   // indices of the centers.
131 |   int *cprev;      // index to the previous node
132 |   int *cnext;      // index to the next node
133 |   int *far2c;      // farthest node to the center
134 |   
135 |   int numClusters; // added by Vlad to keep track of # of clusters
136 |   
137 |   //Functions
138 |   double ddist(const int d, const double *x, const double *y);
139 |   int idmax(int n, double *x);
140 |   
141 | };
142 | 
143 | 
144 | #endif


--------------------------------------------------------------------------------
/man/permutation_model_inference.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/inference.R
  3 | \name{permutation_model_inference}
  4 | \alias{permutation_model_inference}
  5 | \title{Model inference with permutation test.}
  6 | \usage{
  7 | permutation_model_inference(
  8 |   D1,
  9 |   D2,
 10 |   iterations,
 11 |   num_samples,
 12 |   dims = c(0, 1),
 13 |   samp = NULL,
 14 |   paired = F,
 15 |   num_workers = parallelly::availableCores(omit = 1),
 16 |   verbose = F,
 17 |   FUN_boot = "calculate_homology",
 18 |   thresh,
 19 |   distance_mat = FALSE,
 20 |   ripser = NULL,
 21 |   return_diagrams = FALSE
 22 | )
 23 | }
 24 | \arguments{
 25 | \item{D1}{the first dataset (a data frame).}
 26 | 
 27 | \item{D2}{the second dataset (a data frame).}
 28 | 
 29 | \item{iterations}{the number of iterations for permuting group labels, default 20.}
 30 | 
 31 | \item{num_samples}{the number of bootstrap iterations, default 30.}
 32 | 
 33 | \item{dims}{a non-negative integer vector of the homological dimensions in which the test is to be carried out, default c(0,1).}
 34 | 
 35 | \item{samp}{an optional list of row-number samples of `D1`, default NULL. See details and examples for more information. Ignored when `paired` is FALSE.}
 36 | 
 37 | \item{paired}{a boolean flag for if there is a second-order pairing between diagrams at the same index in different groups, default FALSE.}
 38 | 
 39 | \item{num_workers}{the number of cores used for parallel computation, default is one less than the number of cores on the machine.}
 40 | 
 41 | \item{verbose}{a boolean flag for if the time duration of the function call should be printed, default FALSE}
 42 | 
 43 | \item{FUN_boot}{a string representing the persistent homology function to use for calculating the bootstrapped persistence diagrams, either
 44 | 'calculate_homology' (the default), 'PyH' or 'ripsDiag'.}
 45 | 
 46 | \item{thresh}{the positive numeric maximum radius of the Vietoris-Rips filtration.}
 47 | 
 48 | \item{distance_mat}{a boolean representing if `X` is a distance matrix (TRUE) or not (FALSE, default).
 49 | dimensions together (TRUE, the default) or if one threshold should be calculated for each dimension separately (FALSE).}
 50 | 
 51 | \item{ripser}{the imported ripser module when `FUN_boot` is `PyH`.}
 52 | 
 53 | \item{return_diagrams}{whether or not to return the two lists of bootstrapped persistence diagrams, default FALSE.}
 54 | }
 55 | \value{
 56 | a list which contains the output of the call to \code{\link{permutation_test}} and the two groups of bootstrapped
 57 | persistence diagrams if desired, in entries called `diagrams1` and `diagrams2`.
 58 | }
 59 | \description{
 60 | An inference procedure to determine if two datasets were unlikely to be generated by the same process (i.e. if
 61 | the persistence diagram of one dataset is a good model of the persistence diagram of the other dataset).
 62 | }
 63 | \details{
 64 | Inference is carried out by generating bootstrap resampled persistence diagrams from the two datasets and carrying out a permutation test
 65 | on the resulting two groups. A small p-value in a certain dimension suggests that the datasets are not good models of each other. `samp` should
 66 | only be provided when `paired`is TRUE in order to generate the same row samplings of `D1` and `D2` for the bootstrapped persistence diagrams.
 67 | This makes a paired permutation test more appropriate, which has higher statistical power for detecting topological differences. See the examples
 68 | for how to properly supply `samp`.
 69 | }
 70 | \examples{
 71 | 
 72 | if(require("TDAstats"))
 73 | {
 74 |   # create two datasets
 75 |   D1 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,10),],
 76 |                                      dim = 0,threshold = 2)
 77 |   D2 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,10),],
 78 |                                      dim = 0,threshold = 2)
 79 | 
 80 |   # do model inference test with 1 iteration (for speed, more
 81 |   # iterations should be used in practice)
 82 |   model_test <- permutation_model_inference(D1, D2, iterations = 1,
 83 |                                             thresh = 1.75,num_samples = 3,
 84 |                                             num_workers = 2L)
 85 |   
 86 |   # with more iterations, p-values show a difference in the 
 87 |   # clustering of points but not in the arrangement of loops
 88 |   model_test$p_values
 89 |   
 90 |   # to supply samp, when we believe there is a correspondence between
 91 |   # the rows in D1 and the rows in D2
 92 |   # note that the number of entries of samp (3 in this case) must
 93 |   # match the num_samples parameter to the function call
 94 |   samp <- lapply(X = 1:3,FUN = function(X){
 95 | 
 96 |            return(unique(sample(1:nrow(D1),size = nrow(D1),replace = TRUE)))
 97 | 
 98 |           })
 99 |   
100 |   # model inference will theoretically have higher power now for a
101 |   # paired test 
102 |   model_test2 <- permutation_model_inference(D1, D2, iterations = 1,
103 |                                              thresh = 1.75,num_samples = 3,
104 |                                              paired = TRUE,samp = samp,
105 |                                              num_workers = 2L)
106 |   model_test2$p_values
107 | }
108 | }
109 | \references{
110 | Robinson T, Turner K (2017). "Hypothesis testing for topological data analysis." \url{https://link.springer.com/article/10.1007/s41468-017-0008-7}.
111 | 
112 | Chazal F et al (2017). "Robust Topological Inference: Distance to a Measure and Kernel Distance." \url{https://www.jmlr.org/papers/volume18/15-484/15-484.pdf}.
113 | 
114 | Abdallah H et al. (2021). "Statistical Inference for Persistent Homology applied to fMRI." \url{https://github.com/hassan-abdallah/Statistical_Inference_PH_fMRI/blob/main/Abdallah_et_al_Statistical_Inference_PH_fMRI.pdf}.
115 | }
116 | \seealso{
117 | \code{\link{permutation_test}} for an inferential group difference test for groups of persistence diagrams and \code{\link{bootstrap_persistence_thresholds}} for computing confidence sets for persistence diagrams.
118 | }
119 | \author{
120 | Shael Brown - \email{shaelebrown@gmail.com}
121 | }
122 | 


--------------------------------------------------------------------------------
/man/bootstrap_persistence_thresholds.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/bootstrap.R
  3 | \name{bootstrap_persistence_thresholds}
  4 | \alias{bootstrap_persistence_thresholds}
  5 | \title{Estimate persistence threshold(s) for topological features in a data set using bootstrapping.}
  6 | \usage{
  7 | bootstrap_persistence_thresholds(
  8 |   X,
  9 |   FUN_diag = "calculate_homology",
 10 |   FUN_boot = "calculate_homology",
 11 |   maxdim = 0,
 12 |   thresh,
 13 |   distance_mat = FALSE,
 14 |   ripser = NULL,
 15 |   ignore_infinite_cluster = TRUE,
 16 |   calculate_representatives = FALSE,
 17 |   num_samples = 30,
 18 |   alpha = 0.05,
 19 |   return_subsetted = FALSE,
 20 |   return_pvals = FALSE,
 21 |   return_diag = TRUE,
 22 |   num_workers = parallelly::availableCores(omit = 1),
 23 |   p_less_than_alpha = FALSE,
 24 |   ...
 25 | )
 26 | }
 27 | \arguments{
 28 | \item{X}{the input dataset, must either be a matrix or data frame.}
 29 | 
 30 | \item{FUN_diag}{a string representing the persistent homology function to use for calculating the full persistence diagram, either
 31 | 'calculate_homology' (the default), 'PyH' or 'ripsDiag'.}
 32 | 
 33 | \item{FUN_boot}{a string representing the persistent homology function to use for calculating the bootstrapped persistence diagrams, either
 34 | 'calculate_homology' (the default), 'PyH' or 'ripsDiag'.}
 35 | 
 36 | \item{maxdim}{the integer maximum homological dimension for persistent homology, default 0.}
 37 | 
 38 | \item{thresh}{the positive numeric maximum radius of the Vietoris-Rips filtration.}
 39 | 
 40 | \item{distance_mat}{a boolean representing if `X` is a distance matrix (TRUE) or not (FALSE, default).
 41 | dimensions together (TRUE, the default) or if one threshold should be calculated for each dimension separately (FALSE).}
 42 | 
 43 | \item{ripser}{the imported ripser module when `FUN_diag` or `FUN_boot` is `PyH`.}
 44 | 
 45 | \item{ignore_infinite_cluster}{a boolean indicating whether or not to ignore the infinitely lived cluster when `FUN_diag` or `FUN_boot` is `PyH`.}
 46 | 
 47 | \item{calculate_representatives}{a boolean representing whether to calculate representative (co)cycles, default FALSE. Note that representatives cant be
 48 | calculated when using the 'calculate_homology' function.}
 49 | 
 50 | \item{num_samples}{the positive integer number of bootstrap samples, default 30.}
 51 | 
 52 | \item{alpha}{the type-1 error threshold, default 0.05.}
 53 | 
 54 | \item{return_subsetted}{a boolean representing whether or not to return the subsetted persistence diagram (with or without representatives), default FALSE.}
 55 | 
 56 | \item{return_pvals}{a boolean representing whether or not to return p-values for features in the subsetted diagram, default FALSE.}
 57 | 
 58 | \item{return_diag}{a boolean representing whether or not to return the calculated persistence diagram, default TRUE.}
 59 | 
 60 | \item{num_workers}{the integer number of cores used for parallelizing (over bootstrap samples), default one less the maximum amount of cores on the machine.}
 61 | 
 62 | \item{p_less_than_alpha}{a boolean representing whether or not subset further and return only feature whose p-values are strictly less than `alpha`, default `FALSE`. Note that this is not part of the original bootstrap procedure.}
 63 | 
 64 | \item{...}{additional parameters for internal methods.}
 65 | }
 66 | \value{
 67 | either a numeric vector of threshold values, with one for each dimension 0..`maxdim` (in that order), or a list containing those thresholds and elements (if desired)
 68 | }
 69 | \description{
 70 | Bootstrapping is used to find a conservative estimate of a 1-`alpha` percent "confidence interval" around
 71 | each point in the persistence diagram of the data set, and points whose intervals do not
 72 | touch the diagonal (birth == death) would be considered "significant" or "real".
 73 | One threshold is computed for each dimension in the diagram.
 74 | }
 75 | \details{
 76 | The thresholds are then determined by calculating the 1-`alpha'` percentile of the bottleneck
 77 | distance values between the real persistence diagram and other diagrams obtained
 78 | by bootstrap resampling the data. Since `ripsDiag` is the slowest homology engine but is the
 79 | only engine which calculates representative cycles (as opposed to co-cycles with `PyH`), two
 80 | homology engines are input to this function - one to calculate the actual persistence diagram, `FUN_diag`
 81 | (possibly with representative (co)cycles) and one to calculate the bootstrap diagrams, `FUN_boot` (this should be
 82 | a faster engine, like `calculate_homology` or `PyH`).
 83 | p-values can be calculated for any feature which survives the thresholding if both `return_subsetted` and `return_pvals` are `TRUE`, 
 84 | however these values may be larger than the original `alpha` value in some cases. Note that this is not part of the original bootstrap procedure.
 85 | If stricter thresholding is desired,
 86 | or the p-values must be less than `alpha`, set `p_less_than_alpha` to `TRUE`. The minimum
 87 | possible p-value is always 1/(`num_samples` + 1).
 88 | Note that since \code{\link[TDAstats]{calculate_homology}} 
 89 | can ignore the longest-lived cluster, fewer "real" clusters may be found. To avoid this possibility
 90 | try setting `FUN_diag` equal to 'ripsDiag'. Please note that due to the TDA package no longer being available on CRAN,
 91 | if `FUN_diag` or `FUN_boot` are 'ripsDiag' then `bootstrap_persistence_thresholds` will look for the ripsDiag function in the global environment, 
 92 | so the TDA package should be attached with `library("TDA")` prior to use.
 93 | }
 94 | \examples{
 95 | 
 96 | if(require("TDAstats"))
 97 | {
 98 |   # create a persistence diagram from a sample of the unit circle
 99 |   df <- TDAstats::circle2d[sample(1:100,size = 50),]
100 | 
101 |   # calculate persistence thresholds for alpha = 0.05 
102 |   # and return the calculated diagram as well as the subsetted diagram
103 |   bootstrapped_diagram <- bootstrap_persistence_thresholds(X = df,
104 |   maxdim = 1,thresh = 2,num_workers = 2)
105 | }
106 | }
107 | \references{
108 | Chazal F et al (2017). "Robust Topological Inference: Distance to a Measure and Kernel Distance." \url{https://www.jmlr.org/papers/volume18/15-484/15-484.pdf}.
109 | }
110 | \author{
111 | Shael Brown - \email{shaelebrown@gmail.com}
112 | }
113 | 


--------------------------------------------------------------------------------
/R/kernel_calculations.R:
--------------------------------------------------------------------------------
  1 | #### PERSISTENCE FISHER KERNEL ####
  2 | #' Calculate persistence Fisher kernel value between a pair of persistence diagrams.
  3 | #'
  4 | #' Returns the persistence Fisher kernel value between a pair of persistence diagrams
  5 | #' in a particular homological dimension, each of which is either the output from a \code{\link{diagram_to_df}} 
  6 | #' function call or from a persistent homology calculation like ripsDiag/\code{\link[TDAstats]{calculate_homology}}/\code{\link{PyH}}.
  7 | #'
  8 | #' The persistence Fisher kernel is calculated from the Fisher information metric according to the formula
  9 | #' \eqn{k_{PF}(D_1,D_2) = exp(-t*d_{FIM}(D_1,D_2))}, resembling a radial basis kernel for standard
 10 | #' Euclidean spaces.
 11 | #'
 12 | #' @param D1 the first persistence diagram.
 13 | #' @param D2 the second persistence diagram.
 14 | #' @param dim the non-negative integer homological dimension in which the distance is to be computed, default 0.
 15 | #' @param sigma a positive number representing the bandwidth for the Fisher information metric, default 1.
 16 | #' @param rho an optional positive number representing the heuristic for Fisher information metric approximation, see \code{\link{diagram_distance}}. Default NULL. 
 17 | #' @param t a positive number representing the scale for the persistence Fisher kernel, default 1.
 18 | #'
 19 | #' @return the numeric kernel value.
 20 | #' @export
 21 | #' @author Shael Brown - \email{shaelebrown@@gmail.com}
 22 | #' @seealso \code{\link{gram_matrix}} for Gram (i.e. kernel) matrix calculations.
 23 | #' @references
 24 | #' Le T, Yamada M (2018). "Persistence fisher kernel: a riemannian manifold kernel for persistence diagrams." \url{https://proceedings.neurips.cc/paper/2018/file/959ab9a0695c467e7caf75431a872e5c-Paper.pdf}.
 25 | #' 
 26 | #' Murphy, K. "Machine learning: a probabilistic perspective", MIT press (2012).
 27 | #' @examples
 28 | #'
 29 | #' if(require("TDAstats"))
 30 | #' {
 31 | #'   # create two diagrams
 32 | #'   D1 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,20),],
 33 | #'                       dim = 1,threshold = 2)
 34 | #'   D2 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,20),],
 35 | #'                       dim = 1,threshold = 2)
 36 | #' 
 37 | #'   # calculate the kernel value between D1 and D2 with sigma = 2, t = 2 in dimension 1
 38 | #'   diagram_kernel(D1,D2,dim = 1,sigma = 2,t = 2)
 39 | #'   # calculate the kernel value between D1 and D2 with sigma = 2, t = 2 in dimension 0
 40 | #'   diagram_kernel(D1,D2,dim = 0,sigma = 2,t = 2)
 41 | #' }
 42 | 
 43 | diagram_kernel <- function(D1,D2,dim = 0,sigma = 1,t = 1,rho = NULL){
 44 |   
 45 |   # function to compute the Persistence Fisher kernel of two persistence diagrams
 46 |   
 47 |   # check kernel-specific parameter, other inputs are checked in distance calculation
 48 |   check_param("t",t,positive = T,numeric = T,finite = T,multiple = F)
 49 |   
 50 |   # return kernel calculation
 51 |   return(exp(-1*t*diagram_distance(D1 = D1,D2 = D2,dim = dim,distance = "fisher",sigma = sigma,rho = rho)))
 52 |   
 53 | }
 54 | 
 55 | #### GRAM MATRIX ####
 56 | #' Compute the gram matrix for a group of persistence diagrams.
 57 | #' 
 58 | #' Calculate the Gram matrix \eqn{K} for either a single list of persistence diagrams \eqn{(D_1,D_2,\dots,D_n)}, i.e. \eqn{K[i,j] = k_{PF}(D_i,D_j)}, 
 59 | #' or between two lists of persistence diagrams, \eqn{(D_1,D_2,\dots,D_n)} and \eqn{(D'_1,D'_2,\dots,D'_n)}, \eqn{K[i,j] = k_{PF}(D_i,D'_j)}, in parallel.
 60 | #' 
 61 | #' Gram matrices are used in downstream analyses, like in the `diagram_kkmeans`, `diagram_nearest_cluster`,`diagram_kpca`, 
 62 | #' `predict_diagram_kpca`, `predict_diagram_ksvm` and `independence_test` functions.
 63 | #'
 64 | #' @param diagrams a list of persistence diagrams, where each diagram is either the output of a persistent homology calculation like ripsDiag/\code{\link[TDAstats]{calculate_homology}}/\code{\link{PyH}}, or \code{\link{diagram_to_df}}.
 65 | #' @param other_diagrams either NULL (default) or another list of persistence diagrams to compute a cross-Gram matrix.
 66 | #' @param dim the non-negative integer homological dimension in which the distance is to be computed, default 0.
 67 | #' @param sigma a positive number representing the bandwidth for the Fisher information metric, default 1.
 68 | #' @param t a positive number representing the scale for the kernel, default 1.
 69 | #' @param rho an optional positive number representing the heuristic for Fisher information metric approximation, see \code{\link{diagram_distance}}. Default NULL. If supplied, code execution is sequential, but functions in the "exec" directory
 70 | #'            of the package can be loaded to calculate distance matrices in parallel with approximation.
 71 | #' @param num_workers the number of cores used for parallel computation, default is one less than the number of cores on the machine.
 72 | #'
 73 | #' @return the numeric (cross) Gram matrix of class 'kernelMatrix'.
 74 | #' @export
 75 | #' @author Shael Brown - \email{shaelebrown@@gmail.com}
 76 | #' @seealso \code{\link{diagram_kernel}} for individual persistence Fisher kernel calculations.
 77 | #' @examples
 78 | #'
 79 | #' if(require("TDAstats"))
 80 | #' {
 81 | #'   # create two diagrams
 82 | #'   D1 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,20),],
 83 | #'                       dim = 1,threshold = 2)
 84 | #'   D2 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,20),],
 85 | #'                       dim = 1,threshold = 2)
 86 | #'   g <- list(D1,D2)
 87 | #'
 88 | #'   # calculate the Gram matrix in dimension 0 with sigma = 2, t = 2
 89 | #'   G <- gram_matrix(diagrams = g,dim = 0,sigma = 2,t = 2,num_workers = 2)
 90 | #' 
 91 | #'   # calculate cross-Gram matrix, which is the same as G
 92 | #'   G_cross <- gram_matrix(diagrams = g,other_diagrams = g,dim = 0,sigma = 2,
 93 | #'                          t = 2,num_workers = 2)
 94 | #' }
 95 | 
 96 | gram_matrix <- function(diagrams,other_diagrams = NULL,dim = 0,sigma = 1,t = 1,rho = NULL,num_workers = parallelly::availableCores(omit = 1)){
 97 |   
 98 |   # function to compute (cross) Gram matrix in parallel
 99 |   check_param(param_name = "t",param = t,numeric = T,positive = T,multiple = F,finite = T)
100 |   
101 |   # compute gram matrix from distance matrix
102 |   K <- exp(-t*distance_matrix(diagrams = diagrams,other_diagrams = other_diagrams,dim = dim,distance = "fisher",sigma = sigma,rho = rho,num_workers = num_workers))
103 |   
104 |   # update class for interfacing with kernlab package
105 |   class(K) <- "kernelMatrix"
106 |   
107 |   return(K)
108 | 
109 | }
110 | 


--------------------------------------------------------------------------------
/man/permutation_test.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/inference.R
  3 | \name{permutation_test}
  4 | \alias{permutation_test}
  5 | \title{Permutation test for finding group differences between persistence diagrams.}
  6 | \usage{
  7 | permutation_test(
  8 |   ...,
  9 |   iterations = 20,
 10 |   p = 2,
 11 |   q = 2,
 12 |   dims = c(0, 1),
 13 |   dist_mats = NULL,
 14 |   group_sizes = NULL,
 15 |   paired = FALSE,
 16 |   distance = "wasserstein",
 17 |   sigma = NULL,
 18 |   rho = NULL,
 19 |   num_workers = parallelly::availableCores(omit = 1),
 20 |   verbose = FALSE
 21 | )
 22 | }
 23 | \arguments{
 24 | \item{...}{lists of persistence diagrams which are either the output of persistent homology calculations like ripsDiag/\code{\link[TDAstats]{calculate_homology}}/\code{\link{PyH}}, or \code{\link{diagram_to_df}}. Each list must contain at least 2 diagrams.}
 25 | 
 26 | \item{iterations}{the number of iterations for permuting group labels, default 20.}
 27 | 
 28 | \item{p}{a positive number representing the wasserstein power parameter, a number at least 1 (and Inf if using the bottleneck distance) and default 2.}
 29 | 
 30 | \item{q}{a finite number at least 1 for exponentiation in the Turner loss function, default 2.}
 31 | 
 32 | \item{dims}{a non-negative integer vector of the homological dimensions in which the test is to be carried out, default c(0,1).}
 33 | 
 34 | \item{dist_mats}{an optional list of precomputed distances matrices, one for each dimension, where the rows and columns would correspond to the unlisted groups of diagrams (in order), default NULL. If not NULL then no lists of diagrams need to be supplied.}
 35 | 
 36 | \item{group_sizes}{a vector of group sizes, one for each group, when `dist_mats` is not NULL.}
 37 | 
 38 | \item{paired}{a boolean flag for if there is a second-order pairing between diagrams at the same index in different groups, default FALSE}
 39 | 
 40 | \item{distance}{a string which determines which type of distance calculation to carry out, either "wasserstein" (default) or "fisher".}
 41 | 
 42 | \item{sigma}{the positive bandwidth for the Fisher information metric, default NULL.}
 43 | 
 44 | \item{rho}{an optional positive number representing the heuristic for Fisher information metric approximation, see \code{\link{diagram_distance}}. Default NULL. If supplied, code execution is sequential.}
 45 | 
 46 | \item{num_workers}{the number of cores used for parallel computation, default is one less than the number of cores on the machine.}
 47 | 
 48 | \item{verbose}{a boolean flag for if the time duration of the function call should be printed, default FALSE}
 49 | }
 50 | \value{
 51 | a list with the following elements:
 52 | \describe{
 53 | 
 54 |  \item{dimensions}{the input `dims` argument.}
 55 | 
 56 |  \item{permvals}{a numeric vector of length `iterations` with the permuted loss value for each iteration (permutation)}
 57 |  
 58 |  \item{test_statisics}{a numeric vector of the test statistic value in each dimension.}
 59 |  
 60 |  \item{p_values}{a numeric vector of the p-values in each dimension.}
 61 |  
 62 |  \item{run_time}{the run time of the function call, containing time units.}
 63 | 
 64 | }
 65 | }
 66 | \description{
 67 | A non-parametric ANOVA-like test for persistence diagrams 
 68 | (see \url{https://link.springer.com/article/10.1007/s41468-017-0008-7} for details). In each
 69 | desired dimension a test statistic (loss) is calculated, then the group labels are shuffled
 70 | for some number of iterations and the loss is recomputed each time thereby generating a null
 71 | distribution for the test statistic. This test generates a p-value in each desired dimension.
 72 | }
 73 | \details{
 74 | The test is carried out in parallel and optimized in order to not recompute already-calculated distances. As such, memory issues
 75 | may occur when the number of persistence diagrams is very large. 
 76 | Like in (\url{https://github.com/hassan-abdallah/Statistical_Inference_PH_fMRI/blob/main/Abdallah_et_al_Statistical_Inference_PH_fMRI.pdf})
 77 | an option is provided for pairing diagrams between groups to reduce variance (in order to boost statistical power), and
 78 | like it was suggested in the original paper functionality is provided for an arbitrary number of groups (not just 2).
 79 | A small p-value in a dimension suggests that the groups are different (separated) in that dimension.
 80 | If `distance` is "fisher" then `sigma` must not be NULL. TDAstats also has a `permutation_test` function
 81 | so care should be taken to use the desired function when using TDApplied with TDAstats. If `dist_mats` is supplied
 82 | then the sum of the elements of `group_sizes` must equal the number of rows and columns of each of its elements.
 83 | }
 84 | \examples{
 85 | 
 86 | if(require("TDAstats"))
 87 | {
 88 |   # create two groups of diagrams
 89 |   D1 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,10),],
 90 |                                      dim = 0,threshold = 2)
 91 |   D2 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,10),],
 92 |                                      dim = 0,threshold = 2)
 93 |   g1 <- list(D1,D2)
 94 |   g2 <- list(D1,D2)
 95 | 
 96 |   # run test in dimension 0 with 1 iteration, note that the TDA package function
 97 |   # "permutation_test" can mask TDApplied's function, so we will specify explicitly
 98 |   # which function we are using
 99 |   perm_test <- TDApplied::permutation_test(g1,g2,iterations = 1,
100 |                                            num_workers = 2,
101 |                                            dims = c(0))
102 |                                  
103 |   # repeat with precomputed distance matrix, gives similar results
104 |   # (same but the randomness of the permutations can give small differences)
105 |   # just much faster
106 |   D <- distance_matrix(diagrams = list(D1,D2,D1,D2),dim = 0,
107 |                        num_workers = 2)
108 |   perm_test <- TDApplied::permutation_test(dist_mats = list(D),group_sizes = c(2,2),
109 |                                            dims = c(0))
110 | }
111 | }
112 | \references{
113 | Robinson T, Turner K (2017). "Hypothesis testing for topological data analysis." \url{https://link.springer.com/article/10.1007/s41468-017-0008-7}.
114 | 
115 | Abdallah H et al. (2021). "Statistical Inference for Persistent Homology applied to fMRI." \url{https://github.com/hassan-abdallah/Statistical_Inference_PH_fMRI/blob/main/Abdallah_et_al_Statistical_Inference_PH_fMRI.pdf}.
116 | }
117 | \seealso{
118 | \code{\link{independence_test}} for an inferential test of independence for two groups of persistence diagrams.
119 | }
120 | \author{
121 | Shael Brown - \email{shaelebrown@gmail.com}
122 | }
123 | 


--------------------------------------------------------------------------------
/src/ANNx.h:
--------------------------------------------------------------------------------
  1 | //----------------------------------------------------------------------
  2 | //	File:			ANNx.h
  3 | //	Programmer: 	Sunil Arya and David Mount
  4 | //	Last modified:	03/04/98 (Release 0.1)
  5 | //	Description:	Internal include file for ANN
  6 | //
  7 | //	These declarations are of use in manipulating some of
  8 | //	the internal data objects appearing in ANN, but are not
  9 | //	needed for applications just using the nearest neighbor
 10 | //	search.
 11 | //
 12 | //	Typical users of ANN should not need to access this file.
 13 | //----------------------------------------------------------------------
 14 | // Copyright (c) 1997-2005 University of Maryland and Sunil Arya and
 15 | // David Mount.  All Rights Reserved.
 16 | // 
 17 | // This software and related documentation is part of the Approximate
 18 | // Nearest Neighbor Library (ANN).  This software is provided under
 19 | // the provisions of the Lesser GNU Public License (LGPL).  See the
 20 | // file ../ReadMe.txt for further information.
 21 | // 
 22 | // The University of Maryland (U.M.) and the authors make no
 23 | // representations about the suitability or fitness of this software for
 24 | // any purpose.  It is provided "as is" without express or implied
 25 | // warranty.
 26 | //----------------------------------------------------------------------
 27 | //	History:
 28 | //	Revision 0.1  03/04/98
 29 | //	    Initial release
 30 | //	Revision 1.0  04/01/05
 31 | //	    Changed LO, HI, IN, OUT to ANN_LO, ANN_HI, etc.
 32 | //----------------------------------------------------------------------
 33 | 
 34 | #ifndef ANNx_H
 35 | #define ANNx_H
 36 | 
 37 | #include <iomanip>				// I/O manipulators
 38 | #include "ANN.h"			// ANN includes
 39 | 
 40 | //----------------------------------------------------------------------
 41 | //	Global constants and types
 42 | //----------------------------------------------------------------------
 43 | enum	{ANN_LO=0, ANN_HI=1};	// splitting indices
 44 | enum	{ANN_IN=0, ANN_OUT=1};	// shrinking indices
 45 | // what to do in case of error
 46 | enum ANNerr {ANNwarn = 0, ANNabort = 1};
 47 | 
 48 | //----------------------------------------------------------------------
 49 | //	Maximum number of points to visit
 50 | //	We have an option for terminating the search early if the
 51 | //	number of points visited exceeds some threshold.  If the
 52 | //	threshold is 0 (its default)  this means there is no limit
 53 | //	and the algorithm applies its normal termination condition.
 54 | //----------------------------------------------------------------------
 55 | 
 56 | extern int		ANNmaxPtsVisited;	// maximum number of pts visited
 57 | extern int		ANNptsVisited;		// number of pts visited in search
 58 | 
 59 | //----------------------------------------------------------------------
 60 | //	Global function declarations
 61 | //----------------------------------------------------------------------
 62 | 
 63 | void annError(					// ANN error routine
 64 |     char			*msg,		// error message
 65 |     ANNerr			level);		// level of error
 66 | 
 67 | void annPrintPt(				// print a point
 68 |     ANNpoint		pt,			// the point
 69 |     int				dim,		// the dimension
 70 |     std::ostream	&out);		// output stream
 71 | 
 72 | //----------------------------------------------------------------------
 73 | //	Orthogonal (axis aligned) rectangle
 74 | //	Orthogonal rectangles are represented by two points, one
 75 | //	for the lower left corner (min coordinates) and the other
 76 | //	for the upper right corner (max coordinates).
 77 | //
 78 | //	The constructor initializes from either a pair of coordinates,
 79 | //	pair of points, or another rectangle.  Note that all constructors
 80 | //	allocate new point storage. The destructor deallocates this
 81 | //	storage.
 82 | //
 83 | //	BEWARE: Orthogonal rectangles should be passed ONLY BY REFERENCE.
 84 | //	(C++'s default copy constructor will not allocate new point
 85 | //	storage, then on return the destructor free's storage, and then
 86 | //	you get into big trouble in the calling procedure.)
 87 | //----------------------------------------------------------------------
 88 | 
 89 | class ANNorthRect {
 90 | public:
 91 |   ANNpoint		lo;			// rectangle lower bounds
 92 |   ANNpoint		hi;			// rectangle upper bounds
 93 |   //
 94 |   ANNorthRect(				// basic constructor
 95 |     int				dd,			// dimension of space
 96 |     ANNcoord		l=0,		// default is empty
 97 |     ANNcoord		h=0)
 98 |   {  lo = annAllocPt(dd, l);  hi = annAllocPt(dd, h); }
 99 |   
100 |   ANNorthRect(				// (almost a) copy constructor
101 |     int				dd,			// dimension
102 |     const			ANNorthRect &r) // rectangle to copy
103 |   {  lo = annCopyPt(dd, r.lo);  hi = annCopyPt(dd, r.hi);  }
104 |   
105 |   ANNorthRect(				// construct from points
106 |     int				dd,			// dimension
107 |     ANNpoint		l,			// low point
108 |     ANNpoint		h)			// hight point
109 |   {  lo = annCopyPt(dd, l);  hi = annCopyPt(dd, h);  }
110 |   
111 |   ~ANNorthRect()				// destructor
112 |   {  annDeallocPt(lo);  annDeallocPt(hi);  }
113 |   
114 |   ANNbool inside(int dim, ANNpoint p);// is point p inside rectangle?
115 | };
116 | 
117 | void annAssignRect(				// assign one rect to another
118 |     int				dim,		// dimension (both must be same)
119 |     ANNorthRect		&dest,		// destination (modified)
120 |     const ANNorthRect &source);	// source
121 | 
122 | //----------------------------------------------------------------------
123 | //	Orthogonal (axis aligned) halfspace
124 | //	An orthogonal halfspace is represented by an integer cutting
125 | //	dimension cd, coordinate cutting value, cv, and side, sd, which is
126 | //	either +1 or -1. Our convention is that point q lies in the (closed)
127 | //	halfspace if (q[cd] - cv)*sd >= 0.
128 | //----------------------------------------------------------------------
129 | 
130 | class ANNorthHalfSpace {
131 | public:
132 |   int				cd;			// cutting dimension
133 |   ANNcoord		cv;			// cutting value
134 |   int				sd;			// which side
135 |   //
136 |   ANNorthHalfSpace()			// default constructor
137 |   {  cd = 0; cv = 0;  sd = 0;  }
138 |   
139 |   ANNorthHalfSpace(			// basic constructor
140 |     int				cdd,		// dimension of space
141 |     ANNcoord		cvv,		// cutting value
142 |     int				sdd)		// side
143 |   {  cd = cdd;  cv = cvv;  sd = sdd;  }
144 |   
145 |   ANNbool in(ANNpoint q) const	// is q inside halfspace?
146 |   {  return  (ANNbool) ((q[cd] - cv)*sd >= 0);  }
147 |   
148 |   ANNbool out(ANNpoint q) const	// is q outside halfspace?
149 |   {  return  (ANNbool) ((q[cd] - cv)*sd < 0);  }
150 |   
151 |   ANNdist dist(ANNpoint q) const	// (squared) distance from q
152 |   {  return  (ANNdist) ANN_POW(q[cd] - cv);  }
153 |   
154 |   void setLowerBound(int d, ANNpoint p)// set to lower bound at p[i]
155 |   {  cd = d;  cv = p[d];  sd = +1;  }
156 |   
157 |   void setUpperBound(int d, ANNpoint p)// set to upper bound at p[i]
158 |   {  cd = d;  cv = p[d];  sd = -1;  }
159 |   
160 |   void project(ANNpoint &q)		// project q (modified) onto halfspace
161 |   {  if (out(q)) q[cd] = cv;  }
162 | };
163 | 
164 | // array of halfspaces
165 | typedef ANNorthHalfSpace *ANNorthHSArray;
166 | 
167 | #endif


--------------------------------------------------------------------------------
/tests/testthat/test-python.R:
--------------------------------------------------------------------------------
  1 | # all python tests are skipped to avoid build errors, even though they succeed locally
  2 | # to run the tests the reticulate package must be installed, correctly hooked up to
  3 | # python, and the ripser module must be downloaded.
  4 | 
  5 | test_that("ripser can be imported and verified.",{
  6 |   
  7 |   skip_if(T)
  8 |   ripser <- import_ripser()
  9 |   expect_invisible(check_ripser(ripser))
 10 |   expect_error(check_ripser(2),"ripser object")
 11 |   expect_error(check_ripser(NULL),"ripser object")
 12 |   np <- reticulate::import("numpy")
 13 |   expect_error(check_ripser(np),"ripser object")
 14 |   
 15 | })
 16 | 
 17 | test_that("PyH can detect bad input parameters.",{
 18 |   
 19 |   skip_if(T)
 20 |   ripser <- import_ripser()
 21 |   expect_error(PyH(X = data.frame(),maxdim = 1,thresh = 1,distance_mat = F,ripser = ripser),"two rows")
 22 |   expect_error(PyH(X = NULL,maxdim = 1,thresh = 1,distance_mat = F,ripser = ripser),"dataframe")
 23 |   expect_error(PyH(X = data.frame(x = 1:2,y = c("1","2")),maxdim = 1,thresh = 1,distance_mat = F,ripser = ripser),"numeric")
 24 |   expect_error(PyH(X = data.frame(x = c(1,NA,2)),maxdim = 1,thresh = 1,distance_mat = F,ripser = ripser),"missing")
 25 |   expect_error(PyH(X = data.frame(x = 1:3),maxdim = 1,thresh = 1,distance_mat = T,ripser = ripser),"square")
 26 |   expect_error(PyH(X = data.frame(x = 1:3,y = 1:3,z = 1:3),maxdim = 1,thresh = 1,distance_mat = T,ripser = ripser),"matrix")
 27 |   expect_error(PyH(X = data.frame(x = 1:3,y = 1:3,z = 1:3),maxdim = NA,thresh = 1,distance_mat = T,ripser = ripser),"maxdim")
 28 |   expect_error(PyH(X = data.frame(x = 1:3,y = 1:3,z = 1:3),maxdim = -1,thresh = 1,distance_mat = T,ripser = ripser),"maxdim")
 29 |   expect_error(PyH(X = data.frame(x = 1:3,y = 1:3,z = 1:3),maxdim = 1,thresh = 1,distance_mat = NULL,ripser = ripser),"NULL")
 30 |   expect_error(PyH(X = data.frame(x = 1:3,y = 1:3,z = 1:3),maxdim = 1,thresh = 1,distance_mat = NA,ripser = ripser),"NA")
 31 |   expect_error(PyH(X = data.frame(x = 1:3,y = 1:3,z = 1:3),maxdim = 1,thresh = 1,distance_mat = c(T,F),ripser = ripser),"logical")
 32 |   expect_error(PyH(X = data.frame(x = 1:3,y = 1:3,z = 1:3),maxdim = 1,thresh = 1,distance_mat = T,ripser = ripser,ignore_infinite_cluster = NULL),"NULL")
 33 |   expect_error(PyH(X = data.frame(x = 1:3,y = 1:3,z = 1:3),maxdim = 1,thresh = 1,distance_mat = T,ripser = ripser,ignore_infinite_cluster = c(T,F)),"single")
 34 |   expect_error(PyH(X = data.frame(x = 1:3,y = 1:3,z = 1:3),maxdim = 1,thresh = 1,distance_mat = T,ripser = ripser,ignore_infinite_cluster = NA),"NA")
 35 |   expect_error(PyH(X = data.frame(x = 1:3,y = 1:3,z = 1:3),maxdim = 1,thresh = 1,distance_mat = T,ripser = ripser,calculate_representatives = NULL),"NULL")
 36 |   expect_error(PyH(X = data.frame(x = 1:3,y = 1:3,z = 1:3),maxdim = 1,thresh = 1,distance_mat = T,ripser = ripser,calculate_representatives = c(T,F)),"single")
 37 |   expect_error(PyH(X = data.frame(x = 1:3,y = 1:3,z = 1:3),maxdim = 1,thresh = 1,distance_mat = T,ripser = ripser,calculate_representatives = NA),"NA")
 38 |   
 39 | })
 40 | 
 41 | test_that("PyH is computing correctly.",{
 42 |   
 43 |   skip_if(T)
 44 |   skip_if_not_installed("TDAstats")
 45 |   D1 <- data.frame(x = stats::rnorm(20),y = stats::rnorm(20))
 46 |   D2 <- data.frame(x = stats::rnorm(20),y = stats::rnorm(20))
 47 |   D3 <- data.frame(x = stats::rnorm(20),y = stats::rnorm(20))
 48 |   
 49 |   phom_TDA_1 <- diagram_to_df(TDAstats::calculate_homology(D1,threshold = 5))
 50 |   phom_TDA_2 <- diagram_to_df(TDAstats::calculate_homology(D2,threshold = 5))
 51 |   phom_TDA_3 <- diagram_to_df(TDAstats::calculate_homology(D3,threshold = 5))
 52 |   
 53 |   ripser <- import_ripser()
 54 |   
 55 |   phom_py_1 <- PyH(D1,thresh = 5,ripser = ripser)
 56 |   phom_py_2 <- PyH(D2,thresh = 5,ripser = ripser)
 57 |   phom_py_3 <- PyH(D3,thresh = 5,ripser = ripser)
 58 |   
 59 |   expect_equal(phom_TDA_1,phom_py_1,tolerance = 0.00001)
 60 |   expect_equal(phom_TDA_2,phom_py_2,tolerance = 0.00001)
 61 |   expect_equal(phom_TDA_3,phom_py_3,tolerance = 0.000001)
 62 |   
 63 |   phom_with_extra_cluster <- PyH(D1,thresh = 5,ripser = ripser,ignore_infinite_cluster = F)
 64 |   
 65 |   expect_length(which(phom_with_extra_cluster$dimension == 0),20)
 66 |   
 67 |   phom_with_reps <- PyH(D1,thresh = 5,ripser = ripser,calculate_representatives = T)
 68 |   expect_type(phom_with_reps,"list")
 69 |   
 70 |   circ <- TDAstats::circle2d[sample(1:100,10),]
 71 |   phom_with_empty_dim <- PyH(circ,thresh = 2,ripser = ripser,maxdim = 2)
 72 |   expect_s3_class(phom_with_empty_dim,"data.frame")
 73 |   
 74 | })
 75 | 
 76 | test_that("bootstrap function can detect PyH errors correctly.",{
 77 |   
 78 |   skip_if(T)
 79 |   skip_if_not_installed("TDAstats")
 80 |   ripser = import_ripser()
 81 |   D <- TDAstats::circle2d[sample(1:100,10),]
 82 |   expect_error(bootstrap_persistence_thresholds(X = D,FUN_diag = "PyH",maxdim = 1,thresh = 2,calculate_representatives = T,return_diag = T,ripser = ripser,num_workers = 2,num_samples = 3,return_subsetted = T,ignore_infinite_cluster = NULL),"NULL")
 83 |   expect_error(bootstrap_persistence_thresholds(X = D,FUN_diag = "PyH",maxdim = 1,thresh = 2,calculate_representatives = T,return_diag = T,ripser = ripser,num_workers = 2,num_samples = 3,return_subsetted = T,ignore_infinite_cluster = 2),"logical")
 84 |   expect_error(bootstrap_persistence_thresholds(X = D,FUN_boot = "PyH",maxdim = 1,thresh = 2,calculate_representatives = T,return_diag = T,ripser = ripser,num_workers = 2,num_samples = 3,return_subsetted = T,ignore_infinite_cluster = NA),"NA")
 85 |   
 86 | })
 87 | 
 88 | test_that("PyH functionality works in bootstrap function.",{
 89 |   
 90 |   skip_if(T)
 91 |   skip_if_not_installed("TDAstats")
 92 |   ripser = import_ripser()
 93 |   D <- TDAstats::circle2d[sample(1:100,10),]
 94 |   
 95 |   # PyH with multiple thresholds
 96 |   bs <- bootstrap_persistence_thresholds(X = D,FUN_diag = "PyH",FUN_boot = "PyH",maxdim = 1,thresh = 2,calculate_representatives = T,return_diag = T,ripser = ripser,num_workers = 2,num_samples = 3,return_subsetted = T,ignore_infinite_cluster = F)
 97 |   expect_length(bs$representatives[[2]],length(which(bs$diag$dimension == 1)))
 98 |   expect_length(bs$thresholds,2)
 99 |   expect_gt(bs$thresholds[[1]],0)
100 |   expect_gt(bs$thresholds[[2]],0)
101 |   expect_lte(length(bs$subsetted_representatives),nrow(bs$subsetted_diag) + 1)
102 |   if(length(which(bs$subsetted_diag$dimension == 0)) > 0)
103 |   {
104 |     expect_true(min(bs$subsetted_diag[which(bs$subsetted_diag$dimension == 0),]$death - bs$subsetted_diag[which(bs$subsetted_diag$dimension == 0),]$birth) >= bs$thresholds[[1]])
105 |   }
106 |   
107 |   expect_true(min(bs$subsetted_diag[which(bs$subsetted_diag$dimension == 1),]$death - bs$subsetted_diag[which(bs$subsetted_diag$dimension == 1),]$birth) > bs$thresholds[[2]])
108 |   
109 |   # check on circle
110 |   bs <- bootstrap_persistence_thresholds(X = D,FUN_diag = "PyH",maxdim = 1,thresh = 2,return_diag = T,ripser = ripser,num_workers = 2,num_samples = 3)
111 |   expect_lte(length(bs$subsetted_diag$dimension),1)
112 |   bs <- bootstrap_persistence_thresholds(X = D,FUN_diag = "PyH",maxdim = 1,thresh = 2,return_diag = T,ripser = ripser,ignore_infinite_cluster = F,num_workers = 2,num_samples = 3)
113 |   expect_lte(length(bs$subsetted_diag$dimension),2)
114 | 
115 | })
116 | 


--------------------------------------------------------------------------------
/man/diagram_ksvm.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/machine_learning.R
  3 | \name{diagram_ksvm}
  4 | \alias{diagram_ksvm}
  5 | \title{Fit a support vector machine model where each training set instance is a persistence diagram.}
  6 | \usage{
  7 | diagram_ksvm(
  8 |   diagrams,
  9 |   cv = 1,
 10 |   dim,
 11 |   t = 1,
 12 |   sigma = 1,
 13 |   rho = NULL,
 14 |   y,
 15 |   type = NULL,
 16 |   distance_matrices = NULL,
 17 |   C = 1,
 18 |   nu = 0.2,
 19 |   epsilon = 0.1,
 20 |   prob.model = FALSE,
 21 |   class.weights = NULL,
 22 |   fit = TRUE,
 23 |   cache = 40,
 24 |   tol = 0.001,
 25 |   shrinking = TRUE,
 26 |   num_workers = parallelly::availableCores(omit = 1)
 27 | )
 28 | }
 29 | \arguments{
 30 | \item{diagrams}{a list of persistence diagrams which are either the output of a persistent homology calculation like ripsDiag/\code{\link[TDAstats]{calculate_homology}}/\code{\link{PyH}}, or \code{\link{diagram_to_df}}.}
 31 | 
 32 | \item{cv}{a positive number at most the length of `diagrams` which determines the number of cross validation splits to be performed (default 1, aka no cross-validation). If `prob.model` is TRUE then cv is set to 1 since kernlab performs 3-fold CV internally in this case. When performing classification, classes are balanced within each cv fold.}
 33 | 
 34 | \item{dim}{a non-negative integer vector of homological dimensions in which the model is to be fit.}
 35 | 
 36 | \item{t}{either a vector of positive numbers representing the grid of values for the scale of the persistence Fisher kernel or NULL, default 1. If NULL then t is selected automatically, see details.}
 37 | 
 38 | \item{sigma}{a vector of positive numbers representing the grid of values for the bandwidth of the Fisher information metric, default 1.}
 39 | 
 40 | \item{rho}{an optional positive number representing the heuristic for Fisher information metric approximation, see \code{\link{diagram_distance}}. Default NULL. If supplied, distance matrix calculations are sequential.}
 41 | 
 42 | \item{y}{a response vector with one label for each persistence diagram. Must be either numeric or factor, but doesn't need to be supplied when `type` is "one-svc".}
 43 | 
 44 | \item{type}{a string representing the type of task to be performed. Can be any one of "C-svc","nu-svc","one-svc","eps-svr","nu-svr" - default for regression is "eps-svr" and for classification is "C-svc". See \code{\link[kernlab]{ksvm}} for details.}
 45 | 
 46 | \item{distance_matrices}{an optional list of precomputed Fisher distance matrices, corresponding to the rows in `expand.grid(dim = dim,sigma = sigma)`, default NULL.}
 47 | 
 48 | \item{C}{a number representing the cost of constraints violation (default 1) this is the 'C'-constant of the regularization term in the Lagrange formulation.}
 49 | 
 50 | \item{nu}{numeric parameter needed for nu-svc, one-svc and nu-svr. The `nu` parameter sets the upper bound on the training error and the lower bound on the fraction of data points to become Support Vector (default 0.2).}
 51 | 
 52 | \item{epsilon}{epsilon in the insensitive-loss function used for eps-svr, nu-svr and eps-bsvm (default 0.1).}
 53 | 
 54 | \item{prob.model}{if set to TRUE builds a model for calculating class probabilities or in case of regression, calculates the scaling parameter of the Laplacian distribution fitted on the residuals. Fitting is done on output data created by performing a 3-fold cross-validation on the training data. For details see references (default FALSE).}
 55 | 
 56 | \item{class.weights}{a named vector of weights for the different classes, used for asymmetric class sizes. Not all factor levels have to be supplied (default weight: 1). All components have to be named.}
 57 | 
 58 | \item{fit}{indicates whether the fitted values should be computed and included in the model or not (default TRUE).}
 59 | 
 60 | \item{cache}{cache memory in MB (default 40).}
 61 | 
 62 | \item{tol}{tolerance of termination criteria (default 0.001).}
 63 | 
 64 | \item{shrinking}{option whether to use the shrinking-heuristics (default TRUE).}
 65 | 
 66 | \item{num_workers}{the number of cores used for parallel computation, default is one less the number of cores on the machine.}
 67 | }
 68 | \value{
 69 | a list of class 'diagram_ksvm' containing the elements
 70 | 
 71 | \describe{
 72 | 
 73 | \item{cv_results}{the cross-validation results - a matrix storing the parameters for each model in the tuning grid and its mean cross-validation error over all splits.}
 74 | 
 75 | \item{best_model}{a list containing the output of \code{\link[kernlab]{ksvm}} run on the whole dataset with the optimal model parameters found during cross-validation, as well as the optimal kernel parameters for the model.}
 76 | 
 77 | \item{diagrams}{the diagrams which were supplied in the function call.}
 78 | 
 79 | }
 80 | }
 81 | \description{
 82 | Returns the output of kernlab's \code{\link[kernlab]{ksvm}} function on the Gram matrix of the list of persistence diagrams
 83 | in a particular dimension.
 84 | }
 85 | \details{
 86 | Cross validation is carried out in parallel, using a trick
 87 | noted in \doi{10.1007/s41468-017-0008-7} - since the persistence Fisher kernel can be
 88 | written as \eqn{d_{PF}(D_1,D_2)=exp(t*d_{FIM}(D_1,D_2))=exp(d_{FIM}(D_1,D_2))^t}, we can
 89 | store the Fisher information metric distance matrix for each sigma value in the parameter grid to avoid
 90 | recomputing distances, and cross validation is therefore performed in parallel. 
 91 | Note that the response parameter `y` must be a factor for classification - 
 92 | a character vector for instance will throw an error. If `t` is NULL then 1/`t` is selected as
 93 | the 1,2,5,10,20,50 percentiles of the upper triangle of the distance matrix of its training sample (per fold in the case of cross-validation). 
 94 | This is the process suggested in the persistence Fisher kernel paper. If
 95 | any of these values would divide by 0 (i.e. if the training set is small) then the minimum non-zero element
 96 | is taken as the denominator (and hence the returned parameters may have duplicate rows except for differing error values). If
 97 | cross-validation is performed then the mean error across folds is still recorded, but the best `t` parameter
 98 | across all folds is recorded in the cv results table.
 99 | }
100 | \examples{
101 | 
102 | if(require("TDAstats"))
103 | {
104 |   # create four diagrams
105 |   D1 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,20),],
106 |                       dim = 1,threshold = 2)
107 |   D2 <- TDAstats::calculate_homology(TDAstats::circle2d[sample(1:100,20),],
108 |                       dim = 1,threshold = 2)
109 |   D3 <- TDAstats::calculate_homology(TDAstats::sphere3d[sample(1:100,20),],
110 |                       dim = 1,threshold = 2)
111 |   D4 <- TDAstats::calculate_homology(TDAstats::sphere3d[sample(1:100,20),],
112 |                       dim = 1,threshold = 2)
113 |   g <- list(D1,D2,D3,D4)
114 | 
115 |   # create response vector
116 |   y <- as.factor(c("circle","circle","sphere","sphere"))
117 | 
118 |   # fit model without cross validation
119 |   model_svm <- diagram_ksvm(diagrams = g,cv = 1,dim = c(0),
120 |                             y = y,sigma = c(1),t = c(1),
121 |                             num_workers = 2)
122 | }
123 | }
124 | \references{
125 | Murphy, K. "Machine learning: a probabilistic perspective." MIT press (2012).
126 | }
127 | \seealso{
128 | \code{\link{predict_diagram_ksvm}} for predicting labels of new diagrams.
129 | }
130 | \author{
131 | Shael Brown - \email{shaelebrown@gmail.com}
132 | }
133 | 


--------------------------------------------------------------------------------