├── .github ├── .gitignore └── workflows │ └── rhub.yaml ├── src ├── .gitignore ├── Makevars ├── Makevars.win ├── C_GAMMA.h ├── A_AUX.h ├── B_FULLCOND.h ├── G_GRIDE.h ├── C_GAMMA.cpp ├── Z_unused.cpp ├── A_AUX.cpp ├── B_FULLCOND.cpp ├── G_GRIDE.cpp └── RcppExports.cpp ├── LICENSE ├── man ├── figures │ ├── logo.png │ ├── intLogo.png │ └── README-pressure-1.png ├── reexports.Rd ├── Swissroll.Rd ├── autoplot.gride_evolution.Rd ├── autoplot.gride_mle.Rd ├── autoplot.twonn_linfit.Rd ├── autoplot.twonn_mle.Rd ├── auxHidalgo.Rd ├── id_by_class.Rd ├── generalized_ratios_distribution.Rd ├── intRinsic-package.Rd ├── autoplot.twonn_bayes.Rd ├── autoplot.gride_bayes.Rd ├── gride_evolution.Rd ├── twonn_decimated.Rd ├── compute_mus.Rd ├── clustering.Rd ├── twonn_decimation.Rd ├── autoplot.Hidalgo.Rd ├── gride.Rd ├── twonn.Rd └── Hidalgo.Rd ├── CRAN-SUBMISSION ├── .gitignore ├── R ├── zzz.R ├── rdir_exmcmcPack.R ├── intRinsic-package.R ├── general_auxiliary.R ├── RcppExports.R ├── hidalgo_Aux.R ├── gera_distribution.R ├── twonn_dec_depr.R ├── twonn_dec.R ├── gride_mle_aux.R ├── twonn_linfit.R ├── autoplot_hidalgo.R ├── twoNN.R ├── twonn_mle.R ├── gride_mle.R ├── twonn_dec_steps.R ├── autoplot_gride.R ├── gride.R ├── twonn_dec_prop.R ├── twonn_bayes.R ├── hidalgo_postproc.R ├── autoplot_twonn.R ├── gride_evolution.R ├── gride_bayes.R ├── compute_mus.R └── autoplot_hidalgo_aux.R ├── .Rbuildignore ├── intRinsic.Rproj ├── inst └── CITATION ├── cran-comments.md ├── DESCRIPTION ├── README.md ├── README.Rmd ├── NAMESPACE └── NEWS.md /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /src/.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.so 3 | *.dll 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2020 2 | COPYRIGHT HOLDER: Francesco Denti 3 | -------------------------------------------------------------------------------- /man/figures/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Fradenti/intRinsic/HEAD/man/figures/logo.png -------------------------------------------------------------------------------- /man/figures/intLogo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Fradenti/intRinsic/HEAD/man/figures/intLogo.png -------------------------------------------------------------------------------- /CRAN-SUBMISSION: -------------------------------------------------------------------------------- 1 | Version: 1.1.2 2 | Date: 2025-12-17 14:16:39 UTC 3 | SHA: 8e0028cb9ad281128165c453e9219c8f4cec78a4 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | src/*.o 6 | src/*.so 7 | src/*.dll 8 | .DS_Store 9 | -------------------------------------------------------------------------------- /man/figures/README-pressure-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Fradenti/intRinsic/HEAD/man/figures/README-pressure-1.png -------------------------------------------------------------------------------- /R/zzz.R: -------------------------------------------------------------------------------- 1 | # Clean after unload 2 | .onUnload <- function(libpath) { 3 | library.dynam.unload("intRinsic", libpath) 4 | } 5 | -------------------------------------------------------------------------------- /src/Makevars: -------------------------------------------------------------------------------- 1 | PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) 2 | PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) 3 | -------------------------------------------------------------------------------- /src/Makevars.win: -------------------------------------------------------------------------------- 1 | PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) 2 | PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) 3 | -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^LICENSE\.md$ 4 | ^\.github$ 5 | ^cran-comments\.md$ 6 | ^\.travis\.yml$ 7 | ^CRAN-SUBMISSION$ 8 | ^codecov\.yml$ 9 | ^README\.Rmd$ 10 | -------------------------------------------------------------------------------- /R/rdir_exmcmcPack.R: -------------------------------------------------------------------------------- 1 | #' #' @keywords internal 2 | #' .mcmc_pack_rdirichlet <- function (n, alpha) 3 | #' { 4 | #' l <- length(alpha) 5 | #' x <- matrix(stats::rgamma(l * n, alpha), ncol = l, byrow = TRUE) 6 | #' sm <- x %*% rep(1, l) 7 | #' return(x/as.vector(sm)) 8 | #' } 9 | -------------------------------------------------------------------------------- /R/intRinsic-package.R: -------------------------------------------------------------------------------- 1 | #' @keywords internal 2 | "_PACKAGE" 3 | 4 | # The following block is used by usethis to automatically manage 5 | # roxygen namespace tags. Modify with care! 6 | ## usethis namespace: start 7 | #' @importFrom ggplot2 autoplot 8 | #' @importFrom ggplot2 ggplot 9 | #' @importFrom Rcpp sourceCpp 10 | #' @useDynLib intRinsic, .registration = TRUE 11 | ## usethis namespace: end 12 | NULL 13 | -------------------------------------------------------------------------------- /man/reexports.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/autoplot_hidalgo.R 3 | \docType{import} 4 | \name{reexports} 5 | \alias{reexports} 6 | \alias{autoplot} 7 | \title{Objects exported from other packages} 8 | \keyword{internal} 9 | \description{ 10 | These objects are imported from other packages. Follow the links 11 | below to see their documentation. 12 | 13 | \describe{ 14 | \item{ggplot2}{\code{\link[ggplot2]{autoplot}}} 15 | }} 16 | 17 | -------------------------------------------------------------------------------- /intRinsic.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --no-multiarch --with-keep.source 21 | PackageCheckArgs: --as-cran --no-manual 22 | PackageRoxygenize: rd,collate,namespace 23 | -------------------------------------------------------------------------------- /inst/CITATION: -------------------------------------------------------------------------------- 1 | bibentry(bibtype = "Article", 2 | title = "{intRinsic}: An {R} Package for Model-Based Estimation of the Intrinsic Dimension of a Dataset", 3 | author = person(given = "Francesco", family = "Denti", email = "francesco.denti@unicatt.it"), 4 | journal = "Journal of Statistical Software", 5 | year = "2023", 6 | volume = "106", 7 | number = "9", 8 | pages = "1--45", 9 | doi = "10.18637/jss.v106.i09", 10 | header = "To cite intRinsic in publications use:" 11 | ) 12 | -------------------------------------------------------------------------------- /cran-comments.md: -------------------------------------------------------------------------------- 1 | ## Version 1.1.2 2 | 3 | With this version, we: 4 | 5 | - removed the strong dependency of `intRinsic` from the `latex2exp` package, which has been scheduled for archival. 6 | 7 | ## R CMD check results 8 | 9 | Running `devtools::check(args = c('--as-cran','--no-manual'))` locally and the functions `devtools::check_win_devel()`, 10 | `devtools::check_win_release()`, and `devtools::check_mac_release()` does not produce any ERRORs, WARNINGs, or NOTEs. 11 | 12 | ## Downstream dependencies 13 | 14 | There are currently no downstream dependencies for this package. 15 | -------------------------------------------------------------------------------- /src/C_GAMMA.h: -------------------------------------------------------------------------------- 1 | #ifndef C_GAMMA 2 | #define C_GAMMA 3 | 4 | #include 5 | using namespace Rcpp; 6 | 7 | arma::colvec gam_trunc(int D, int K, 8 | double a0_d, double b0_d, 9 | arma::colvec n_l, 10 | arma::colvec sLog); 11 | 12 | arma::colvec gam_trunc_pmass(int D, int K, 13 | double a0_d, double b0_d, 14 | arma::colvec n_l, 15 | arma::colvec sLog, 16 | double pi_mass); 17 | #endif 18 | -------------------------------------------------------------------------------- /src/A_AUX.h: -------------------------------------------------------------------------------- 1 | #ifndef A_AUX 2 | #define A_AUX 3 | 4 | #include 5 | 6 | double log_Likelihood_double(double mu_obs, double d); 7 | arma::rowvec Stratified_operations_0(arma::vec x, 8 | arma::vec col1, 9 | int val1); 10 | arma::mat Groups_quantities(arma::colvec mu_obser, 11 | arma::vec Ci, 12 | int K); 13 | Rcpp::List index_row_col(arma::mat Nq, 14 | int q, 15 | int N); 16 | 17 | double Norm_Constant_Z_l2( int Nzi_l, int N, double xi, int q); 18 | 19 | arma::colvec rdir_cpp(arma::colvec alpha); 20 | 21 | #endif 22 | 23 | -------------------------------------------------------------------------------- /man/Swissroll.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/general_auxiliary.R 3 | \name{Swissroll} 4 | \alias{Swissroll} 5 | \title{Generates a noise-free Swiss roll dataset} 6 | \usage{ 7 | Swissroll(n) 8 | } 9 | \arguments{ 10 | \item{n}{number of observations contained in the output dataset.} 11 | } 12 | \value{ 13 | a three-dimensional \code{data.frame} containing the coordinates of 14 | the points generated via the Swiss roll mapping. 15 | } 16 | \description{ 17 | The function creates a three-dimensional dataset with coordinates 18 | following the Swiss roll mapping, transforming random uniform data points 19 | sampled on the interval \code{(0,10)}. 20 | } 21 | \examples{ 22 | Data <- Swissroll(1000) 23 | 24 | } 25 | -------------------------------------------------------------------------------- /R/general_auxiliary.R: -------------------------------------------------------------------------------- 1 | #' Generates a noise-free Swiss roll dataset 2 | #' 3 | #' The function creates a three-dimensional dataset with coordinates 4 | #' following the Swiss roll mapping, transforming random uniform data points 5 | #' sampled on the interval \code{(0,10)}. 6 | #' 7 | #' @param n number of observations contained in the output dataset. 8 | #' 9 | #' @return a three-dimensional \code{data.frame} containing the coordinates of 10 | #' the points generated via the Swiss roll mapping. 11 | #' 12 | #' @export 13 | #' 14 | #' @examples 15 | #' Data <- Swissroll(1000) 16 | #' 17 | Swissroll <- function(n) { 18 | X <- stats::runif(n, 0, 10) 19 | Y <- stats::runif(n, 0, 10) 20 | return(SR = data.frame( 21 | x = X * cos(X), 22 | y = Y, 23 | z = X * sin(X) 24 | )) 25 | } 26 | 27 | -------------------------------------------------------------------------------- /src/B_FULLCOND.h: -------------------------------------------------------------------------------- 1 | #ifndef B_FULLCOND 2 | #define B_FULLCOND 3 | 4 | #include 5 | #include "A_AUX.h" 6 | //[[Rcpp::depends(RcppArmadillo)]] 7 | #include 8 | using namespace Rcpp; 9 | 10 | arma::colvec Update_memberships_faster(arma::colvec mu_obser, 11 | arma::colvec dl, 12 | arma::colvec pl, 13 | int K, int N,int q, 14 | arma::colvec possible_label, 15 | arma::colvec Ci, double QQ, 16 | arma::umat index_row, 17 | Rcpp::List index_col, 18 | arma::colvec log_Precomp_Z, 19 | arma::colvec log_Precomp_ratios); 20 | #endif 21 | -------------------------------------------------------------------------------- /src/G_GRIDE.h: -------------------------------------------------------------------------------- 1 | #ifndef G_GRIDE 2 | #define G_GRIDE 3 | 4 | 5 | double gride_log_likelihood(double d, 6 | int n1, 7 | int n2, 8 | arma::colvec mus_n1_n2); 9 | 10 | double gride_log_posterior(double z, 11 | int n1, 12 | int n2, 13 | double a_d, 14 | double b_d, 15 | arma::colvec mus_n1_n2); 16 | 17 | arma::colvec gride_mh_sampler(double start_d, 18 | int n1, 19 | int n2, 20 | double a_d, 21 | double b_d, 22 | arma::colvec mus_n1_n2, 23 | int nsim, 24 | int burn_in, 25 | double sigma); 26 | 27 | #endif 28 | -------------------------------------------------------------------------------- /man/autoplot.gride_evolution.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gride_evolution.R 3 | \name{autoplot.gride_evolution} 4 | \alias{autoplot.gride_evolution} 5 | \title{Plot the evolution of \code{Gride} estimates} 6 | \usage{ 7 | \method{autoplot}{gride_evolution}(object, title = "Gride Evolution", ...) 8 | } 9 | \arguments{ 10 | \item{object}{an object of class \code{gride_evolution}.} 11 | 12 | \item{title}{an optional string to customize the title of the plot.} 13 | 14 | \item{...}{other arguments passed to specific methods.} 15 | } 16 | \value{ 17 | object of class \code{\link[ggplot2]{ggplot}}. It displays the 18 | the evolution of the Gride maximum likelihood estimates as a function 19 | of the average distance from \code{n2}. 20 | } 21 | \description{ 22 | Use this method without the \code{.gride_evolution} suffix. 23 | It plots the evolution of the \code{id} 24 | estimates as a function of the average distance from the furthest NN of 25 | each point. 26 | } 27 | -------------------------------------------------------------------------------- /man/autoplot.gride_mle.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/autoplot_gride.R 3 | \name{autoplot.gride_mle} 4 | \alias{autoplot.gride_mle} 5 | \title{Plot the simulated bootstrap sample for the MLE \code{Gride}} 6 | \usage{ 7 | \method{autoplot}{gride_mle}(object, title = "MLE Gride - Bootstrap sample", ...) 8 | } 9 | \arguments{ 10 | \item{object}{object of class \code{gride_mle}. 11 | It is obtained using the output of the \code{gride} function when 12 | \code{method = "mle"}.} 13 | 14 | \item{title}{title for the plot.} 15 | 16 | \item{...}{other arguments passed to specific methods.} 17 | } 18 | \value{ 19 | object of class \code{\link[ggplot2]{ggplot}}. It displays the 20 | density plot of the sample generated via parametric bootstrap to help the 21 | visual assessment of the uncertainty of the \code{id} estimates. 22 | } 23 | \description{ 24 | Use this method without the \code{.gride_mle} suffix. 25 | It displays the density plot of sample obtained via 26 | parametric bootstrap for the \code{Gride} model. 27 | } 28 | \seealso{ 29 | \code{\link{gride}} 30 | } 31 | -------------------------------------------------------------------------------- /man/autoplot.twonn_linfit.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/autoplot_twonn.R 3 | \name{autoplot.twonn_linfit} 4 | \alias{autoplot.twonn_linfit} 5 | \title{Plot the output of the \code{TWO-NN} model estimated via least squares} 6 | \usage{ 7 | \method{autoplot}{twonn_linfit}(object, title = "TWO-NN Linear Fit", ...) 8 | } 9 | \arguments{ 10 | \item{object}{object of class \code{twonn_linfit}, the output of the 11 | \code{twonn} function when \code{method = "linfit"}.} 12 | 13 | \item{title}{string used as title of the plot.} 14 | 15 | \item{...}{other arguments passed to specific methods.} 16 | } 17 | \value{ 18 | a \code{\link[ggplot2]{ggplot2}} object displaying the goodness of 19 | the linear fit of the TWO-NN model. 20 | } 21 | \description{ 22 | Use this method without the \code{.twonn_linfit} suffix. 23 | The function returns the representation of the linear 24 | regression that is fitted with the \code{linfit} method. 25 | } 26 | \seealso{ 27 | \code{\link{twonn}} 28 | 29 | Other autoplot methods: 30 | \code{\link{autoplot.Hidalgo}()}, 31 | \code{\link{autoplot.gride_bayes}()}, 32 | \code{\link{autoplot.twonn_bayes}()}, 33 | \code{\link{autoplot.twonn_mle}()} 34 | } 35 | \concept{autoplot methods} 36 | -------------------------------------------------------------------------------- /man/autoplot.twonn_mle.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/autoplot_twonn.R 3 | \name{autoplot.twonn_mle} 4 | \alias{autoplot.twonn_mle} 5 | \title{Plot the output of the \code{TWO-NN} model estimated via the Maximum 6 | Likelihood approach} 7 | \usage{ 8 | \method{autoplot}{twonn_mle}(object, title = "MLE TWO-NN", ...) 9 | } 10 | \arguments{ 11 | \item{object}{object of class \code{twonn_mle}, the output of the 12 | \code{twonn} function when \code{method = "mle"}.} 13 | 14 | \item{title}{character string used as title of the plot.} 15 | 16 | \item{...}{other arguments passed to specific methods.} 17 | } 18 | \value{ 19 | \code{\link[ggplot2]{ggplot2}} object displaying the point estimate 20 | and confidence interval obtained via the maximum likelihood approach of the 21 | \code{id} parameter. 22 | } 23 | \description{ 24 | Use this method without the \code{.twonn_mle} suffix. 25 | The function returns the point estimate along with the confidence bands 26 | computed via the \code{mle} method. 27 | } 28 | \seealso{ 29 | \code{\link{twonn}} 30 | 31 | Other autoplot methods: 32 | \code{\link{autoplot.Hidalgo}()}, 33 | \code{\link{autoplot.gride_bayes}()}, 34 | \code{\link{autoplot.twonn_bayes}()}, 35 | \code{\link{autoplot.twonn_linfit}()} 36 | } 37 | \concept{autoplot methods} 38 | -------------------------------------------------------------------------------- /man/auxHidalgo.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/hidalgo_Aux.R 3 | \name{auxHidalgo} 4 | \alias{auxHidalgo} 5 | \alias{posterior_means} 6 | \alias{initial_values} 7 | \alias{posterior_medians} 8 | \alias{credible_intervals} 9 | \title{Auxiliary functions for the \code{Hidalgo} model} 10 | \usage{ 11 | posterior_means(x) 12 | 13 | initial_values(x) 14 | 15 | posterior_medians(x) 16 | 17 | credible_intervals(x, alpha = 0.95) 18 | } 19 | \arguments{ 20 | \item{x}{object of class \code{Hidalgo}, the output of the 21 | \code{Hidalgo()} function.} 22 | 23 | \item{alpha}{posterior probability contained in the computed credible 24 | interval.} 25 | } 26 | \value{ 27 | \code{posterior_mean} returns the observation-specific \code{id} posterior means estimated with \code{Hidalgo}. 28 | 29 | \code{initial_values} returns a list with the parameter specification 30 | passed to the model. 31 | 32 | \code{posterior_median} returns the observation-specific \code{id} posterior medians estimated with \code{Hidalgo}. 33 | 34 | \code{credible_interval} returns the observation-specific credible intervals for a specific 35 | probability \code{alpha}. 36 | } 37 | \description{ 38 | Collection of functions used to extract meaningful information from the object returned 39 | by the function \code{Hidalgo} 40 | } 41 | -------------------------------------------------------------------------------- /man/id_by_class.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/hidalgo_postproc.R 3 | \name{id_by_class} 4 | \alias{id_by_class} 5 | \alias{print.hidalgo_class} 6 | \title{Stratification of the \code{id} by an external categorical variable} 7 | \usage{ 8 | id_by_class(object, class) 9 | 10 | \method{print}{hidalgo_class}(x, ...) 11 | } 12 | \arguments{ 13 | \item{object}{object of class \code{Hidalgo}, the output of the 14 | \code{Hidalgo()} function.} 15 | 16 | \item{class}{factor according to the observations should be stratified by.} 17 | 18 | \item{x}{object of class \code{hidalgo_class}, the output of the \code{id_by_class()} function.} 19 | 20 | \item{...}{other arguments passed to specific methods.} 21 | } 22 | \value{ 23 | a \code{data.frame} containing the posterior \code{id} means, 24 | medians, and standard deviations stratified by the levels of the variable 25 | \code{class}. 26 | } 27 | \description{ 28 | The function computes summary statistics (mean, median, and standard 29 | deviation) of the post-processed chains of the intrinsic dimension stratified 30 | by an external categorical variable. 31 | } 32 | \examples{ 33 | \donttest{ 34 | X <- replicate(5,rnorm(500)) 35 | X[1:250,1:2] <- 0 36 | oracle <- rep(1:2,rep(250,2)) 37 | h_out <- Hidalgo(X) 38 | id_by_class(h_out,oracle) 39 | } 40 | 41 | } 42 | \seealso{ 43 | \code{\link{Hidalgo}} 44 | } 45 | -------------------------------------------------------------------------------- /man/generalized_ratios_distribution.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gera_distribution.R 3 | \name{generalized_ratios_distribution} 4 | \alias{generalized_ratios_distribution} 5 | \alias{dgera} 6 | \alias{rgera} 7 | \title{The Generalized Ratio distribution} 8 | \usage{ 9 | rgera(nsim, n1 = 1, n2 = 2, d) 10 | 11 | dgera(x, n1 = 1, n2 = 2, d, log = FALSE) 12 | } 13 | \arguments{ 14 | \item{nsim}{integer, the number of observations to generate.} 15 | 16 | \item{n1}{order of the first NN considered. Default is 1.} 17 | 18 | \item{n2}{order of the second NN considered. Default is 2.} 19 | 20 | \item{d}{value of the intrinsic dimension.} 21 | 22 | \item{x}{vector of quantiles.} 23 | 24 | \item{log}{logical, if \code{TRUE}, it returns the log-density} 25 | } 26 | \value{ 27 | \code{dgera} gives the density. \code{rgera} returns a vector of 28 | random observations sampled from the generalized ratio distribution. 29 | } 30 | \description{ 31 | Density function and random number generator for the Generalized Ratio 32 | distribution with NN orders equal to \code{n1} and \code{n2}. 33 | See \href{https://www.nature.com/articles/s41598-022-20991-1}{Denti et al., 2022} 34 | for more details. 35 | } 36 | \examples{ 37 | draws <- rgera(100,3,5,2) 38 | density <- dgera(3,3,5,2) 39 | 40 | } 41 | \references{ 42 | Denti F, Doimo D, Laio A, Mira A (2022). "The generalized ratios intrinsic dimension estimator." 43 | Scientific Reports, 12(20005). 44 | ISSN 20452322, \doi{10.1038/s41598-022-20991-1}. 45 | } 46 | -------------------------------------------------------------------------------- /man/intRinsic-package.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/intRinsic-package.R 3 | \docType{package} 4 | \name{intRinsic-package} 5 | \alias{intRinsic} 6 | \alias{intRinsic-package} 7 | \title{intRinsic: Likelihood-Based Intrinsic Dimension Estimators} 8 | \description{ 9 | \if{html}{\figure{logo.png}{options: style='float: right' alt='logo' width='120'}} 10 | 11 | Provides functions to estimate the intrinsic dimension of a dataset via likelihood-based approaches. Specifically, the package implements the 'TWO-NN' and 'Gride' estimators and the 'Hidalgo' Bayesian mixture model. In addition, the first reference contains an extended vignette on the usage of the 'TWO-NN' and 'Hidalgo' models. References: Denti (2023, \doi{10.18637/jss.v106.i09}); Allegra et al. (2020, \doi{10.1038/s41598-020-72222-0}); Denti et al. (2022, \doi{10.1038/s41598-022-20991-1}); Facco et al. (2017, \doi{10.1038/s41598-017-11873-y}); Santos-Fernandez et al. (2021, \doi{10.1038/s41598-022-20991-1}). 12 | } 13 | \seealso{ 14 | Useful links: 15 | \itemize{ 16 | \item \url{https://github.com/Fradenti/intRinsic} 17 | \item Report bugs at \url{https://github.com/fradenti/intRinsic/issues} 18 | } 19 | 20 | } 21 | \author{ 22 | \strong{Maintainer}: Francesco Denti \email{francescodenti.personal@gmail.com} (\href{https://orcid.org/0000-0003-2978-4702}{ORCID}) [copyright holder] 23 | 24 | Authors: 25 | \itemize{ 26 | \item Andrea Gilardi \email{andrea.gilardi@unimib.it} (\href{https://orcid.org/0000-0002-9424-7439}{ORCID}) 27 | } 28 | 29 | } 30 | \keyword{internal} 31 | -------------------------------------------------------------------------------- /man/autoplot.twonn_bayes.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/autoplot_twonn.R 3 | \name{autoplot.twonn_bayes} 4 | \alias{autoplot.twonn_bayes} 5 | \title{Plot the output of the \code{TWO-NN} model estimated via the Bayesian 6 | approach} 7 | \usage{ 8 | \method{autoplot}{twonn_bayes}( 9 | object, 10 | plot_low = 0, 11 | plot_upp = NULL, 12 | by = 0.05, 13 | title = "Bayesian TWO-NN", 14 | ... 15 | ) 16 | } 17 | \arguments{ 18 | \item{object}{object of class \code{twonn_bayes}, the output of the 19 | \code{twonn} function when \code{method = "bayes"}.} 20 | 21 | \item{plot_low}{lower bound of the interval on which the posterior density 22 | is plotted.} 23 | 24 | \item{plot_upp}{upper bound of the interval on which the posterior density 25 | is plotted.} 26 | 27 | \item{by}{step-size at which the sequence spanning the interval is 28 | incremented.} 29 | 30 | \item{title}{character string used as title of the plot.} 31 | 32 | \item{...}{other arguments passed to specific methods.} 33 | } 34 | \value{ 35 | \code{\link[ggplot2]{ggplot2}} object displaying the posterior 36 | distribution of the intrinsic dimension parameter. 37 | } 38 | \description{ 39 | Use this method without the \code{.twonn_bayes} suffix. 40 | The function returns the density plot of the 41 | posterior distribution computed with the \code{bayes} method. 42 | } 43 | \seealso{ 44 | \code{\link{twonn}} 45 | 46 | Other autoplot methods: 47 | \code{\link{autoplot.Hidalgo}()}, 48 | \code{\link{autoplot.gride_bayes}()}, 49 | \code{\link{autoplot.twonn_linfit}()}, 50 | \code{\link{autoplot.twonn_mle}()} 51 | } 52 | \concept{autoplot methods} 53 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: intRinsic 2 | Title: Likelihood-Based Intrinsic Dimension Estimators 3 | Version: 1.1.2 4 | Authors@R: c( 5 | person("Francesco", "Denti", ,"francescodenti.personal@gmail.com", 6 | role = c("aut", "cre", "cph"), 7 | comment = c(ORCID = "0000-0003-2978-4702")), 8 | person("Andrea", "Gilardi", ,"andrea.gilardi@unimib.it", 9 | role = "aut", 10 | comment = c(ORCID = "0000-0002-9424-7439")) 11 | ) 12 | Maintainer: Francesco Denti 13 | Description: Provides functions to estimate the intrinsic dimension of a dataset 14 | via likelihood-based approaches. Specifically, the package implements the 15 | 'TWO-NN' and 'Gride' estimators and the 'Hidalgo' Bayesian mixture model. 16 | In addition, the first reference contains an extended vignette on the usage of 17 | the 'TWO-NN' and 'Hidalgo' models. References: 18 | Denti (2023, ); 19 | Allegra et al. (2020, ); 20 | Denti et al. (2022, ); 21 | Facco et al. (2017, ); 22 | Santos-Fernandez et al. (2021, ). 23 | License: MIT + file LICENSE 24 | URL: https://github.com/Fradenti/intRinsic 25 | BugReports: https://github.com/fradenti/intRinsic/issues 26 | Depends: R (>= 4.2.0) 27 | Imports: 28 | dplyr, 29 | FNN, 30 | ggplot2, 31 | knitr, 32 | Rcpp, 33 | reshape2, 34 | rlang, 35 | stats, 36 | utils, 37 | salso 38 | LinkingTo: 39 | Rcpp, 40 | RcppArmadillo 41 | NeedsCompilation: yes 42 | ByteCompile: true 43 | Encoding: UTF-8 44 | RoxygenNote: 7.3.3 45 | -------------------------------------------------------------------------------- /man/autoplot.gride_bayes.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/autoplot_gride.R 3 | \name{autoplot.gride_bayes} 4 | \alias{autoplot.gride_bayes} 5 | \title{Plot the simulated MCMC chains for the Bayesian \code{Gride}} 6 | \usage{ 7 | \method{autoplot}{gride_bayes}( 8 | object, 9 | traceplot = FALSE, 10 | title = "Bayesian Gride - Posterior distribution", 11 | ... 12 | ) 13 | } 14 | \arguments{ 15 | \item{object}{object of class \code{gride_bayes}. 16 | It is obtained using the output of the \code{gride} function when 17 | \code{method = "bayes"}.} 18 | 19 | \item{traceplot}{logical. If \code{FALSE}, the function returns a plot of the 20 | posterior density. If \code{TRUE}, the function returns the traceplots of the 21 | MCMC used to simulate from the posterior distribution.} 22 | 23 | \item{title}{optional string to display as title.} 24 | 25 | \item{...}{other arguments passed to specific methods.} 26 | } 27 | \value{ 28 | object of class \code{\link[ggplot2]{ggplot}}. 29 | It could represent the traceplot of the posterior simulations for the 30 | Bayesian \code{Gride} model (\code{traceplot = TRUE}) or a density plot 31 | of the simulated posterior distribution (\code{traceplot = FALSE}). 32 | } 33 | \description{ 34 | Use this method without the \code{.gride_bayes} suffix. 35 | It displays the traceplot of the chain generated 36 | with Metropolis-Hasting updates to visually assess mixing and convergence. 37 | Alternatively, it is possible to plot the posterior density. 38 | } 39 | \seealso{ 40 | \code{\link{gride}} 41 | 42 | Other autoplot methods: 43 | \code{\link{autoplot.Hidalgo}()}, 44 | \code{\link{autoplot.twonn_bayes}()}, 45 | \code{\link{autoplot.twonn_linfit}()}, 46 | \code{\link{autoplot.twonn_mle}()} 47 | } 48 | \concept{autoplot methods} 49 | -------------------------------------------------------------------------------- /R/RcppExports.R: -------------------------------------------------------------------------------- 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | log_Likelihood_double <- function(mu_obs, d) { 5 | .Call(`_intRinsic_log_Likelihood_double`, mu_obs, d) 6 | } 7 | 8 | Groups_quantities <- function(mu_obser, Ci, K) { 9 | .Call(`_intRinsic_Groups_quantities`, mu_obser, Ci, K) 10 | } 11 | 12 | Norm_Constant_Z_l2 <- function(Nzi_l, N, xi, q) { 13 | .Call(`_intRinsic_Norm_Constant_Z_l2`, Nzi_l, N, xi, q) 14 | } 15 | 16 | log_Zeta_maker <- function(N, xi, q) { 17 | .Call(`_intRinsic_log_Zeta_maker`, N, xi, q) 18 | } 19 | 20 | index_row_col <- function(Nq, q, N) { 21 | .Call(`_intRinsic_index_row_col`, Nq, q, N) 22 | } 23 | 24 | rdir_cpp <- function(alpha) { 25 | .Call(`_intRinsic_rdir_cpp`, alpha) 26 | } 27 | 28 | Update_memberships_faster <- function(mu_obser, dl, pl, K, N, q, possible_label, Ci, QQ, index_row, index_col, log_Precomp_Z, log_Precomp_ratios) { 29 | .Call(`_intRinsic_Update_memberships_faster`, mu_obser, dl, pl, K, N, q, possible_label, Ci, QQ, index_row, index_col, log_Precomp_Z, log_Precomp_ratios) 30 | } 31 | 32 | gam_trunc <- function(D, K, a0_d, b0_d, n_l, sLog) { 33 | .Call(`_intRinsic_gam_trunc`, D, K, a0_d, b0_d, n_l, sLog) 34 | } 35 | 36 | gam_trunc_pmass <- function(D, K, a0_d, b0_d, n_l, sLog, pi_mass) { 37 | .Call(`_intRinsic_gam_trunc_pmass`, D, K, a0_d, b0_d, n_l, sLog, pi_mass) 38 | } 39 | 40 | gride_log_likelihood <- function(d, n1, n2, mus_n1_n2) { 41 | .Call(`_intRinsic_gride_log_likelihood`, d, n1, n2, mus_n1_n2) 42 | } 43 | 44 | gride_log_posterior <- function(z, n1, n2, a_d, b_d, mus_n1_n2) { 45 | .Call(`_intRinsic_gride_log_posterior`, z, n1, n2, a_d, b_d, mus_n1_n2) 46 | } 47 | 48 | gride_mh_sampler <- function(start_d, n1, n2, a_d, b_d, mus_n1_n2, nsim, burn_in, sigma) { 49 | .Call(`_intRinsic_gride_mh_sampler`, start_d, n1, n2, a_d, b_d, mus_n1_n2, nsim, burn_in, sigma) 50 | } 51 | 52 | -------------------------------------------------------------------------------- /src/C_GAMMA.cpp: -------------------------------------------------------------------------------- 1 | #include "C_GAMMA.h" 2 | 3 | // [[Rcpp::export]] 4 | arma::colvec gam_trunc(int D, int K, 5 | double a0_d, double b0_d, 6 | arma::colvec n_l, 7 | arma::colvec sLog){ 8 | arma::colvec d(K); 9 | arma::colvec p_upper(K); 10 | arma::colvec p_unif(K); 11 | 12 | arma::colvec a_star = n_l + a0_d; 13 | arma::colvec b_star = sLog + b0_d; 14 | 15 | for(int k=0; k(AIFD["SL0"]); 20 | arma::colvec n10 = as(AIFD["nl0"]); 21 | arma::colvec a_star = a0 + n10; 22 | arma::colvec b_star = b0 + LOGSUM; 23 | for( int l=0; l(AIFD["SL0"]); 33 | arma::colvec n10 = as(AIFD["nl0"]); 34 | arma::colvec a_star = a0 + n10; 35 | arma::colvec b_star = b0 + LOGSUM; 36 | arma::colvec lp(2); 37 | for( int l=0; l 3 | 4 | # intRinsic v1.1.2 5 | 6 | 7 | 8 | [![CRAN](https://www.r-pkg.org/badges/version/intRinsic)](https://cran.r-project.org/package=intRinsic) 9 | [![Last 10 | Commit](https://img.shields.io/github/last-commit/fradenti/intRinsic)](https://github.com/fradenti/intRinsic) 11 | [![Downloads 12 | (monthly)](https://cranlogs.r-pkg.org/badges/intRinsic?color=brightgreen)](https://www.r-pkg.org/pkg/intRinsic) 13 | [![Downloads 14 | (total)](https://cranlogs.r-pkg.org/badges/grand-total/intRinsic?color=brightgreen)](https://www.r-pkg.org/pkg/intRinsic) 15 | [![JSS](https://img.shields.io/badge/JSS-10.18637%2Fjss.v040.i08-brightgreen)](https://www.jstatsoft.org/article/view/v106i09) 16 | 17 | 18 | A package with functions to estimate the intrinsic dimension of a 19 | dataset via likelihood-based approaches. Specifically, the package 20 | implements the `TWO-NN` and `Gride` estimators and the `Hidalgo` 21 | Bayesian mixture model. 22 | 23 | To install the package from CRAN, run 24 | 25 | ``` r 26 | install.packages("intRinsic") 27 | ``` 28 | 29 | To install the package from this GitHub repository, run 30 | 31 | ``` r 32 | # install.packages("remotes") 33 | 34 | #Turn off warning-error-conversion regarding package versions 35 | Sys.setenv("R_REMOTES_NO_ERRORS_FROM_WARNINGS" = "true") 36 | 37 | #install from github 38 | remotes::install_github("Fradenti/intRinsic") 39 | ``` 40 | 41 | Simple example on Swissroll dataset 42 | 43 | ``` r 44 | library(intRinsic) 45 | X <- Swissroll(2000) 46 | twonn(X) 47 | ``` 48 | 49 | The vignette for this package has been published in the 50 | `Journal of Statistical Software`. The article can be found at [this 51 | link](https://doi.org/10.18637/jss.v106.i09). 52 | 53 | Please help me improve this package by reporting suggestions, typos, and 54 | issues at [this link](https://github.com/Fradenti/intRinsic/issues). 55 | 56 | Please note that the previous versions of the package (from `v0.1.0` to 57 | `v1.1.1`) are still available as GitHub Releases at [this 58 | page](https://github.com/Fradenti/intRinsic/releases). 59 | -------------------------------------------------------------------------------- /man/gride_evolution.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gride_evolution.R 3 | \name{gride_evolution} 4 | \alias{gride_evolution} 5 | \alias{print.gride_evolution} 6 | \alias{plot.gride_evolution} 7 | \title{\code{Gride} evolution based on Maximum Likelihood Estimation} 8 | \usage{ 9 | gride_evolution(X, vec_n1, vec_n2, upp_bound = 50) 10 | 11 | \method{print}{gride_evolution}(x, ...) 12 | 13 | \method{plot}{gride_evolution}(x, ...) 14 | } 15 | \arguments{ 16 | \item{X}{data matrix with \code{n} observations and \code{D} variables.} 17 | 18 | \item{vec_n1}{vector of integers, containing the smaller NN orders considered 19 | in the evolution.} 20 | 21 | \item{vec_n2}{vector of integers, containing the larger NN orders considered 22 | in the evolution.} 23 | 24 | \item{upp_bound}{upper bound for the interval used in the numerical 25 | optimization (via \code{optimize}). Default is set to 50.} 26 | 27 | \item{x}{an object of class \code{gride_evolution}.} 28 | 29 | \item{...}{other arguments passed to specific methods.} 30 | } 31 | \value{ 32 | list containing the Gride evolution, the corresponding NN distance 33 | ratios, the average n2-th NN order distances, and the NN orders considered. 34 | 35 | the function prints a summary of the Gride evolution to 36 | console. 37 | } 38 | \description{ 39 | The function allows the study of the evolution of the \code{id} estimates 40 | as a function of the scale of a dataset. A scale-dependent analysis 41 | is essential to identify the correct number of relevant directions in noisy 42 | data. To increase the average distance from the second NN (and thus the 43 | average neighborhood size) involved in the estimation, the function computes 44 | a sequence of \code{Gride} models with increasing NN orders, \code{n1} and 45 | \code{n2}. 46 | See also \href{https://www.nature.com/articles/s41598-022-20991-1}{Denti et al., 2022} 47 | for more details. 48 | } 49 | \examples{ 50 | \donttest{ 51 | X <- replicate(5,rnorm(10000,0,.1)) 52 | gride_evolution(X = X,vec_n1 = 2^(0:5),vec_n2 = 2^(1:6)) 53 | } 54 | 55 | } 56 | \references{ 57 | Denti F, Doimo D, Laio A, Mira A (2022). "The generalized ratios intrinsic 58 | dimension estimator." 59 | Scientific Reports, 12(20005). 60 | ISSN 20452322, \doi{10.1038/s41598-022-20991-1}. 61 | } 62 | -------------------------------------------------------------------------------- /R/hidalgo_Aux.R: -------------------------------------------------------------------------------- 1 | #' Auxiliary functions for the \code{Hidalgo} model 2 | #' 3 | #' Collection of functions used to extract meaningful information from the object returned 4 | #' by the function \code{Hidalgo} 5 | #' 6 | #' @name auxHidalgo 7 | #' 8 | #' @param x object of class \code{Hidalgo}, the output of the 9 | #' \code{Hidalgo()} function. 10 | #' 11 | #' @return \code{posterior_mean} returns the observation-specific \code{id} posterior means estimated with \code{Hidalgo}. 12 | #' 13 | #' @export 14 | posterior_means <- function(x){ 15 | if (class(x)[1] != "Hidalgo") { 16 | stop("object is not of class 'Hidalgo'", call. = FALSE) 17 | } 18 | return(x$id_summary$MEAN) 19 | 20 | } 21 | 22 | #' @name auxHidalgo 23 | #' 24 | #' @param x object of class \code{Hidalgo}, the output of the 25 | #' \code{Hidalgo()} function. 26 | #' 27 | #' @return \code{initial_values} returns a list with the parameter specification 28 | #' passed to the model. 29 | #' 30 | #' @export 31 | initial_values <- function(x){ 32 | if (class(x)[1] != "Hidalgo") { 33 | stop("object is not of class 'Hidalgo'", call. = FALSE) 34 | } 35 | return(x$recap) 36 | 37 | } 38 | 39 | 40 | #' @name auxHidalgo 41 | #' 42 | #' @param x object of class \code{Hidalgo}, the output of the 43 | #' \code{Hidalgo()} function. 44 | #' 45 | #' @return \code{posterior_median} returns the observation-specific \code{id} posterior medians estimated with \code{Hidalgo}. 46 | #' 47 | #' @export 48 | posterior_medians <- function(x){ 49 | if (class(x)[1] != "Hidalgo") { 50 | stop("object is not of class 'Hidalgo'", call. = FALSE) 51 | } 52 | out <- x$id_summary$MEDIAN 53 | return(out) 54 | 55 | } 56 | 57 | 58 | #' @name auxHidalgo 59 | #' 60 | #' @param x object of class \code{Hidalgo}, the output of the 61 | #' \code{Hidalgo()} function. 62 | #' @param alpha posterior probability contained in the computed credible 63 | #' interval. 64 | #' 65 | #' @return \code{credible_interval} returns the observation-specific credible intervals for a specific 66 | #' probability \code{alpha}. 67 | #' 68 | #' @export 69 | credible_intervals <- function(x, alpha = .95){ 70 | if (class(x)[1] != "Hidalgo") { 71 | stop("object is not of class 'Hidalgo'", call. = FALSE) 72 | } 73 | out <- t(apply(x$id_postpr,2,function(x) stats::quantile(x, probs = c((1 - alpha) / 2, (1 + alpha) / 2)))) 74 | return(out) 75 | } 76 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: github_document 3 | --- 4 | 5 | 6 | 7 | ```{r, include = FALSE} 8 | knitr::opts_chunk$set( 9 | collapse = TRUE, 10 | comment = "#>", 11 | fig.path = "man/figures/README-", 12 | out.width = "100%" 13 | ) 14 | ``` 15 | 16 | # intRinsic v1.1.2 17 | 18 | 19 | 20 | [![CRAN](https://www.r-pkg.org/badges/version/intRinsic)](https://cran.r-project.org/package=intRinsic) 21 | [![Last Commit](https://img.shields.io/github/last-commit/fradenti/intRinsic)](https://github.com/fradenti/intRinsic) 22 | [![Downloads (monthly)](https://cranlogs.r-pkg.org/badges/intRinsic?color=brightgreen)](https://www.r-pkg.org/pkg/intRinsic) 23 | [![Downloads (total)](https://cranlogs.r-pkg.org/badges/grand-total/intRinsic?color=brightgreen)](https://www.r-pkg.org/pkg/intRinsic) 24 | [![JSS](https://img.shields.io/badge/JSS-10.18637%2Fjss.v040.i08-brightgreen)](https://www.jstatsoft.org/article/view/v106i09) 25 | 26 | 27 | A package with functions to estimate the intrinsic dimension of a dataset via likelihood-based approaches. 28 | Specifically, the package implements the `TWO-NN` and `Gride` estimators and the `Hidalgo` Bayesian mixture model. 29 | 30 | To install the package from CRAN, run 31 | ```r 32 | install.packages("intRinsic") 33 | ``` 34 | 35 | To install the package from this GitHub repository, run 36 | ```r 37 | # install.packages("remotes") 38 | 39 | #Turn off warning-error-conversion regarding package versions 40 | Sys.setenv("R_REMOTES_NO_ERRORS_FROM_WARNINGS" = "true") 41 | 42 | #install from github 43 | remotes::install_github("Fradenti/intRinsic") 44 | ``` 45 | 46 | Simple example on Swissroll dataset 47 | 48 | ```r 49 | library(intRinsic) 50 | X <- Swissroll(2000) 51 | twonn(X) 52 | ``` 53 | 54 | The vignette for this package has been published in the `Journal of Statistical Software`. The article can be found at [this link](https://doi.org/10.18637/jss.v106.i09). 55 | 56 | Please help me improve this package by reporting suggestions, typos, and issues at [this link](https://github.com/Fradenti/intRinsic/issues). 57 | 58 | Please note that the previous versions of the package (from `v0.1.0` to `v1.1.1`) are still available as GitHub Releases at [this page](https://github.com/Fradenti/intRinsic/releases). 59 | 60 | -------------------------------------------------------------------------------- /man/twonn_decimated.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/twonn_dec_depr.R 3 | \name{twonn_decimated} 4 | \alias{twonn_decimated} 5 | \title{Estimate the decimated \code{TWO-NN} evolution with halving steps or vector of 6 | proportions} 7 | \usage{ 8 | twonn_decimated( 9 | X, 10 | method = c("steps", "proportions"), 11 | steps = 0, 12 | proportions = 1, 13 | seed = NULL 14 | ) 15 | } 16 | \arguments{ 17 | \item{X}{data matrix with \code{n} observations and \code{D} variables.} 18 | 19 | \item{method}{method to use for decimation: 20 | \describe{ 21 | \item{\code{"steps"}}{the number of times the dataset is halved;} 22 | \item{\code{"proportion"}}{the dataset is subsampled according to a vector 23 | of proportions.} 24 | }} 25 | 26 | \item{steps}{number of times the dataset is halved.} 27 | 28 | \item{proportions}{vector containing the fractions of the dataset to be 29 | considered.} 30 | 31 | \item{seed}{random seed controlling the sequence of sub-sampled observations.} 32 | } 33 | \value{ 34 | list containing the \code{TWO-NN} evolution 35 | (maximum likelihood estimation and confidence intervals), the average 36 | distance from the second NN, and the vector of proportions that were 37 | considered. According to the chosen estimation method, it is accompanied with 38 | the vector of proportions or halving steps considered. 39 | } 40 | \description{ 41 | The estimation of the \code{id} is related to the scale of the 42 | dataset. To escape the local reach of the \code{TWO-NN} estimator, 43 | \href{https://www.nature.com/articles/s41598-017-11873-y}{Facco et al. (2017)} 44 | proposed to subsample the original dataset in order to induce greater 45 | distances between the data points. By investigating the estimates' evolution 46 | as a function of the size of the neighborhood, it is possible to obtain 47 | information about the validity of the modeling assumptions and the robustness 48 | of the model in the presence of noise. 49 | } 50 | \references{ 51 | Facco E, D'Errico M, Rodriguez A, Laio A (2017). "Estimating the intrinsic 52 | dimension of datasets by a minimal neighborhood information." 53 | Scientific Reports, 7(1). 54 | ISSN 20452322, \doi{10.1038/s41598-017-11873-y}. 55 | 56 | Denti F, Doimo D, Laio A, Mira A (2022). "The generalized ratios intrinsic 57 | dimension estimator." 58 | Scientific Reports, 12(20005). 59 | ISSN 20452322, \doi{10.1038/s41598-022-20991-1}. 60 | } 61 | \seealso{ 62 | \code{\link{twonn}} 63 | } 64 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method(autoplot,Hidalgo) 4 | S3method(autoplot,gride_bayes) 5 | S3method(autoplot,gride_evolution) 6 | S3method(autoplot,gride_mle) 7 | S3method(autoplot,twonn_bayes) 8 | S3method(autoplot,twonn_linfit) 9 | S3method(autoplot,twonn_mle) 10 | S3method(plot,Hidalgo) 11 | S3method(plot,gride_bayes) 12 | S3method(plot,gride_evolution) 13 | S3method(plot,gride_mle) 14 | S3method(plot,hidalgo_psm) 15 | S3method(plot,mus) 16 | S3method(plot,twonn_bayes) 17 | S3method(plot,twonn_dec_by) 18 | S3method(plot,twonn_dec_prop) 19 | S3method(plot,twonn_linfit) 20 | S3method(plot,twonn_mle) 21 | S3method(print,Hidalgo) 22 | S3method(print,gride_bayes) 23 | S3method(print,gride_evolution) 24 | S3method(print,gride_mle) 25 | S3method(print,hidalgo_class) 26 | S3method(print,hidalgo_psm) 27 | S3method(print,mus) 28 | S3method(print,mus_Nq) 29 | S3method(print,summary.Hidalgo) 30 | S3method(print,summary.gride_bayes) 31 | S3method(print,summary.gride_mle) 32 | S3method(print,summary.twonn_bayes) 33 | S3method(print,summary.twonn_linfit) 34 | S3method(print,summary.twonn_mle) 35 | S3method(print,twonn_bayes) 36 | S3method(print,twonn_dec_by) 37 | S3method(print,twonn_dec_prop) 38 | S3method(print,twonn_linfit) 39 | S3method(print,twonn_mle) 40 | S3method(summary,Hidalgo) 41 | S3method(summary,gride_bayes) 42 | S3method(summary,gride_mle) 43 | S3method(summary,twonn_bayes) 44 | S3method(summary,twonn_linfit) 45 | S3method(summary,twonn_mle) 46 | export(Hidalgo) 47 | export(Swissroll) 48 | export(autoplot) 49 | export(clustering) 50 | export(compute_mus) 51 | export(credible_intervals) 52 | export(dgera) 53 | export(gride) 54 | export(gride_evolution) 55 | export(id_by_class) 56 | export(initial_values) 57 | export(posterior_means) 58 | export(posterior_medians) 59 | export(rgera) 60 | export(twonn) 61 | export(twonn_decimated) 62 | export(twonn_decimation) 63 | importFrom(Rcpp,sourceCpp) 64 | importFrom(ggplot2,autoplot) 65 | importFrom(ggplot2,ggplot) 66 | importFrom(graphics,abline) 67 | importFrom(graphics,boxplot) 68 | importFrom(graphics,curve) 69 | importFrom(graphics,hist) 70 | importFrom(graphics,legend) 71 | importFrom(graphics,lines) 72 | importFrom(graphics,matplot) 73 | importFrom(graphics,par) 74 | importFrom(graphics,plot) 75 | importFrom(graphics,points) 76 | importFrom(graphics,polygon) 77 | importFrom(graphics,segments) 78 | importFrom(rlang,.data) 79 | importFrom(stats,density) 80 | importFrom(stats,ts) 81 | useDynLib(intRinsic, .registration = TRUE) 82 | -------------------------------------------------------------------------------- /src/A_AUX.cpp: -------------------------------------------------------------------------------- 1 | #include "A_AUX.h" 2 | 3 | // [[Rcpp::export]] 4 | double log_Likelihood_double(double mu_obs, double d){ 5 | return log(d) - (d+1.0) * log(mu_obs) ; 6 | } 7 | 8 | arma::rowvec Stratified_operations_0(arma::vec x, 9 | arma::vec col1, 10 | int val1){ 11 | 12 | arma::uvec inds1 = find( col1 == val1 ) ; 13 | arma::vec x10 = x.elem(inds1); 14 | arma::rowvec Res(2); Res.zeros(); 15 | Res(0) = x10.size(); 16 | if(Res(0) > 0){ 17 | Res(1) = accu(log(x10)); 18 | } 19 | return Res; 20 | } 21 | 22 | // [[Rcpp::export]] 23 | arma::mat Groups_quantities(arma::colvec mu_obser, 24 | arma::vec Ci, 25 | int K){ // performs stratified operations for every k=1,..,K 26 | arma::mat A(K,2); 27 | for(int ls = 0; ls index_col(N); 74 | 75 | for(int i=0; i 1)) 72 | } else{ 73 | logB <- sum(log(1:(n2 - n1 - 1))) + sum(log(1:(n1-1))) - 74 | sum(log(1:(n2-1))) 75 | lognum <- log(d) + (d_n12 - 1) * (log(x ^ d - 1)) 76 | logden <- ((n2 - 1) * d + 1) * (log(x)) 77 | log_dens <- lognum - logden + sum(log(x > 1)) - logB 78 | } 79 | if (!log) { 80 | log_dens <- exp(log_dens) 81 | } 82 | 83 | structure(log_dens, class = c("dgera", class(log_dens))) 84 | 85 | return(log_dens) 86 | } 87 | -------------------------------------------------------------------------------- /man/clustering.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/hidalgo_postproc.R 3 | \name{clustering} 4 | \alias{clustering} 5 | \alias{print.hidalgo_psm} 6 | \alias{plot.hidalgo_psm} 7 | \title{Posterior similarity matrix and partition estimation} 8 | \usage{ 9 | clustering( 10 | object, 11 | clustering_method = c("dendrogram", "salso"), 12 | K = 2, 13 | nCores = 1, 14 | ... 15 | ) 16 | 17 | \method{print}{hidalgo_psm}(x, ...) 18 | 19 | \method{plot}{hidalgo_psm}(x, ...) 20 | } 21 | \arguments{ 22 | \item{object}{object of class \code{Hidalgo}, the output of the 23 | \code{Hidalgo} function.} 24 | 25 | \item{clustering_method}{character indicating the method to use to perform 26 | clustering. It can be 27 | \describe{ 28 | \item{"dendrogram"}{thresholding the adjacency dendrogram with a given 29 | number (\code{K});} 30 | \item{"salso"}{estimation via minimization of several partition 31 | estimation criteria. 32 | The default loss function is the variation of information.} 33 | }} 34 | 35 | \item{K}{number of clusters to recover by thresholding the 36 | dendrogram obtained from the psm.} 37 | 38 | \item{nCores}{parameter for the \code{salso} function: the number of CPU 39 | cores to use. A value of zero indicates to use all cores on the system.} 40 | 41 | \item{...}{ignored.} 42 | 43 | \item{x}{object of class \code{hidalgo_psm}, obtained from the function 44 | \code{clustering()}.} 45 | } 46 | \value{ 47 | list containing the posterior similarity matrix (\code{psm}) and 48 | the estimated partition \code{clust}. 49 | } 50 | \description{ 51 | The function computes the posterior similarity (coclustering) matrix (psm) 52 | and estimates a representative partition of the observations from the MCMC 53 | output. The user can provide the desired number of clusters or estimate a 54 | optimal clustering solution by minimizing a loss function on the space 55 | of the partitions. 56 | In the latter case, the function uses the package \code{salso} 57 | (\href{https://cran.r-project.org/package=salso}{Dahl et al., 2021}), 58 | that the user needs to load. 59 | } 60 | \examples{ 61 | \donttest{ 62 | library(salso) 63 | X <- replicate(5,rnorm(500)) 64 | X[1:250,1:2] <- 0 65 | h_out <- Hidalgo(X) 66 | clustering(h_out) 67 | } 68 | } 69 | \references{ 70 | D. B. Dahl, D. J. Johnson, and P. Müller (2022), 71 | "Search Algorithms and Loss Functions for Bayesian Clustering", 72 | Journal of Computational and Graphical Statistics, 73 | \doi{10.1080/10618600.2022.2069779}. 74 | 75 | David B. Dahl, Devin J. Johnson and Peter Müller (2022). "salso: Search 76 | Algorithms and Loss Functions for Bayesian Clustering". 77 | R package version 78 | 0.3.0. \url{https://CRAN.R-project.org/package=salso} 79 | } 80 | \seealso{ 81 | \code{\link{Hidalgo}}, \code{\link[salso]{salso}} 82 | } 83 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # intRinsic 1.1.2 2 | 3 | * Removed the dependency from the package `latex2exp` 4 | 5 | # intRinsic 1.1.1 6 | 7 | * Replaced `arma::is_finite` with `std::isfinite` to ensure compatibility with the latest `RcppArmadillo` release and CRAN policies. 8 | * Implemented small code adjustments to address and resolve compiler warnings. 9 | 10 | # intRinsic 1.1.0 11 | 12 | * Re-organized the cpp code into multiple files 13 | * Solved an indexing problem with `log_Zeta` and `log_corr`, sometimes causing `na` with single-manifold data when using `Hidalgo()` 14 | * Translated some `R` parts of the `Hidalgo()` code into `C++` 15 | * Corrected a bug affecting the `Truncated-pointmass` approach 16 | * Updated the README file 17 | * Set up the new `rhub` checks workflows 18 | 19 | # intRinsic 1.0.2 20 | 21 | * Removed the dependency from the package `MCMCpack` 22 | * Fixed and updated the documentation (minor changes) 23 | 24 | # intRinsic 1.0.1 25 | 26 | * Fixed a typo in `compute_mus` when computing ratios from `dist_matrix`. Now the function handles also the `dissimilarity` class 27 | * Fixed a typo in `compute_mus` involving the class of the returned object 28 | * Adjusted some indentations in `print` methods 29 | * Adjusted the `autoplot` method related to `gride_mle` 30 | 31 | # intRinsic 1.0.0 32 | 33 | * This update marks the acceptance of the `intRinsic` vignette in the `Journal of Statistical Software`. Note: the DOI in the CITATION is for a new JSS publication that will be registered after publication on CRAN 34 | * Fixed some spelling typos and new-line inconsistencies 35 | * Removed the dependencies on the `as_tibble()` function (deprecated) 36 | * The function `twonn_decimated()` is now deprecated and will be removed in future releases. The function to use is `twonn_decimation()` 37 | * Now `gride_evolution()` and `twonn_decimation()` remove duplicated observations by default 38 | 39 | # intRinsic 0.2.2 40 | 41 | * Fixed bug in `Hidalgo()`, causing errors when setting `verbose = FALSE` 42 | * Fixed bug in `Hidalgo()` initialization. Added warning if `alpha_Dirichlet` is set so small it causes underflow problems 43 | * Fixed bug in `clustering()`. Now the returned `K` when the option `salso` is selected, is correct 44 | * Fixed bug in `print.mus()`. Now it checks if the passed object is a list 45 | * Corrected a few typos in the documentation 46 | * Updated `doi` of recently published references 47 | * Added additional checks in `compute_mus`: NA, symmetry, non-negativity 48 | 49 | # intRinsic 0.2.1 50 | 51 | * Fixed minor bugs involving the `gride()` family of functions 52 | * Added several methods (`print()`, `summary()`, `plot()`,...) to facilitate the 53 | extraction of the results 54 | * Now, the `autoplot()` method is directly exported from the `ggplot2` package 55 | 56 | # intRinsic 0.2.0 57 | 58 | * Submitted to CRAN 59 | * Released to the public the first official version 60 | 61 | -------------------------------------------------------------------------------- /R/twonn_dec_depr.R: -------------------------------------------------------------------------------- 1 | #' Estimate the decimated \code{TWO-NN} evolution with halving steps or vector of 2 | #' proportions 3 | #' 4 | #' The estimation of the \code{id} is related to the scale of the 5 | #' dataset. To escape the local reach of the \code{TWO-NN} estimator, 6 | #' \href{https://www.nature.com/articles/s41598-017-11873-y}{Facco et al. (2017)} 7 | #' proposed to subsample the original dataset in order to induce greater 8 | #' distances between the data points. By investigating the estimates' evolution 9 | #' as a function of the size of the neighborhood, it is possible to obtain 10 | #' information about the validity of the modeling assumptions and the robustness 11 | #' of the model in the presence of noise. 12 | #' 13 | #' @param X data matrix with \code{n} observations and \code{D} variables. 14 | #' @param method method to use for decimation: 15 | #' \describe{ 16 | #' \item{\code{"steps"}}{the number of times the dataset is halved;} 17 | #' \item{\code{"proportion"}}{the dataset is subsampled according to a vector 18 | #' of proportions.} 19 | #' } 20 | #' @param proportions vector containing the fractions of the dataset to be 21 | #' considered. 22 | #' @param steps number of times the dataset is halved. 23 | #' @param seed random seed controlling the sequence of sub-sampled observations. 24 | #' 25 | #' 26 | #' @references 27 | #' Facco E, D'Errico M, Rodriguez A, Laio A (2017). "Estimating the intrinsic 28 | #' dimension of datasets by a minimal neighborhood information." 29 | #' Scientific Reports, 7(1). 30 | #' ISSN 20452322, \doi{10.1038/s41598-017-11873-y}. 31 | #' 32 | #' Denti F, Doimo D, Laio A, Mira A (2022). "The generalized ratios intrinsic 33 | #' dimension estimator." 34 | #' Scientific Reports, 12(20005). 35 | #' ISSN 20452322, \doi{10.1038/s41598-022-20991-1}. 36 | #' 37 | #' @return list containing the \code{TWO-NN} evolution 38 | #' (maximum likelihood estimation and confidence intervals), the average 39 | #' distance from the second NN, and the vector of proportions that were 40 | #' considered. According to the chosen estimation method, it is accompanied with 41 | #' the vector of proportions or halving steps considered. 42 | #' 43 | #' @export 44 | #' 45 | #' @seealso \code{\link{twonn}} 46 | #' 47 | twonn_decimated <- function(X, 48 | method = c("steps", "proportions"), 49 | steps = 0, 50 | proportions = 1, 51 | seed = NULL) { 52 | .Deprecated("twonn_decimation") 53 | method <- match.arg(method) 54 | 55 | if (steps == 0 & length(proportions) == 1) { 56 | if (proportions == 1) 57 | method <- "mle" 58 | } 59 | 60 | res <- switch( 61 | method, 62 | steps = twonn_dec_by(X = X, 63 | steps = steps, 64 | seed = seed), 65 | proportions = twonn_dec_prop( 66 | X = X, 67 | proportions = proportions, 68 | seed = seed 69 | ), 70 | mle = twonn(X = X, 71 | method = "mle") 72 | ) 73 | return(res) 74 | 75 | } 76 | -------------------------------------------------------------------------------- /src/G_GRIDE.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | // [[Rcpp::export]] 4 | double gride_log_likelihood(double d, 5 | int n1, 6 | int n2, 7 | arma::colvec mus_n1_n2){ 8 | 9 | 10 | if(n2 1 ) ); 27 | } else{ 28 | logB = arma::accu(log(vn1)) + arma::accu(log(vn12))-arma::accu(log(vn2)); 29 | lognum = nn * log(d) + 30 | (d_n12 - 1) * arma::accu( log( pow( mus_n1_n2, d) - 1 ) ); 31 | logden = ((n2 - 1) * d + 1) * sum(log(mus_n1_n2)); 32 | log_dens = lognum - logden + sum(log(mus_n1_n2 > 1.)) - logB * nn; 33 | } 34 | 35 | return(log_dens); 36 | } 37 | 38 | 39 | // [[Rcpp::export]] 40 | double gride_log_posterior(double z, 41 | int n1, 42 | int n2, 43 | double a_d, 44 | double b_d, 45 | arma::colvec mus_n1_n2){ 46 | 47 | 48 | if(n2 < n1){ 49 | Rcpp::stop("n2 should be greater than n1"); 50 | } 51 | 52 | double d = exp(z) + 1; 53 | double log_post = gride_log_likelihood(d, n1, n2, mus_n1_n2) + 54 | z + 55 | R::dgamma( d, a_d, 1/b_d, 1 ); 56 | 57 | return(log_post); 58 | } 59 | 60 | // [[Rcpp::export]] 61 | arma::colvec gride_mh_sampler(double start_d, 62 | int n1, 63 | int n2, 64 | double a_d, 65 | double b_d, 66 | arma::colvec mus_n1_n2, 67 | int nsim, 68 | int burn_in, 69 | double sigma){ 70 | 71 | arma::colvec mh_sample(nsim); 72 | 73 | double oldpar, newpar, alpha; 74 | oldpar = start_d; 75 | for(int i=0; i<(nsim + burn_in); i++) { 76 | newpar = arma::randn(1)[0] * sigma + oldpar; 77 | alpha = gride_log_posterior( 78 | newpar, 79 | n1, 80 | n2, 81 | a_d, 82 | b_d, 83 | mus_n1_n2) - 84 | gride_log_posterior( 85 | oldpar, 86 | n1, 87 | n2, 88 | a_d, 89 | b_d, 90 | mus_n1_n2); 91 | 92 | if (log(arma::randu(1)[0]) < alpha) { 93 | oldpar = newpar; 94 | } 95 | if(i >= burn_in){ 96 | mh_sample(i-burn_in) = oldpar; 97 | } 98 | } 99 | return(mh_sample); 100 | } 101 | 102 | -------------------------------------------------------------------------------- /R/twonn_dec.R: -------------------------------------------------------------------------------- 1 | #' Estimate the decimated \code{TWO-NN} evolution with halving steps or vector of 2 | #' proportions 3 | #' 4 | #' The estimation of the \code{id} is related to the scale of the 5 | #' dataset. To escape the local reach of the \code{TWO-NN} estimator, 6 | #' \href{https://www.nature.com/articles/s41598-017-11873-y}{Facco et al. (2017)} 7 | #' proposed to subsample the original dataset in order to induce greater 8 | #' distances between the data points. By investigating the estimates' evolution 9 | #' as a function of the size of the neighborhood, it is possible to obtain 10 | #' information about the validity of the modeling assumptions and the robustness 11 | #' of the model in the presence of noise. 12 | #' 13 | #' @param X data matrix with \code{n} observations and \code{D} variables. 14 | #' @param method method to use for decimation: 15 | #' \describe{ 16 | #' \item{\code{"steps"}}{the number of times the dataset is halved;} 17 | #' \item{\code{"proportion"}}{the dataset is subsampled according to a vector 18 | #' of proportions.} 19 | #' } 20 | #' @param proportions vector containing the fractions of the dataset to be 21 | #' considered. 22 | #' @param steps number of times the dataset is halved. 23 | #' @param seed random seed controlling the sequence of sub-sampled observations. 24 | #' 25 | #' 26 | #' @references 27 | #' Facco E, D'Errico M, Rodriguez A, Laio A (2017). "Estimating the intrinsic 28 | #' dimension of datasets by a minimal neighborhood information." 29 | #' Scientific Reports, 7(1). 30 | #' ISSN 20452322, \doi{10.1038/s41598-017-11873-y}. 31 | #' 32 | #' Denti F, Doimo D, Laio A, Mira A (2022). "The generalized ratios intrinsic 33 | #' dimension estimator." 34 | #' Scientific Reports, 12(20005). 35 | #' ISSN 20452322, \doi{10.1038/s41598-022-20991-1}. 36 | #' 37 | #' @return list containing the \code{TWO-NN} evolution 38 | #' (maximum likelihood estimation and confidence intervals), the average 39 | #' distance from the second NN, and the vector of proportions that were 40 | #' considered. According to the chosen estimation method, it is accompanied with 41 | #' the vector of proportions or halving steps considered. 42 | #' 43 | #' @export 44 | #' 45 | #' @seealso \code{\link{twonn}} 46 | #' 47 | #' @examples 48 | #' X <- replicate(4,rnorm(1000)) 49 | #' twonn_decimation(X,,method = "proportions", 50 | #' proportions = c(1,.5,.2,.1,.01)) 51 | #' 52 | twonn_decimation <- function(X, 53 | method = c("steps", "proportions"), 54 | steps = 0, 55 | proportions = 1, 56 | seed = NULL) { 57 | method <- match.arg(method) 58 | 59 | if (steps == 0 & length(proportions) == 1) { 60 | if (proportions == 1) 61 | method <- "mle" 62 | } 63 | 64 | res <- switch( 65 | method, 66 | steps = twonn_dec_by(X = X, 67 | steps = steps, 68 | seed = seed), 69 | proportions = twonn_dec_prop( 70 | X = X, 71 | proportions = proportions, 72 | seed = seed 73 | ), 74 | mle = twonn(X = X, 75 | method = "mle") 76 | ) 77 | return(res) 78 | } 79 | -------------------------------------------------------------------------------- /.github/workflows/rhub.yaml: -------------------------------------------------------------------------------- 1 | # R-hub's generic GitHub Actions workflow file. It's canonical location is at 2 | # https://github.com/r-hub/actions/blob/v1/workflows/rhub.yaml 3 | # You can update this file to a newer version using the rhub2 package: 4 | # 5 | # rhub::rhub_setup() 6 | # 7 | # It is unlikely that you need to modify this file manually. 8 | 9 | name: R-hub 10 | run-name: "${{ github.event.inputs.id }}: ${{ github.event.inputs.name || format('Manually run by {0}', github.triggering_actor) }}" 11 | 12 | on: 13 | workflow_dispatch: 14 | inputs: 15 | config: 16 | description: 'A comma separated list of R-hub platforms to use.' 17 | type: string 18 | default: 'linux,windows,macos' 19 | name: 20 | description: 'Run name. You can leave this empty now.' 21 | type: string 22 | id: 23 | description: 'Unique ID. You can leave this empty now.' 24 | type: string 25 | 26 | jobs: 27 | 28 | setup: 29 | runs-on: ubuntu-latest 30 | outputs: 31 | containers: ${{ steps.rhub-setup.outputs.containers }} 32 | platforms: ${{ steps.rhub-setup.outputs.platforms }} 33 | 34 | steps: 35 | # NO NEED TO CHECKOUT HERE 36 | - uses: r-hub/actions/setup@v1 37 | with: 38 | config: ${{ github.event.inputs.config }} 39 | id: rhub-setup 40 | 41 | linux-containers: 42 | needs: setup 43 | if: ${{ needs.setup.outputs.containers != '[]' }} 44 | runs-on: ubuntu-latest 45 | name: ${{ matrix.config.label }} 46 | strategy: 47 | fail-fast: false 48 | matrix: 49 | config: ${{ fromJson(needs.setup.outputs.containers) }} 50 | container: 51 | image: ${{ matrix.config.container }} 52 | 53 | steps: 54 | - uses: r-hub/actions/checkout@v1 55 | - uses: r-hub/actions/platform-info@v1 56 | with: 57 | token: ${{ secrets.RHUB_TOKEN }} 58 | job-config: ${{ matrix.config.job-config }} 59 | - uses: r-hub/actions/setup-deps@v1 60 | with: 61 | token: ${{ secrets.RHUB_TOKEN }} 62 | job-config: ${{ matrix.config.job-config }} 63 | - uses: r-hub/actions/run-check@v1 64 | with: 65 | token: ${{ secrets.RHUB_TOKEN }} 66 | job-config: ${{ matrix.config.job-config }} 67 | 68 | other-platforms: 69 | needs: setup 70 | if: ${{ needs.setup.outputs.platforms != '[]' }} 71 | runs-on: ${{ matrix.config.os }} 72 | name: ${{ matrix.config.label }} 73 | strategy: 74 | fail-fast: false 75 | matrix: 76 | config: ${{ fromJson(needs.setup.outputs.platforms) }} 77 | 78 | steps: 79 | - uses: r-hub/actions/checkout@v1 80 | - uses: r-hub/actions/setup-r@v1 81 | with: 82 | job-config: ${{ matrix.config.job-config }} 83 | token: ${{ secrets.RHUB_TOKEN }} 84 | - uses: r-hub/actions/platform-info@v1 85 | with: 86 | token: ${{ secrets.RHUB_TOKEN }} 87 | job-config: ${{ matrix.config.job-config }} 88 | - uses: r-hub/actions/setup-deps@v1 89 | with: 90 | job-config: ${{ matrix.config.job-config }} 91 | token: ${{ secrets.RHUB_TOKEN }} 92 | - uses: r-hub/actions/run-check@v1 93 | with: 94 | job-config: ${{ matrix.config.job-config }} 95 | token: ${{ secrets.RHUB_TOKEN }} 96 | -------------------------------------------------------------------------------- /man/twonn_decimation.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/twonn_dec.R, R/twonn_dec_prop.R, 3 | % R/twonn_dec_steps.R 4 | \name{twonn_decimation} 5 | \alias{twonn_decimation} 6 | \alias{print.twonn_dec_prop} 7 | \alias{plot.twonn_dec_prop} 8 | \alias{print.twonn_dec_by} 9 | \alias{plot.twonn_dec_by} 10 | \title{Estimate the decimated \code{TWO-NN} evolution with halving steps or vector of 11 | proportions} 12 | \usage{ 13 | twonn_decimation( 14 | X, 15 | method = c("steps", "proportions"), 16 | steps = 0, 17 | proportions = 1, 18 | seed = NULL 19 | ) 20 | 21 | \method{print}{twonn_dec_prop}(x, ...) 22 | 23 | \method{plot}{twonn_dec_prop}(x, CI = FALSE, proportions = FALSE, ...) 24 | 25 | \method{print}{twonn_dec_by}(x, ...) 26 | 27 | \method{plot}{twonn_dec_by}(x, CI = FALSE, steps = FALSE, ...) 28 | } 29 | \arguments{ 30 | \item{X}{data matrix with \code{n} observations and \code{D} variables.} 31 | 32 | \item{method}{method to use for decimation: 33 | \describe{ 34 | \item{\code{"steps"}}{the number of times the dataset is halved;} 35 | \item{\code{"proportion"}}{the dataset is subsampled according to a vector 36 | of proportions.} 37 | }} 38 | 39 | \item{steps}{logical, if \code{TRUE}, the x-axis reports the number of halving steps. 40 | If \code{FALSE}, the x-axis reports the log10 average distance.} 41 | 42 | \item{proportions}{logical, if \code{TRUE}, the x-axis reports the number of decimating proportions. 43 | If \code{FALSE}, the x-axis reports the log10 average distance.} 44 | 45 | \item{seed}{random seed controlling the sequence of sub-sampled observations.} 46 | 47 | \item{x}{object of class \code{twonn_dec_prop}, obtained from the function 48 | \code{twonn_dec_prop()}.} 49 | 50 | \item{...}{ignored.} 51 | 52 | \item{CI}{logical, if \code{TRUE}, the confidence intervals are plotted} 53 | } 54 | \value{ 55 | list containing the \code{TWO-NN} evolution 56 | (maximum likelihood estimation and confidence intervals), the average 57 | distance from the second NN, and the vector of proportions that were 58 | considered. According to the chosen estimation method, it is accompanied with 59 | the vector of proportions or halving steps considered. 60 | } 61 | \description{ 62 | The estimation of the \code{id} is related to the scale of the 63 | dataset. To escape the local reach of the \code{TWO-NN} estimator, 64 | \href{https://www.nature.com/articles/s41598-017-11873-y}{Facco et al. (2017)} 65 | proposed to subsample the original dataset in order to induce greater 66 | distances between the data points. By investigating the estimates' evolution 67 | as a function of the size of the neighborhood, it is possible to obtain 68 | information about the validity of the modeling assumptions and the robustness 69 | of the model in the presence of noise. 70 | } 71 | \examples{ 72 | X <- replicate(4,rnorm(1000)) 73 | twonn_decimation(X,,method = "proportions", 74 | proportions = c(1,.5,.2,.1,.01)) 75 | 76 | } 77 | \references{ 78 | Facco E, D'Errico M, Rodriguez A, Laio A (2017). "Estimating the intrinsic 79 | dimension of datasets by a minimal neighborhood information." 80 | Scientific Reports, 7(1). 81 | ISSN 20452322, \doi{10.1038/s41598-017-11873-y}. 82 | 83 | Denti F, Doimo D, Laio A, Mira A (2022). "The generalized ratios intrinsic 84 | dimension estimator." 85 | Scientific Reports, 12(20005). 86 | ISSN 20452322, \doi{10.1038/s41598-022-20991-1}. 87 | } 88 | \seealso{ 89 | \code{\link{twonn}} 90 | } 91 | -------------------------------------------------------------------------------- /R/gride_mle_aux.R: -------------------------------------------------------------------------------- 1 | #' Bootstrap sample generator for the \code{Gride} MLE 2 | #' 3 | #' @param mus_n1_n2 vector of generalized order NN distance ratios. 4 | #' @param n1 order of the first NN considered. 5 | #' @param n2 order of the second NN considered. 6 | #' @param nsim integer, the number of bootstrap replications to consider. 7 | #' @param upper_D nominal dimension of the dataset (upper bound for the 8 | #' maximization routine). 9 | #' 10 | #' @keywords Internal 11 | #' @noRd 12 | #' 13 | #' @return list containing the MLE, confidence interval, and the 14 | #' bootstrap sample used for the estimation. 15 | #' 16 | gride_bootstrap <- function(mus_n1_n2, 17 | n1 = 1, 18 | n2 = 2, 19 | nsim = 2000, 20 | upper_D = NULL) { 21 | 22 | if (n2 < n1) { 23 | stop("n2 should be greater than n1", call. = FALSE) 24 | } 25 | 26 | if (is.null(upper_D)) { 27 | stop("Please provide the nominal dimension of the dataset D in upper_D", 28 | call. = FALSE) 29 | } 30 | 31 | n <- length(mus_n1_n2) 32 | 33 | mle_est <- gride_mle_point( 34 | mus_n1_n2 = mus_n1_n2, 35 | n1 = n1, 36 | n2 = n2, 37 | upper_D = upper_D 38 | ) 39 | 40 | boot_mus_samples <- replicate(nsim, rgera( 41 | nsim = n, 42 | n1 = n1, 43 | n2 = n2, 44 | d = mle_est 45 | )) 46 | 47 | bootstrap_sample <- apply(boot_mus_samples, 48 | 2, 49 | function(x) 50 | stats::optimize( 51 | gride_log_likelihood, 52 | interval = c(1.01, upper_D), 53 | n1 = n1, 54 | n2 = n2, 55 | mus_n1_n2 = x, 56 | maximum = TRUE 57 | )$max) 58 | 59 | Res <- list(mle = mle_est, boot_sample = bootstrap_sample) 60 | 61 | structure(Res, class = c("bootstrap_sample", "gride", class(Res))) 62 | 63 | } 64 | 65 | #' Maximum Likelihood Estimator (MLE) for the id using generic ratio statistics 66 | #' 67 | #' @param mus_n1_n2 vector of generalized order NN distance ratios statistics. 68 | #' @param n1 order of the first NN considered. Default is 1. 69 | #' @param n2 order of the second NN considered. Default is 2. 70 | #' @param upper_D nominal dimension of the dataset (upper bound for the 71 | #' maximization routine). 72 | #' 73 | #' @keywords Internal 74 | #' @noRd 75 | #' 76 | #' @return Gride MLE point estimate obtained via numeric optimization. 77 | #' 78 | gride_mle_point <- function(mus_n1_n2, 79 | n1 = 1, 80 | n2 = 2, 81 | upper_D = NULL) { 82 | if (class(mus_n1_n2)[1] == "mus") { 83 | n1 <- attr(mus_n1_n2, which = "n1") 84 | n2 <- attr(mus_n1_n2, which = "n2") 85 | } 86 | 87 | if (n2 < n1) { 88 | stop("n2 should be greater than n1", call. = FALSE) 89 | } 90 | 91 | if (is.null(upper_D)) { 92 | stop("Please provide the nominal dimension of the dataset D in upper_D", 93 | call. = FALSE) 94 | } 95 | 96 | O <- stats::optimize( 97 | gride_log_likelihood, 98 | interval = c(1.01, upper_D), 99 | n1 = n1, 100 | n2 = n2, 101 | mus_n1_n2 = mus_n1_n2, 102 | maximum = TRUE 103 | ) 104 | 105 | 106 | structure(O$max, class = c("mle_point", "gride", class(O$max))) 107 | 108 | } 109 | -------------------------------------------------------------------------------- /man/autoplot.Hidalgo.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/autoplot_hidalgo.R 3 | \name{autoplot.Hidalgo} 4 | \alias{autoplot.Hidalgo} 5 | \title{Plot the output of the \code{Hidalgo} function} 6 | \usage{ 7 | \method{autoplot}{Hidalgo}( 8 | object, 9 | type = c("raw_chains", "point_estimates", "class_plot", "clustering"), 10 | class_plot_type = c("histogram", "density", "boxplot", "violin"), 11 | class = NULL, 12 | psm = NULL, 13 | clust = NULL, 14 | title = NULL, 15 | ... 16 | ) 17 | } 18 | \arguments{ 19 | \item{object}{object of class \code{Hidalgo}, the output of the 20 | \code{Hidalgo()} function.} 21 | 22 | \item{type}{character that indicates the requested type of plot. 23 | It can be: 24 | \describe{ 25 | \item{\code{"raw_chains"}}{plot the MCMC and the ergodic means NOT corrected 26 | for label switching;} 27 | \item{\code{"point_estimates"}}{plot the posterior mean and median of the id 28 | for each observation, after the chains are processed for label switching;} 29 | \item{\code{"class_plot"}}{plot the estimated id distributions stratified by 30 | the groups specified in the class vector;} 31 | \item{\code{"clustering"}}{plot the posterior coclustering matrix. Rows and 32 | columns can be stratified by an exogenous class and/or a clustering 33 | solution.} 34 | }} 35 | 36 | \item{class_plot_type}{if \code{type} is chosen to be \code{"class_plot"}, 37 | one can plot the stratified id estimates with a \code{"density"} plot or a 38 | \code{"histogram"}, or using \code{"boxplots"} or \code{"violin"} plots.} 39 | 40 | \item{class}{factor variable used to stratify observations according to 41 | their the \code{id} estimates.} 42 | 43 | \item{psm}{posterior similarity matrix containing the posterior probability 44 | of coclustering.} 45 | 46 | \item{clust}{vector containing the cluster membership labels.} 47 | 48 | \item{title}{character string used as title of the plot.} 49 | 50 | \item{...}{other arguments passed to specific methods.} 51 | } 52 | \value{ 53 | a \code{\link[ggplot2]{ggplot2}} object produced by the function 54 | according to the \code{type} chosen. 55 | More precisely, if 56 | \describe{ 57 | \item{\code{method = "raw_chains"}}{The functions produces the traceplots 58 | of the parameters \code{d_k}, for \code{k=1...K}. 59 | The ergodic means for all the chains are superimposed. The \code{K} chains 60 | that are plotted are not post-processed. 61 | Ergo, they are subjected to label switching;} 62 | \item{\code{method = "point_estimates"}}{The function returns two 63 | scatterplots displaying 64 | the posterior mean and median \code{id} for each observation, after that the 65 | MCMC has been postprocessed to handle label switching;} 66 | \item{\code{method = "class_plot"}}{The function returns a plot that can be 67 | used to visually assess the relationship between the posterior \code{id} 68 | estimates and an external, categorical variable. The type of plot varies 69 | according to the specification of \code{class_plot_type}, and it can be 70 | either a set of boxplots or violin plots or a collection of overlapping 71 | densities or histograms;} 72 | \item{\code{method = "clustering"}}{The function displays the posterior 73 | similarity matrix, to allow the study of the clustering structure present in 74 | the data estimated via the mixture model. Rows and columns can be stratified 75 | by an exogenous class and/or a clustering structure.} 76 | } 77 | } 78 | \description{ 79 | Use this method without the \code{.Hidalgo} suffix. 80 | It produces several plots to explore the output of 81 | the \code{Hidalgo} model. 82 | } 83 | \seealso{ 84 | \code{\link{Hidalgo}} 85 | 86 | Other autoplot methods: 87 | \code{\link{autoplot.gride_bayes}()}, 88 | \code{\link{autoplot.twonn_bayes}()}, 89 | \code{\link{autoplot.twonn_linfit}()}, 90 | \code{\link{autoplot.twonn_mle}()} 91 | } 92 | \concept{autoplot methods} 93 | -------------------------------------------------------------------------------- /R/twonn_linfit.R: -------------------------------------------------------------------------------- 1 | #' Least Squares Estimator for the \code{TWO-NN} model 2 | #' 3 | #' The function fits the \code{TWO-NN} model via least squares estimation, as 4 | #' originally proposed in 5 | #' \href{https://www.nature.com/articles/s41598-017-11873-y}{Facco et al., 2017}. 6 | #' 7 | #' @param mus vector of second to first NN distance ratios. 8 | #' @param alpha confidence level needed for the computation of the confidence 9 | #' interval. 10 | #' @param c_trimmed proportion of trimmed observations. 11 | #' 12 | #' @return object of class \code{twonn_linfit}, which is a list containing the 13 | #' least squares estimate of the intrinsic dimension, along with the \code{lm()} 14 | #' output used for the computation. 15 | #' 16 | #' @seealso \code{\link{twonn}}, \code{\link{autoplot.twonn_linfit}} 17 | #' @keywords Internal 18 | #' 19 | #' @noRd 20 | #' 21 | #' @references 22 | #' Facco E, D'Errico M, Rodriguez A, Laio A (2017). "Estimating the intrinsic 23 | #' dimension of datasets by a minimal neighborhood information." 24 | #' Scientific Reports, 7(1). 25 | #' ISSN 20452322, doi: 10.1038/s41598-017-11873-y. 26 | #' 27 | twonn_linfit <- function(mus, 28 | alpha = 0.95, 29 | c_trimmed = .01) { 30 | c_considered <- 1 - c_trimmed 31 | n <- n_original <- base::length(mus) 32 | 33 | if (c_trimmed > 0) { 34 | mus <- base::sort(mus)[1:floor(n * c_considered)] 35 | n <- base::length(mus) 36 | } 37 | 38 | F_musi <- (0:(n - 1)) / n 39 | y <- -base::log(1 - (F_musi)) 40 | x <- base::sort(base::log(mus)) 41 | 42 | modlin1 <- stats::lm(y ~ x - 1) 43 | 44 | Res <- numeric(3) 45 | Res[c(1, 3)] <- stats::confint.lm(modlin1, level = alpha) 46 | Res[2] <- stats::coef(modlin1) 47 | Res <- list( 48 | est = Res, 49 | lm_output = modlin1, 50 | cl = alpha, 51 | c_trimmed = c_trimmed, 52 | n_original = n_original, 53 | n = n 54 | ) 55 | 56 | structure(Res, class = c("twonn_linfit", class(Res))) 57 | } 58 | 59 | #' @name twonn 60 | #' 61 | #' @param x object of class \code{twonn_mle}, obtained from the function 62 | #' \code{twonn_mle()}. 63 | #' @param ... ignored. 64 | #' 65 | #' 66 | #' @export 67 | print.twonn_linfit <- function(x, ...) { 68 | y <- c("TWONN - Linfit" = x[["est"]][2]) 69 | cat(y) 70 | cat("\n") 71 | invisible(x) 72 | } 73 | 74 | #' @name twonn 75 | #' 76 | #' @param object object of class \code{twonn_mle}, obtained from the function 77 | #' \code{twonn_mle()}. 78 | #' @param ... ignored. 79 | #' 80 | #' @export 81 | summary.twonn_linfit <- function(object, ...) { 82 | y <- cbind( 83 | `Original sample size` = object[["n_original"]], 84 | `Used sample size` = object[["n"]], 85 | `Trimming proportion` = object[["c_trimmed"]], 86 | `Confidence level` = object[["cl"]], 87 | `Lower Bound` = object[["est"]][1], 88 | `Estimate` = object[["est"]][2], 89 | `Upper Bound` = object[["est"]][3] 90 | ) 91 | structure(y, class = c("summary.twonn_linfit","matrix")) 92 | } 93 | 94 | 95 | #' @name twonn 96 | #' 97 | #' @param x object of class \code{twonn_mle}, obtained from the function 98 | #' \code{twonn_mle()}. 99 | #' @param ... ignored. 100 | #' 101 | #' @export 102 | print.summary.twonn_linfit <- function(x, ...) { 103 | cat("Model: TWO-NN\n") 104 | cat("Method: Least Squares Estimation\n") 105 | cat(paste0( 106 | "Sample size: ", 107 | x[1], 108 | ", Obs. used: ", 109 | x[2], 110 | ". Trimming proportion: ", 111 | 100 * x[3], 112 | "%\n" 113 | )) 114 | cat(paste0("ID estimates (confidence level: ", x[4], ")")) 115 | y <- cbind( 116 | `Lower Bound` = x[5], 117 | `Estimate` = x[6], 118 | `Upper Bound` = x[7] 119 | ) 120 | print(knitr::kable(y)) 121 | cat("\n") 122 | invisible(x) 123 | } 124 | 125 | 126 | 127 | #' @name twonn 128 | #' 129 | #' @param x object of class \code{twonn_linfit}, the output of the 130 | #' \code{twonn} function when \code{method = "linfit"}. 131 | #' 132 | #' @export 133 | #' 134 | plot.twonn_linfit <- function(x, 135 | ...) { 136 | lmod <- x$lm_output 137 | xx <- lmod$model$x 138 | y <- lmod$model$y 139 | Res <- x$est 140 | 141 | plot( y ~ xx, pch = 21, bg = 1,cex=.4, 142 | ylab = ("-log(1-(i/N))"), 143 | xlab = expression(log(mu))) 144 | abline(lmod,col=2,lwd=1.3) 145 | graphics::title("TWO-NN Linear Fit") 146 | expr <- expression(R^2) 147 | legend("topleft",legend = paste0("ID : ", round(Res[2], 3),"\n" , 148 | expr," : ", round(summary(lmod)$r.squared, 3) ) ) 149 | invisible() 150 | 151 | } 152 | -------------------------------------------------------------------------------- /man/gride.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gride.R, R/gride_bayes.R, R/gride_mle.R 3 | \name{gride} 4 | \alias{gride} 5 | \alias{print.gride_bayes} 6 | \alias{summary.gride_bayes} 7 | \alias{print.summary.gride_bayes} 8 | \alias{plot.gride_bayes} 9 | \alias{print.gride_mle} 10 | \alias{summary.gride_mle} 11 | \alias{print.summary.gride_mle} 12 | \alias{plot.gride_mle} 13 | \title{\code{Gride}: the Generalized Ratios ID Estimator} 14 | \usage{ 15 | gride( 16 | X = NULL, 17 | dist_mat = NULL, 18 | mus_n1_n2 = NULL, 19 | method = c("mle", "bayes"), 20 | n1 = 1, 21 | n2 = 2, 22 | alpha = 0.95, 23 | nsim = 5000, 24 | upper_D = 50, 25 | burn_in = 2000, 26 | sigma = 0.5, 27 | start_d = NULL, 28 | a_d = 1, 29 | b_d = 1, 30 | ... 31 | ) 32 | 33 | \method{print}{gride_bayes}(x, ...) 34 | 35 | \method{summary}{gride_bayes}(object, ...) 36 | 37 | \method{print}{summary.gride_bayes}(x, ...) 38 | 39 | \method{plot}{gride_bayes}(x, ...) 40 | 41 | \method{print}{gride_mle}(x, ...) 42 | 43 | \method{summary}{gride_mle}(object, ...) 44 | 45 | \method{print}{summary.gride_mle}(x, ...) 46 | 47 | \method{plot}{gride_mle}(x, ...) 48 | } 49 | \arguments{ 50 | \item{X}{data matrix with \code{n} observations and \code{D} variables.} 51 | 52 | \item{dist_mat}{distance matrix computed between the \code{n} observations.} 53 | 54 | \item{mus_n1_n2}{vector of generalized order NN distance ratios.} 55 | 56 | \item{method}{the chosen estimation method. It can be 57 | \describe{ 58 | \item{\code{"mle"}}{maximum likelihood estimation;} 59 | \item{\code{"bayes"}}{estimation with the Bayesian approach.} 60 | }} 61 | 62 | \item{n1}{order of the first NN considered. Default is 1.} 63 | 64 | \item{n2}{order of the second NN considered. Default is 2.} 65 | 66 | \item{alpha}{confidence level (for \code{mle}) or posterior probability in 67 | the credible interval (\code{bayes}).} 68 | 69 | \item{nsim}{number of bootstrap samples or posterior simulation to consider.} 70 | 71 | \item{upper_D}{nominal dimension of the dataset (upper bound for the 72 | maximization routine).} 73 | 74 | \item{burn_in}{number of iterations to discard from the MCMC sample. 75 | Applicable if \code{method = "bayes"}.} 76 | 77 | \item{sigma}{standard deviation of the Gaussian proposal used in the MH step. 78 | Applicable if \code{method = "bayes"}.} 79 | 80 | \item{start_d}{initial value for the MCMC chain. If \code{NULL}, 81 | the MLE is used. Applicable if \code{method = "bayes"}.} 82 | 83 | \item{a_d}{shape parameter of the Gamma prior distribution for \code{d}. 84 | Applicable if \code{method = "bayes"}.} 85 | 86 | \item{b_d}{rate parameter of the Gamma prior distribution for \code{d}. 87 | Applicable if \code{method = "bayes"}.} 88 | 89 | \item{...}{other arguments passed to specific methods.} 90 | 91 | \item{x}{object of class \code{gride_mle}. 92 | It is obtained using the output of the \code{gride} function when 93 | \code{method = "mle"}.} 94 | 95 | \item{object}{object of class \code{gride_mle}, obtained from the function 96 | \code{gride_mle()}.} 97 | } 98 | \value{ 99 | a list containing the \code{id} estimate obtained with the Gride 100 | method, along with the relative confidence or credible interval 101 | (object \code{est}). The class of the output object changes according to the 102 | chosen \code{method}. Similarly, 103 | the remaining elements stored in the list reports a summary of the key 104 | quantities involved in the estimation process, e.g., 105 | the NN orders \code{n1} and \code{n2}. 106 | } 107 | \description{ 108 | The function can fit the Generalized ratios ID estimator under both the 109 | frequentist and the Bayesian frameworks, depending on the specification of 110 | the argument \code{method}. The model is the direct extension of the 111 | \code{TWO-NN} method presented in 112 | \href{https://www.nature.com/articles/s41598-017-11873-y}{Facco et al., 2017} 113 | . See also \href{https://www.nature.com/articles/s41598-022-20991-1}{Denti et al., 2022} \ 114 | for more details. 115 | } 116 | \examples{ 117 | \donttest{ 118 | X <- replicate(2,rnorm(500)) 119 | dm <- as.matrix(dist(X,method = "manhattan")) 120 | res <- gride(X, nsim = 500) 121 | res 122 | plot(res) 123 | gride(dist_mat = dm, method = "bayes", upper_D =10, 124 | nsim = 500, burn_in = 100) 125 | } 126 | } 127 | \references{ 128 | Facco E, D'Errico M, Rodriguez A, Laio A (2017). "Estimating the intrinsic 129 | dimension of datasets by a minimal neighborhood information." 130 | Scientific Reports, 7(1). 131 | ISSN 20452322, \doi{10.1038/s41598-017-11873-y}. 132 | 133 | Denti F, Doimo D, Laio A, Mira A (2022). "The generalized ratios intrinsic 134 | dimension estimator." 135 | Scientific Reports, 12(20005). 136 | ISSN 20452322, \doi{10.1038/s41598-022-20991-1}. 137 | } 138 | -------------------------------------------------------------------------------- /R/autoplot_hidalgo.R: -------------------------------------------------------------------------------- 1 | #' @importFrom ggplot2 autoplot 2 | #' @export 3 | ggplot2::autoplot 4 | 5 | 6 | #' Plot the output of the \code{Hidalgo} function 7 | #' 8 | #' Use this method without the \code{.Hidalgo} suffix. 9 | #' It produces several plots to explore the output of 10 | #' the \code{Hidalgo} model. 11 | #' 12 | #' @param object object of class \code{Hidalgo}, the output of the 13 | #' \code{Hidalgo()} function. 14 | #' @param type character that indicates the requested type of plot. 15 | #' It can be: 16 | #' \describe{ 17 | #' \item{\code{"raw_chains"}}{plot the MCMC and the ergodic means NOT corrected 18 | #' for label switching;} 19 | #' \item{\code{"point_estimates"}}{plot the posterior mean and median of the id 20 | #' for each observation, after the chains are processed for label switching;} 21 | #' \item{\code{"class_plot"}}{plot the estimated id distributions stratified by 22 | #' the groups specified in the class vector;} 23 | #' \item{\code{"clustering"}}{plot the posterior coclustering matrix. Rows and 24 | #' columns can be stratified by an exogenous class and/or a clustering 25 | #' solution.} 26 | #' } 27 | #' @param class_plot_type if \code{type} is chosen to be \code{"class_plot"}, 28 | #' one can plot the stratified id estimates with a \code{"density"} plot or a 29 | #' \code{"histogram"}, or using \code{"boxplots"} or \code{"violin"} plots. 30 | #' @param class factor variable used to stratify observations according to 31 | #' their the \code{id} estimates. 32 | #' @param psm posterior similarity matrix containing the posterior probability 33 | #' of coclustering. 34 | #' @param clust vector containing the cluster membership labels. 35 | #' @param title character string used as title of the plot. 36 | #' @param ... other arguments passed to specific methods. 37 | #' 38 | #' @rdname autoplot.Hidalgo 39 | #' 40 | #' @return a \code{\link[ggplot2]{ggplot2}} object produced by the function 41 | #' according to the \code{type} chosen. 42 | #' More precisely, if 43 | #' \describe{ 44 | #' \item{\code{method = "raw_chains"}}{The functions produces the traceplots 45 | #' of the parameters \code{d_k}, for \code{k=1...K}. 46 | #' The ergodic means for all the chains are superimposed. The \code{K} chains 47 | #' that are plotted are not post-processed. 48 | #' Ergo, they are subjected to label switching;} 49 | #' \item{\code{method = "point_estimates"}}{The function returns two 50 | #' scatterplots displaying 51 | #' the posterior mean and median \code{id} for each observation, after that the 52 | #' MCMC has been postprocessed to handle label switching;} 53 | #' \item{\code{method = "class_plot"}}{The function returns a plot that can be 54 | #' used to visually assess the relationship between the posterior \code{id} 55 | #' estimates and an external, categorical variable. The type of plot varies 56 | #' according to the specification of \code{class_plot_type}, and it can be 57 | #' either a set of boxplots or violin plots or a collection of overlapping 58 | #' densities or histograms;} 59 | #' \item{\code{method = "clustering"}}{The function displays the posterior 60 | #' similarity matrix, to allow the study of the clustering structure present in 61 | #' the data estimated via the mixture model. Rows and columns can be stratified 62 | #' by an exogenous class and/or a clustering structure.} 63 | #' } 64 | #' 65 | #' 66 | #' @seealso \code{\link{Hidalgo}} 67 | #' 68 | #' 69 | #' @importFrom rlang .data 70 | #' @importFrom ggplot2 autoplot 71 | #' 72 | #' @export 73 | #' @family autoplot methods 74 | #' 75 | autoplot.Hidalgo <- function(object, 76 | type = c("raw_chains", 77 | "point_estimates", 78 | "class_plot", 79 | "clustering"), 80 | class_plot_type = c("histogram", "density", 81 | "boxplot", "violin"), 82 | class = NULL, 83 | psm = NULL, 84 | clust = NULL, 85 | title = NULL, 86 | ...) { 87 | type <- match.arg(type) 88 | 89 | if (type == "class_plot") { 90 | class_plot_type <- match.arg(class_plot_type) 91 | if (is.null(class)) 92 | stop("Please provide the factor variable to stratify the id estimates") 93 | } 94 | 95 | G1 <- switch( 96 | type, 97 | raw_chains = ggHid_chains(object), 98 | point_estimates = ggHid_mean_median(object), 99 | class_plot = ggHid_class(object, class, class_plot_type), 100 | clustering = ggHid_psm(object, psm, class, clust, ...) 101 | ) 102 | 103 | if (!is.null(title)) { 104 | G1 <- G1 + ggplot2::ggtitle(title) 105 | } 106 | 107 | G1 108 | } 109 | 110 | 111 | 112 | -------------------------------------------------------------------------------- /R/twoNN.R: -------------------------------------------------------------------------------- 1 | #' \code{TWO-NN} estimator 2 | #' 3 | #' The function can fit the two-nearest neighbor estimator within the maximum 4 | #' likelihood and the Bayesian frameworks. Also, one can obtain the estimates 5 | #' using least squares estimation, depending on the specification of the 6 | #' argument \code{method}. This model has been originally presented in 7 | #' \href{https://www.nature.com/articles/s41598-017-11873-y}{Facco et al., 2017} 8 | #' . See also \href{https://www.nature.com/articles/s41598-022-20991-1}{Denti et al., 2022} 9 | #' for more details. 10 | #' 11 | #' @param X data matrix with \code{n} observations and \code{D} variables. 12 | #' @param dist_mat distance matrix computed between the \code{n} observations. 13 | #' @param mus vector of second to first NN distance ratios. 14 | #' @param method chosen estimation method. It can be 15 | #' \describe{ 16 | #' \item{\code{"mle"}}{for maximum likelihood estimator;} 17 | #' \item{\code{"linfit"}}{for estimation via the least squares approach;} 18 | #' \item{\code{"bayes"}}{for estimation with the Bayesian approach.} 19 | #' } 20 | #' @param alpha the confidence level (for \code{mle} and least squares fit) or 21 | #' posterior probability in the credible interval (\code{bayes}). 22 | #' @param c_trimmed the proportion of trimmed observations. 23 | #' @param unbiased logical, applicable when \code{method = "mle"}. 24 | #' If \code{TRUE}, the MLE is corrected to ensure unbiasedness. 25 | #' @param a_d shape parameter of the Gamma prior on the parameter \code{d}, 26 | #' applicable when \code{method = "bayes"}. 27 | #' @param b_d rate parameter of the Gamma prior on the parameter \code{d}, 28 | #' applicable when \code{method = "bayes"}. 29 | #' @param ... additional arguments for the different methods. 30 | #' 31 | #' 32 | #' @return list characterized by a class type that depends on the \code{method} 33 | #' chosen. Regardless of the \code{method}, the output list always contains the 34 | #' object \code{est}, which provides the estimated intrinsic dimension along 35 | #' with uncertainty quantification. The remaining objects vary with the 36 | #' estimation method. In particular, if 37 | #' \describe{ 38 | #' \item{\code{method = "mle"}}{the output reports the MLE and the relative 39 | #' confidence interval;} 40 | #' \item{\code{method = "linfit"}}{the output includes the \code{lm()} object used for the computation;} 41 | #' \item{\code{method = "bayes"}}{the output contains the (1 + \code{alpha}) / 2 and (1 - \code{alpha}) / 2 quantiles, mean, mode, and median of the posterior distribution of \code{d}.} 42 | #' } 43 | #' 44 | #' @export 45 | #' 46 | #' @references 47 | #' Facco E, D'Errico M, Rodriguez A, Laio A (2017). "Estimating the intrinsic 48 | #' dimension of datasets by a minimal neighborhood information." 49 | #' Scientific Reports, 7(1). 50 | #' ISSN 20452322, \doi{10.1038/s41598-017-11873-y}. 51 | #' 52 | #' Denti F, Doimo D, Laio A, Mira A (2022). "The generalized ratios intrinsic 53 | #' dimension estimator." 54 | #' Scientific Reports, 12(20005). 55 | #' ISSN 20452322, \doi{10.1038/s41598-022-20991-1}. 56 | #' 57 | #' @examples 58 | #' # dataset with 1000 observations and id = 2 59 | #' X <- replicate(2,rnorm(1000)) 60 | #' twonn(X) 61 | #' # dataset with 1000 observations and id = 3 62 | #' Y <- replicate(3,runif(1000)) 63 | #' # Bayesian and least squares estimate from distance matrix 64 | #' dm <- as.matrix(dist(Y,method = "manhattan")) 65 | #' twonn(dist_mat = dm,method = "bayes") 66 | #' twonn(dist_mat = dm,method = "linfit") 67 | #' 68 | twonn <- function(X = NULL, 69 | dist_mat = NULL, 70 | mus = NULL, 71 | method = c("mle", "linfit", "bayes"), 72 | alpha = 0.95, 73 | c_trimmed = .01, 74 | unbiased = TRUE, 75 | a_d = 0.001, 76 | b_d = 0.001, 77 | ...) { 78 | 79 | if (!is.null(mus)) { 80 | if( !all(mus >= 1) ){ 81 | stop("Detected some values in mus below 1. 82 | Please provide a proper vector of ratios.", 83 | call. = FALSE) 84 | } 85 | } 86 | 87 | if (is.null(mus)) { 88 | if (is.null(X) & is.null(dist_mat)) { 89 | stop("Please provide either a dataset X or a distance matrix", 90 | call. = FALSE) 91 | } 92 | 93 | mus <- compute_mus( 94 | X, 95 | dist_mat = dist_mat, 96 | n1 = 1, 97 | n2 = 2, 98 | Nq = FALSE 99 | ) 100 | } 101 | 102 | method <- match.arg(method) 103 | 104 | 105 | switch( 106 | method, 107 | mle = twonn_mle( 108 | mus = mus, 109 | alpha = alpha, 110 | c_trimmed = c_trimmed, 111 | unbiased = unbiased, 112 | ... 113 | ), 114 | linfit = twonn_linfit( 115 | mus = mus, 116 | alpha = alpha, 117 | c_trimmed = c_trimmed, 118 | ... 119 | ), 120 | bayes = twonn_bayes( 121 | mus = mus, 122 | alpha = alpha, 123 | c_trimmed = c_trimmed, 124 | a_d = a_d, 125 | b_d = b_d, 126 | ... 127 | ) 128 | ) 129 | 130 | 131 | } 132 | -------------------------------------------------------------------------------- /R/twonn_mle.R: -------------------------------------------------------------------------------- 1 | #' Maximum Likelihood Estimator for the \code{TWO-NN} model 2 | #' 3 | #' The function fits the \code{TWO-NN} model via maximum likelihood estimation, 4 | #' as discussed in 5 | #' \href{https://www.nature.com/articles/s41598-022-20991-1}{Denti et al., 2022}. 6 | #' This function is not exported, and can be accessed with \code{twonn()}, 7 | #' specifying \code{method = "mle"}. 8 | #' 9 | #' @param mus vector of second to first NN distance ratios. 10 | #' @param unbiased logical, if \code{TRUE} the MLE is corrected to ensure 11 | #' unbiasedness. 12 | #' @param alpha confidence level needed for the computation of the confidence 13 | #' interval. 14 | #' @param c_trimmed proportion of trimmed observations. 15 | #' 16 | #' @return list of class \code{twonn_mle} containing the maximum likelihood 17 | #' estimation of the intrinsic dimension with its confidence interval. 18 | #' @keywords Internal 19 | #' 20 | #' @noRd 21 | #' 22 | #' @seealso \code{\link{twonn}} 23 | #' 24 | #' @references 25 | #' #' Facco E, D'Errico M, Rodriguez A, Laio A (2017). "Estimating the intrinsic 26 | #' dimension of datasets by a minimal neighborhood information." 27 | #' Scientific Reports, 7(1). 28 | #' ISSN 20452322, \doi{10.1038/s41598-017-11873-y}. 29 | #' 30 | #' Denti F, Doimo D, Laio A, Mira A (2022). "The generalized ratios intrinsic 31 | #' dimension estimator." 32 | #' Scientific Reports, 12(20005). 33 | #' ISSN 20452322, \doi{10.1038/s41598-022-20991-1}. 34 | #' 35 | twonn_mle <- 36 | function(mus, 37 | unbiased = TRUE, 38 | alpha = .95, 39 | c_trimmed = 0.01) { 40 | n <- n_original <- base::length(mus) 41 | c_considered <- 1 - c_trimmed 42 | 43 | if (c_trimmed > 0) { 44 | mus <- sort(mus)[1:floor(n * c_considered)] 45 | n <- base::length(mus) 46 | } 47 | 48 | alpha <- 1 - alpha 49 | logmus <- base::log(mus) 50 | 51 | if (unbiased) { 52 | Z <- (n - 1) / base::sum(logmus) 53 | UB <- 54 | Z * stats::qgamma(1 - alpha / 2, shape = n, rate = n - 1) 55 | LB <- Z * stats::qgamma(alpha / 2, shape = n, rate = n - 1) 56 | } else { 57 | Z <- (n) / base::sum(logmus) 58 | UB <- Z * stats::qgamma(1 - alpha / 2, shape = n, rate = n) 59 | LB <- Z * stats::qgamma(alpha / 2, shape = n, rate = n) 60 | } 61 | results <- c(LB, Z, UB) 62 | 63 | Res <- 64 | list( 65 | est = results, 66 | cl = 1 - alpha, 67 | c_trimmed = c_trimmed, 68 | n_original = n_original, 69 | n = n 70 | ) 71 | structure(Res, class = c("twonn_mle", class(Res))) 72 | } 73 | 74 | 75 | #' @name twonn 76 | #' 77 | #' @param x object of class \code{twonn_mle}, obtained from the function 78 | #' \code{twonn_mle()}. 79 | #' @param ... ignored. 80 | #' 81 | #' 82 | #' @export 83 | print.twonn_mle <- function(x, ...) { 84 | y <- c("TWONN - MLE" = x[["est"]][2]) 85 | cat(y) 86 | cat("\n") 87 | invisible(x) 88 | } 89 | 90 | #' @name twonn 91 | #' 92 | #' @param object object of class \code{twonn_mle}, obtained from the function 93 | #' \code{twonn_mle()}. 94 | #' @param ... ignored. 95 | #' 96 | #' @export 97 | summary.twonn_mle <- function(object, ...) { 98 | y <- cbind( 99 | `Original sample size` = object[["n_original"]], 100 | `Used sample size` = object[["n"]], 101 | `Trimming proportion` = object[["c_trimmed"]], 102 | `Confidence level` = object[["cl"]], 103 | `Lower Bound` = object[["est"]][1], 104 | `Estimate` = object[["est"]][2], 105 | `Upper Bound` = object[["est"]][3] 106 | ) 107 | structure(y, class = c("summary.twonn_mle","matrix")) 108 | } 109 | 110 | 111 | #' @name twonn 112 | #' 113 | #' @param x object of class \code{twonn_mle}, obtained from the function 114 | #' \code{twonn_mle()}. 115 | #' @param ... ignored. 116 | #' 117 | #' @export 118 | print.summary.twonn_mle <- function(x, ...) { 119 | cat("Model: TWO-NN\n") 120 | cat("Method: MLE\n") 121 | cat(paste0( 122 | "Sample size: ", 123 | x[1], 124 | ", Obs. used: ", 125 | x[2], 126 | ". Trimming proportion: ", 127 | 100 * x[3], 128 | "%\n" 129 | )) 130 | cat(paste0("ID estimates (confidence level: ", x[4], ")")) 131 | y <- cbind( 132 | `Lower Bound` = x[5], 133 | `Estimate` = x[6], 134 | `Upper Bound` = x[7] 135 | ) 136 | print(knitr::kable(y)) 137 | cat("\n") 138 | invisible(x) 139 | } 140 | 141 | 142 | #' @name twonn 143 | #' @param x object of class \code{twonn_mle}, the output of the 144 | #' \code{twonn} function when \code{method = "mle"}. 145 | #' 146 | #' 147 | #' @importFrom graphics abline legend lines matplot par points polygon 148 | #' @importFrom stats density ts 149 | #' 150 | #' @export 151 | #' 152 | plot.twonn_mle <- 153 | function(x, 154 | ...) { 155 | 156 | plot(rep(0,3) ~ x$est,type="n",xlab = ("Maximum Likelihood Estimation"),ylab="") 157 | graphics::abline(v = x$est,col = "gray",lwd=1,lty=2) 158 | graphics::points(0~x$est[2],cex = 2,pch=21,bg=1) 159 | graphics::arrows(y0 = 0,code = 3,y1 = 0,x0 = x$est[1],x1 = x$est[3],angle=90,lwd=2) 160 | graphics::title("MLE TWO-NN") 161 | invisible() 162 | } 163 | 164 | 165 | -------------------------------------------------------------------------------- /R/gride_mle.R: -------------------------------------------------------------------------------- 1 | #' Generalized Ratios ID Estimation via MLE 2 | #' 3 | #' The function fits the frequentist Gride model. To run this function, use the 4 | #' high-level \code{gride()} and specify \code{method = "mle"}. 5 | #' The function finds the maximum likelihood estimates for \code{d}, and 6 | #' subsequently simulate parametric bootstrap samples for uncertainty 7 | #' quantification. 8 | #' See \href{https://www.nature.com/articles/s41598-022-20991-1}{Denti et al., 2022} 9 | #' for more details. 10 | #' 11 | #' 12 | #' @param mus_n1_n2 vector of generalized order NN distance ratios. 13 | #' @param n1 order of the first NN considered. Default is 1. 14 | #' @param n2 order of the second NN considered. Default is 2. 15 | #' @param nsim number of bootstrap simulations to consider. 16 | #' @param alpha confidence level for the computation of the confidence interval. 17 | #' @param upper_D nominal dimension of the dataset (upper bound for the 18 | #' maximization routine). 19 | #' 20 | #' @return MLE estimate obtained via numeric optimization along with the 21 | #' bootstrap confidence interval. 22 | #' @keywords internal 23 | #' @noRd 24 | #' 25 | #' @references 26 | #' Denti F, Doimo D, Laio A, Mira A (2022). "The generalized ratios intrinsic 27 | #' dimension estimator." 28 | #' Scientific Reports, 12(20005). 29 | #' ISSN 20452322, \doi{10.1038/s41598-022-20991-1}. 30 | #' 31 | #' @seealso \code{\link{gride}} 32 | #' 33 | gride_mle <- function(mus_n1_n2 = NULL, 34 | n1 = 1, 35 | n2 = 2, 36 | nsim = 2000, 37 | alpha = .95, 38 | upper_D = NULL) { 39 | if (class(mus_n1_n2)[1] == "mus") { 40 | n1 <- attr(mus_n1_n2, which = "n1") 41 | n2 <- attr(mus_n1_n2, which = "n2") 42 | } 43 | 44 | if (n2 < n1) { 45 | stop("n2 should be greater than n1", call. = FALSE) 46 | } 47 | 48 | one_m_alpha <- 1 - alpha 49 | 50 | bs <- gride_bootstrap( 51 | mus_n1_n2 = mus_n1_n2, 52 | n1 = n1, 53 | n2 = n2, 54 | nsim = nsim, 55 | upper_D = upper_D 56 | ) 57 | 58 | qq <- base::unname(stats::quantile(bs$boot_sample, 59 | probs = c(one_m_alpha / 2, 60 | 1 - one_m_alpha / 2))) 61 | 62 | Res <- list( 63 | est = c( 64 | lb = qq[1], 65 | mle = bs$mle, 66 | ub = qq[2] 67 | ), 68 | cl = alpha, 69 | boot_sample = bs$boot_sample, 70 | n1 = n1, 71 | n2 = n2, 72 | nsim = nsim 73 | ) 74 | structure(Res, class = c("gride_mle", class(Res))) 75 | 76 | } 77 | 78 | 79 | 80 | 81 | #' @name gride 82 | #' 83 | #' @param object object of class \code{gride_mle}, obtained from the function 84 | #' \code{gride_mle()}. 85 | #' @param ... ignored. 86 | #' 87 | #' 88 | #' @export 89 | print.gride_mle <- function(x, ...) { 90 | cat(paste0("Gride(",x[["n1"]],",",x[["n2"]],") - MLE\n")) 91 | cat(x[["est"]][2]) 92 | cat("\n") 93 | invisible(x) 94 | } 95 | 96 | #' @name gride 97 | #' 98 | #' @param object object of class \code{gride_mle}, obtained from the function 99 | #' \code{gride_mle()}. 100 | #' @param ... ignored. 101 | #' 102 | #' @export 103 | summary.gride_mle <- function(object, ...) { 104 | y <- cbind( 105 | `NN order 1` = object[["n1"]], 106 | `NN order 2` = object[["n2"]], 107 | `Bootstrap simulations` = object[["nsim"]], 108 | `Confidence level` = object[["cl"]], 109 | 110 | `Lower Bound` = object[["est"]][1], 111 | `Estimate` = object[["est"]][2], 112 | `Upper Bound` = object[["est"]][3] 113 | ) 114 | structure(y, class = c("summary.gride_mle","matrix")) 115 | } 116 | 117 | 118 | #' @name gride 119 | #' 120 | #' @param x object of class \code{twonn_mle}, obtained from the function 121 | #' \code{twonn_mle()}. 122 | #' @param ... ignored. 123 | #' 124 | #' @export 125 | print.summary.gride_mle <- function(x, ...) { 126 | cat(paste0("Model: Gride(", x[1], ",", x[2], ")\n")) 127 | cat("Method: MLE\n") 128 | cat(paste0("CI obtained with a parametric bootstrap sample of size ", 129 | x[3], "\n")) 130 | cat(paste0("ID estimates (confidence level: ", x[4], ")")) 131 | y <- cbind( 132 | `Lower Bound` = x[5], 133 | `Estimate` = x[6], 134 | `Upper Bound` = x[7] 135 | ) 136 | print(knitr::kable(y)) 137 | cat("\n") 138 | invisible(x) 139 | } 140 | 141 | 142 | #' @name gride 143 | #' 144 | #' @param x object of class \code{gride_mle}. 145 | #' It is obtained using the output of the \code{gride} function when 146 | #' \code{method = "mle"}. 147 | #' 148 | #' @param ... other arguments passed to specific methods. 149 | #' 150 | #' @export 151 | #' 152 | plot.gride_mle <- function(x, 153 | ...) { 154 | ID <- x$boot_sample 155 | dx <- density(ID) 156 | plot(dx, xlab = "Intrinsic Dimension" , ylab = "Bootstrap Density", 157 | col="darkblue",lwd=1.3, main = "") 158 | polygon(c(dx$x), c(dx$y), 159 | col = "lightgray", border = "darkblue", main = "") 160 | abline(v = c(x$est, x$lb, x$ub), 161 | lty = 2, 162 | col = 2) 163 | graphics::title("MLE Gride: Bootstrap sample") 164 | invisible() 165 | } 166 | 167 | -------------------------------------------------------------------------------- /R/twonn_dec_steps.R: -------------------------------------------------------------------------------- 1 | #' Decimated TWO-NN evolution with halving steps 2 | #' 3 | #' The estimation of the \code{id} is related to the scale of the 4 | #' dataset. To escape the local reach of the \code{TWO-NN} estimator, 5 | #' \href{https://www.nature.com/articles/s41598-017-11873-y}{Facco et al. (2017)} 6 | #' propose to subsample the original dataset in order to induce greater 7 | #' distances between the data points. By investigating the estimates' evolution 8 | #' as a function of the size of the neighborhood, it is possible to obtain 9 | #' information about the validity of the modeling assumptions and the robustness 10 | #' of the model in the presence of noise. 11 | #' 12 | #' @param X data matrix with \code{n} observations and \code{D} variables. 13 | #' @param steps number of times the dataset is halved. 14 | #' @param seed random seed controlling the sequence of subsampling. 15 | #' 16 | #' @return object of class \code{twonn_decimation_steps}, which is a list 17 | #' containing the \code{TWO-NN} evolution (maximum likelihood estimation and 18 | #' confidence intervals), the average distance from the second NN, and the 19 | #' number of steps. 20 | #' 21 | #' @keywords internal 22 | #' @noRd 23 | #' 24 | twonn_dec_by <- function(X, 25 | steps = 2, 26 | seed = NULL) { 27 | if (!is.null(seed)) { 28 | set.seed(seed) 29 | } 30 | 31 | X <- as.matrix(X) 32 | D <- ncol(X) 33 | n <- n0 <- nrow(X) 34 | check <- 0 35 | check <- duplicated(X) 36 | 37 | if (sum(check) > 0) { 38 | X <- X[-which(check),] 39 | n <- nrow(X) 40 | warning( 41 | paste0( 42 | "\n Duplicates are present and will be removed. 43 | \n Original sample size: ", 44 | n0, 45 | ". New sample size: ", 46 | n, 47 | "." 48 | ), 49 | call. = FALSE 50 | ) 51 | } 52 | 53 | if (floor(2 ^ (-steps) * n) <= 2) { 54 | stop( 55 | "Too many steps, no observations left. Please lower the number of steps considered", 56 | call. = FALSE 57 | ) 58 | } 59 | 60 | W <- steps + 1 61 | twonns <- matrix(NA, W, 3) 62 | avg_distance_n2 <- numeric(W) 63 | inds <- 1:n 64 | 65 | # Classic TWO-NN 66 | K <- FNN::get.knn(X, k = 2) 67 | mudots <- K$nn.dist[, 2] / K$nn.dist[, 1] 68 | avg_distance_n2[1] <- mean(K$nn.dist[, 2]) 69 | ests <- twonn_mle(mudots) 70 | twonns[1, ] <- ests$est 71 | 72 | 73 | for (w in 2:(W)) { 74 | sub_ind <- sample(x = inds, 75 | size = floor(.5 * length(inds))) 76 | 77 | subX <- X[sub_ind, ] 78 | K <- FNN::get.knn(subX, k = 2) 79 | mudots <- K$nn.dist[, 2] / K$nn.dist[, 1] 80 | avg_distance_n2[w] <- mean(K$nn.dist[, 2]) 81 | ests <- twonn_mle(mudots) 82 | twonns[w, ] <- ests$est 83 | inds <- sub_ind 84 | 85 | } 86 | 87 | res <- list( 88 | decimated_twonn = twonns, 89 | avg_distance_n2 = avg_distance_n2, 90 | steps = steps 91 | ) 92 | structure(res, class = c("twonn_dec_by", class(res))) 93 | 94 | } 95 | 96 | 97 | 98 | 99 | #' @name twonn_decimation 100 | #' 101 | #' @param x object of class \code{twonn_dec_prop}, obtained from the function 102 | #' \code{twonn_dec_prop()}. 103 | #' @param ... ignored. 104 | #' 105 | #' 106 | #' @export 107 | print.twonn_dec_by <- function(x, ...) { 108 | cat(paste0("Model: decimated TWO-NN\n")) 109 | cat(paste0("Dataset halved ", x[["steps"]], " times \n")) 110 | cat(paste0( 111 | "Average distance from the n2-th NN ranging from ", 112 | round(min(x[["avg_distance_n2"]]), 4), 113 | " to ", 114 | round(max(x[["avg_distance_n2"]]), 4), 115 | "\n" 116 | )) 117 | invisible(x) 118 | } 119 | 120 | 121 | 122 | #' @name twonn_decimation 123 | #' 124 | #' @param x object of class \code{twonn_dec_prop}, obtained from the function 125 | #' \code{twonn_dec_prop()}. 126 | #' @param CI logical, if \code{TRUE}, the confidence intervals are plotted 127 | #' @param steps logical, if \code{TRUE}, the x-axis reports the number of halving steps. 128 | #' If \code{FALSE}, the x-axis reports the log10 average distance. 129 | #' @param ... ignored. 130 | #' 131 | #' 132 | #' @export 133 | plot.twonn_dec_by <- function(x, CI = FALSE, steps = FALSE, ...) { 134 | 135 | if(steps){ 136 | xx <- 1:(x$steps+1) 137 | logscale <- "" 138 | xlab <- "Halving steps" 139 | }else{ 140 | xx <- x$avg_distance_n2 141 | logscale <- "x" 142 | xlab <- Log[10] ~ average ~ n[2] ~ distance 143 | } 144 | if(CI){ 145 | plot(x$decimated_twonn[,2]~xx,col="darkblue",type = "b",lwd=1.5, 146 | xlab = Log[10] ~ average ~ n[2] ~ distance, ylab = "ID estimate", log = logscale, 147 | ylim = c(min(x$decimated_twonn),max(x$decimated_twonn))) 148 | graphics::polygon(c(xx, rev(xx)), c(x$decimated_twonn[,1], rev(x$decimated_twonn[,3])), 149 | col = "lightblue", border = NA) 150 | lines(x$decimated_twonn[,2]~xx,col="darkblue",type = "b",lwd=1.5) 151 | }else{ 152 | plot(x$decimated_twonn[,2]~xx,col="darkblue",type = "b",lwd=1.5, 153 | xlab = Log[10] ~ average ~ n[2] ~ distance, ylab = "ID estimate", log = logscale) 154 | } 155 | graphics::title("Decimated TWO-NN: Halving steps") 156 | invisible() 157 | } 158 | -------------------------------------------------------------------------------- /man/twonn.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/twoNN.R, R/twonn_bayes.R, R/twonn_linfit.R, 3 | % R/twonn_mle.R 4 | \name{twonn} 5 | \alias{twonn} 6 | \alias{print.twonn_bayes} 7 | \alias{summary.twonn_bayes} 8 | \alias{print.summary.twonn_bayes} 9 | \alias{plot.twonn_bayes} 10 | \alias{print.twonn_linfit} 11 | \alias{summary.twonn_linfit} 12 | \alias{print.summary.twonn_linfit} 13 | \alias{plot.twonn_linfit} 14 | \alias{print.twonn_mle} 15 | \alias{summary.twonn_mle} 16 | \alias{print.summary.twonn_mle} 17 | \alias{plot.twonn_mle} 18 | \title{\code{TWO-NN} estimator} 19 | \usage{ 20 | twonn( 21 | X = NULL, 22 | dist_mat = NULL, 23 | mus = NULL, 24 | method = c("mle", "linfit", "bayes"), 25 | alpha = 0.95, 26 | c_trimmed = 0.01, 27 | unbiased = TRUE, 28 | a_d = 0.001, 29 | b_d = 0.001, 30 | ... 31 | ) 32 | 33 | \method{print}{twonn_bayes}(x, ...) 34 | 35 | \method{summary}{twonn_bayes}(object, ...) 36 | 37 | \method{print}{summary.twonn_bayes}(x, ...) 38 | 39 | \method{plot}{twonn_bayes}(x, plot_low = 0.001, plot_upp = NULL, by = 0.05, ...) 40 | 41 | \method{print}{twonn_linfit}(x, ...) 42 | 43 | \method{summary}{twonn_linfit}(object, ...) 44 | 45 | \method{print}{summary.twonn_linfit}(x, ...) 46 | 47 | \method{plot}{twonn_linfit}(x, ...) 48 | 49 | \method{print}{twonn_mle}(x, ...) 50 | 51 | \method{summary}{twonn_mle}(object, ...) 52 | 53 | \method{print}{summary.twonn_mle}(x, ...) 54 | 55 | \method{plot}{twonn_mle}(x, ...) 56 | } 57 | \arguments{ 58 | \item{X}{data matrix with \code{n} observations and \code{D} variables.} 59 | 60 | \item{dist_mat}{distance matrix computed between the \code{n} observations.} 61 | 62 | \item{mus}{vector of second to first NN distance ratios.} 63 | 64 | \item{method}{chosen estimation method. It can be 65 | \describe{ 66 | \item{\code{"mle"}}{for maximum likelihood estimator;} 67 | \item{\code{"linfit"}}{for estimation via the least squares approach;} 68 | \item{\code{"bayes"}}{for estimation with the Bayesian approach.} 69 | }} 70 | 71 | \item{alpha}{the confidence level (for \code{mle} and least squares fit) or 72 | posterior probability in the credible interval (\code{bayes}).} 73 | 74 | \item{c_trimmed}{the proportion of trimmed observations.} 75 | 76 | \item{unbiased}{logical, applicable when \code{method = "mle"}. 77 | If \code{TRUE}, the MLE is corrected to ensure unbiasedness.} 78 | 79 | \item{a_d}{shape parameter of the Gamma prior on the parameter \code{d}, 80 | applicable when \code{method = "bayes"}.} 81 | 82 | \item{b_d}{rate parameter of the Gamma prior on the parameter \code{d}, 83 | applicable when \code{method = "bayes"}.} 84 | 85 | \item{...}{ignored.} 86 | 87 | \item{x}{object of class \code{twonn_mle}, the output of the 88 | \code{twonn} function when \code{method = "mle"}.} 89 | 90 | \item{object}{object of class \code{twonn_mle}, obtained from the function 91 | \code{twonn_mle()}.} 92 | 93 | \item{plot_low}{lower bound of the interval on which the posterior density 94 | is plotted.} 95 | 96 | \item{plot_upp}{upper bound of the interval on which the posterior density 97 | is plotted.} 98 | 99 | \item{by}{step-size at which the sequence spanning the interval is 100 | incremented.} 101 | } 102 | \value{ 103 | list characterized by a class type that depends on the \code{method} 104 | chosen. Regardless of the \code{method}, the output list always contains the 105 | object \code{est}, which provides the estimated intrinsic dimension along 106 | with uncertainty quantification. The remaining objects vary with the 107 | estimation method. In particular, if 108 | \describe{ 109 | \item{\code{method = "mle"}}{the output reports the MLE and the relative 110 | confidence interval;} 111 | \item{\code{method = "linfit"}}{the output includes the \code{lm()} object used for the computation;} 112 | \item{\code{method = "bayes"}}{the output contains the (1 + \code{alpha}) / 2 and (1 - \code{alpha}) / 2 quantiles, mean, mode, and median of the posterior distribution of \code{d}.} 113 | } 114 | } 115 | \description{ 116 | The function can fit the two-nearest neighbor estimator within the maximum 117 | likelihood and the Bayesian frameworks. Also, one can obtain the estimates 118 | using least squares estimation, depending on the specification of the 119 | argument \code{method}. This model has been originally presented in 120 | \href{https://www.nature.com/articles/s41598-017-11873-y}{Facco et al., 2017} 121 | . See also \href{https://www.nature.com/articles/s41598-022-20991-1}{Denti et al., 2022} 122 | for more details. 123 | } 124 | \examples{ 125 | # dataset with 1000 observations and id = 2 126 | X <- replicate(2,rnorm(1000)) 127 | twonn(X) 128 | # dataset with 1000 observations and id = 3 129 | Y <- replicate(3,runif(1000)) 130 | # Bayesian and least squares estimate from distance matrix 131 | dm <- as.matrix(dist(Y,method = "manhattan")) 132 | twonn(dist_mat = dm,method = "bayes") 133 | twonn(dist_mat = dm,method = "linfit") 134 | 135 | } 136 | \references{ 137 | Facco E, D'Errico M, Rodriguez A, Laio A (2017). "Estimating the intrinsic 138 | dimension of datasets by a minimal neighborhood information." 139 | Scientific Reports, 7(1). 140 | ISSN 20452322, \doi{10.1038/s41598-017-11873-y}. 141 | 142 | Denti F, Doimo D, Laio A, Mira A (2022). "The generalized ratios intrinsic 143 | dimension estimator." 144 | Scientific Reports, 12(20005). 145 | ISSN 20452322, \doi{10.1038/s41598-022-20991-1}. 146 | } 147 | -------------------------------------------------------------------------------- /R/autoplot_gride.R: -------------------------------------------------------------------------------- 1 | #' Plot the simulated MCMC chains for the Bayesian \code{Gride} 2 | #' 3 | #' Use this method without the \code{.gride_bayes} suffix. 4 | #' It displays the traceplot of the chain generated 5 | #' with Metropolis-Hasting updates to visually assess mixing and convergence. 6 | #' Alternatively, it is possible to plot the posterior density. 7 | #' 8 | #' @param object object of class \code{gride_bayes}. 9 | #' It is obtained using the output of the \code{gride} function when 10 | #' \code{method = "bayes"}. 11 | #' @param traceplot logical. If \code{FALSE}, the function returns a plot of the 12 | #' posterior density. If \code{TRUE}, the function returns the traceplots of the 13 | #' MCMC used to simulate from the posterior distribution. 14 | #' @param title optional string to display as title. 15 | #' @param ... other arguments passed to specific methods. 16 | #' 17 | #' @rdname autoplot.gride_bayes 18 | #' 19 | #' @return object of class \code{\link[ggplot2]{ggplot}}. 20 | #' It could represent the traceplot of the posterior simulations for the 21 | #' Bayesian \code{Gride} model (\code{traceplot = TRUE}) or a density plot 22 | #' of the simulated posterior distribution (\code{traceplot = FALSE}). 23 | #' 24 | #' @seealso \code{\link{gride}} 25 | #' 26 | #' @family autoplot methods 27 | #' 28 | #' @export 29 | autoplot.gride_bayes <- function(object, 30 | traceplot = FALSE, 31 | title = "Bayesian Gride - Posterior distribution", 32 | ...) { 33 | if (traceplot) { 34 | sam <- c(object$post_sample) 35 | cmm <- cumsum(sam) / seq_along(sam) 36 | G1 <- 37 | ggplot2::ggplot(dplyr::tibble(value = sam, ind = seq_along(sam), cmm = cmm)) + 38 | ggplot2::geom_line(ggplot2::aes(x = .data$ind, y = .data$value), col = 39 | "lightgray") + 40 | ggplot2::geom_line(ggplot2::aes(x = .data$ind, y = .data$cmm), col = 41 | "darkblue") + 42 | ggplot2::theme_bw() + 43 | ggplot2::ylab("Intrinsic Dimension") + 44 | ggplot2::xlab("MCMC Iteration") + 45 | ggplot2::theme( 46 | axis.title.x = ggplot2::element_text(size = 15), 47 | axis.title.y = ggplot2::element_text(size = 15), 48 | title = ggplot2::element_text(size = 15) 49 | ) + 50 | ggplot2::ggtitle(title, 51 | subtitle = bquote( 52 | n[1] == .(object$n1) ~ "," ~ n[2] == .(object$n2) ~ "," ~ sigma == .(object$sigma) 53 | )) 54 | 55 | 56 | } else{ 57 | sam <- object$post_sample 58 | G1 <- ggplot2::ggplot(dplyr::tibble(value = c(sam))) + 59 | ggplot2::geom_density(ggplot2::aes(x = .data$value), 60 | col = "darkblue", 61 | fill = "lightgray") + 62 | ggplot2::xlab("Intrinsic Dimension") + 63 | ggplot2::ylab("Simulated Posterior Density") + 64 | ggplot2::theme_bw() + 65 | ggplot2::geom_vline(xintercept = object$est, 66 | lty = 2, 67 | col = 2) + 68 | ggplot2::theme( 69 | axis.title.x = ggplot2::element_text(size = 15), 70 | axis.title.y = ggplot2::element_text(size = 15), 71 | title = ggplot2::element_text(size = 15) 72 | ) + 73 | ggplot2::ggtitle(title, 74 | subtitle = bquote( 75 | n[1] == .(object$n1) ~ "," ~ n[2] == .(object$n2) ~ "," ~ sigma == .(object$sigma) 76 | )) 77 | 78 | 79 | } 80 | return(G1) 81 | } 82 | 83 | 84 | 85 | 86 | #' Plot the simulated bootstrap sample for the MLE \code{Gride} 87 | #' 88 | #' Use this method without the \code{.gride_mle} suffix. 89 | #' It displays the density plot of sample obtained via 90 | #' parametric bootstrap for the \code{Gride} model. 91 | #' 92 | #' @param object object of class \code{gride_mle}. 93 | #' It is obtained using the output of the \code{gride} function when 94 | #' \code{method = "mle"}. 95 | #' @param title title for the plot. 96 | #' @param ... other arguments passed to specific methods. 97 | #' 98 | #' @rdname autoplot.gride_mle 99 | #' 100 | #' @return object of class \code{\link[ggplot2]{ggplot}}. It displays the 101 | #' density plot of the sample generated via parametric bootstrap to help the 102 | #' visual assessment of the uncertainty of the \code{id} estimates. 103 | #' 104 | #' 105 | #' @seealso \code{\link{gride}} 106 | #' 107 | #' @export 108 | autoplot.gride_mle <- function(object, 109 | title = "MLE Gride - Bootstrap sample", 110 | ...) { 111 | G1 <- ggplot2::ggplot(dplyr::tibble(value = object$boot_sample)) + 112 | ggplot2::geom_density(ggplot2::aes(x = .data$value), 113 | col = "darkblue", 114 | fill = "lightgray") + 115 | ggplot2::xlab("Intrinsic Dimension") + 116 | ggplot2::ylab("Bootstrap Density") + 117 | ggplot2::theme_bw() + 118 | ggplot2::geom_vline( 119 | xintercept = c(object$est, object$lb, object$ub), 120 | lty = 2, 121 | col = 2 122 | ) + 123 | ggplot2::theme( 124 | axis.title.x = ggplot2::element_text(size = 15), 125 | axis.title.y = ggplot2::element_text(size = 15), 126 | title = ggplot2::element_text(size = 15) 127 | ) + 128 | ggplot2::ggtitle(title, 129 | subtitle = bquote(n[1] == .(object$n1) ~ "," ~ n[2] == .(object$n2))) 130 | 131 | return(G1) 132 | } 133 | 134 | 135 | -------------------------------------------------------------------------------- /R/gride.R: -------------------------------------------------------------------------------- 1 | #' \code{Gride}: the Generalized Ratios ID Estimator 2 | #' 3 | #' The function can fit the Generalized ratios ID estimator under both the 4 | #' frequentist and the Bayesian frameworks, depending on the specification of 5 | #' the argument \code{method}. The model is the direct extension of the 6 | #' \code{TWO-NN} method presented in 7 | #' \href{https://www.nature.com/articles/s41598-017-11873-y}{Facco et al., 2017} 8 | #' . See also \href{https://www.nature.com/articles/s41598-022-20991-1}{Denti et al., 2022} \ 9 | #' for more details. 10 | #' 11 | #' @param X data matrix with \code{n} observations and \code{D} variables. 12 | #' @param dist_mat distance matrix computed between the \code{n} observations. 13 | #' @param mus_n1_n2 vector of generalized order NN distance ratios. 14 | #' @param method the chosen estimation method. It can be 15 | #' \describe{ 16 | #' \item{\code{"mle"}}{maximum likelihood estimation;} 17 | #' \item{\code{"bayes"}}{estimation with the Bayesian approach.} 18 | #' } 19 | #' @param n1 order of the first NN considered. Default is 1. 20 | #' @param n2 order of the second NN considered. Default is 2. 21 | #' @param alpha confidence level (for \code{mle}) or posterior probability in 22 | #' the credible interval (\code{bayes}). 23 | #' @param upper_D nominal dimension of the dataset (upper bound for the 24 | #' maximization routine). 25 | #' @param nsim number of bootstrap samples or posterior simulation to consider. 26 | #' @param burn_in number of iterations to discard from the MCMC sample. 27 | #' Applicable if \code{method = "bayes"}. 28 | #' @param sigma standard deviation of the Gaussian proposal used in the MH step. 29 | #' Applicable if \code{method = "bayes"}. 30 | #' @param start_d initial value for the MCMC chain. If \code{NULL}, 31 | #' the MLE is used. Applicable if \code{method = "bayes"}. 32 | #' @param a_d shape parameter of the Gamma prior distribution for \code{d}. 33 | #' Applicable if \code{method = "bayes"}. 34 | #' @param b_d rate parameter of the Gamma prior distribution for \code{d}. 35 | #' Applicable if \code{method = "bayes"}. 36 | #' @param ... additional arguments for the different methods. 37 | #' 38 | #' 39 | #' @return a list containing the \code{id} estimate obtained with the Gride 40 | #' method, along with the relative confidence or credible interval 41 | #' (object \code{est}). The class of the output object changes according to the 42 | #' chosen \code{method}. Similarly, 43 | #' the remaining elements stored in the list reports a summary of the key 44 | #' quantities involved in the estimation process, e.g., 45 | #' the NN orders \code{n1} and \code{n2}. 46 | #' 47 | #' @export 48 | #' 49 | #' @references 50 | #' Facco E, D'Errico M, Rodriguez A, Laio A (2017). "Estimating the intrinsic 51 | #' dimension of datasets by a minimal neighborhood information." 52 | #' Scientific Reports, 7(1). 53 | #' ISSN 20452322, \doi{10.1038/s41598-017-11873-y}. 54 | #' 55 | #' Denti F, Doimo D, Laio A, Mira A (2022). "The generalized ratios intrinsic 56 | #' dimension estimator." 57 | #' Scientific Reports, 12(20005). 58 | #' ISSN 20452322, \doi{10.1038/s41598-022-20991-1}. 59 | #' 60 | #' @examples 61 | #' \donttest{ 62 | #' X <- replicate(2,rnorm(500)) 63 | #' dm <- as.matrix(dist(X,method = "manhattan")) 64 | #' res <- gride(X, nsim = 500) 65 | #' res 66 | #' plot(res) 67 | #' gride(dist_mat = dm, method = "bayes", upper_D =10, 68 | #' nsim = 500, burn_in = 100) 69 | #' } 70 | gride <- function(X = NULL, 71 | dist_mat = NULL, 72 | mus_n1_n2 = NULL, 73 | method = c("mle", "bayes"), 74 | n1 = 1, 75 | n2 = 2, 76 | alpha = 0.95, 77 | nsim = 5000, 78 | upper_D = 50, 79 | burn_in = 2000, 80 | sigma = 0.5, 81 | start_d = NULL, 82 | a_d = 1, 83 | b_d = 1, 84 | ...) { 85 | 86 | 87 | if (is.null(mus_n1_n2)) { 88 | if (is.null(X) & is.null(dist_mat)) { 89 | stop("Please provide either a dataset X or a distance matrix", 90 | call. = FALSE)} 91 | 92 | mus_n1_n2 <- compute_mus( 93 | X = X, 94 | dist_mat = dist_mat, 95 | n1 = n1, 96 | n2 = n2 97 | ) 98 | 99 | }else{ 100 | if (class(mus_n1_n2)[1] == "mus") { 101 | n1 <- attr(mus_n1_n2, which = "n1") 102 | n2 <- attr(mus_n1_n2, which = "n2") 103 | if(attr(mus_n1_n2, which = "upper_D") != "unknown"){ 104 | upper_D <- attr(mus_n1_n2, which = "upper_D") + 5 105 | } 106 | 107 | } 108 | 109 | 110 | } 111 | 112 | 113 | if (!is.null(X)) { 114 | upper_D <- ncol(X) + 1 115 | } 116 | if (is.null(X) & is.null(upper_D)) { 117 | stop("Please provide the nominal dimension of the dataset D in upper_D", 118 | call. = FALSE) 119 | } 120 | 121 | method <- match.arg(method) 122 | switch( 123 | method, 124 | mle = gride_mle( 125 | mus_n1_n2 = mus_n1_n2, 126 | n1 = n1, 127 | n2 = n2, 128 | alpha = alpha, 129 | upper_D = upper_D, 130 | nsim = nsim, 131 | ... 132 | ), 133 | bayes = gride_bayes( 134 | mus_n1_n2 = mus_n1_n2, 135 | alpha = alpha, 136 | n1 = n1, 137 | n2 = n2, 138 | upper_D = upper_D, 139 | nsim = nsim, 140 | burn_in = burn_in, 141 | sigma = sigma, 142 | start_d = start_d, 143 | a_d = a_d, b_d = b_d, 144 | ... 145 | ) 146 | ) 147 | 148 | } 149 | -------------------------------------------------------------------------------- /R/twonn_dec_prop.R: -------------------------------------------------------------------------------- 1 | #' Decimated \code{TWO-NN} evolution with proportions 2 | #' 3 | #' The estimation of the intrinsic dimension is related to the scale of the 4 | #' dataset. To escape the local reach of the \code{TWO-NN} estimator, 5 | #' \href{https://www.nature.com/articles/s41598-017-11873-y}{Facco et al. (2017)} 6 | #' proposed to subsample the original dataset in order to induce greater 7 | #' distances between the data points. By investigating the estimates' evolution 8 | #' as a function of the size of the neighborhood, it is possible to obtain 9 | #' information about the validity of the modeling assumptions and the robustness 10 | #' of the model in the presence of noise. 11 | #' 12 | #' @param X data matrix with \code{n} observations and \code{D} variables. 13 | #' @param proportions vector containing the fractions of the dataset to be 14 | #' considered. 15 | #' @param seed random seed controlling the sequence of subsampling. 16 | #' 17 | #' @return list containing the \code{TWO-NN} evolution (maximum likelihood 18 | #' estimation and confidence intervals), the average distance from the second 19 | #' NN, and the vector of proportions that were considered. 20 | #' 21 | #' @keywords internal 22 | #' @noRd 23 | #' 24 | twonn_dec_prop <- function(X, 25 | proportions = 1, 26 | seed = NULL) { 27 | if (!is.null(seed)) { 28 | set.seed(seed) 29 | } 30 | 31 | if (any(proportions > 1) | any(proportions < 0)) { 32 | stop( 33 | "The vector proportions must contain values between 0 (excluded) and 1 (included)", 34 | call. = FALSE 35 | ) 36 | } 37 | 38 | X <- as.matrix(X) 39 | D <- ncol(X) 40 | n <- n0 <- nrow(X) 41 | check <- 0 42 | check <- duplicated(X) 43 | 44 | if (sum(check) > 0) { 45 | X <- X[-which(check),] 46 | n <- nrow(X) 47 | warning( 48 | paste0( 49 | "\n Duplicates are present and will be removed. 50 | \n Original sample size: ", 51 | n0, 52 | ". New sample size: ", 53 | n, 54 | "." 55 | ), 56 | call. = FALSE 57 | ) 58 | } 59 | 60 | if (floor(min(proportions) * n) <= 2) { 61 | stop( 62 | "Proportions are too low, no observations left. Please lower the number of steps considered" 63 | ) 64 | } 65 | 66 | proportions <- sort(proportions,decreasing = T) 67 | W <- length(proportions) 68 | twonns <- matrix(NA, W, 3) 69 | avg_distance_n2 <- numeric(W) 70 | sub_ind <- 1:n 71 | n_new <- n 72 | 73 | # Classic TWO-NN 74 | K <- FNN::get.knn(X, k = 2) 75 | mudots <- K$nn.dist[, 2] / K$nn.dist[, 1] 76 | avg_distance_n2[1] <- mean(K$nn.dist[, 2]) 77 | ests <- twonn_mle(mudots) 78 | twonns[1, ] <- ests$est 79 | 80 | for (w in 2:(W)) { 81 | prop <- proportions[(w)] / proportions[w - 1] 82 | 83 | sub_ind <- sample(x = sub_ind, 84 | size = floor(n_new * prop)) 85 | 86 | n_new <- length(sub_ind) 87 | 88 | subX <- X[sub_ind, ] 89 | K <- FNN::get.knn(subX, k = 2) 90 | mudots <- K$nn.dist[, 2] / K$nn.dist[, 1] 91 | avg_distance_n2[w] <- mean(K$nn.dist[, 2]) 92 | ests <- twonn_mle(mudots) 93 | twonns[w, ] <- ests$est 94 | 95 | } 96 | 97 | res <- list( 98 | decimated_twonn = twonns, 99 | avg_distance_n2 = avg_distance_n2, 100 | proportions = proportions 101 | ) 102 | structure(res, class = c("twonn_dec_prop", class(res))) 103 | 104 | } 105 | 106 | 107 | 108 | #' @name twonn_decimation 109 | #' 110 | #' @param x object of class \code{twonn_dec_prop}, obtained from the function 111 | #' \code{twonn_dec_prop()}. 112 | #' @param ... ignored. 113 | #' 114 | #' @export 115 | print.twonn_dec_prop <- function(x, ...) { 116 | cat(paste0("Model: decimated TWO-NN\n")) 117 | if (length(x[["proportions"]]) > 10) { 118 | cat(paste0( 119 | "Decimating proportions ranging from ", 120 | round(max(x[["proportions"]]), 5), 121 | " to ", 122 | round(min(x[["proportions"]]), 5) 123 | ), 124 | "\n") 125 | } else{ 126 | cat("Decimating proportions: ", round(x[["proportions"]], 5), "\n") 127 | } 128 | cat(paste0( 129 | "Average distance from the n2-th NN ranging from ", 130 | round(min(x[["avg_distance_n2"]]), 4), 131 | " to ", 132 | round(max(x[["avg_distance_n2"]]), 4), 133 | "\n" 134 | )) 135 | invisible(x) 136 | } 137 | 138 | 139 | 140 | #' @name twonn_decimation 141 | #' 142 | #' @param x object of class \code{twonn_dec_prop}, obtained from the function 143 | #' \code{twonn_dec_prop()}. 144 | #' @param CI logical, if \code{TRUE}, the confidence intervals are plotted 145 | #' @param proportions logical, if \code{TRUE}, the x-axis reports the number of decimating proportions. 146 | #' If \code{FALSE}, the x-axis reports the log10 average distance. 147 | #' @param ... ignored. 148 | #' 149 | #' 150 | #' @export 151 | plot.twonn_dec_prop <- function(x, 152 | CI = FALSE, 153 | proportions = FALSE, 154 | ...) { 155 | 156 | if(proportions){ 157 | xx <- (x$proportions) 158 | logscale <- "" 159 | xlab <- "Decimating proportions" 160 | }else{ 161 | xx <- x$avg_distance_n2 162 | logscale <- "x" 163 | xlab <- Log[10] ~ average ~ n[2] ~ distance 164 | } 165 | 166 | if(CI){ 167 | plot(x$decimated_twonn[,2]~xx,col="darkblue",type = "b",lwd=1.5, 168 | xlab = xlab, ylab = "ID estimate", log = logscale, 169 | ylim = c(min(x$decimated_twonn),max(x$decimated_twonn))) 170 | graphics::polygon(c(xx, rev(xx)), c(x$decimated_twonn[,1], rev(x$decimated_twonn[,3])), 171 | col = "lightblue", border = NA) 172 | lines(x$decimated_twonn[,2]~xx,col="darkblue",type = "b",lwd=1.5) 173 | }else{ 174 | plot(x$decimated_twonn[,2]~xx,col="darkblue",type = "b",lwd=1.5, 175 | xlab = xlab, ylab = "ID estimate", log = logscale) 176 | } 177 | graphics::title("Decimated TWO-NN: Proportions") 178 | invisible() 179 | } 180 | -------------------------------------------------------------------------------- /man/Hidalgo.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/hidalgo.R 3 | \name{Hidalgo} 4 | \alias{Hidalgo} 5 | \alias{print.Hidalgo} 6 | \alias{plot.Hidalgo} 7 | \alias{summary.Hidalgo} 8 | \alias{print.summary.Hidalgo} 9 | \title{Fit the \code{Hidalgo} model} 10 | \usage{ 11 | Hidalgo( 12 | X = NULL, 13 | dist_mat = NULL, 14 | K = 10, 15 | nsim = 5000, 16 | burn_in = 5000, 17 | thinning = 1, 18 | verbose = TRUE, 19 | q = 3, 20 | xi = 0.75, 21 | alpha_Dirichlet = 0.05, 22 | a0_d = 1, 23 | b0_d = 1, 24 | prior_type = c("Conjugate", "Truncated", "Truncated_PointMass"), 25 | D = NULL, 26 | pi_mass = 0.5 27 | ) 28 | 29 | \method{print}{Hidalgo}(x, ...) 30 | 31 | \method{plot}{Hidalgo}(x, type = c("A", "B", "C"), class = NULL, ...) 32 | 33 | \method{summary}{Hidalgo}(object, ...) 34 | 35 | \method{print}{summary.Hidalgo}(x, ...) 36 | } 37 | \arguments{ 38 | \item{X}{data matrix with \code{n} observations and \code{D} variables.} 39 | 40 | \item{dist_mat}{distance matrix computed between the \code{n} observations.} 41 | 42 | \item{K}{integer, number of mixture components.} 43 | 44 | \item{nsim}{number of MCMC iterations to run.} 45 | 46 | \item{burn_in}{number of MCMC iterations to discard as burn-in period.} 47 | 48 | \item{thinning}{integer indicating the thinning interval.} 49 | 50 | \item{verbose}{logical, should the progress of the sampler be printed?} 51 | 52 | \item{q}{integer, first local homogeneity parameter. Default is 3.} 53 | 54 | \item{xi}{real number between 0 and 1, second local homogeneity parameter. 55 | Default is 0.75.} 56 | 57 | \item{alpha_Dirichlet}{parameter of the symmetric Dirichlet prior 58 | on the mixture weights. Default is 0.05, inducing a sparse mixture. 59 | Values that are too small (i.e., lower than 0.005) may cause underflow.} 60 | 61 | \item{a0_d}{shape parameter of the Gamma prior on \code{d}.} 62 | 63 | \item{b0_d}{rate parameter of the Gamma prior on \code{d}.} 64 | 65 | \item{prior_type}{character, type of Gamma prior on \code{d}, can be 66 | \describe{ 67 | \item{\code{"Conjugate"}}{a conjugate Gamma distribution is elicited;} 68 | \item{\code{"Truncated"}}{the conjugate Gamma prior is truncated over the 69 | interval \code{(0,D)};} 70 | \item{\code{"Truncated_PointMass"}}{same as \code{"Truncated"}, but a 71 | point mass is placed on \code{D}, to allow the \code{id} to be 72 | identically equal to the nominal dimension.} 73 | }} 74 | 75 | \item{D}{integer, the maximal dimension of the dataset.} 76 | 77 | \item{pi_mass}{probability placed a priori on \code{D} when 78 | \code{Truncated_PointMass} is chosen.} 79 | 80 | \item{x}{object of class \code{Hidalgo}, the output of the 81 | \code{Hidalgo()} function.} 82 | 83 | \item{...}{other arguments passed to specific methods.} 84 | 85 | \item{type}{character that indicates the type of plot that is requested. 86 | It can be: 87 | \describe{ 88 | \item{\code{"A"}}{plot the MCMC and the ergodic means NOT corrected 89 | for label switching;} 90 | \item{\code{"B"}}{plot the posterior mean and median of the id 91 | for each observation, after the chains are processed for label switching;} 92 | \item{\code{"C"}}{plot the estimated id distributions stratified by 93 | the groups specified in the class vector;} 94 | }} 95 | 96 | \item{class}{factor variable used to stratify observations according to 97 | their the \code{id} estimates.} 98 | 99 | \item{object}{object of class \code{Hidalgo}, the output of the 100 | \code{Hidalgo()} function.} 101 | } 102 | \value{ 103 | object of class \code{Hidalgo}, which is a list containing 104 | \describe{ 105 | \item{\code{cluster_prob}}{chains of the posterior mixture weights;} 106 | \item{\code{membership_labels}}{chains of the membership labels for all 107 | the observations;} 108 | \item{\code{id_raw}}{chains of the \code{K} intrinsic dimensions 109 | parameters, one per mixture component;} 110 | \item{\code{id_postpr}}{a chain for each observation, corrected for 111 | label switching;} 112 | \item{\code{id_summary}}{a matrix containing, for each observation, the 113 | value of posterior mean and the 5\%, 25\%, 50\%, 75\%, 95\% quantiles;} 114 | \item{\code{recap}}{a list with the objects and specifications passed to 115 | the function used in the estimation.} 116 | } 117 | } 118 | \description{ 119 | The function fits the Heterogeneous intrinsic dimension algorithm, developed 120 | in Allegra et al., 2020. The model is a Bayesian mixture of Pareto 121 | distribution with modified likelihood to induce homogeneity across 122 | neighboring observations. The model can segment the observations into 123 | multiple clusters characterized by different 124 | intrinsic dimensions. This permits to capture hidden patterns in the data. 125 | For more details on the algorithm, refer to 126 | \href{https://www.nature.com/articles/s41598-020-72222-0}{Allegra et al., 2020}. 127 | For an example of application to basketball data, see 128 | \href{https://imstat.org/journals-and-publications/annals-of-applied-statistics/annals-of-applied-statistics-next-issues/}{Santos-Fernandez et al., 2021}. 129 | } 130 | \examples{ 131 | \donttest{ 132 | set.seed(1234) 133 | X <- replicate(5,rnorm(500)) 134 | X[1:250,1:2] <- 0 135 | X[1:250,] <- X[1:250,] + 4 136 | oracle <- rep(1:2,rep(250,2)) 137 | # this is just a short example 138 | # increase the number of iterations to improve mixing and convergence 139 | h_out <- Hidalgo(X, nsim = 500, burn_in = 500) 140 | plot(h_out, type = "B") 141 | id_by_class(h_out, oracle) 142 | } 143 | 144 | 145 | } 146 | \references{ 147 | Allegra M, Facco E, Denti F, Laio A, Mira A (2020). 148 | “Data segmentation based on the local intrinsic dimension.” 149 | Scientific Reports, 10(1), 1–27. ISSN 20452322, 150 | \doi{10.1038/s41598-020-72222-0}, 151 | 152 | Santos-Fernandez E, Denti F, Mengersen K, Mira A (2021). 153 | “The role of intrinsic dimension in high-resolution player tracking data – 154 | Insights in basketball.” Annals of Applied Statistics - Forthcoming, – 155 | ISSN 2331-8422, 2002.04148, \doi{10.1038/s41598-022-20991-1} 156 | } 157 | \seealso{ 158 | \code{\link{id_by_class}} and \code{\link{clustering}} 159 | to understand how to further postprocess the results. 160 | } 161 | -------------------------------------------------------------------------------- /R/twonn_bayes.R: -------------------------------------------------------------------------------- 1 | #' Bayesian estimator for the \code{TWO-NN} model 2 | #' 3 | #' The function fits the \code{TWO-NN} model via Bayesian estimation, employing 4 | #' a conjugate prior. The formulas can be found in 5 | #' \href{https://www.nature.com/articles/s41598-022-20991-1}{Denti et al., 2022}. 6 | #' 7 | #' @param mus vector of second to first NN distance ratios. 8 | #' @param a_d shape parameter of the Gamma prior on the parameter \code{d}. 9 | #' @param b_d rate parameter of the Gamma prior on the parameter \code{d}. 10 | #' @param alpha posterior probability contained in the computed credible 11 | #' interval. 12 | #' @param c_trimmed proportion of trimmed observations. 13 | #' 14 | #' @return object of class \code{twonn_bayes}, which is a list containing the 15 | #' {(1 + \code{alpha}) / 2} and 1 - \code{\alpha} / 2 quantiles, mean, mode and 16 | #' median of the posterior distribution of \code{d}. 17 | #' 18 | #' @seealso \code{\link{twonn}}, \code{\link{autoplot.twonn_bayes}} 19 | #' 20 | #' @keywords Internal 21 | #' @noRd 22 | #' 23 | #' @references 24 | #' Denti F, Doimo D, Laio A, Mira A (2022). "The generalized ratios intrinsic 25 | #' dimension estimator." 26 | #' Scientific Reports, 12(20005). 27 | #' ISSN 20452322, \doi{10.1038/s41598-022-20991-1}. 28 | #' 29 | twonn_bayes <- function(mus, 30 | a_d = 0.001, 31 | b_d = 0.001, 32 | alpha = 0.95, 33 | c_trimmed = 0.01) { 34 | n <- n_original <- base::length(mus) 35 | c_considered <- 1 - c_trimmed 36 | 37 | n <- base::length(mus) 38 | 39 | if (c_trimmed) { 40 | mus <- sort(mus)[1:floor(n * c_considered)] 41 | n <- base::length(mus) 42 | } 43 | 44 | logmus <- base::log(mus) 45 | Res <- numeric(5) 46 | Res[2] <- (a_d + length(logmus)) / (b_d + sum(logmus)) 47 | Res[3] <- 48 | stats::qgamma(.5, shape = (a_d + length(logmus)), 49 | rate = (b_d + sum(logmus))) 50 | Res[4] <- (a_d + length(logmus) - 1) / (b_d + sum(logmus)) 51 | Res[c(1, 5)] <- 52 | stats::qgamma(c((1 - alpha) / 2, (1 + alpha) / 2), 53 | shape = (a_d + length(logmus)), 54 | rate = (b_d + sum(logmus))) 55 | Res <- list( 56 | est = Res, 57 | alpha = alpha, 58 | hp_prior = c(a_d, b_d), 59 | hp_posterior = c(a_d + length(logmus), b_d + sum(logmus)), 60 | c_trimmed = c_trimmed, 61 | n_original = n_original, 62 | n = n 63 | ) 64 | 65 | structure(Res, class = c("twonn_bayes", class(Res))) 66 | } 67 | 68 | 69 | #' @name twonn 70 | #' 71 | #' @param x object of class \code{twonn_bayes}, obtained from the function 72 | #' \code{twonn_bayes()}. 73 | #' @param ... ignored. 74 | #' 75 | #' 76 | #' @export 77 | print.twonn_bayes <- function(x, ...) { 78 | y <- c("TWONN - Bayes - Posterior Mean" = x[["est"]][2]) 79 | cat(y) 80 | cat("\n") 81 | invisible(x) 82 | } 83 | 84 | #' @name twonn 85 | #' 86 | #' @param object object of class \code{twonn_bayes}, obtained from the function 87 | #' \code{twonn_bayes()}. 88 | #' @param ... ignored. 89 | #' 90 | #' @export 91 | summary.twonn_bayes <- function(object, ...) { 92 | y <- c( 93 | `Original sample size` = object[["n_original"]], 94 | `Used sample size` = object[["n"]], 95 | `Trimming proportion` = object[["c_trimmed"]], 96 | 97 | `Prior shape` = object[["hp_prior"]][1], 98 | `Prior scale` = object[["hp_prior"]][2], 99 | 100 | `Credible interval level` = object[["alpha"]], 101 | 102 | `Lower bound` = object[["est"]][1], 103 | `Posterior mean` = object[["est"]][2], 104 | `Posterior median` = object[["est"]][3], 105 | `Posterior mode` = object[["est"]][4], 106 | `Upper bound` = object[["est"]][5] 107 | ) 108 | structure(y, class = c("summary.twonn_bayes","matrix")) 109 | } 110 | 111 | 112 | #' @name twonn 113 | #' 114 | #' @param x object of class \code{twonn_bayes}, obtained from the function 115 | #' \code{twonn_bayes()}. 116 | #' @param ... ignored. 117 | #' 118 | #' @export 119 | print.summary.twonn_bayes <- function(x, ...) { 120 | cat("Model: TWO-NN\n") 121 | cat("Method: Bayesian Estimation\n") 122 | cat(paste0( 123 | "Sample size: ", 124 | x[1], 125 | ", Obs. used: ", 126 | x[2], 127 | ". Trimming proportion: ", 128 | 100 * x[3], 129 | "%\n" 130 | )) 131 | cat(paste0("Prior d ~ Gamma(", 132 | x[4], 133 | ", ", 134 | x[5], 135 | ")\n")) 136 | cat(paste0( 137 | "Credible Interval quantiles: ", 138 | (1 - x[6]) / 2 * 100, 139 | "%, ", 140 | (1 + x[6]) / 2 * 100, 141 | "%\n" 142 | )) 143 | cat(paste0("Posterior ID estimates:")) 144 | y <- cbind( 145 | `Lower Bound` = x[7], 146 | `Mean` = x[8], 147 | `Median` = x[9], 148 | `Mode` = x[10], 149 | `Upper Bound` = x[11] 150 | ) 151 | rownames(y) <- NULL 152 | print(knitr::kable(y)) 153 | cat("\n") 154 | invisible(x) 155 | } 156 | 157 | 158 | 159 | #' @name twonn 160 | #' 161 | #' @param x object of class \code{twonn_bayes}, the output of the 162 | #' \code{twonn} function when \code{method = "bayes"}. 163 | #' @param plot_low lower bound of the interval on which the posterior density 164 | #' is plotted. 165 | #' @param plot_upp upper bound of the interval on which the posterior density 166 | #' is plotted. 167 | #' @param by step-size at which the sequence spanning the interval is 168 | #' incremented. 169 | #' @param ... other arguments passed to specific methods. 170 | #' 171 | #' @export 172 | #' 173 | plot.twonn_bayes <- 174 | function(x, 175 | plot_low = 0.001, 176 | plot_upp = NULL, 177 | by = .05, 178 | ...) { 179 | 180 | if (is.null(plot_upp)) { 181 | plot_upp <- x$est[5] + 3 182 | } 183 | 184 | xx <- seq(plot_low, plot_upp, by = by) 185 | y0 <- stats::dgamma(xx, 186 | shape = x$hp_prior[1], 187 | rate = x$hp_prior[2]) 188 | y <- stats::dgamma(xx, 189 | shape = x$hp_posterior[1], 190 | rate = x$hp_posterior[2]) 191 | 192 | plot(y0~xx, type="l", col = 4, 193 | xlab=("Intrinsic Dimension"), 194 | ylab=("Posterior density"), ylim = c(0, max(c(y,y0)) )) 195 | lines(y~xx, type="l", col = 1) 196 | abline(v = x$est, 197 | lty = 2, 198 | col = 2) 199 | graphics::title("Bayesian TWO-NN") 200 | invisible() 201 | } 202 | 203 | 204 | -------------------------------------------------------------------------------- /R/hidalgo_postproc.R: -------------------------------------------------------------------------------- 1 | #' Stratification of the \code{id} by an external categorical variable 2 | #' 3 | #' The function computes summary statistics (mean, median, and standard 4 | #' deviation) of the post-processed chains of the intrinsic dimension stratified 5 | #' by an external categorical variable. 6 | #' 7 | #' @param object object of class \code{Hidalgo}, the output of the 8 | #' \code{Hidalgo()} function. 9 | #' @param class factor according to the observations should be stratified by. 10 | #' 11 | #' @return a \code{data.frame} containing the posterior \code{id} means, 12 | #' medians, and standard deviations stratified by the levels of the variable 13 | #' \code{class}. 14 | #' @export 15 | #' 16 | #' @seealso \code{\link{Hidalgo}} 17 | #' 18 | #' @examples 19 | #' \donttest{ 20 | #' X <- replicate(5,rnorm(500)) 21 | #' X[1:250,1:2] <- 0 22 | #' oracle <- rep(1:2,rep(250,2)) 23 | #' h_out <- Hidalgo(X) 24 | #' id_by_class(h_out,oracle) 25 | #' } 26 | #' 27 | id_by_class <- function(object, class) { 28 | 29 | if (class(object)[1] != "Hidalgo") { 30 | stop("object is not of class 'Hidalgo'", call. = FALSE) 31 | } 32 | 33 | class <- factor(class) 34 | REV <- object$id_summary 35 | 36 | means <- tapply(REV$MEAN, class, mean) 37 | medians <- tapply(REV$MEAN, class, stats::median) 38 | sds <- tapply(REV$MEAN, class, stats::sd) 39 | 40 | Res <- data.frame( 41 | class = levels(class), 42 | mean = means, 43 | median = medians, 44 | sd = sds 45 | ) 46 | 47 | structure(Res, class = c("hidalgo_class", class(Res))) 48 | } 49 | 50 | #' @name id_by_class 51 | #' 52 | #' @param x object of class \code{hidalgo_class}, the output of the \code{id_by_class()} function. 53 | #' @param ... other arguments passed to specific methods. 54 | #' 55 | #' @export 56 | #' 57 | print.hidalgo_class <- function(x, ... ){ 58 | cat("Posterior ID by class:") 59 | rownames(x) <- NULL 60 | print(knitr::kable(x)) 61 | cat("\n") 62 | invisible(x) 63 | } 64 | 65 | #' Posterior similarity matrix and partition estimation 66 | #' 67 | #' The function computes the posterior similarity (coclustering) matrix (psm) 68 | #' and estimates a representative partition of the observations from the MCMC 69 | #' output. The user can provide the desired number of clusters or estimate a 70 | #' optimal clustering solution by minimizing a loss function on the space 71 | #' of the partitions. 72 | #' In the latter case, the function uses the package \code{salso} 73 | #' (\href{https://cran.r-project.org/package=salso}{Dahl et al., 2021}), 74 | #' that the user needs to load. 75 | #' 76 | #' @param object object of class \code{Hidalgo}, the output of the 77 | #' \code{Hidalgo} function. 78 | #' @param clustering_method character indicating the method to use to perform 79 | #' clustering. It can be 80 | #' \describe{ 81 | #' \item{"dendrogram"}{thresholding the adjacency dendrogram with a given 82 | #' number (\code{K});} 83 | #' \item{"salso"}{estimation via minimization of several partition 84 | #' estimation criteria. 85 | #' The default loss function is the variation of information.} 86 | #' } 87 | #' @param K number of clusters to recover by thresholding the 88 | #' dendrogram obtained from the psm. 89 | #' @param nCores parameter for the \code{salso} function: the number of CPU 90 | #' cores to use. A value of zero indicates to use all cores on the system. 91 | #' @param ... optional additional parameter to pass to \code{salso()}. 92 | #' 93 | #' @return list containing the posterior similarity matrix (\code{psm}) and 94 | #' the estimated partition \code{clust}. 95 | #' @export 96 | #' 97 | #' @seealso \code{\link{Hidalgo}}, \code{\link[salso]{salso}} 98 | #' 99 | #' @references 100 | #' D. B. Dahl, D. J. Johnson, and P. Müller (2022), 101 | #' "Search Algorithms and Loss Functions for Bayesian Clustering", 102 | #' Journal of Computational and Graphical Statistics, 103 | #' \doi{10.1080/10618600.2022.2069779}. 104 | #' 105 | #' David B. Dahl, Devin J. Johnson and Peter Müller (2022). "salso: Search 106 | #' Algorithms and Loss Functions for Bayesian Clustering". 107 | #' R package version 108 | #' 0.3.0. \url{https://CRAN.R-project.org/package=salso} 109 | #' 110 | #' @examples 111 | #' \donttest{ 112 | #' library(salso) 113 | #' X <- replicate(5,rnorm(500)) 114 | #' X[1:250,1:2] <- 0 115 | #' h_out <- Hidalgo(X) 116 | #' clustering(h_out) 117 | #' } 118 | clustering <- function(object, 119 | clustering_method = c("dendrogram", "salso"), 120 | K = 2, 121 | nCores = 1, 122 | ...) { 123 | 124 | if (class(object)[1] != "Hidalgo") { 125 | stop("object is not of class 'Hidalgo'", call. = FALSE) 126 | } 127 | 128 | clustering_method <- match.arg(clustering_method) 129 | 130 | psm <- salso::psm(object$membership_labels, 131 | nCores = nCores) 132 | 133 | clust <- switch(clustering_method, 134 | "dendrogram" = { 135 | dendr <- stats::hclust(stats::as.dist(1 - psm)) 136 | stats::cutree(dendr, k = K) 137 | }, 138 | "salso" = { 139 | salso::salso(object$membership_labels, ...) 140 | }) 141 | 142 | Res <- 143 | list( 144 | clust = factor(clust), 145 | psm = psm, 146 | chosen_method = clustering_method, 147 | K = length(unique(clust)) 148 | ) 149 | structure(Res, class = c("hidalgo_psm", class(Res))) 150 | } 151 | 152 | 153 | #' @name clustering 154 | #' 155 | #' @param x object of class \code{hidalgo_psm}, obtained from the function 156 | #' \code{clustering()}. 157 | #' @param ... ignored. 158 | #' 159 | #' @export 160 | print.hidalgo_psm <- function(x, ...) { 161 | cat("Estimated clustering solution summary:\n\n") 162 | cat(paste0("Method: ", x$chosen_method, "\n")) 163 | 164 | if (x$chosen_method == "dendrogram") { 165 | cat(paste0("Retrieved clusters: ", x$K, "\n")) 166 | } else{ 167 | cat(paste0("Retrieved clusters: ", length(unique(x$clust)), "\n")) 168 | } 169 | 170 | cat("Clustering frequencies:") 171 | tab <- t(table(Cluster = x$clust)) 172 | colnames(tab) <- paste("Cluster", colnames(tab)) 173 | print(knitr::kable(tab)) 174 | cat("\n") 175 | invisible(x) 176 | } 177 | 178 | #' @name clustering 179 | #' 180 | #' 181 | #' @param x object of class \code{hidalgo_psm}, obtained from the function 182 | #' \code{clustering()}. 183 | #' @param ... ignored. 184 | #' 185 | #' @export 186 | #' 187 | plot.hidalgo_psm <- function(x, ...) { 188 | 189 | psm <- x$psm 190 | stats::heatmap(psm) 191 | invisible() 192 | } 193 | 194 | 195 | 196 | 197 | -------------------------------------------------------------------------------- /R/autoplot_twonn.R: -------------------------------------------------------------------------------- 1 | #' Plot the output of the \code{TWO-NN} model estimated via least squares 2 | #' 3 | #' Use this method without the \code{.twonn_linfit} suffix. 4 | #' The function returns the representation of the linear 5 | #' regression that is fitted with the \code{linfit} method. 6 | #' 7 | #' @param object object of class \code{twonn_linfit}, the output of the 8 | #' \code{twonn} function when \code{method = "linfit"}. 9 | #' @param title string used as title of the plot. 10 | #' @param ... other arguments passed to specific methods. 11 | #' 12 | #' @rdname autoplot.twonn_linfit 13 | #' 14 | #' @seealso \code{\link{twonn}} 15 | #' 16 | #' @return a \code{\link[ggplot2]{ggplot2}} object displaying the goodness of 17 | #' the linear fit of the TWO-NN model. 18 | #' 19 | #' @family autoplot methods 20 | #' 21 | #' @export 22 | autoplot.twonn_linfit <- function(object, 23 | title = "TWO-NN Linear Fit", 24 | ...) { 25 | lmod <- object$lm_output 26 | x <- lmod$model$x 27 | y <- lmod$model$y 28 | Res <- object$est 29 | 30 | p1 <- ggplot2::ggplot()+ 31 | ggplot2::geom_point(ggplot2::aes(x = x, y = y)) + 32 | ggplot2::theme_bw() + 33 | ggplot2::geom_abline( 34 | intercept = 0, 35 | slope = lmod$coefficients, 36 | col = I("red") 37 | ) + 38 | ggplot2::ylab("-log(1-(i/N))") + 39 | ggplot2::xlab(expression(log(mu))) + 40 | ggplot2::annotate( 41 | "label", 42 | -Inf, 43 | Inf, 44 | label = paste("ID:", round(Res[2], 3)), 45 | hjust = -0.05, 46 | vjust = 1.1 47 | ) + 48 | ggplot2::annotate( 49 | "label", 50 | Inf, 51 | -Inf, 52 | label = paste("R^2:", round(summary(lmod)$r.squared, 3)), 53 | parse = TRUE, 54 | hjust = 1.05, 55 | vjust = -0.1 56 | ) + 57 | ggplot2::theme( 58 | axis.title.x = ggplot2::element_text(size = 15), 59 | axis.title.y = ggplot2::element_text(size = 15), 60 | title = ggplot2::element_text(size = 15) 61 | ) 62 | 63 | p1 + ggplot2::ggtitle(title) 64 | } 65 | 66 | 67 | #' Plot the output of the \code{TWO-NN} model estimated via the Bayesian 68 | #' approach 69 | #' 70 | #' Use this method without the \code{.twonn_bayes} suffix. 71 | #' The function returns the density plot of the 72 | #' posterior distribution computed with the \code{bayes} method. 73 | #' 74 | #' @param object object of class \code{twonn_bayes}, the output of the 75 | #' \code{twonn} function when \code{method = "bayes"}. 76 | #' @param title character string used as title of the plot. 77 | #' @param plot_low lower bound of the interval on which the posterior density 78 | #' is plotted. 79 | #' @param plot_upp upper bound of the interval on which the posterior density 80 | #' is plotted. 81 | #' @param by step-size at which the sequence spanning the interval is 82 | #' incremented. 83 | #' @param ... other arguments passed to specific methods. 84 | #' 85 | #' @rdname autoplot.twonn_bayes 86 | #' 87 | #' @seealso \code{\link{twonn}} 88 | #' 89 | #' @return \code{\link[ggplot2]{ggplot2}} object displaying the posterior 90 | #' distribution of the intrinsic dimension parameter. 91 | #' 92 | #' @family autoplot methods 93 | #' 94 | #' @export 95 | autoplot.twonn_bayes <- 96 | function(object, 97 | plot_low = 0, 98 | plot_upp = NULL, 99 | by = .05, 100 | title = "Bayesian TWO-NN", 101 | ...) { 102 | if (is.null(plot_upp)) { 103 | plot_upp <- object$est[5] + 3 104 | } 105 | 106 | x <- seq(plot_low, plot_upp, by = by) 107 | y0 <- stats::dgamma(x, 108 | shape = object$hp_prior[1], 109 | rate = object$hp_prior[2]) 110 | y <- stats::dgamma(x, 111 | shape = object$hp_posterior[1], 112 | rate = object$hp_posterior[2]) 113 | G1 <- ggplot2::ggplot(dplyr::tibble(x, y)) + 114 | ggplot2::geom_line(ggplot2::aes(x = x, y = y0), col = 4) + 115 | ggplot2::geom_line(ggplot2::aes(x = x, y = y)) + 116 | ggplot2::xlab("Intrinsic Dimension") + 117 | ggplot2::ylab("Posterior density") + 118 | ggplot2::theme_bw() + 119 | ggplot2::geom_vline(xintercept = object$est, 120 | lty = 2, 121 | col = 2) + 122 | ggplot2::theme( 123 | axis.title.x = ggplot2::element_text(size = 20), 124 | axis.title.y = ggplot2::element_text(size = 20), 125 | title = ggplot2::element_text(size = 20) 126 | ) 127 | G1 + ggplot2::ggtitle(title) 128 | } 129 | 130 | 131 | #' Plot the output of the \code{TWO-NN} model estimated via the Maximum 132 | #' Likelihood approach 133 | #' 134 | #' Use this method without the \code{.twonn_mle} suffix. 135 | #' The function returns the point estimate along with the confidence bands 136 | #' computed via the \code{mle} method. 137 | #' 138 | #' @param object object of class \code{twonn_mle}, the output of the 139 | #' \code{twonn} function when \code{method = "mle"}. 140 | #' @param title character string used as title of the plot. 141 | #' @param ... other arguments passed to specific methods. 142 | #' 143 | #' @rdname autoplot.twonn_mle 144 | #' 145 | #' @seealso \code{\link{twonn}} 146 | #' 147 | #' @return \code{\link[ggplot2]{ggplot2}} object displaying the point estimate 148 | #' and confidence interval obtained via the maximum likelihood approach of the 149 | #' \code{id} parameter. 150 | #' 151 | #' @family autoplot methods 152 | #' 153 | #' @export 154 | autoplot.twonn_mle <- 155 | function(object, 156 | title = "MLE TWO-NN", 157 | ...) { 158 | D <- data.frame(x = object$est, y = 0) 159 | G1 <- ggplot2::ggplot() + 160 | ggplot2::geom_segment(ggplot2::aes( 161 | x = D[1, 1], 162 | xend = D[3, 1], 163 | y = D[1, 2], 164 | yend = D[3, 2] 165 | )) + 166 | ggplot2::geom_vline( 167 | xintercept = D[, 1], 168 | col = "gray", 169 | lty = 2, 170 | size = .5 171 | ) + 172 | ggplot2::geom_point(ggplot2::aes(x = D[c(1), 1], y = D[c(1), 2]), size = 173 | 10, pch = "[") + 174 | ggplot2::geom_point(ggplot2::aes(x = D[c(3), 1], y = D[c(3), 2]), size = 175 | 10, pch = "]") + 176 | ggplot2::geom_point(ggplot2::aes(x = D[c(2), 1], y = D[c(2), 2]), size = 177 | 10) + 178 | ggplot2::xlab("Maximum Likelihood Estimation") + 179 | ggplot2::theme_bw() + 180 | ggplot2::theme( 181 | axis.title.y = ggplot2::element_blank(), 182 | axis.text.y = ggplot2::element_blank(), 183 | axis.ticks.y = ggplot2::element_blank(), 184 | axis.title.x = ggplot2::element_text(size = 20), 185 | title = ggplot2::element_text(size = 20) 186 | ) 187 | G1 + ggplot2::ggtitle(title) 188 | } 189 | 190 | 191 | 192 | 193 | -------------------------------------------------------------------------------- /R/gride_evolution.R: -------------------------------------------------------------------------------- 1 | #' \code{Gride} evolution based on Maximum Likelihood Estimation 2 | #' 3 | #' The function allows the study of the evolution of the \code{id} estimates 4 | #' as a function of the scale of a dataset. A scale-dependent analysis 5 | #' is essential to identify the correct number of relevant directions in noisy 6 | #' data. To increase the average distance from the second NN (and thus the 7 | #' average neighborhood size) involved in the estimation, the function computes 8 | #' a sequence of \code{Gride} models with increasing NN orders, \code{n1} and 9 | #' \code{n2}. 10 | #' See also \href{https://www.nature.com/articles/s41598-022-20991-1}{Denti et al., 2022} 11 | #' for more details. 12 | #' 13 | #' @param X data matrix with \code{n} observations and \code{D} variables. 14 | #' @param vec_n1 vector of integers, containing the smaller NN orders considered 15 | #' in the evolution. 16 | #' @param vec_n2 vector of integers, containing the larger NN orders considered 17 | #' in the evolution. 18 | #' @param upp_bound upper bound for the interval used in the numerical 19 | #' optimization (via \code{optimize}). Default is set to 50. 20 | #' 21 | #' @return list containing the Gride evolution, the corresponding NN distance 22 | #' ratios, the average n2-th NN order distances, and the NN orders considered. 23 | #' 24 | #' @name gride_evolution 25 | #' 26 | #' @export 27 | #' 28 | #' @references 29 | #' Denti F, Doimo D, Laio A, Mira A (2022). "The generalized ratios intrinsic 30 | #' dimension estimator." 31 | #' Scientific Reports, 12(20005). 32 | #' ISSN 20452322, \doi{10.1038/s41598-022-20991-1}. 33 | #' 34 | #' @examples 35 | #' \donttest{ 36 | #' X <- replicate(5,rnorm(10000,0,.1)) 37 | #' gride_evolution(X = X,vec_n1 = 2^(0:5),vec_n2 = 2^(1:6)) 38 | #'} 39 | #' 40 | gride_evolution <- function(X, vec_n1, vec_n2, upp_bound = 50) { 41 | if (any(vec_n2 < vec_n1)) { 42 | stop("at least one n2 is lower than one n1", call. = FALSE) 43 | } 44 | if (length(vec_n1) != length(vec_n2)) { 45 | stop("the considered NN orders in n1 and n2 do not match", call. = FALSE) 46 | } 47 | 48 | X <- as.matrix(X) 49 | D <- ncol(X) 50 | n <- n0 <- nrow(X) 51 | check <- 0 52 | check <- duplicated(X) 53 | 54 | if (sum(check) > 0) { 55 | X <- X[-which(check),] 56 | n <- nrow(X) 57 | warning( 58 | paste0( 59 | "\n Duplicates are present and will be removed. 60 | \n Original sample size: ", 61 | n0, 62 | ". New sample size: ", 63 | n, 64 | "." 65 | ), 66 | call. = FALSE 67 | ) 68 | } 69 | 70 | 71 | 72 | W <- length(vec_n1) 73 | MUdots <- matrix(NA, n, W) 74 | K <- FNN::get.knn(X, k = max(vec_n2)) 75 | path <- numeric(W) 76 | avg_distance_n2 <- numeric(W) 77 | 78 | for (w in 1:W) { 79 | MUdots[, w] <- (K$nn.dist[, vec_n2[w]]) / 80 | (K$nn.dist[, vec_n1[w]]) 81 | 82 | avg_distance_n2[w] <- mean(K$nn.dist[, vec_n2[w]]) 83 | path[w] <- stats::optimize( 84 | gride_log_likelihood, 85 | interval = c(0.01, min(D, upp_bound) + 1), 86 | n1 = vec_n1[w], 87 | n2 = vec_n2[w], 88 | mus_n1_n2 = MUdots[, w], 89 | maximum = TRUE 90 | )$max 91 | } 92 | 93 | res <- list( 94 | path = path, 95 | MUdots = MUdots, 96 | NNorders = rbind(vec_n1, vec_n2), 97 | avg_distance_n2 = avg_distance_n2 98 | ) 99 | 100 | structure(res, class = c("gride_evolution", class(res))) 101 | 102 | } 103 | 104 | 105 | #' @name gride_evolution 106 | #' 107 | #' @param x object of class \code{gride_evolution}, obtained from the function 108 | #' \code{gride_evolution()}. 109 | #' @param ... ignored. 110 | #' 111 | #' @return the function prints a summary of the Gride evolution to 112 | #' console. 113 | #' 114 | #' @export 115 | print.gride_evolution <- function(x, ...) { 116 | cat(paste0("Model: Gride evolution\n")) 117 | cat(paste0("Smaller NN order ranging from ", 118 | min(x[["NNorders"]][1, ]), 119 | " to ", 120 | max(x[["NNorders"]][1, ]), "\n")) 121 | cat(paste0("Larger NN order ranging from ", 122 | min(x[["NNorders"]][2, ]), 123 | " to ", 124 | max(x[["NNorders"]][2, ]), "\n")) 125 | cat(paste0( 126 | "Average distance from the n2-th NN ranging from ", 127 | round(min(x[["avg_distance_n2"]]), 4), 128 | " to ", 129 | round(max(x[["avg_distance_n2"]]), 4), 130 | "\n" 131 | )) 132 | invisible(x) 133 | } 134 | 135 | 136 | 137 | #' @name gride_evolution 138 | #' 139 | #' @param x an object of class \code{gride_evolution}. 140 | #' 141 | #' @param ... other arguments passed to specific methods. 142 | #' 143 | #' @export 144 | #' 145 | plot.gride_evolution <- function(x, 146 | ...) { 147 | id <- x$path 148 | avg_n2 <- x$avg_distance_n2 149 | 150 | plot(id~avg_n2,type = "b", col = "darkblue",log = "x", 151 | ylab = ("Intrinsic dimension"), 152 | xlab = Log[10] ~ average ~ n[2] ~ distance) 153 | graphics::title("Gride Evolution") 154 | invisible() 155 | } 156 | 157 | #' Plot the evolution of \code{Gride} estimates 158 | #' 159 | #' Use this method without the \code{.gride_evolution} suffix. 160 | #' It plots the evolution of the \code{id} 161 | #' estimates as a function of the average distance from the furthest NN of 162 | #' each point. 163 | #' 164 | #' @param object an object of class \code{gride_evolution}. 165 | #' @param title an optional string to customize the title of the plot. 166 | #' @param ... other arguments passed to specific methods. 167 | #' 168 | #' @rdname autoplot.gride_evolution 169 | #' 170 | #' @return object of class \code{\link[ggplot2]{ggplot}}. It displays the 171 | #' the evolution of the Gride maximum likelihood estimates as a function 172 | #' of the average distance from \code{n2}. 173 | #' 174 | #' @export 175 | #' 176 | autoplot.gride_evolution <- function(object, 177 | title = "Gride Evolution", 178 | ...) { 179 | D <- data.frame(id = object$path, 180 | n2 = object$avg_distance_n2) 181 | G1 <- ggplot2::ggplot(D) + 182 | ggplot2::geom_path(ggplot2::aes(x = .data$n2, 183 | y = .data$id), 184 | col = "darkblue") + 185 | ggplot2::geom_point(ggplot2::aes(x = .data$n2, 186 | y = .data$id), 187 | col = "darkblue") + 188 | ggplot2::ylab("Intrinsic dimension") + 189 | ggplot2::theme_bw() + 190 | ggplot2::scale_x_continuous(trans = "log10") + 191 | ggplot2::xlab(Log[10] ~ average ~ n[2] ~ distance) + 192 | ggplot2::theme( 193 | axis.title.x = ggplot2::element_text(size = 20), 194 | axis.title.y = ggplot2::element_text(size = 20), 195 | title = ggplot2::element_text(size = 20) 196 | ) + 197 | ggplot2::ggtitle(title) 198 | 199 | return(G1) 200 | } 201 | -------------------------------------------------------------------------------- /R/gride_bayes.R: -------------------------------------------------------------------------------- 1 | #' Generalized Ratios ID Estimation via Bayesian approach 2 | #' 3 | #' The function fits the Bayesian Gride model. To run this function, use the 4 | #' high-level \code{gride()} and specify \code{method = "bayes"}. The function 5 | #' runs a Metropolis-Hasting (MH) algorithm over \code{log(d)}, adopting a 6 | #' Normal distribution with pre-specified standard deviation \code{sigma} as 7 | #' proposal distribution. 8 | #' See \href{https://www.nature.com/articles/s41598-022-20991-1}{Denti et al., 2022} 9 | #' for more details. 10 | #' 11 | #' @param mus_n1_n2 vector of generalized order NN distance ratios. 12 | #' @param n1 order of the first NN considered. Default is 1. 13 | #' @param n2 order of the second NN considered. Default is 2. 14 | #' @param nsim number of MCMC iterations to collect in the sample. 15 | #' @param burn_in number of iterations to discard from the MCMC sample. 16 | #' @param sigma standard deviation of the Gaussian proposal used in the MH step. 17 | #' @param start_d initial value for the MCMC chain. If \code{NULL}, the MLE is 18 | #' computed. 19 | #' @param a_d shape parameter of the Gamma prior distribution for \code{d}. 20 | #' @param b_d rate parameter of the Gamma prior distribution for \code{d}. 21 | #' @param alpha posterior probability contained in computed credible interval. 22 | #' @param upper_D upper bound for the id parameter, needed if \code{start_d} 23 | #' is not provided to initiate the chain with the MLE estimate. 24 | #' 25 | #' @return a list containing the MCMC sample and the summary of the 26 | #' specified arguments. 27 | #' 28 | #' @keywords Internal 29 | #' @noRd 30 | #' 31 | #' @references 32 | #' Denti F, Doimo D, Laio A, Mira A (2022). "The generalized ratios intrinsic 33 | #' dimension estimator." 34 | #' Scientific Reports, 12(20005). 35 | #' ISSN 20452322, \doi{10.1038/s41598-022-20991-1}. 36 | #' 37 | #' @seealso \code{\link{gride}} 38 | #' 39 | gride_bayes <- function(mus_n1_n2 = NULL, 40 | n1 = 1, 41 | n2 = 2, 42 | nsim = 5000, 43 | burn_in = 2000, 44 | sigma = 0.5, 45 | start_d = NULL, 46 | a_d = 1, 47 | b_d = 1, 48 | alpha = 0.95, 49 | upper_D = NULL) { 50 | 51 | 52 | if (n2 < n1) { 53 | stop("n2 should be greater than n1", call. = FALSE) 54 | } 55 | 56 | 57 | if (is.null(start_d)) { 58 | start_d <- gride_mle_point( 59 | mus_n1_n2 = mus_n1_n2, 60 | n1 = n1, 61 | n2 = n2, 62 | upper_D = upper_D 63 | ) 64 | start_d <- log(start_d - 1) 65 | } else{ 66 | if (start_d <= 1) { 67 | stop("Invalid starting point: it has to be > 1", call. = FALSE) 68 | } 69 | start_d <- log(start_d - 1) 70 | } 71 | 72 | post_sample <- gride_mh_sampler( 73 | start_d = start_d, 74 | n1 = n1, 75 | n2 = n2, 76 | a_d = a_d, 77 | b_d = b_d, 78 | mus_n1_n2 = mus_n1_n2, 79 | nsim = nsim, 80 | burn_in = burn_in, 81 | sigma = sigma 82 | ) 83 | 84 | sam <- (exp(post_sample) + 1) 85 | est <- numeric(5) 86 | est[2] <- mean(sam) 87 | est[3] <- stats::median(sam) 88 | dens <- stats::density(sam) 89 | est[4] <- dens$x[which.max(dens$y)] 90 | est[c(1, 5)] <- stats::quantile(sam, c((1 - alpha) / 2, (1 + alpha) / 91 | 2)) 92 | Res <- list( 93 | est = est, 94 | post_sample = sam, 95 | hp_prior = c(a_d, b_d), 96 | alpha = alpha, 97 | n1 = n1, 98 | n2 = n2, 99 | nsim = nsim, 100 | sigma = sigma 101 | ) 102 | structure(Res, class = c("gride_bayes", class(Res))) 103 | } 104 | 105 | 106 | 107 | #' @name gride 108 | #' 109 | #' @param x object of class \code{gride_bayes}, obtained from the function 110 | #' \code{gride_bayes()}. 111 | #' @param ... ignored. 112 | #' 113 | #' 114 | #' @export 115 | print.gride_bayes <- function(x, ...) { 116 | cat(paste0("Gride(",x[["n1"]],",",x[["n2"]],") - Bayes - Posterior Mean\n")) 117 | cat(x[["est"]][2]) 118 | cat("\n") 119 | invisible(x) 120 | } 121 | 122 | #' @name gride 123 | #' 124 | #' @param object object of class \code{gride_bayes}, obtained from the function 125 | #' \code{gride_bayes()}. 126 | #' @param ... ignored. 127 | #' 128 | #' @export 129 | summary.gride_bayes <- function(object, ...) { 130 | y <- c( 131 | `MCMC Iterations` = object[["nsim"]], 132 | 133 | 134 | `Prior shape` = object[["hp_prior"]][1], 135 | `Prior scale` = object[["hp_prior"]][2], 136 | 137 | `Credible interval level` = object[["alpha"]], 138 | 139 | `n1` = object[["n1"]], 140 | `n2` = object[["n2"]], 141 | 142 | `Lower bound` = object[["est"]][1], 143 | `Posterior mean` = object[["est"]][2], 144 | `Posterior median` = object[["est"]][3], 145 | `Posterior mode` = object[["est"]][4], 146 | `Upper bound` = object[["est"]][5] 147 | ) 148 | structure(y, class = c("summary.gride_bayes","matrix")) 149 | } 150 | 151 | 152 | #' @name gride 153 | #' 154 | #' @param x object of class \code{gride_bayes()}, obtained from the function 155 | #' \code{gride_bayes()}. 156 | #' @param ... ignored. 157 | #' 158 | #' 159 | #' @export 160 | print.summary.gride_bayes <- function(x, ...) { 161 | cat(paste0("Model: Gride(", x[5], ",", x[6], ")\n")) 162 | cat("Method: Bayesian Estimation\n") 163 | cat(paste0("Prior d ~ Gamma(", 164 | x[2], 165 | ", ", 166 | x[3], 167 | ")\n")) 168 | cat(paste0("MCMC posterior sample size: ", x[1], "\n")) 169 | cat(paste0( 170 | "Credible Interval quantiles: ", 171 | (1 - x[4]) / 2 * 100, 172 | "%, ", 173 | (1 + x[4]) / 2 * 100, 174 | "%\n" 175 | )) 176 | 177 | cat(paste0("Posterior ID estimates:")) 178 | y <- cbind( 179 | `Lower Bound` = x[7], 180 | `Mean` = x[8], 181 | `Median` = x[9], 182 | `Mode` = x[10], 183 | `Upper Bound` = x[11] 184 | ) 185 | rownames(y) <- NULL 186 | print(knitr::kable(y)) 187 | cat("\n") 188 | invisible(x) 189 | } 190 | 191 | 192 | 193 | #' @name gride 194 | #' 195 | #' @param x object of class \code{gride_bayes}. 196 | #' It is obtained using the output of the \code{gride} function when 197 | #' \code{method = "bayes"}. 198 | #' 199 | #' @export 200 | #' 201 | plot.gride_bayes <- function(x, 202 | ...) { 203 | 204 | on.exit({par(my_par)}, add = TRUE, after = TRUE) 205 | my_par <- par(mfrow = c(2, 1)) 206 | 207 | ID <- c(x$post_sample) 208 | cmm <- cumsum(ID) / seq_along(ID) 209 | 210 | plot(ID ,col="gray",type="l", xlab = "MCMC Iteration") 211 | lines(stats::ts(cmm), col = "darkblue",lwd=1.3) 212 | graphics::title("Bayesian Gride: Traceplot") 213 | 214 | 215 | dx <- density(ID) 216 | plot(dx, xlab = "Intrinsic Dimension" , ylab = "Simulated Posterior Density", 217 | col="darkblue",lwd=1.3, main = "") 218 | polygon(c(dx$x), c(dx$y), 219 | col = "lightgray", border = "darkblue", main = "") 220 | graphics::title("Bayesian Gride: Posterior Density") 221 | 222 | invisible() 223 | } 224 | 225 | -------------------------------------------------------------------------------- /R/compute_mus.R: -------------------------------------------------------------------------------- 1 | #' Compute the ratio statistics needed for the intrinsic dimension estimation 2 | #' 3 | #' The function \code{compute_mus} computes the ratios of distances between 4 | #' nearest neighbors (NNs) of generic order, denoted as 5 | #' \code{mu(n_1,n_2)}. 6 | #' This quantity is at the core of all the likelihood-based methods contained 7 | #' in the package. 8 | #' 9 | #' @param X a dataset with \code{n} observations and \code{D} variables. 10 | #' @param dist_mat a distance matrix computed between \code{n} observations. 11 | #' @param n1 order of the first NN considered. Default is 1. 12 | #' @param n2 order of the second NN considered. Default is 2. 13 | #' @param Nq logical indicator. If \code{TRUE}, it provides the \code{N^q} 14 | #' matrix needed for fitting the Hidalgo model. 15 | #' @param q integer, number of NN considered to build \code{N^q}. 16 | #' 17 | #' @references 18 | #' Facco E, D'Errico M, Rodriguez A, Laio A (2017). "Estimating the intrinsic 19 | #' dimension of datasets by a minimal neighborhood information." 20 | #' Scientific Reports, 7(1). 21 | #' ISSN 20452322, \doi{10.1038/s41598-017-11873-y}. 22 | #' 23 | #' Denti F, Doimo D, Laio A, Mira A (2022). "The generalized ratios intrinsic 24 | #' dimension estimator." 25 | #' Scientific Reports, 12(20005). 26 | #' ISSN 20452322, \doi{10.1038/s41598-022-20991-1}. 27 | #' 28 | #' @return the principal output of this function is a vector containing the 29 | #' ratio statistics, an object of class \code{mus}. The length of the vector is 30 | #' equal to the number of observations considered, unless ties are present in 31 | #' the dataset. In that case, the duplicates are removed. Optionally, if 32 | #' \code{Nq} is \code{TRUE}, the function returns an object of class 33 | #' \code{mus_Nq}, a list containing both the ratio statistics \code{mus} and the 34 | #' adjacency matrix \code{NQ}. 35 | # 36 | #' @export 37 | #' 38 | #' 39 | #' @examples 40 | #' X <- replicate(2,rnorm(1000)) 41 | #' mu <- compute_mus(X, n1 = 1, n2 = 2) 42 | #' mudots <- compute_mus(X, n1 = 4, n2 = 8) 43 | #' pre_hidalgo <- compute_mus(X, n1 = 4, n2 = 8, Nq = TRUE, q = 3) 44 | compute_mus <- function(X = NULL, 45 | dist_mat = NULL, 46 | n1 = 1, 47 | n2 = 2, 48 | Nq = FALSE, 49 | q = 3) { 50 | if (n2 < n1) { 51 | stop("n2 is lower than n1", call. = FALSE) 52 | } 53 | 54 | if (is.null(dist_mat) & is.null(X)) { 55 | stop("Please provide either a dataset X or a distance matrix", 56 | call. = FALSE) 57 | } 58 | 59 | D <- NULL 60 | 61 | ##################################################### 62 | # data, no dist_mat 63 | if (is.null(dist_mat)) { 64 | if(any(is.na(X))){ 65 | stop("There are missing values in the provided dataset. 66 | Please remove the problematic observations and try again.", 67 | call. = FALSE) 68 | } 69 | 70 | X <- as.matrix(X) 71 | D <- ncol(X) 72 | n <- n0 <- nrow(X) 73 | check <- 0 74 | check <- duplicated(X) 75 | 76 | # remove duplicates 77 | if (sum(check) > 0) { 78 | X <- X[-which(check),] 79 | n <- nrow(X) 80 | warning( 81 | paste0( 82 | "\n Duplicates are present and will be removed. 83 | \n Original sample size: ", 84 | n0, 85 | ". New sample size: ", 86 | n, 87 | "." 88 | ), 89 | call. = FALSE 90 | ) 91 | } 92 | 93 | 94 | 95 | 96 | if (!Nq) { 97 | K <- FNN::get.knn(X, k = n2) 98 | mus <- K$nn.dist[, n2] / K$nn.dist[, n1] 99 | 100 | attr(mus, "upper_D") <- D 101 | attr(mus, "n1") <- n1 102 | attr(mus, "n2") <- n2 103 | structure(mus, class = c("mus", class(mus))) 104 | 105 | } else{ 106 | K <- FNN::get.knn(X, k = max(n2, q)) 107 | mus <- K$nn.dist[, n2] / K$nn.dist[, n1] 108 | NQ <- matrix(0, n, n) 109 | for (h in 1:n) { 110 | NQ[h, K$nn.index[h,]] <- 1 111 | } 112 | attr(mus, "upper_D") <- D 113 | attr(mus, "n1") <- n1 114 | attr(mus, "n2") <- n2 115 | mus <- structure(mus, class = c("mus", class(mus))) 116 | 117 | mus_nq <- list(mus = mus, NQ = NQ) 118 | attr(mus_nq, "upper_D") <- D 119 | attr(mus_nq, "n1") <- n1 120 | attr(mus_nq, "n2") <- n2 121 | attr(mus_nq, "q") <- q 122 | structure(mus_nq, class = c("mus_Nq", class(mus_nq))) 123 | } 124 | 125 | 126 | ################################# 127 | } else { 128 | # dist_mat is passed 129 | ################################# 130 | # checks on dist_mat 131 | 132 | # if it is of class dist, then transform it into a matrix 133 | if(inherits(dist_mat,"dist") | inherits(dist_mat,"dissimilarity")){ 134 | dist_mat <- as.matrix(dist_mat) 135 | } 136 | 137 | ## does it contain non-negative distances? 138 | if(!all(dist_mat>=0)){ 139 | stop("Negative distances detected in dist_mat. Please provide a valid distance matrix", 140 | call. = FALSE) 141 | } 142 | # is it symmetric? 143 | if(!isSymmetric(dist_mat)){ 144 | stop("The provided distance matrix is not symmetric. 145 | Please provide a valid distance matrix", 146 | call. = FALSE) 147 | } 148 | # NA? 149 | if(any(is.na(dist_mat))){ 150 | stop("There are missing values in the provided distance matrix. 151 | Please remove the problematic observations and try again.", 152 | call. = FALSE) 153 | } 154 | 155 | n <- n0 <- nrow(dist_mat) 156 | dummy <- dist_mat 157 | dummy[lower.tri(dummy, diag = TRUE)] <- -1 158 | inds <- unique(which(dummy == 0, arr.ind = TRUE)[, 2]) 159 | 160 | if (length(inds) > 0) { 161 | dist_mat <- dist_mat[-inds, -inds] 162 | n <- nrow(dist_mat) 163 | 164 | warning( 165 | paste0("\nDuplicates are present and will be removed.\n", 166 | "Original sample size: ", n0,". New sample size: ", 167 | n,"." 168 | ) 169 | ) 170 | } 171 | 172 | sDistMat <- apply(dist_mat, 1, function(x) 173 | sort(x, index = TRUE)) 174 | mus <- unlist(lapply(sDistMat, 175 | function(z) 176 | z$x[n2 + 1] / z$x[n1 + 1])) 177 | if (!Nq) { 178 | 179 | attr(mus, "n1") <- n1 180 | attr(mus, "n2") <- n2 181 | structure(mus, class = c("mus", class(mus))) 182 | 183 | }else{ 184 | 185 | NQ <- matrix(0, n, n) 186 | for (h in 1:n) { 187 | NQ[h, (sDistMat[[h]]$ix)[2:(q+1)]] <- 1 188 | } 189 | 190 | attr(mus, "n1") <- n1 191 | attr(mus, "n2") <- n2 192 | mus <- structure(mus, class = c("mus", class(mus))) 193 | mus_nq <- list(mus = mus, NQ = NQ) 194 | attr(mus_nq, "n1") <- n1 195 | attr(mus_nq, "n2") <- n2 196 | attr(mus_nq, "q") <- q 197 | structure(mus_nq, class = c("mus_Nq", class(mus_nq))) 198 | 199 | } 200 | } 201 | 202 | } 203 | 204 | 205 | 206 | #' @name compute_mus 207 | #' 208 | #' @param x object of class \code{mus}, obtained from the 209 | #' function \code{compute_mus()}. 210 | #' @param ... ignored. 211 | #' 212 | #' @export 213 | print.mus <- function(x, ...) { 214 | 215 | if(is.list(x)){ #check if the mus object contains Nq 216 | nn <- length(x[[1]]) 217 | }else{ 218 | nn <- length(x) 219 | } 220 | 221 | cat("Ratio statistics mu's\n") 222 | cat(paste0("NN orders: n1 = ", attr(x, "n1"), ", n2 = ", 223 | attr(x, "n2"), "\n")) 224 | cat(paste0("Sample size: ", nn, "\n")) 225 | if (!is.null(attr(x, "upper_D"))) { 226 | cat(paste0("Nominal Dimension: ", attr(x, "upper_D"), "\n")) 227 | } 228 | invisible(x) 229 | } 230 | 231 | 232 | #' @name compute_mus 233 | #' 234 | #' @param x object of class \code{mus_Nq}, obtained from the 235 | #' function \code{compute_mus()}. 236 | #' @param ... ignored. 237 | #' 238 | #' @export 239 | print.mus_Nq <- function(x, ...) { 240 | 241 | if(is.list(x)){ #check if the mus object contains Nq 242 | nn <- length(x[[1]]) 243 | }else{ 244 | nn <- length(x) 245 | } 246 | 247 | cat("List containing the ratio statistics mu's and the Nq adjacency matrix\n") 248 | 249 | cat(paste0("NN orders: n1 = ", attr(x, "n1"), ", n2 = ", 250 | attr(x, "n2"), "\n")) 251 | cat(paste0("Sample size: ", nn, "\n")) 252 | if (!is.null(attr(x, "upper_D"))) { 253 | cat(paste0("Nominal Dimension: ", attr(x, "upper_D"), "\n")) 254 | } 255 | cat(paste0("NNs considered: q = ", attr(x, "q"), "\n")) 256 | 257 | invisible(x) 258 | } 259 | 260 | 261 | #' @name compute_mus 262 | #' 263 | #' @importFrom graphics hist curve 264 | #' 265 | #' @param x object of class \code{mus}, obtained from the 266 | #' function \code{compute_mus()}. 267 | #' @param range_d a sequence of values for which the generalized ratios density 268 | #' is superimposed to the histogram of \code{mus}. 269 | #' @param ... ignored. 270 | #' 271 | #' @export 272 | plot.mus <- function(x, range_d = NULL, ...) { 273 | 274 | 275 | n1 <- attr(x, "n1") 276 | n2 <- attr(x, "n2") 277 | hist(x, 278 | breaks = 30, 279 | freq = F, 280 | col = "white", 281 | main = "", xlab = bquote(mu)) 282 | 283 | if(!is.null(range_d)){ 284 | 285 | for (i in seq_along(range_d)) { 286 | y <- dgera(sort(x), 287 | n1 = n1, 288 | n2 = n2, 289 | d = range_d[i]) 290 | lines(y ~ sort(x), 291 | col = range_d[i] - min(range_d) + 1, 292 | lwd = 2, 293 | lty = 3) 294 | } 295 | 296 | legend( 297 | "topright", 298 | legend = range_d, 299 | col = (range_d) - min(range_d) + 1, 300 | lty = 3, 301 | lwd = 2 302 | ) 303 | } 304 | invisible() 305 | } 306 | 307 | 308 | -------------------------------------------------------------------------------- /R/autoplot_hidalgo_aux.R: -------------------------------------------------------------------------------- 1 | #' Plot the raw traceplots of the \code{id} parameters 2 | #' 3 | #' The function produces the traceplots of the parameters 4 | #' \code{d_k}, for \code{k=1...K}. The ergodic means for all the chains 5 | #' are superimposed. 6 | #' The \code{K} chains that are plotted are not post-processed. 7 | #' Ergo, they are subjected to label switching. 8 | #' 9 | #' @param object object of class \code{Hidalgo}, the output of the 10 | #' \code{Hidalgo()} function. 11 | #' @param ... other arguments passed to specific methods. 12 | #' 13 | #' @importFrom rlang .data 14 | #' 15 | #' @seealso \code{\link{autoplot.Hidalgo}} 16 | #' @rdname autoplot.gride_bayes 17 | #' 18 | #' @return object of class \code{\link[ggplot2]{ggplot}}, which displays the 19 | #' chains of the \code{id} parameters sampled from the mixture model. 20 | #' @keywords Internal 21 | #' @noRd 22 | #' 23 | ggHid_chains <- function(object, ...) { 24 | ID <- object$id_raw 25 | cmm <- (apply(ID, 2, function(x) 26 | cumsum(x) / seq_along(x))) 27 | D <- reshape2::melt(ID) 28 | D1 <- reshape2::melt(cmm) 29 | 30 | ggplot2::ggplot() + 31 | ggplot2::geom_line( 32 | data = D, 33 | ggplot2::aes( 34 | x = .data$Var1, 35 | y = .data$value, 36 | group = .data$Var2 37 | ), 38 | col = "gray", 39 | alpha = .2 40 | ) + 41 | ggplot2::theme_bw() + 42 | ggplot2::ylab("Raw MCMC - Intrinsic Dimension") + 43 | ggplot2::xlab("MCMC Iteration") + 44 | ggplot2::geom_line( 45 | data = D1, 46 | ggplot2::aes( 47 | x = .data$Var1, 48 | y = .data$value, 49 | group = .data$Var2, 50 | col = factor(.data$Var2) 51 | ), 52 | alpha = 1, 53 | lwd = 1 54 | ) + 55 | ggplot2::theme( 56 | axis.title.x = ggplot2::element_text(size = 20), 57 | axis.title.y = ggplot2::element_text(size = 20), 58 | title = ggplot2::element_text(size = 20), 59 | legend.position = "none" 60 | ) 61 | } 62 | 63 | 64 | #' Plot a summary of the distributions of re-arranged chains 65 | #' 66 | #' The function produces two panels, reporting the means (left) and the medians 67 | #' (right) of the processed chains. Each observation is mapped to its own 68 | #' intrinsic dimension value assumed at each iteration \code{t} of the MCMC, 69 | #' denoted as \code{d(t,z_i)}. The 90% credible intervals are also depicted 70 | #' with gray lines. 71 | #' 72 | #' @param object object of class \code{Hidalgo}, the output of the 73 | #' \code{Hidalgo()} function. 74 | #' @param ... other arguments passed to specific methods. 75 | #' @seealso \code{\link{autoplot.Hidalgo}} 76 | #' 77 | #' 78 | #' @return object of class \code{\link[ggplot2]{ggplot}}. It displays two 79 | #' scatterplots containing the posterior mean and median \code{id} for each 80 | #' observation, once the MCMC has been postprocessed to handle label 81 | #' switching. 82 | #' 83 | #' @keywords internal 84 | #' @noRd 85 | #' 86 | ggHid_mean_median <- function(object, ...) { 87 | a <- object$id_summary 88 | data <- rbind( 89 | data.frame( 90 | x = a$OBS, 91 | low = a$Q.05, 92 | est = a$MEAN, 93 | upp = a$Q.95, 94 | type = "Mean" 95 | ), 96 | data.frame( 97 | x = a$OBS, 98 | low = a$Q.05, 99 | est = a$MEDIAN, 100 | upp = a$Q.95, 101 | type = "Median" 102 | ) 103 | ) 104 | ggplot2::ggplot(data = data) + 105 | ggplot2::geom_segment( 106 | ggplot2::aes( 107 | x = .data$x, 108 | xend = .data$x, 109 | y = .data$low, 110 | yend = .data$upp 111 | ), 112 | col = "gray", 113 | alpha = .4 114 | ) + 115 | ggplot2::theme_bw() + 116 | ggplot2::theme(text = ggplot2::element_text(size = 20), 117 | legend.position = "none") + 118 | ggplot2::geom_point( 119 | ggplot2::aes(x = .data$x, y = .data$est), 120 | col = "darkblue", 121 | alpha = 1, 122 | pch = 21 123 | ) + 124 | ggplot2::facet_wrap( ~ type) + 125 | ggplot2::xlab("Observation") + 126 | ggplot2::ylab("Intrinsic Dimension") 127 | } 128 | 129 | #' Plot posterior \code{id} for each observation stratified by external factor 130 | #' 131 | #' The function produces different plots to investigate the relationship between 132 | #' the posterior estimates of the \code{id} and an external, categorical 133 | #' variable \code{class}. 134 | #' 135 | #' @param object object of class \code{Hidalgo}, the output of the 136 | #' \code{Hidalgo()} function. 137 | #' @param class factor variable used to stratify observations according to their 138 | #' \code{id} estimates. 139 | #' @param class_plot_type a string indicating the preferred type of plot used 140 | #' for the comparison. It can be: 141 | #' \describe{ 142 | #' \item{\code{"histogram"} or \code{"density"}}{which produces overlapping 143 | #' plots of the stratified distributions} 144 | #' \item{\code{"boxplot"} or \code{"violin"}}{which produces side-to-side 145 | #' boxplots or violin plots} 146 | #' } 147 | #' @param ... other arguments passed to specific methods. 148 | #' 149 | #' @seealso \code{\link{autoplot.Hidalgo}} 150 | #' @keywords internal 151 | #' @noRd 152 | #' 153 | #' @return object of class \code{\link[ggplot2]{ggplot}}. It can be used to 154 | #' visually study the relation between the posterior \code{id} estimates and an 155 | #' exogenous, categorical variable. 156 | #' The type of plot varies according to the specification of 157 | #' \code{class_plot_type}, and it can be either a set of boxplots or 158 | #' violin plots, or a collection of overlapping densities or histograms. 159 | #' 160 | ggHid_class <- function(object, 161 | class, 162 | class_plot_type = c("histogram", "density", 163 | "boxplot", "violin"), 164 | ...) { 165 | class_plot_type <- match.arg(class_plot_type) 166 | D <- object$id_summary 167 | D <- rbind( 168 | data.frame( 169 | Class = as.factor(class), 170 | est = D$MEAN, 171 | type = "Mean" 172 | ), 173 | data.frame( 174 | Class = as.factor(class), 175 | est = D$MEDIAN, 176 | type = "Median" 177 | ) 178 | ) 179 | 180 | if (class_plot_type == "histogram") { 181 | p1 <- ggplot2::ggplot() + 182 | ggplot2::geom_histogram( 183 | data = D, 184 | ggplot2::aes(x = .data$est, 185 | fill = .data$Class), 186 | position = "identity", 187 | col = 1, 188 | alpha = .5, 189 | bins = 25 190 | ) 191 | } else if (class_plot_type == "density") { 192 | p1 <- ggplot2::ggplot() + 193 | ggplot2::geom_density( 194 | data = D, 195 | ggplot2::aes(x = .data$est, 196 | fill = .data$Class), 197 | position = "identity", 198 | col = 1, 199 | alpha = .5 200 | ) 201 | } else if (class_plot_type == "boxplot") { 202 | p1 <- ggplot2::ggplot() + 203 | ggplot2::geom_boxplot( 204 | data = D, 205 | ggplot2::aes( 206 | x = .data$est, 207 | y = .data$Class, 208 | fill = .data$Class 209 | ), 210 | col = 1 211 | ) 212 | } else if (class_plot_type == "violin") { 213 | p1 <- ggplot2::ggplot() + 214 | ggplot2::geom_violin( 215 | data = D, 216 | ggplot2::aes( 217 | x = .data$est, 218 | y = .data$Class, 219 | fill = .data$Class 220 | ), 221 | col = 1 222 | ) 223 | } 224 | 225 | p1 + 226 | ggplot2::theme_bw() + 227 | ggplot2::theme(text = ggplot2::element_text(size = 20), 228 | legend.position = "none") + 229 | ggplot2::facet_wrap( ~ type) + 230 | ggplot2::xlab("ID posterior estimate") + 231 | ggplot2::ylab("") 232 | 233 | } 234 | 235 | #' Plot the posterior similarity matrix 236 | #' 237 | #' The function produces a heatmap of the posterior similarity (coclustering) 238 | #' matrix (psm) computed from the MCMC output of the function \code{Hidalgo()}. 239 | #' Rows and columns can be organized according to a clustering solution or an 240 | #' exogenous categorical variable. 241 | #' 242 | #' @param object object of class \code{Hidalgo}, the output of the 243 | #' \code{Hidalgo()} function. 244 | #' @param psm posterior similarity matrix that can be provided directly to the 245 | #' function. 246 | #' @param class factor variable used to order the observations according to 247 | #' their \code{id} estimates. 248 | #' @param ... other arguments passed to specific methods. 249 | #' 250 | #' @keywords internal 251 | #' @noRd 252 | #' 253 | #' @return object of class \code{\link[ggplot2]{ggplot}}. It is a heatmap 254 | #' representing the psm, which allows to study the clustering structure present 255 | #' in the data estimated via the mixture model. 256 | #' 257 | #' @seealso \code{\link{autoplot.Hidalgo}} 258 | #' 259 | ggHid_psm <- function(object, 260 | psm = NULL, 261 | class = NULL, 262 | ...) { 263 | if (is.null(psm)) { 264 | psm <- clustering(object)$psm 265 | } 266 | 267 | n <- nrow(psm) 268 | 269 | if (!is.null(class)) { 270 | ind <- sort(as.numeric(class),index=T)$ix 271 | D <- data.frame(reshape2::melt(psm[ind,ind])) 272 | 273 | Q1 <- ggplot2::ggplot(D) + 274 | ggplot2::geom_tile(ggplot2::aes( 275 | x = .data$Var2, 276 | y = .data$Var1, 277 | fill = .data$value 278 | )) 279 | 280 | } else { 281 | cl <- stats::hclust(stats::as.dist(1-psm)) 282 | 283 | D <- data.frame(reshape2::melt(psm[cl$order,cl$order])) 284 | 285 | Q1 <- ggplot2::ggplot(D) + 286 | ggplot2::geom_tile(ggplot2::aes( 287 | x = (.data$Var1), 288 | y = (.data$Var2), 289 | fill = .data$value 290 | )) 291 | } 292 | Q <- Q1 + 293 | ggplot2::theme_bw() + 294 | ggplot2::theme( 295 | text = ggplot2::element_text(size = 20), 296 | legend.position = "bottom", 297 | legend.margin = ggplot2::margin(), 298 | legend.key.width = ggplot2::unit(2, "cm") 299 | ) + 300 | ggplot2::xlab("") + 301 | ggplot2::ylab("") + 302 | ggplot2::scale_fill_gradient("PCP ", 303 | low = "white", high = 4) 304 | 305 | 306 | return(Q) 307 | } 308 | -------------------------------------------------------------------------------- /src/RcppExports.cpp: -------------------------------------------------------------------------------- 1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | #include 5 | #include 6 | 7 | using namespace Rcpp; 8 | 9 | #ifdef RCPP_USE_GLOBAL_ROSTREAM 10 | Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); 11 | Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); 12 | #endif 13 | 14 | // log_Likelihood_double 15 | double log_Likelihood_double(double mu_obs, double d); 16 | RcppExport SEXP _intRinsic_log_Likelihood_double(SEXP mu_obsSEXP, SEXP dSEXP) { 17 | BEGIN_RCPP 18 | Rcpp::RObject rcpp_result_gen; 19 | Rcpp::RNGScope rcpp_rngScope_gen; 20 | Rcpp::traits::input_parameter< double >::type mu_obs(mu_obsSEXP); 21 | Rcpp::traits::input_parameter< double >::type d(dSEXP); 22 | rcpp_result_gen = Rcpp::wrap(log_Likelihood_double(mu_obs, d)); 23 | return rcpp_result_gen; 24 | END_RCPP 25 | } 26 | // Groups_quantities 27 | arma::mat Groups_quantities(arma::colvec mu_obser, arma::vec Ci, int K); 28 | RcppExport SEXP _intRinsic_Groups_quantities(SEXP mu_obserSEXP, SEXP CiSEXP, SEXP KSEXP) { 29 | BEGIN_RCPP 30 | Rcpp::RObject rcpp_result_gen; 31 | Rcpp::RNGScope rcpp_rngScope_gen; 32 | Rcpp::traits::input_parameter< arma::colvec >::type mu_obser(mu_obserSEXP); 33 | Rcpp::traits::input_parameter< arma::vec >::type Ci(CiSEXP); 34 | Rcpp::traits::input_parameter< int >::type K(KSEXP); 35 | rcpp_result_gen = Rcpp::wrap(Groups_quantities(mu_obser, Ci, K)); 36 | return rcpp_result_gen; 37 | END_RCPP 38 | } 39 | // Norm_Constant_Z_l2 40 | double Norm_Constant_Z_l2(int Nzi_l, int N, double xi, int q); 41 | RcppExport SEXP _intRinsic_Norm_Constant_Z_l2(SEXP Nzi_lSEXP, SEXP NSEXP, SEXP xiSEXP, SEXP qSEXP) { 42 | BEGIN_RCPP 43 | Rcpp::RObject rcpp_result_gen; 44 | Rcpp::RNGScope rcpp_rngScope_gen; 45 | Rcpp::traits::input_parameter< int >::type Nzi_l(Nzi_lSEXP); 46 | Rcpp::traits::input_parameter< int >::type N(NSEXP); 47 | Rcpp::traits::input_parameter< double >::type xi(xiSEXP); 48 | Rcpp::traits::input_parameter< int >::type q(qSEXP); 49 | rcpp_result_gen = Rcpp::wrap(Norm_Constant_Z_l2(Nzi_l, N, xi, q)); 50 | return rcpp_result_gen; 51 | END_RCPP 52 | } 53 | // log_Zeta_maker 54 | arma::vec log_Zeta_maker(int N, double xi, int q); 55 | RcppExport SEXP _intRinsic_log_Zeta_maker(SEXP NSEXP, SEXP xiSEXP, SEXP qSEXP) { 56 | BEGIN_RCPP 57 | Rcpp::RObject rcpp_result_gen; 58 | Rcpp::RNGScope rcpp_rngScope_gen; 59 | Rcpp::traits::input_parameter< int >::type N(NSEXP); 60 | Rcpp::traits::input_parameter< double >::type xi(xiSEXP); 61 | Rcpp::traits::input_parameter< int >::type q(qSEXP); 62 | rcpp_result_gen = Rcpp::wrap(log_Zeta_maker(N, xi, q)); 63 | return rcpp_result_gen; 64 | END_RCPP 65 | } 66 | // index_row_col 67 | Rcpp::List index_row_col(arma::mat Nq, int q, int N); 68 | RcppExport SEXP _intRinsic_index_row_col(SEXP NqSEXP, SEXP qSEXP, SEXP NSEXP) { 69 | BEGIN_RCPP 70 | Rcpp::RObject rcpp_result_gen; 71 | Rcpp::RNGScope rcpp_rngScope_gen; 72 | Rcpp::traits::input_parameter< arma::mat >::type Nq(NqSEXP); 73 | Rcpp::traits::input_parameter< int >::type q(qSEXP); 74 | Rcpp::traits::input_parameter< int >::type N(NSEXP); 75 | rcpp_result_gen = Rcpp::wrap(index_row_col(Nq, q, N)); 76 | return rcpp_result_gen; 77 | END_RCPP 78 | } 79 | // rdir_cpp 80 | arma::colvec rdir_cpp(arma::colvec alpha); 81 | RcppExport SEXP _intRinsic_rdir_cpp(SEXP alphaSEXP) { 82 | BEGIN_RCPP 83 | Rcpp::RObject rcpp_result_gen; 84 | Rcpp::RNGScope rcpp_rngScope_gen; 85 | Rcpp::traits::input_parameter< arma::colvec >::type alpha(alphaSEXP); 86 | rcpp_result_gen = Rcpp::wrap(rdir_cpp(alpha)); 87 | return rcpp_result_gen; 88 | END_RCPP 89 | } 90 | // Update_memberships_faster 91 | arma::colvec Update_memberships_faster(arma::colvec mu_obser, arma::colvec dl, arma::colvec pl, int K, int N, int q, arma::colvec possible_label, arma::colvec Ci, double QQ, arma::umat index_row, Rcpp::List index_col, arma::colvec log_Precomp_Z, arma::colvec log_Precomp_ratios); 92 | RcppExport SEXP _intRinsic_Update_memberships_faster(SEXP mu_obserSEXP, SEXP dlSEXP, SEXP plSEXP, SEXP KSEXP, SEXP NSEXP, SEXP qSEXP, SEXP possible_labelSEXP, SEXP CiSEXP, SEXP QQSEXP, SEXP index_rowSEXP, SEXP index_colSEXP, SEXP log_Precomp_ZSEXP, SEXP log_Precomp_ratiosSEXP) { 93 | BEGIN_RCPP 94 | Rcpp::RObject rcpp_result_gen; 95 | Rcpp::RNGScope rcpp_rngScope_gen; 96 | Rcpp::traits::input_parameter< arma::colvec >::type mu_obser(mu_obserSEXP); 97 | Rcpp::traits::input_parameter< arma::colvec >::type dl(dlSEXP); 98 | Rcpp::traits::input_parameter< arma::colvec >::type pl(plSEXP); 99 | Rcpp::traits::input_parameter< int >::type K(KSEXP); 100 | Rcpp::traits::input_parameter< int >::type N(NSEXP); 101 | Rcpp::traits::input_parameter< int >::type q(qSEXP); 102 | Rcpp::traits::input_parameter< arma::colvec >::type possible_label(possible_labelSEXP); 103 | Rcpp::traits::input_parameter< arma::colvec >::type Ci(CiSEXP); 104 | Rcpp::traits::input_parameter< double >::type QQ(QQSEXP); 105 | Rcpp::traits::input_parameter< arma::umat >::type index_row(index_rowSEXP); 106 | Rcpp::traits::input_parameter< Rcpp::List >::type index_col(index_colSEXP); 107 | Rcpp::traits::input_parameter< arma::colvec >::type log_Precomp_Z(log_Precomp_ZSEXP); 108 | Rcpp::traits::input_parameter< arma::colvec >::type log_Precomp_ratios(log_Precomp_ratiosSEXP); 109 | rcpp_result_gen = Rcpp::wrap(Update_memberships_faster(mu_obser, dl, pl, K, N, q, possible_label, Ci, QQ, index_row, index_col, log_Precomp_Z, log_Precomp_ratios)); 110 | return rcpp_result_gen; 111 | END_RCPP 112 | } 113 | // gam_trunc 114 | arma::colvec gam_trunc(int D, int K, double a0_d, double b0_d, arma::colvec n_l, arma::colvec sLog); 115 | RcppExport SEXP _intRinsic_gam_trunc(SEXP DSEXP, SEXP KSEXP, SEXP a0_dSEXP, SEXP b0_dSEXP, SEXP n_lSEXP, SEXP sLogSEXP) { 116 | BEGIN_RCPP 117 | Rcpp::RObject rcpp_result_gen; 118 | Rcpp::RNGScope rcpp_rngScope_gen; 119 | Rcpp::traits::input_parameter< int >::type D(DSEXP); 120 | Rcpp::traits::input_parameter< int >::type K(KSEXP); 121 | Rcpp::traits::input_parameter< double >::type a0_d(a0_dSEXP); 122 | Rcpp::traits::input_parameter< double >::type b0_d(b0_dSEXP); 123 | Rcpp::traits::input_parameter< arma::colvec >::type n_l(n_lSEXP); 124 | Rcpp::traits::input_parameter< arma::colvec >::type sLog(sLogSEXP); 125 | rcpp_result_gen = Rcpp::wrap(gam_trunc(D, K, a0_d, b0_d, n_l, sLog)); 126 | return rcpp_result_gen; 127 | END_RCPP 128 | } 129 | // gam_trunc_pmass 130 | arma::colvec gam_trunc_pmass(int D, int K, double a0_d, double b0_d, arma::colvec n_l, arma::colvec sLog, double pi_mass); 131 | RcppExport SEXP _intRinsic_gam_trunc_pmass(SEXP DSEXP, SEXP KSEXP, SEXP a0_dSEXP, SEXP b0_dSEXP, SEXP n_lSEXP, SEXP sLogSEXP, SEXP pi_massSEXP) { 132 | BEGIN_RCPP 133 | Rcpp::RObject rcpp_result_gen; 134 | Rcpp::RNGScope rcpp_rngScope_gen; 135 | Rcpp::traits::input_parameter< int >::type D(DSEXP); 136 | Rcpp::traits::input_parameter< int >::type K(KSEXP); 137 | Rcpp::traits::input_parameter< double >::type a0_d(a0_dSEXP); 138 | Rcpp::traits::input_parameter< double >::type b0_d(b0_dSEXP); 139 | Rcpp::traits::input_parameter< arma::colvec >::type n_l(n_lSEXP); 140 | Rcpp::traits::input_parameter< arma::colvec >::type sLog(sLogSEXP); 141 | Rcpp::traits::input_parameter< double >::type pi_mass(pi_massSEXP); 142 | rcpp_result_gen = Rcpp::wrap(gam_trunc_pmass(D, K, a0_d, b0_d, n_l, sLog, pi_mass)); 143 | return rcpp_result_gen; 144 | END_RCPP 145 | } 146 | // gride_log_likelihood 147 | double gride_log_likelihood(double d, int n1, int n2, arma::colvec mus_n1_n2); 148 | RcppExport SEXP _intRinsic_gride_log_likelihood(SEXP dSEXP, SEXP n1SEXP, SEXP n2SEXP, SEXP mus_n1_n2SEXP) { 149 | BEGIN_RCPP 150 | Rcpp::RObject rcpp_result_gen; 151 | Rcpp::RNGScope rcpp_rngScope_gen; 152 | Rcpp::traits::input_parameter< double >::type d(dSEXP); 153 | Rcpp::traits::input_parameter< int >::type n1(n1SEXP); 154 | Rcpp::traits::input_parameter< int >::type n2(n2SEXP); 155 | Rcpp::traits::input_parameter< arma::colvec >::type mus_n1_n2(mus_n1_n2SEXP); 156 | rcpp_result_gen = Rcpp::wrap(gride_log_likelihood(d, n1, n2, mus_n1_n2)); 157 | return rcpp_result_gen; 158 | END_RCPP 159 | } 160 | // gride_log_posterior 161 | double gride_log_posterior(double z, int n1, int n2, double a_d, double b_d, arma::colvec mus_n1_n2); 162 | RcppExport SEXP _intRinsic_gride_log_posterior(SEXP zSEXP, SEXP n1SEXP, SEXP n2SEXP, SEXP a_dSEXP, SEXP b_dSEXP, SEXP mus_n1_n2SEXP) { 163 | BEGIN_RCPP 164 | Rcpp::RObject rcpp_result_gen; 165 | Rcpp::RNGScope rcpp_rngScope_gen; 166 | Rcpp::traits::input_parameter< double >::type z(zSEXP); 167 | Rcpp::traits::input_parameter< int >::type n1(n1SEXP); 168 | Rcpp::traits::input_parameter< int >::type n2(n2SEXP); 169 | Rcpp::traits::input_parameter< double >::type a_d(a_dSEXP); 170 | Rcpp::traits::input_parameter< double >::type b_d(b_dSEXP); 171 | Rcpp::traits::input_parameter< arma::colvec >::type mus_n1_n2(mus_n1_n2SEXP); 172 | rcpp_result_gen = Rcpp::wrap(gride_log_posterior(z, n1, n2, a_d, b_d, mus_n1_n2)); 173 | return rcpp_result_gen; 174 | END_RCPP 175 | } 176 | // gride_mh_sampler 177 | arma::colvec gride_mh_sampler(double start_d, int n1, int n2, double a_d, double b_d, arma::colvec mus_n1_n2, int nsim, int burn_in, double sigma); 178 | RcppExport SEXP _intRinsic_gride_mh_sampler(SEXP start_dSEXP, SEXP n1SEXP, SEXP n2SEXP, SEXP a_dSEXP, SEXP b_dSEXP, SEXP mus_n1_n2SEXP, SEXP nsimSEXP, SEXP burn_inSEXP, SEXP sigmaSEXP) { 179 | BEGIN_RCPP 180 | Rcpp::RObject rcpp_result_gen; 181 | Rcpp::RNGScope rcpp_rngScope_gen; 182 | Rcpp::traits::input_parameter< double >::type start_d(start_dSEXP); 183 | Rcpp::traits::input_parameter< int >::type n1(n1SEXP); 184 | Rcpp::traits::input_parameter< int >::type n2(n2SEXP); 185 | Rcpp::traits::input_parameter< double >::type a_d(a_dSEXP); 186 | Rcpp::traits::input_parameter< double >::type b_d(b_dSEXP); 187 | Rcpp::traits::input_parameter< arma::colvec >::type mus_n1_n2(mus_n1_n2SEXP); 188 | Rcpp::traits::input_parameter< int >::type nsim(nsimSEXP); 189 | Rcpp::traits::input_parameter< int >::type burn_in(burn_inSEXP); 190 | Rcpp::traits::input_parameter< double >::type sigma(sigmaSEXP); 191 | rcpp_result_gen = Rcpp::wrap(gride_mh_sampler(start_d, n1, n2, a_d, b_d, mus_n1_n2, nsim, burn_in, sigma)); 192 | return rcpp_result_gen; 193 | END_RCPP 194 | } 195 | 196 | static const R_CallMethodDef CallEntries[] = { 197 | {"_intRinsic_log_Likelihood_double", (DL_FUNC) &_intRinsic_log_Likelihood_double, 2}, 198 | {"_intRinsic_Groups_quantities", (DL_FUNC) &_intRinsic_Groups_quantities, 3}, 199 | {"_intRinsic_Norm_Constant_Z_l2", (DL_FUNC) &_intRinsic_Norm_Constant_Z_l2, 4}, 200 | {"_intRinsic_log_Zeta_maker", (DL_FUNC) &_intRinsic_log_Zeta_maker, 3}, 201 | {"_intRinsic_index_row_col", (DL_FUNC) &_intRinsic_index_row_col, 3}, 202 | {"_intRinsic_rdir_cpp", (DL_FUNC) &_intRinsic_rdir_cpp, 1}, 203 | {"_intRinsic_Update_memberships_faster", (DL_FUNC) &_intRinsic_Update_memberships_faster, 13}, 204 | {"_intRinsic_gam_trunc", (DL_FUNC) &_intRinsic_gam_trunc, 6}, 205 | {"_intRinsic_gam_trunc_pmass", (DL_FUNC) &_intRinsic_gam_trunc_pmass, 7}, 206 | {"_intRinsic_gride_log_likelihood", (DL_FUNC) &_intRinsic_gride_log_likelihood, 4}, 207 | {"_intRinsic_gride_log_posterior", (DL_FUNC) &_intRinsic_gride_log_posterior, 6}, 208 | {"_intRinsic_gride_mh_sampler", (DL_FUNC) &_intRinsic_gride_mh_sampler, 9}, 209 | {NULL, NULL, 0} 210 | }; 211 | 212 | RcppExport void R_init_intRinsic(DllInfo *dll) { 213 | R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); 214 | R_useDynamicSymbols(dll, FALSE); 215 | } 216 | --------------------------------------------------------------------------------