├── .github
    ├── .gitignore
    └── workflows
    │   └── rhub.yaml
├── src
    ├── .gitignore
    ├── Makevars
    ├── Makevars.win
    ├── C_GAMMA.h
    ├── A_AUX.h
    ├── B_FULLCOND.h
    ├── G_GRIDE.h
    ├── C_GAMMA.cpp
    ├── Z_unused.cpp
    ├── A_AUX.cpp
    ├── B_FULLCOND.cpp
    ├── G_GRIDE.cpp
    └── RcppExports.cpp
├── LICENSE
├── man
    ├── figures
    │   ├── logo.png
    │   ├── intLogo.png
    │   └── README-pressure-1.png
    ├── reexports.Rd
    ├── Swissroll.Rd
    ├── autoplot.gride_evolution.Rd
    ├── autoplot.gride_mle.Rd
    ├── autoplot.twonn_linfit.Rd
    ├── autoplot.twonn_mle.Rd
    ├── auxHidalgo.Rd
    ├── id_by_class.Rd
    ├── generalized_ratios_distribution.Rd
    ├── intRinsic-package.Rd
    ├── autoplot.twonn_bayes.Rd
    ├── autoplot.gride_bayes.Rd
    ├── gride_evolution.Rd
    ├── twonn_decimated.Rd
    ├── compute_mus.Rd
    ├── clustering.Rd
    ├── twonn_decimation.Rd
    ├── autoplot.Hidalgo.Rd
    ├── gride.Rd
    ├── twonn.Rd
    └── Hidalgo.Rd
├── CRAN-SUBMISSION
├── .gitignore
├── R
    ├── zzz.R
    ├── rdir_exmcmcPack.R
    ├── intRinsic-package.R
    ├── general_auxiliary.R
    ├── RcppExports.R
    ├── hidalgo_Aux.R
    ├── gera_distribution.R
    ├── twonn_dec_depr.R
    ├── twonn_dec.R
    ├── gride_mle_aux.R
    ├── twonn_linfit.R
    ├── autoplot_hidalgo.R
    ├── twoNN.R
    ├── twonn_mle.R
    ├── gride_mle.R
    ├── twonn_dec_steps.R
    ├── autoplot_gride.R
    ├── gride.R
    ├── twonn_dec_prop.R
    ├── twonn_bayes.R
    ├── hidalgo_postproc.R
    ├── autoplot_twonn.R
    ├── gride_evolution.R
    ├── gride_bayes.R
    ├── compute_mus.R
    └── autoplot_hidalgo_aux.R
├── .Rbuildignore
├── intRinsic.Rproj
├── inst
    └── CITATION
├── cran-comments.md
├── DESCRIPTION
├── README.md
├── README.Rmd
├── NAMESPACE
└── NEWS.md


/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | 


--------------------------------------------------------------------------------
/src/.gitignore:
--------------------------------------------------------------------------------
1 | *.o
2 | *.so
3 | *.dll
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2020
2 | COPYRIGHT HOLDER: Francesco Denti
3 | 


--------------------------------------------------------------------------------
/man/figures/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Fradenti/intRinsic/HEAD/man/figures/logo.png


--------------------------------------------------------------------------------
/man/figures/intLogo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Fradenti/intRinsic/HEAD/man/figures/intLogo.png


--------------------------------------------------------------------------------
/CRAN-SUBMISSION:
--------------------------------------------------------------------------------
1 | Version: 1.1.2
2 | Date: 2025-12-17 14:16:39 UTC
3 | SHA: 8e0028cb9ad281128165c453e9219c8f4cec78a4
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | src/*.o
6 | src/*.so
7 | src/*.dll
8 | .DS_Store
9 | 


--------------------------------------------------------------------------------
/man/figures/README-pressure-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Fradenti/intRinsic/HEAD/man/figures/README-pressure-1.png


--------------------------------------------------------------------------------
/R/zzz.R:
--------------------------------------------------------------------------------
1 | # Clean after unload
2 | .onUnload <- function(libpath) {
3 |   library.dynam.unload("intRinsic", libpath)
4 | }
5 | 


--------------------------------------------------------------------------------
/src/Makevars:
--------------------------------------------------------------------------------
1 | PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS)
2 | PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS)
3 | 


--------------------------------------------------------------------------------
/src/Makevars.win:
--------------------------------------------------------------------------------
1 | PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS)
2 | PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS)
3 | 


--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^.*\.Rproj$
 2 | ^\.Rproj\.user$
 3 | ^LICENSE\.md$
 4 | ^\.github$
 5 | ^cran-comments\.md$
 6 | ^\.travis\.yml$
 7 | ^CRAN-SUBMISSION$
 8 | ^codecov\.yml$
 9 | ^README\.Rmd$
10 | 


--------------------------------------------------------------------------------
/R/rdir_exmcmcPack.R:
--------------------------------------------------------------------------------
1 | #' #' @keywords internal
2 | #' .mcmc_pack_rdirichlet <- function (n, alpha)
3 | #' {
4 | #'   l <- length(alpha)
5 | #'   x <- matrix(stats::rgamma(l * n, alpha), ncol = l, byrow = TRUE)
6 | #'   sm <- x %*% rep(1, l)
7 | #'   return(x/as.vector(sm))
8 | #' }
9 | 


--------------------------------------------------------------------------------
/R/intRinsic-package.R:
--------------------------------------------------------------------------------
 1 | #' @keywords internal
 2 | "_PACKAGE"
 3 | 
 4 | # The following block is used by usethis to automatically manage
 5 | # roxygen namespace tags. Modify with care!
 6 | ## usethis namespace: start
 7 | #' @importFrom ggplot2 autoplot
 8 | #' @importFrom ggplot2 ggplot
 9 | #' @importFrom Rcpp sourceCpp
10 | #' @useDynLib intRinsic, .registration = TRUE
11 | ## usethis namespace: end
12 | NULL
13 | 


--------------------------------------------------------------------------------
/man/reexports.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/autoplot_hidalgo.R
 3 | \docType{import}
 4 | \name{reexports}
 5 | \alias{reexports}
 6 | \alias{autoplot}
 7 | \title{Objects exported from other packages}
 8 | \keyword{internal}
 9 | \description{
10 | These objects are imported from other packages. Follow the links
11 | below to see their documentation.
12 | 
13 | \describe{
14 |   \item{ggplot2}{\code{\link[ggplot2]{autoplot}}}
15 | }}
16 | 
17 | 


--------------------------------------------------------------------------------
/intRinsic.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | 
18 | BuildType: Package
19 | PackageUseDevtools: Yes
20 | PackageInstallArgs: --no-multiarch --with-keep.source
21 | PackageCheckArgs: --as-cran --no-manual
22 | PackageRoxygenize: rd,collate,namespace
23 | 


--------------------------------------------------------------------------------
/inst/CITATION:
--------------------------------------------------------------------------------
 1 | bibentry(bibtype = "Article",
 2 |   title        = "{intRinsic}: An {R} Package for Model-Based Estimation of the Intrinsic Dimension of a Dataset",
 3 |   author       = person(given = "Francesco", family = "Denti", email = "francesco.denti@unicatt.it"),
 4 |   journal      = "Journal of Statistical Software",
 5 |   year         = "2023",
 6 |   volume       = "106",
 7 |   number       = "9",
 8 |   pages        = "1--45",
 9 |   doi          = "10.18637/jss.v106.i09",
10 |   header       = "To cite intRinsic in publications use:"
11 | )
12 | 


--------------------------------------------------------------------------------
/cran-comments.md:
--------------------------------------------------------------------------------
 1 | ## Version 1.1.2
 2 | 
 3 | With this version, we:
 4 | 
 5 | - removed the strong dependency of `intRinsic` from the `latex2exp` package, which has been scheduled for archival.
 6 | 
 7 | ## R CMD check results
 8 | 
 9 | Running `devtools::check(args = c('--as-cran','--no-manual'))` locally and the functions `devtools::check_win_devel()`, 
10 | `devtools::check_win_release()`, and `devtools::check_mac_release()` does not produce any ERRORs, WARNINGs, or NOTEs.
11 | 
12 | ## Downstream dependencies
13 | 
14 | There are currently no downstream dependencies for this package.
15 | 


--------------------------------------------------------------------------------
/src/C_GAMMA.h:
--------------------------------------------------------------------------------
 1 | #ifndef C_GAMMA
 2 | #define C_GAMMA
 3 | 
 4 | #include <RcppArmadillo.h>
 5 | using namespace Rcpp;
 6 | 
 7 | arma::colvec gam_trunc(int D, int K,
 8 |                        double a0_d, double b0_d,
 9 |                        arma::colvec n_l,
10 |                        arma::colvec sLog);
11 | 
12 | arma::colvec gam_trunc_pmass(int D, int K,
13 |                              double a0_d, double b0_d,
14 |                              arma::colvec n_l,
15 |                              arma::colvec sLog,
16 |                              double pi_mass);
17 | #endif
18 | 


--------------------------------------------------------------------------------
/src/A_AUX.h:
--------------------------------------------------------------------------------
 1 | #ifndef A_AUX
 2 | #define A_AUX
 3 | 
 4 | #include <RcppArmadillo.h>
 5 | 
 6 | double log_Likelihood_double(double mu_obs, double d);
 7 | arma::rowvec Stratified_operations_0(arma::vec x,
 8 |                                      arma::vec col1,
 9 |                                      int val1);
10 | arma::mat Groups_quantities(arma::colvec mu_obser,
11 |                             arma::vec  Ci,
12 |                             int K);
13 | Rcpp::List index_row_col(arma::mat Nq,
14 |                    int q,
15 |                    int N);
16 | 
17 | double Norm_Constant_Z_l2( int Nzi_l, int N, double xi, int q);
18 | 
19 | arma::colvec rdir_cpp(arma::colvec alpha);
20 | 
21 |   #endif
22 | 
23 | 


--------------------------------------------------------------------------------
/man/Swissroll.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/general_auxiliary.R
 3 | \name{Swissroll}
 4 | \alias{Swissroll}
 5 | \title{Generates a noise-free Swiss roll dataset}
 6 | \usage{
 7 | Swissroll(n)
 8 | }
 9 | \arguments{
10 | \item{n}{number of observations contained in the output dataset.}
11 | }
12 | \value{
13 | a three-dimensional \code{data.frame} containing the coordinates of
14 | the points generated via the Swiss roll mapping.
15 | }
16 | \description{
17 | The function creates a three-dimensional dataset with coordinates
18 | following the Swiss roll mapping, transforming random uniform data points
19 | sampled on the interval \code{(0,10)}.
20 | }
21 | \examples{
22 | Data <- Swissroll(1000)
23 | 
24 | }
25 | 


--------------------------------------------------------------------------------
/R/general_auxiliary.R:
--------------------------------------------------------------------------------
 1 | #' Generates a noise-free Swiss roll dataset
 2 | #'
 3 | #' The function creates a three-dimensional dataset with coordinates
 4 | #' following the Swiss roll mapping, transforming random uniform data points
 5 | #' sampled on the interval \code{(0,10)}.
 6 | #'
 7 | #' @param n number of observations contained in the output dataset.
 8 | #'
 9 | #' @return a three-dimensional \code{data.frame} containing the coordinates of
10 | #' the points generated via the Swiss roll mapping.
11 | #'
12 | #' @export
13 | #'
14 | #' @examples
15 | #' Data <- Swissroll(1000)
16 | #'
17 | Swissroll <- function(n) {
18 |   X <- stats::runif(n, 0, 10)
19 |   Y <- stats::runif(n, 0, 10)
20 |   return(SR = data.frame(
21 |     x = X * cos(X),
22 |     y = Y,
23 |     z = X * sin(X)
24 |   ))
25 | }
26 | 
27 | 


--------------------------------------------------------------------------------
/src/B_FULLCOND.h:
--------------------------------------------------------------------------------
 1 | #ifndef B_FULLCOND
 2 | #define B_FULLCOND
 3 | 
 4 | #include <RcppArmadillo.h>
 5 | #include "A_AUX.h"
 6 | //[[Rcpp::depends(RcppArmadillo)]]
 7 | #include <RcppArmadilloExtensions/sample.h>
 8 | using namespace Rcpp;
 9 | 
10 | arma::colvec  Update_memberships_faster(arma::colvec mu_obser,
11 |                                         arma::colvec dl,
12 |                                         arma::colvec pl,
13 |                                         int K, int N,int q,
14 |                                         arma::colvec possible_label,
15 |                                         arma::colvec Ci, double QQ,
16 |                                         arma::umat index_row,
17 |                                         Rcpp::List index_col,
18 |                                         arma::colvec log_Precomp_Z,
19 |                                         arma::colvec log_Precomp_ratios);
20 | #endif
21 | 


--------------------------------------------------------------------------------
/src/G_GRIDE.h:
--------------------------------------------------------------------------------
 1 | #ifndef G_GRIDE
 2 | #define G_GRIDE
 3 | 
 4 | 
 5 | double gride_log_likelihood(double d,
 6 |                             int n1,
 7 |                             int n2,
 8 |                             arma::colvec mus_n1_n2);
 9 | 
10 | double gride_log_posterior(double z,
11 |                            int n1,
12 |                            int n2,
13 |                            double a_d,
14 |                            double b_d,
15 |                            arma::colvec mus_n1_n2);
16 | 
17 | arma::colvec gride_mh_sampler(double start_d,
18 |                               int n1,
19 |                               int n2,
20 |                               double a_d,
21 |                               double b_d,
22 |                               arma::colvec mus_n1_n2,
23 |                               int nsim,
24 |                               int burn_in,
25 |                               double sigma);
26 | 
27 | #endif
28 | 


--------------------------------------------------------------------------------
/man/autoplot.gride_evolution.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/gride_evolution.R
 3 | \name{autoplot.gride_evolution}
 4 | \alias{autoplot.gride_evolution}
 5 | \title{Plot the evolution of \code{Gride} estimates}
 6 | \usage{
 7 | \method{autoplot}{gride_evolution}(object, title = "Gride Evolution", ...)
 8 | }
 9 | \arguments{
10 | \item{object}{an object of class \code{gride_evolution}.}
11 | 
12 | \item{title}{an optional string to customize the title of the plot.}
13 | 
14 | \item{...}{other arguments passed to specific methods.}
15 | }
16 | \value{
17 | object of class \code{\link[ggplot2]{ggplot}}. It displays the
18 | the evolution of the Gride maximum likelihood estimates as a function
19 | of the average distance from \code{n2}.
20 | }
21 | \description{
22 | Use this method without the \code{.gride_evolution} suffix.
23 | It plots the evolution of the \code{id}
24 | estimates as a function of the average distance from the furthest NN of
25 | each point.
26 | }
27 | 


--------------------------------------------------------------------------------
/man/autoplot.gride_mle.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/autoplot_gride.R
 3 | \name{autoplot.gride_mle}
 4 | \alias{autoplot.gride_mle}
 5 | \title{Plot the simulated bootstrap sample for the MLE \code{Gride}}
 6 | \usage{
 7 | \method{autoplot}{gride_mle}(object, title = "MLE Gride - Bootstrap sample", ...)
 8 | }
 9 | \arguments{
10 | \item{object}{object of class \code{gride_mle}.
11 | It is obtained using the output of the \code{gride} function when
12 | \code{method = "mle"}.}
13 | 
14 | \item{title}{title for the plot.}
15 | 
16 | \item{...}{other arguments passed to specific methods.}
17 | }
18 | \value{
19 | object of class \code{\link[ggplot2]{ggplot}}. It displays the
20 | density plot of the sample generated via parametric bootstrap to help the
21 | visual assessment of the uncertainty of the \code{id} estimates.
22 | }
23 | \description{
24 | Use this method without the \code{.gride_mle} suffix.
25 | It displays the density plot of sample obtained via
26 | parametric bootstrap for the \code{Gride} model.
27 | }
28 | \seealso{
29 | \code{\link{gride}}
30 | }
31 | 


--------------------------------------------------------------------------------
/man/autoplot.twonn_linfit.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/autoplot_twonn.R
 3 | \name{autoplot.twonn_linfit}
 4 | \alias{autoplot.twonn_linfit}
 5 | \title{Plot the output of the \code{TWO-NN} model estimated via least squares}
 6 | \usage{
 7 | \method{autoplot}{twonn_linfit}(object, title = "TWO-NN Linear Fit", ...)
 8 | }
 9 | \arguments{
10 | \item{object}{object of class \code{twonn_linfit}, the output of the
11 | \code{twonn} function when \code{method = "linfit"}.}
12 | 
13 | \item{title}{string used as title of the plot.}
14 | 
15 | \item{...}{other arguments passed to specific methods.}
16 | }
17 | \value{
18 | a \code{\link[ggplot2]{ggplot2}} object displaying the goodness of
19 | the linear fit of the TWO-NN model.
20 | }
21 | \description{
22 | Use this method without the \code{.twonn_linfit} suffix.
23 | The function returns the representation of the linear
24 | regression that is fitted with the \code{linfit} method.
25 | }
26 | \seealso{
27 | \code{\link{twonn}}
28 | 
29 | Other autoplot methods: 
30 | \code{\link{autoplot.Hidalgo}()},
31 | \code{\link{autoplot.gride_bayes}()},
32 | \code{\link{autoplot.twonn_bayes}()},
33 | \code{\link{autoplot.twonn_mle}()}
34 | }
35 | \concept{autoplot methods}
36 | 


--------------------------------------------------------------------------------
/man/autoplot.twonn_mle.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/autoplot_twonn.R
 3 | \name{autoplot.twonn_mle}
 4 | \alias{autoplot.twonn_mle}
 5 | \title{Plot the output of the \code{TWO-NN} model estimated via the Maximum
 6 | Likelihood approach}
 7 | \usage{
 8 | \method{autoplot}{twonn_mle}(object, title = "MLE TWO-NN", ...)
 9 | }
10 | \arguments{
11 | \item{object}{object of class \code{twonn_mle}, the output of the
12 | \code{twonn} function when \code{method = "mle"}.}
13 | 
14 | \item{title}{character string used as title of the plot.}
15 | 
16 | \item{...}{other arguments passed to specific methods.}
17 | }
18 | \value{
19 | \code{\link[ggplot2]{ggplot2}} object displaying the point estimate
20 | and confidence interval obtained via the maximum likelihood approach of the
21 | \code{id} parameter.
22 | }
23 | \description{
24 | Use this method without the \code{.twonn_mle} suffix.
25 | The function returns the point estimate along with the confidence bands
26 | computed via the \code{mle} method.
27 | }
28 | \seealso{
29 | \code{\link{twonn}}
30 | 
31 | Other autoplot methods: 
32 | \code{\link{autoplot.Hidalgo}()},
33 | \code{\link{autoplot.gride_bayes}()},
34 | \code{\link{autoplot.twonn_bayes}()},
35 | \code{\link{autoplot.twonn_linfit}()}
36 | }
37 | \concept{autoplot methods}
38 | 


--------------------------------------------------------------------------------
/man/auxHidalgo.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/hidalgo_Aux.R
 3 | \name{auxHidalgo}
 4 | \alias{auxHidalgo}
 5 | \alias{posterior_means}
 6 | \alias{initial_values}
 7 | \alias{posterior_medians}
 8 | \alias{credible_intervals}
 9 | \title{Auxiliary functions for the \code{Hidalgo} model}
10 | \usage{
11 | posterior_means(x)
12 | 
13 | initial_values(x)
14 | 
15 | posterior_medians(x)
16 | 
17 | credible_intervals(x, alpha = 0.95)
18 | }
19 | \arguments{
20 | \item{x}{object of class \code{Hidalgo}, the output of the
21 | \code{Hidalgo()} function.}
22 | 
23 | \item{alpha}{posterior probability contained in the computed credible
24 | interval.}
25 | }
26 | \value{
27 | \code{posterior_mean} returns the observation-specific \code{id} posterior means estimated with \code{Hidalgo}.
28 | 
29 | \code{initial_values} returns a list with the parameter specification
30 | passed to the model.
31 | 
32 | \code{posterior_median} returns the observation-specific \code{id} posterior medians estimated with  \code{Hidalgo}.
33 | 
34 | \code{credible_interval} returns the observation-specific credible intervals for a specific
35 | probability \code{alpha}.
36 | }
37 | \description{
38 | Collection of functions used to extract meaningful information from the object returned
39 | by the function \code{Hidalgo}
40 | }
41 | 


--------------------------------------------------------------------------------
/man/id_by_class.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/hidalgo_postproc.R
 3 | \name{id_by_class}
 4 | \alias{id_by_class}
 5 | \alias{print.hidalgo_class}
 6 | \title{Stratification of the \code{id} by an external categorical variable}
 7 | \usage{
 8 | id_by_class(object, class)
 9 | 
10 | \method{print}{hidalgo_class}(x, ...)
11 | }
12 | \arguments{
13 | \item{object}{object of class \code{Hidalgo}, the output of the
14 | \code{Hidalgo()} function.}
15 | 
16 | \item{class}{factor according to the observations should be stratified by.}
17 | 
18 | \item{x}{object of class \code{hidalgo_class}, the output of the \code{id_by_class()} function.}
19 | 
20 | \item{...}{other arguments passed to specific methods.}
21 | }
22 | \value{
23 | a \code{data.frame} containing the posterior \code{id} means,
24 | medians, and standard deviations stratified by the levels of the variable
25 | \code{class}.
26 | }
27 | \description{
28 | The function computes summary statistics (mean, median, and standard
29 | deviation) of the post-processed chains of the intrinsic dimension stratified
30 | by an external categorical variable.
31 | }
32 | \examples{
33 | \donttest{
34 | X            <- replicate(5,rnorm(500))
35 | X[1:250,1:2] <- 0
36 | oracle       <- rep(1:2,rep(250,2))
37 | h_out        <- Hidalgo(X)
38 | id_by_class(h_out,oracle)
39 | }
40 | 
41 | }
42 | \seealso{
43 | \code{\link{Hidalgo}}
44 | }
45 | 


--------------------------------------------------------------------------------
/man/generalized_ratios_distribution.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/gera_distribution.R
 3 | \name{generalized_ratios_distribution}
 4 | \alias{generalized_ratios_distribution}
 5 | \alias{dgera}
 6 | \alias{rgera}
 7 | \title{The Generalized Ratio distribution}
 8 | \usage{
 9 | rgera(nsim, n1 = 1, n2 = 2, d)
10 | 
11 | dgera(x, n1 = 1, n2 = 2, d, log = FALSE)
12 | }
13 | \arguments{
14 | \item{nsim}{integer, the number of observations to generate.}
15 | 
16 | \item{n1}{order of the first NN considered. Default is 1.}
17 | 
18 | \item{n2}{order of the second NN considered. Default is 2.}
19 | 
20 | \item{d}{value of the intrinsic dimension.}
21 | 
22 | \item{x}{vector of quantiles.}
23 | 
24 | \item{log}{logical, if \code{TRUE}, it returns the log-density}
25 | }
26 | \value{
27 | \code{dgera} gives the density. \code{rgera} returns a vector of
28 | random observations sampled from the generalized ratio distribution.
29 | }
30 | \description{
31 | Density function and random number generator for the Generalized Ratio
32 | distribution with NN orders equal to \code{n1} and \code{n2}.
33 | See \href{https://www.nature.com/articles/s41598-022-20991-1}{Denti et al., 2022}
34 | for more details.
35 | }
36 | \examples{
37 | draws   <- rgera(100,3,5,2)
38 | density <- dgera(3,3,5,2)
39 | 
40 | }
41 | \references{
42 | Denti F, Doimo D, Laio A, Mira A (2022). "The generalized ratios intrinsic dimension estimator."
43 | Scientific Reports, 12(20005).
44 | ISSN  20452322, \doi{10.1038/s41598-022-20991-1}.
45 | }
46 | 


--------------------------------------------------------------------------------
/man/intRinsic-package.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/intRinsic-package.R
 3 | \docType{package}
 4 | \name{intRinsic-package}
 5 | \alias{intRinsic}
 6 | \alias{intRinsic-package}
 7 | \title{intRinsic: Likelihood-Based Intrinsic Dimension Estimators}
 8 | \description{
 9 | \if{html}{\figure{logo.png}{options: style='float: right' alt='logo' width='120'}}
10 | 
11 | Provides functions to estimate the intrinsic dimension of a dataset via likelihood-based approaches. Specifically, the package implements the 'TWO-NN' and 'Gride' estimators and the 'Hidalgo' Bayesian mixture model. In addition, the first reference contains an extended vignette on the usage of the 'TWO-NN' and 'Hidalgo' models. References: Denti (2023, \doi{10.18637/jss.v106.i09}); Allegra et al. (2020, \doi{10.1038/s41598-020-72222-0}); Denti et al. (2022, \doi{10.1038/s41598-022-20991-1}); Facco et al. (2017, \doi{10.1038/s41598-017-11873-y}); Santos-Fernandez et al. (2021, \doi{10.1038/s41598-022-20991-1}).
12 | }
13 | \seealso{
14 | Useful links:
15 | \itemize{
16 |   \item \url{https://github.com/Fradenti/intRinsic}
17 |   \item Report bugs at \url{https://github.com/fradenti/intRinsic/issues}
18 | }
19 | 
20 | }
21 | \author{
22 | \strong{Maintainer}: Francesco Denti \email{francescodenti.personal@gmail.com} (\href{https://orcid.org/0000-0003-2978-4702}{ORCID}) [copyright holder]
23 | 
24 | Authors:
25 | \itemize{
26 |   \item Andrea Gilardi \email{andrea.gilardi@unimib.it} (\href{https://orcid.org/0000-0002-9424-7439}{ORCID})
27 | }
28 | 
29 | }
30 | \keyword{internal}
31 | 


--------------------------------------------------------------------------------
/man/autoplot.twonn_bayes.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/autoplot_twonn.R
 3 | \name{autoplot.twonn_bayes}
 4 | \alias{autoplot.twonn_bayes}
 5 | \title{Plot the output of the \code{TWO-NN} model estimated via the Bayesian
 6 | approach}
 7 | \usage{
 8 | \method{autoplot}{twonn_bayes}(
 9 |   object,
10 |   plot_low = 0,
11 |   plot_upp = NULL,
12 |   by = 0.05,
13 |   title = "Bayesian TWO-NN",
14 |   ...
15 | )
16 | }
17 | \arguments{
18 | \item{object}{object of class \code{twonn_bayes}, the output of the
19 | \code{twonn} function when \code{method = "bayes"}.}
20 | 
21 | \item{plot_low}{lower bound of the interval on which the posterior density
22 | is plotted.}
23 | 
24 | \item{plot_upp}{upper bound of the interval on which the posterior density
25 | is plotted.}
26 | 
27 | \item{by}{step-size at which the sequence spanning the interval is
28 | incremented.}
29 | 
30 | \item{title}{character string used as title of the plot.}
31 | 
32 | \item{...}{other arguments passed to specific methods.}
33 | }
34 | \value{
35 | \code{\link[ggplot2]{ggplot2}} object displaying the posterior
36 | distribution of the intrinsic dimension parameter.
37 | }
38 | \description{
39 | Use this method without the \code{.twonn_bayes} suffix.
40 | The function returns the density plot of the
41 | posterior distribution computed with the \code{bayes} method.
42 | }
43 | \seealso{
44 | \code{\link{twonn}}
45 | 
46 | Other autoplot methods: 
47 | \code{\link{autoplot.Hidalgo}()},
48 | \code{\link{autoplot.gride_bayes}()},
49 | \code{\link{autoplot.twonn_linfit}()},
50 | \code{\link{autoplot.twonn_mle}()}
51 | }
52 | \concept{autoplot methods}
53 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: intRinsic
 2 | Title: Likelihood-Based Intrinsic Dimension Estimators
 3 | Version: 1.1.2
 4 | Authors@R: c(
 5 |     person("Francesco", "Denti", ,"francescodenti.personal@gmail.com", 
 6 |     role = c("aut", "cre", "cph"),
 7 |     comment = c(ORCID = "0000-0003-2978-4702")),
 8 |     person("Andrea", "Gilardi", ,"andrea.gilardi@unimib.it", 
 9 |     role = "aut",
10 |     comment = c(ORCID = "0000-0002-9424-7439"))
11 |   )
12 | Maintainer: Francesco Denti <francescodenti.personal@gmail.com>
13 | Description: Provides functions to estimate the intrinsic dimension of a dataset 
14 |   via likelihood-based approaches. Specifically, the package implements the 
15 |   'TWO-NN' and 'Gride' estimators and the 'Hidalgo' Bayesian mixture model. 
16 |   In addition, the first reference contains an extended vignette on the usage of 
17 |   the 'TWO-NN' and 'Hidalgo' models. References: 
18 |     Denti (2023, <doi:10.18637/jss.v106.i09>); 
19 |     Allegra et al. (2020, <doi:10.1038/s41598-020-72222-0>); 
20 |     Denti et al. (2022, <doi:10.1038/s41598-022-20991-1>);
21 |     Facco et al. (2017, <doi:10.1038/s41598-017-11873-y>); 
22 |     Santos-Fernandez et al. (2021, <doi:10.1038/s41598-022-20991-1>).
23 | License: MIT + file LICENSE
24 | URL: https://github.com/Fradenti/intRinsic
25 | BugReports: https://github.com/fradenti/intRinsic/issues
26 | Depends: R (>= 4.2.0)
27 | Imports: 
28 |     dplyr,
29 |     FNN,
30 |     ggplot2,
31 |     knitr,
32 |     Rcpp,
33 |     reshape2,
34 |     rlang,
35 |     stats,
36 |     utils,
37 |     salso
38 | LinkingTo: 
39 |     Rcpp,
40 |     RcppArmadillo
41 | NeedsCompilation: yes
42 | ByteCompile: true
43 | Encoding: UTF-8
44 | RoxygenNote: 7.3.3
45 | 


--------------------------------------------------------------------------------
/man/autoplot.gride_bayes.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/autoplot_gride.R
 3 | \name{autoplot.gride_bayes}
 4 | \alias{autoplot.gride_bayes}
 5 | \title{Plot the simulated MCMC chains for the Bayesian \code{Gride}}
 6 | \usage{
 7 | \method{autoplot}{gride_bayes}(
 8 |   object,
 9 |   traceplot = FALSE,
10 |   title = "Bayesian Gride - Posterior distribution",
11 |   ...
12 | )
13 | }
14 | \arguments{
15 | \item{object}{object of class \code{gride_bayes}.
16 | It is obtained using the output of the \code{gride} function when
17 | \code{method = "bayes"}.}
18 | 
19 | \item{traceplot}{logical. If \code{FALSE}, the function returns a plot of the
20 | posterior density. If \code{TRUE}, the function returns the traceplots of the
21 | MCMC used to simulate from the posterior distribution.}
22 | 
23 | \item{title}{optional string to display as title.}
24 | 
25 | \item{...}{other arguments passed to specific methods.}
26 | }
27 | \value{
28 | object of class \code{\link[ggplot2]{ggplot}}.
29 | It could represent the traceplot of the posterior simulations for the
30 | Bayesian \code{Gride} model (\code{traceplot = TRUE}) or a density plot
31 | of the simulated posterior distribution (\code{traceplot = FALSE}).
32 | }
33 | \description{
34 | Use this method without the \code{.gride_bayes} suffix.
35 | It displays the traceplot of the chain generated
36 | with Metropolis-Hasting updates to visually assess mixing and convergence.
37 | Alternatively, it is possible to plot the posterior density.
38 | }
39 | \seealso{
40 | \code{\link{gride}}
41 | 
42 | Other autoplot methods: 
43 | \code{\link{autoplot.Hidalgo}()},
44 | \code{\link{autoplot.twonn_bayes}()},
45 | \code{\link{autoplot.twonn_linfit}()},
46 | \code{\link{autoplot.twonn_mle}()}
47 | }
48 | \concept{autoplot methods}
49 | 


--------------------------------------------------------------------------------
/R/RcppExports.R:
--------------------------------------------------------------------------------
 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand
 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 3 | 
 4 | log_Likelihood_double <- function(mu_obs, d) {
 5 |     .Call(`_intRinsic_log_Likelihood_double`, mu_obs, d)
 6 | }
 7 | 
 8 | Groups_quantities <- function(mu_obser, Ci, K) {
 9 |     .Call(`_intRinsic_Groups_quantities`, mu_obser, Ci, K)
10 | }
11 | 
12 | Norm_Constant_Z_l2 <- function(Nzi_l, N, xi, q) {
13 |     .Call(`_intRinsic_Norm_Constant_Z_l2`, Nzi_l, N, xi, q)
14 | }
15 | 
16 | log_Zeta_maker <- function(N, xi, q) {
17 |     .Call(`_intRinsic_log_Zeta_maker`, N, xi, q)
18 | }
19 | 
20 | index_row_col <- function(Nq, q, N) {
21 |     .Call(`_intRinsic_index_row_col`, Nq, q, N)
22 | }
23 | 
24 | rdir_cpp <- function(alpha) {
25 |     .Call(`_intRinsic_rdir_cpp`, alpha)
26 | }
27 | 
28 | Update_memberships_faster <- function(mu_obser, dl, pl, K, N, q, possible_label, Ci, QQ, index_row, index_col, log_Precomp_Z, log_Precomp_ratios) {
29 |     .Call(`_intRinsic_Update_memberships_faster`, mu_obser, dl, pl, K, N, q, possible_label, Ci, QQ, index_row, index_col, log_Precomp_Z, log_Precomp_ratios)
30 | }
31 | 
32 | gam_trunc <- function(D, K, a0_d, b0_d, n_l, sLog) {
33 |     .Call(`_intRinsic_gam_trunc`, D, K, a0_d, b0_d, n_l, sLog)
34 | }
35 | 
36 | gam_trunc_pmass <- function(D, K, a0_d, b0_d, n_l, sLog, pi_mass) {
37 |     .Call(`_intRinsic_gam_trunc_pmass`, D, K, a0_d, b0_d, n_l, sLog, pi_mass)
38 | }
39 | 
40 | gride_log_likelihood <- function(d, n1, n2, mus_n1_n2) {
41 |     .Call(`_intRinsic_gride_log_likelihood`, d, n1, n2, mus_n1_n2)
42 | }
43 | 
44 | gride_log_posterior <- function(z, n1, n2, a_d, b_d, mus_n1_n2) {
45 |     .Call(`_intRinsic_gride_log_posterior`, z, n1, n2, a_d, b_d, mus_n1_n2)
46 | }
47 | 
48 | gride_mh_sampler <- function(start_d, n1, n2, a_d, b_d, mus_n1_n2, nsim, burn_in, sigma) {
49 |     .Call(`_intRinsic_gride_mh_sampler`, start_d, n1, n2, a_d, b_d, mus_n1_n2, nsim, burn_in, sigma)
50 | }
51 | 
52 | 


--------------------------------------------------------------------------------
/src/C_GAMMA.cpp:
--------------------------------------------------------------------------------
 1 | #include "C_GAMMA.h"
 2 | 
 3 | // [[Rcpp::export]]
 4 | arma::colvec gam_trunc(int D, int K,
 5 |                        double a0_d, double b0_d,
 6 |                        arma::colvec n_l,
 7 |                        arma::colvec sLog){
 8 |   arma::colvec d(K);
 9 |   arma::colvec p_upper(K);
10 |   arma::colvec p_unif(K);
11 | 
12 |   arma::colvec a_star = n_l + a0_d;
13 |   arma::colvec b_star = sLog + b0_d;
14 | 
15 |   for(int k=0; k<K; k++){
16 |     p_upper[k] = R::pgamma( D, a_star[k], 1.0/b_star[k], true, false);
17 |     p_unif[k]  = R::runif( 0, p_upper[k]);
18 |     d[k]       = R::qgamma(p_unif[k], a_star[k], 1.0/b_star[k], true, false);
19 |   }
20 |   return(d);
21 | }
22 | 
23 | // [[Rcpp::export]]
24 | arma::colvec gam_trunc_pmass(int D, int K,
25 |                              double a0_d, double b0_d,
26 |                              arma::colvec n_l,
27 |                              arma::colvec sLog,
28 |                              double pi_mass){
29 | 
30 |   arma::colvec d(K);
31 |   arma::colvec p_upper(K);
32 |   arma::colvec p_unif(K);
33 |   arma::colvec log_prob(2), prob(2);
34 |   arma::colvec a_star = n_l + a0_d;
35 |   arma::colvec b_star = sLog + b0_d;
36 | 
37 |   arma::vec dtrunc = gam_trunc(D, K, a0_d, b0_d,
38 |                                n_l, sLog);
39 | 
40 |   for(int ll=0; ll<K; ll++) {
41 |     log_prob[0] =log(pi_mass) + n_l[ll] * log(D) - D * sLog[ll];
42 |     log_prob[1] =
43 |         log(1 - pi_mass) +
44 |         a0_d * log(b0_d) -
45 |         lgamma(a0_d) +
46 |         lgamma(a_star[ll]) -
47 |         a_star[ll] * log(b_star[ll]); //+
48 |       // R::pgamma(D, a_star[ll], b_star[ll], true, true) -
49 |       // R::pgamma(D, a0_d, b0_d, true, true);
50 |       // not needed, as I am sampling from a truncated gamma
51 |       // (this ratio is part of the distribution!)
52 | 
53 |     prob = exp(log_prob) / arma::accu(exp(log_prob));
54 |   if(R::runif(0,1)<prob[0]){
55 |     d[ll] = D;
56 |   }else{
57 |     d[ll] = dtrunc[ll];
58 |   }
59 |   }
60 |   return(d);
61 |   }
62 | 


--------------------------------------------------------------------------------
/src/Z_unused.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | //////////////////////////////////////////////////////////////////////////////////////////////
 3 |   // [[Rcpp::export]]
 4 | double rtgamma_once(double shape, double rate, double lb, double ub) {
 5 |   // Draws from a truncated Gamma(shape, rate) distribution on (lb, ub)
 6 |   // weprovide the shape and rate parameters of the gamma
 7 |   // these are converted internally to scale parameters used by Rf_{dpq}gamma
 8 |   double lub = Rf_pgamma(lb, shape, 1.0/rate, 1,0);
 9 |   double uub = Rf_pgamma(ub, shape, 1.0/rate, 1,0);
10 |   double u = Rf_runif(lub, uub);
11 |   double draw = Rf_qgamma(u, shape, 1.0/rate, 1,0);
12 |   return draw;
13 | }
14 | // [[Rcpp::export]]
15 | arma::colvec UPD_d_TRUNC( List AIFD,  double a0, double b0, int K, double D){
16 | 
17 |   arma::colvec D_new(K);
18 | 
19 |   arma::colvec LOGSUM = as<arma::colvec>(AIFD["SL0"]);
20 |   arma::colvec n10    = as<arma::colvec>(AIFD["nl0"]);
21 |   arma::colvec a_star = a0 + n10;
22 |   arma::colvec b_star = b0 + LOGSUM;
23 |   for( int l=0; l<K; l++){
24 |     D_new[l] = rtgamma_once(a_star[l], b_star[l], 0.0, D);
25 |   }
26 |   return(D_new);
27 | }
28 | // [[Rcpp::export]]
29 | arma::colvec UPD_d_TRUNC_MASS( List AIFD,  double a0, double b0, int K, double D, double piMass){
30 | 
31 |   arma::colvec D_new(K);
32 |   arma::colvec LOGSUM = as<arma::colvec>(AIFD["SL0"]);
33 |   arma::colvec n10    = as<arma::colvec>(AIFD["nl0"]);
34 |   arma::colvec a_star = a0 + n10;
35 |   arma::colvec b_star = b0 + LOGSUM;
36 |   arma::colvec lp(2);
37 |   for( int l=0; l<K; l++){
38 |     //non tengo conto che è troncata nel peso!
39 |       double log_mx  = (::lgamma(a_star[l])) - (::lgamma(a0)) +
40 |         a0*log(b0) - (a_star[l]) * log(b_star[l]) - LOGSUM[l];
41 |       lp(0) = (log_mx)      + log(piMass)  ;
42 |       lp(1) = log(1-piMass) + n10[l] * log(D) - (D)*LOGSUM[l]   ;
43 |       lp    = exp(lp-max(lp));
44 |       double  U = R::runif(0,1);
45 |       double totpeso = lp(0)/accu(lp);
46 | 
47 |       // Rcout << totpeso << "..." <<    "\n";
48 |       if( U < totpeso){
49 |         D_new[l] = rtgamma_once(a_star[l], b_star[l], 0.0, D);
50 |       }else{
51 |         D_new[l] = D;
52 |       }
53 |   }
54 |   return(D_new);
55 | }
56 | 
57 | */
58 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | <!-- README.md is generated from README.Rmd. Please edit that file -->
 3 | 
 4 | # intRinsic v1.1.2 <img src="man/figures/intLogo.png" align="right" width="120" />
 5 | 
 6 | <!-- badges: start -->
 7 | 
 8 | [![CRAN](https://www.r-pkg.org/badges/version/intRinsic)](https://cran.r-project.org/package=intRinsic)
 9 | [![Last
10 | Commit](https://img.shields.io/github/last-commit/fradenti/intRinsic)](https://github.com/fradenti/intRinsic)
11 | [![Downloads
12 | (monthly)](https://cranlogs.r-pkg.org/badges/intRinsic?color=brightgreen)](https://www.r-pkg.org/pkg/intRinsic)
13 | [![Downloads
14 | (total)](https://cranlogs.r-pkg.org/badges/grand-total/intRinsic?color=brightgreen)](https://www.r-pkg.org/pkg/intRinsic)
15 | [![JSS](https://img.shields.io/badge/JSS-10.18637%2Fjss.v040.i08-brightgreen)](https://www.jstatsoft.org/article/view/v106i09)
16 | <!-- badges: end -->
17 | 
18 | A package with functions to estimate the intrinsic dimension of a
19 | dataset via likelihood-based approaches. Specifically, the package
20 | implements the `TWO-NN` and `Gride` estimators and the `Hidalgo`
21 | Bayesian mixture model.
22 | 
23 | To install the package from CRAN, run
24 | 
25 | ``` r
26 | install.packages("intRinsic")
27 | ```
28 | 
29 | To install the package from this GitHub repository, run
30 | 
31 | ``` r
32 | # install.packages("remotes")
33 | 
34 | #Turn off warning-error-conversion regarding package versions
35 | Sys.setenv("R_REMOTES_NO_ERRORS_FROM_WARNINGS" = "true")
36 | 
37 | #install from github
38 | remotes::install_github("Fradenti/intRinsic")
39 | ```
40 | 
41 | Simple example on Swissroll dataset
42 | 
43 | ``` r
44 | library(intRinsic)
45 | X <- Swissroll(2000)
46 | twonn(X)
47 | ```
48 | 
49 | The vignette for this package has been published in the
50 | `Journal of Statistical Software`. The article can be found at [this
51 | link](https://doi.org/10.18637/jss.v106.i09).
52 | 
53 | Please help me improve this package by reporting suggestions, typos, and
54 | issues at [this link](https://github.com/Fradenti/intRinsic/issues).
55 | 
56 | Please note that the previous versions of the package (from `v0.1.0` to
57 | `v1.1.1`) are still available as GitHub Releases at [this
58 | page](https://github.com/Fradenti/intRinsic/releases).
59 | 


--------------------------------------------------------------------------------
/man/gride_evolution.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/gride_evolution.R
 3 | \name{gride_evolution}
 4 | \alias{gride_evolution}
 5 | \alias{print.gride_evolution}
 6 | \alias{plot.gride_evolution}
 7 | \title{\code{Gride} evolution based on Maximum Likelihood Estimation}
 8 | \usage{
 9 | gride_evolution(X, vec_n1, vec_n2, upp_bound = 50)
10 | 
11 | \method{print}{gride_evolution}(x, ...)
12 | 
13 | \method{plot}{gride_evolution}(x, ...)
14 | }
15 | \arguments{
16 | \item{X}{data matrix with \code{n} observations and \code{D} variables.}
17 | 
18 | \item{vec_n1}{vector of integers, containing the smaller NN orders considered
19 | in the evolution.}
20 | 
21 | \item{vec_n2}{vector of integers, containing the larger NN orders considered
22 | in the evolution.}
23 | 
24 | \item{upp_bound}{upper bound for the interval used in the numerical
25 | optimization (via \code{optimize}). Default is set to 50.}
26 | 
27 | \item{x}{an object of class \code{gride_evolution}.}
28 | 
29 | \item{...}{other arguments passed to specific methods.}
30 | }
31 | \value{
32 | list containing the Gride evolution, the corresponding NN distance
33 | ratios, the average n2-th NN order distances, and the NN orders considered.
34 | 
35 | the function prints a summary of the Gride evolution to
36 | console.
37 | }
38 | \description{
39 | The function allows the study of the evolution of the \code{id} estimates
40 | as a function of the scale of a dataset. A scale-dependent analysis
41 | is essential to identify the correct number of relevant directions in noisy
42 | data. To increase the average distance from the second NN (and thus the
43 | average neighborhood size) involved in the estimation, the function computes
44 | a sequence of \code{Gride} models with increasing NN orders, \code{n1} and
45 | \code{n2}.
46 | See also \href{https://www.nature.com/articles/s41598-022-20991-1}{Denti et al., 2022}
47 | for more details.
48 | }
49 | \examples{
50 | \donttest{
51 | X       <-  replicate(5,rnorm(10000,0,.1))
52 | gride_evolution(X = X,vec_n1 = 2^(0:5),vec_n2 = 2^(1:6))
53 | }
54 | 
55 | }
56 | \references{
57 | Denti F, Doimo D, Laio A, Mira A (2022). "The generalized ratios intrinsic
58 | dimension estimator."
59 | Scientific Reports, 12(20005).
60 | ISSN  20452322, \doi{10.1038/s41598-022-20991-1}.
61 | }
62 | 


--------------------------------------------------------------------------------
/R/hidalgo_Aux.R:
--------------------------------------------------------------------------------
 1 | #' Auxiliary functions for the \code{Hidalgo} model
 2 | #'
 3 | #' Collection of functions used to extract meaningful information from the object returned
 4 | #' by the function \code{Hidalgo}
 5 | #'
 6 | #' @name auxHidalgo
 7 | #'
 8 | #' @param x object of class \code{Hidalgo}, the output of the
 9 | #' \code{Hidalgo()} function.
10 | #'
11 | #' @return \code{posterior_mean} returns the observation-specific \code{id} posterior means estimated with \code{Hidalgo}.
12 | #'
13 | #' @export
14 | posterior_means <- function(x){
15 |   if (class(x)[1] != "Hidalgo") {
16 |     stop("object is not of class 'Hidalgo'", call. = FALSE)
17 |   }
18 |   return(x$id_summary$MEAN)
19 | 
20 | }
21 | 
22 | #' @name auxHidalgo
23 | #'
24 | #' @param x object of class \code{Hidalgo}, the output of the
25 | #' \code{Hidalgo()} function.
26 | #'
27 | #' @return \code{initial_values} returns a list with the parameter specification
28 | #' passed to the model.
29 | #'
30 | #' @export
31 | initial_values <- function(x){
32 |   if (class(x)[1] != "Hidalgo") {
33 |     stop("object is not of class 'Hidalgo'", call. = FALSE)
34 |   }
35 |   return(x$recap)
36 | 
37 | }
38 | 
39 | 
40 | #' @name auxHidalgo
41 | #'
42 | #' @param x object of class \code{Hidalgo}, the output of the
43 | #' \code{Hidalgo()} function.
44 | #'
45 | #' @return \code{posterior_median} returns the observation-specific \code{id} posterior medians estimated with  \code{Hidalgo}.
46 | #'
47 | #' @export
48 | posterior_medians <- function(x){
49 |   if (class(x)[1] != "Hidalgo") {
50 |     stop("object is not of class 'Hidalgo'", call. = FALSE)
51 |   }
52 |   out <- x$id_summary$MEDIAN
53 |   return(out)
54 | 
55 | }
56 | 
57 | 
58 | #' @name auxHidalgo
59 | #'
60 | #' @param x object of class \code{Hidalgo}, the output of the
61 | #' \code{Hidalgo()} function.
62 | #' @param alpha posterior probability contained in the computed credible
63 | #' interval.
64 | #'
65 | #' @return \code{credible_interval} returns the observation-specific credible intervals for a specific
66 | #' probability \code{alpha}.
67 | #'
68 | #' @export
69 | credible_intervals <- function(x, alpha = .95){
70 |   if (class(x)[1] != "Hidalgo") {
71 |     stop("object is not of class 'Hidalgo'", call. = FALSE)
72 |   }
73 |   out <- t(apply(x$id_postpr,2,function(x) stats::quantile(x, probs = c((1 - alpha) / 2, (1 + alpha) / 2))))
74 |   return(out)
75 | }
76 | 


--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | output: github_document
 3 | ---
 4 | 
 5 | <!-- README.md is generated from README.Rmd. Please edit that file -->
 6 | 
 7 | ```{r, include = FALSE}
 8 | knitr::opts_chunk$set(
 9 |   collapse = TRUE,
10 |   comment = "#>",
11 |   fig.path = "man/figures/README-",
12 |   out.width = "100%"
13 | )
14 | ```
15 | 
16 | # intRinsic v1.1.2 <img src="man/figures/intLogo.png" align="right" width="120" />
17 | 
18 | 
19 | <!-- badges: start -->
20 | [![CRAN](https://www.r-pkg.org/badges/version/intRinsic)](https://cran.r-project.org/package=intRinsic)
21 | [![Last Commit](https://img.shields.io/github/last-commit/fradenti/intRinsic)](https://github.com/fradenti/intRinsic)
22 | [![Downloads (monthly)](https://cranlogs.r-pkg.org/badges/intRinsic?color=brightgreen)](https://www.r-pkg.org/pkg/intRinsic)
23 | [![Downloads (total)](https://cranlogs.r-pkg.org/badges/grand-total/intRinsic?color=brightgreen)](https://www.r-pkg.org/pkg/intRinsic)
24 | [![JSS](https://img.shields.io/badge/JSS-10.18637%2Fjss.v040.i08-brightgreen)](https://www.jstatsoft.org/article/view/v106i09)
25 | <!-- badges: end -->
26 | 
27 | A package with functions to estimate the intrinsic dimension of a dataset via likelihood-based approaches. 
28 | Specifically, the package implements the `TWO-NN` and `Gride` estimators and the `Hidalgo` Bayesian mixture model.
29 | 
30 | To install the package from CRAN, run
31 | ```r
32 | install.packages("intRinsic")
33 | ```
34 | 
35 | To install the package from this GitHub repository, run
36 | ```r
37 | # install.packages("remotes")
38 | 
39 | #Turn off warning-error-conversion regarding package versions
40 | Sys.setenv("R_REMOTES_NO_ERRORS_FROM_WARNINGS" = "true")
41 | 
42 | #install from github
43 | remotes::install_github("Fradenti/intRinsic")
44 | ```
45 | 
46 | Simple example on Swissroll dataset
47 | 
48 | ```r
49 | library(intRinsic)
50 | X <- Swissroll(2000)
51 | twonn(X)
52 | ```
53 | 
54 | The vignette for this package has been published in the `Journal of Statistical Software`. The article can be found at [this link](https://doi.org/10.18637/jss.v106.i09).
55 | 
56 | Please help me improve this package by reporting suggestions, typos, and issues at [this link](https://github.com/Fradenti/intRinsic/issues).
57 | 
58 | Please note that the previous versions of the package (from `v0.1.0` to `v1.1.1`) are still available as GitHub Releases at [this page](https://github.com/Fradenti/intRinsic/releases).
59 | 
60 | 


--------------------------------------------------------------------------------
/man/twonn_decimated.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/twonn_dec_depr.R
 3 | \name{twonn_decimated}
 4 | \alias{twonn_decimated}
 5 | \title{Estimate the decimated \code{TWO-NN} evolution with halving steps or vector of
 6 | proportions}
 7 | \usage{
 8 | twonn_decimated(
 9 |   X,
10 |   method = c("steps", "proportions"),
11 |   steps = 0,
12 |   proportions = 1,
13 |   seed = NULL
14 | )
15 | }
16 | \arguments{
17 | \item{X}{data matrix with \code{n} observations and \code{D} variables.}
18 | 
19 | \item{method}{method to use for decimation:
20 | \describe{
21 |   \item{\code{"steps"}}{the number of times the dataset is halved;}
22 |   \item{\code{"proportion"}}{the dataset is subsampled according to a vector
23 |   of proportions.}
24 | }}
25 | 
26 | \item{steps}{number of times the dataset is halved.}
27 | 
28 | \item{proportions}{vector containing the fractions of the dataset to be
29 | considered.}
30 | 
31 | \item{seed}{random seed controlling the sequence of sub-sampled observations.}
32 | }
33 | \value{
34 | list containing the \code{TWO-NN} evolution
35 | (maximum likelihood estimation and confidence intervals), the average
36 | distance from the second NN, and the vector of proportions that were
37 | considered. According to the chosen estimation method, it is accompanied with
38 | the vector of proportions or halving steps considered.
39 | }
40 | \description{
41 | The estimation of the \code{id} is related to the scale of the
42 | dataset. To escape the local reach of the \code{TWO-NN} estimator,
43 | \href{https://www.nature.com/articles/s41598-017-11873-y}{Facco et al. (2017)}
44 | proposed to subsample the original dataset in order to induce greater
45 | distances between the data points. By investigating the estimates' evolution
46 | as a function of the size of the neighborhood, it is possible to obtain
47 | information about the validity of the modeling assumptions and the robustness
48 | of the model in the presence of noise.
49 | }
50 | \references{
51 | Facco E, D'Errico M, Rodriguez A, Laio A (2017). "Estimating the intrinsic
52 | dimension of datasets by a minimal neighborhood information."
53 | Scientific Reports, 7(1).
54 | ISSN 20452322, \doi{10.1038/s41598-017-11873-y}.
55 | 
56 | Denti F, Doimo D, Laio A, Mira A (2022). "The generalized ratios intrinsic
57 | dimension estimator."
58 | Scientific Reports, 12(20005).
59 | ISSN  20452322, \doi{10.1038/s41598-022-20991-1}.
60 | }
61 | \seealso{
62 | \code{\link{twonn}}
63 | }
64 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | S3method(autoplot,Hidalgo)
 4 | S3method(autoplot,gride_bayes)
 5 | S3method(autoplot,gride_evolution)
 6 | S3method(autoplot,gride_mle)
 7 | S3method(autoplot,twonn_bayes)
 8 | S3method(autoplot,twonn_linfit)
 9 | S3method(autoplot,twonn_mle)
10 | S3method(plot,Hidalgo)
11 | S3method(plot,gride_bayes)
12 | S3method(plot,gride_evolution)
13 | S3method(plot,gride_mle)
14 | S3method(plot,hidalgo_psm)
15 | S3method(plot,mus)
16 | S3method(plot,twonn_bayes)
17 | S3method(plot,twonn_dec_by)
18 | S3method(plot,twonn_dec_prop)
19 | S3method(plot,twonn_linfit)
20 | S3method(plot,twonn_mle)
21 | S3method(print,Hidalgo)
22 | S3method(print,gride_bayes)
23 | S3method(print,gride_evolution)
24 | S3method(print,gride_mle)
25 | S3method(print,hidalgo_class)
26 | S3method(print,hidalgo_psm)
27 | S3method(print,mus)
28 | S3method(print,mus_Nq)
29 | S3method(print,summary.Hidalgo)
30 | S3method(print,summary.gride_bayes)
31 | S3method(print,summary.gride_mle)
32 | S3method(print,summary.twonn_bayes)
33 | S3method(print,summary.twonn_linfit)
34 | S3method(print,summary.twonn_mle)
35 | S3method(print,twonn_bayes)
36 | S3method(print,twonn_dec_by)
37 | S3method(print,twonn_dec_prop)
38 | S3method(print,twonn_linfit)
39 | S3method(print,twonn_mle)
40 | S3method(summary,Hidalgo)
41 | S3method(summary,gride_bayes)
42 | S3method(summary,gride_mle)
43 | S3method(summary,twonn_bayes)
44 | S3method(summary,twonn_linfit)
45 | S3method(summary,twonn_mle)
46 | export(Hidalgo)
47 | export(Swissroll)
48 | export(autoplot)
49 | export(clustering)
50 | export(compute_mus)
51 | export(credible_intervals)
52 | export(dgera)
53 | export(gride)
54 | export(gride_evolution)
55 | export(id_by_class)
56 | export(initial_values)
57 | export(posterior_means)
58 | export(posterior_medians)
59 | export(rgera)
60 | export(twonn)
61 | export(twonn_decimated)
62 | export(twonn_decimation)
63 | importFrom(Rcpp,sourceCpp)
64 | importFrom(ggplot2,autoplot)
65 | importFrom(ggplot2,ggplot)
66 | importFrom(graphics,abline)
67 | importFrom(graphics,boxplot)
68 | importFrom(graphics,curve)
69 | importFrom(graphics,hist)
70 | importFrom(graphics,legend)
71 | importFrom(graphics,lines)
72 | importFrom(graphics,matplot)
73 | importFrom(graphics,par)
74 | importFrom(graphics,plot)
75 | importFrom(graphics,points)
76 | importFrom(graphics,polygon)
77 | importFrom(graphics,segments)
78 | importFrom(rlang,.data)
79 | importFrom(stats,density)
80 | importFrom(stats,ts)
81 | useDynLib(intRinsic, .registration = TRUE)
82 | 


--------------------------------------------------------------------------------
/src/A_AUX.cpp:
--------------------------------------------------------------------------------
 1 | #include "A_AUX.h"
 2 | 
 3 | // [[Rcpp::export]]
 4 | double log_Likelihood_double(double mu_obs, double d){
 5 |   return log(d) - (d+1.0) * log(mu_obs) ;
 6 | }
 7 | 
 8 | arma::rowvec Stratified_operations_0(arma::vec x,
 9 |                                      arma::vec col1,
10 |                                      int val1){
11 | 
12 |   arma::uvec inds1  = find( col1 == val1 ) ;
13 |   arma::vec  x10    = x.elem(inds1);
14 |   arma::rowvec Res(2); Res.zeros();
15 |   Res(0) = x10.size();
16 |   if(Res(0) > 0){
17 |     Res(1) = accu(log(x10));
18 |   }
19 |   return Res;
20 | }
21 | 
22 | // [[Rcpp::export]]
23 | arma::mat Groups_quantities(arma::colvec mu_obser,
24 |                             arma::vec  Ci,
25 |                             int K){ // performs stratified operations for every k=1,..,K
26 |   arma::mat A(K,2);
27 |   for(int ls = 0; ls<K; ls++){
28 |     A.row(ls) = Stratified_operations_0(mu_obser, Ci, ls+1);;
29 |   }
30 |   return(A);
31 | }
32 | 
33 | // [[Rcpp::export]]
34 | double Norm_Constant_Z_l2( int Nzi_l, int N, double xi, int q){
35 |   int Nb = Nzi_l-1;
36 |   int Nw = N-Nzi_l;
37 |   double res=0; ;
38 |   for( int nb=0; nb<(q+1); nb++ ){
39 |     res +=  Rf_choose(Nb, nb) * Rf_choose(Nw, q-nb) * pow(xi, nb) * pow(1-xi,q-nb);
40 |   }
41 |   return (res);
42 | }
43 | 
44 | // [[Rcpp::export]]
45 | arma::vec log_Zeta_maker(int N, double xi, int q){
46 | 
47 |   arma::vec NCZ(N);
48 |   for(int i=0; i<N; i++){
49 |     NCZ[i] = Norm_Constant_Z_l2(i+1, N, xi, q);
50 |   }
51 |   return(log(NCZ));
52 | }
53 | 
54 | 
55 | arma::mat Postprocessing_Chains( arma::mat labels, arma::mat IDs){
56 |   int N    = labels.n_rows;
57 |   int NSIM = labels.n_cols;
58 |   arma::mat trackedID(N,NSIM);
59 |   int ind;
60 |   for(int i =0; i<N; i++){
61 |     for(int j=0; j<NSIM; j++){
62 |       ind = labels(i,j);
63 |       trackedID(i,j) =  IDs(ind-1,j);
64 |     }}
65 |   return trackedID;
66 | }
67 | 
68 | 
69 | // [[Rcpp::export]]
70 | Rcpp::List index_row_col(arma::mat Nq, int q, int N){
71 | 
72 |   arma::umat index_row(q,N);
73 |   arma::field<arma::uvec> index_col(N);
74 | 
75 |   for(int i=0; i<N; i++){
76 |     index_row.col(i) = find(Nq.row(i)==1);
77 |     arma::uvec index_col2 = find(Nq.col(i)==1);
78 |     index_col(i) = index_col2;
79 |   }
80 |   return(
81 |       Rcpp::List::create(
82 |       Rcpp::_["IR"] =  index_row,
83 |       Rcpp::_["IC"] =  index_col ) );
84 | }
85 | 
86 | 
87 | // [[Rcpp::export]]
88 | arma::colvec rdir_cpp(arma::colvec alpha){
89 | 
90 |   unsigned int K = alpha.n_elem;
91 |   arma::colvec dirvec(K);
92 |   double su = 0.0;
93 |   for(size_t i=0; i<K; i++){
94 |     dirvec[i] = R::rgamma(alpha[i],1.0);
95 |     su += dirvec[i];
96 |   }
97 |   return(dirvec/su);
98 | }
99 | 


--------------------------------------------------------------------------------
/src/B_FULLCOND.cpp:
--------------------------------------------------------------------------------
 1 | #include "B_FULLCOND.h"
 2 | 
 3 | // [[Rcpp::export]]
 4 | arma::colvec  Update_memberships_faster(arma::colvec mu_obser,
 5 |                                         arma::colvec dl,
 6 |                                         arma::colvec pl,
 7 |                                         int K, int N,int q,
 8 |                                         arma::colvec possible_label,
 9 |                                         arma::colvec Ci, double QQ,
10 |                                         arma::umat index_row,
11 |                                         Rcpp::List index_col,
12 |                                         arma::colvec log_Precomp_Z,
13 |                                         arma::colvec log_Precomp_ratios) {
14 |   Rcpp::checkUserInterrupt();
15 |   arma::colvec n_i_in_l(K), m_i_in_l(K), log_Zeta(K), log_corr(K),
16 |   log_Nq_given_z_i(K), pp(K), pp2(K), logLik_mu_i(K), Nzi_lAUX(K);
17 |   for(int i=0; i<N; i++){  // for every label
18 |     arma::colvec fakeC = Ci;
19 |     logLik_mu_i.fill(0);
20 |     n_i_in_l.fill(0);
21 |     m_i_in_l.fill(0);
22 |     unsigned int Nzi_l = 0 ;
23 |     arma::uvec IR=index_row.col(i), IC=index_col(i);
24 |     for(int l=0; l<(K); l++){ // for every possible value taken by Ci
25 |       logLik_mu_i(l) = log_Likelihood_double( mu_obser[i], dl[l]  );
26 |       fakeC[i] = l+1;
27 |       Nzi_l = accu(fakeC==(l+1));
28 |       //////////////////////////////////////////////////////////////////////////////////
29 |       n_i_in_l[l] = accu( fakeC.elem(IR) == (l+1) );
30 |       m_i_in_l[l] = accu( fakeC.elem(IC) == (l+1) );
31 |       log_Zeta[l] = log_Precomp_Z[Nzi_l-1];     //Norm_Constant_Z_l2(Nzi_l,  N, xi, q);
32 |       log_corr[l] = log_Precomp_ratios[Nzi_l - 1];//Norm_Constant_Z_l2(Nzi_l-1,  N, xi, q)/Zeta[l];
33 |       Nzi_lAUX[l] = Nzi_l-1;
34 |       //////////////////////////////////////////////////////////////////////////////////
35 |     }
36 | 
37 | 
38 |     log_Nq_given_z_i =  (n_i_in_l+m_i_in_l) * QQ - log_Zeta + log_corr;
39 |     pp = logLik_mu_i + log_Nq_given_z_i + log(pl);
40 |     if(pp.has_nan()){
41 |       Rcpp::Rcout << log_Nq_given_z_i <<"C\n";
42 |       Rcpp::Rcout << log_Zeta <<"A\n";
43 |       Rcpp::Rcout << log_corr <<"B\n";
44 |       Rcpp::Rcout << Nzi_lAUX <<"D\n";
45 |       Rcpp::Rcout << log_Precomp_ratios[Nzi_l-1];
46 |     }
47 |     // closing loop on labels to compute probabilities
48 |     if(std::isfinite(max(pp))){
49 |       pp2 = exp( pp - max(pp));
50 |       Ci(i) = RcppArmadillo::sample(
51 |         possible_label, 1, TRUE, pp2)[0];
52 |     }else{
53 |       Rcpp:: Rcout << "***!!!***";
54 |       Ci(i) = RcppArmadillo::sample(possible_label, 1, TRUE)[0];
55 |     }
56 |   } // closing loop on observations
57 |   return Ci;
58 | }
59 | 


--------------------------------------------------------------------------------
/man/compute_mus.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/compute_mus.R
 3 | \name{compute_mus}
 4 | \alias{compute_mus}
 5 | \alias{print.mus}
 6 | \alias{print.mus_Nq}
 7 | \alias{plot.mus}
 8 | \title{Compute the ratio statistics needed for the intrinsic dimension estimation}
 9 | \usage{
10 | compute_mus(X = NULL, dist_mat = NULL, n1 = 1, n2 = 2, Nq = FALSE, q = 3)
11 | 
12 | \method{print}{mus}(x, ...)
13 | 
14 | \method{print}{mus_Nq}(x, ...)
15 | 
16 | \method{plot}{mus}(x, range_d = NULL, ...)
17 | }
18 | \arguments{
19 | \item{X}{a dataset with \code{n} observations and \code{D} variables.}
20 | 
21 | \item{dist_mat}{a distance matrix computed between \code{n} observations.}
22 | 
23 | \item{n1}{order of the first NN considered. Default is 1.}
24 | 
25 | \item{n2}{order of the second NN considered. Default is 2.}
26 | 
27 | \item{Nq}{logical indicator. If \code{TRUE}, it provides the \code{N^q}
28 | matrix needed for fitting the Hidalgo model.}
29 | 
30 | \item{q}{integer, number of NN considered to build \code{N^q}.}
31 | 
32 | \item{x}{object of class \code{mus}, obtained from the
33 | function \code{compute_mus()}.}
34 | 
35 | \item{...}{ignored.}
36 | 
37 | \item{range_d}{a sequence of values for which the generalized ratios density
38 | is superimposed to the histogram of \code{mus}.}
39 | }
40 | \value{
41 | the principal output of this function is a vector containing the
42 | ratio statistics, an object of class \code{mus}. The length of the vector is
43 | equal to the number of observations considered, unless ties are present in
44 | the dataset. In that case, the duplicates are removed. Optionally, if
45 | \code{Nq} is \code{TRUE}, the function returns an object of class
46 | \code{mus_Nq}, a list containing both the ratio statistics \code{mus} and the
47 | adjacency matrix \code{NQ}.
48 | }
49 | \description{
50 | The function \code{compute_mus} computes the ratios of distances between
51 | nearest neighbors (NNs) of generic order, denoted as
52 | \code{mu(n_1,n_2)}.
53 | This quantity is at the core of all the likelihood-based methods contained
54 | in the package.
55 | }
56 | \examples{
57 | X           <- replicate(2,rnorm(1000))
58 | mu          <- compute_mus(X, n1 = 1, n2 = 2)
59 | mudots      <- compute_mus(X, n1 = 4, n2 = 8)
60 | pre_hidalgo <- compute_mus(X, n1 = 4, n2 = 8, Nq = TRUE, q = 3)
61 | }
62 | \references{
63 | Facco E, D'Errico M, Rodriguez A, Laio A (2017). "Estimating the intrinsic
64 | dimension of datasets by a minimal neighborhood information."
65 | Scientific Reports, 7(1).
66 | ISSN 20452322, \doi{10.1038/s41598-017-11873-y}.
67 | 
68 | Denti F, Doimo D, Laio A, Mira A (2022). "The generalized ratios intrinsic
69 | dimension estimator."
70 | Scientific Reports, 12(20005).
71 | ISSN  20452322, \doi{10.1038/s41598-022-20991-1}.
72 | }
73 | 


--------------------------------------------------------------------------------
/R/gera_distribution.R:
--------------------------------------------------------------------------------
 1 | #' The Generalized Ratio distribution
 2 | #'
 3 | #' Density function and random number generator for the Generalized Ratio
 4 | #' distribution with NN orders equal to \code{n1} and \code{n2}.
 5 | #' See \href{https://www.nature.com/articles/s41598-022-20991-1}{Denti et al., 2022}
 6 | #' for more details.
 7 | #'
 8 | #' @aliases dgera rgera
 9 | #'
10 | #' @name generalized_ratios_distribution
11 | #'
12 | #' @param x vector of quantiles.
13 | #' @param nsim integer, the number of observations to generate.
14 | #' @param n1 order of the first NN considered. Default is 1.
15 | #' @param n2 order of the second NN considered. Default is 2.
16 | #' @param d value of the intrinsic dimension.
17 | #' @param log logical, if \code{TRUE}, it returns the log-density
18 | #'
19 | #' @return \code{dgera} gives the density. \code{rgera} returns a vector of
20 | #' random observations sampled from the generalized ratio distribution.
21 | #'
22 | #' @references
23 | #' Denti F, Doimo D, Laio A, Mira A (2022). "The generalized ratios intrinsic dimension estimator."
24 | #' Scientific Reports, 12(20005).
25 | #' ISSN  20452322, \doi{10.1038/s41598-022-20991-1}.
26 | #'
27 | #' @examples
28 | #' draws   <- rgera(100,3,5,2)
29 | #' density <- dgera(3,3,5,2)
30 | #'
31 | NULL
32 | 
33 | #' @name generalized_ratios_distribution
34 | #' @export
35 | rgera <- function(nsim,
36 |                   n1 = 1,
37 |                   n2 = 2,
38 |                   d) {
39 |   if (n2 < n1) {
40 |     stop("n2 should be greater than n1", call. = FALSE)
41 |   }
42 |   a <- n2 - n1
43 |   b <- n1
44 |   x <- stats::rgamma(nsim, a, 1)
45 |   y <- stats::rgamma(nsim, b, 1)
46 |   z <- x / y
47 |   sample <- (z + 1) ^ (1 / d)
48 |   attr(sample, "n1") <- n1
49 |   attr(sample, "n2") <- n2
50 |   attr(sample, "upper_D") <- "unknown"
51 |   structure(sample, class = c("mus", class(sample)))
52 | }
53 | 
54 | #' @name generalized_ratios_distribution
55 | #' @export
56 | dgera <- function(x,
57 |                   n1 = 1,
58 |                   n2 = 2,
59 |                   d,
60 |                   log = FALSE) {
61 | 
62 |   x <- c(x)
63 |   if (n2 < n1) {
64 |     stop("n2 should be greater than n1", call. = FALSE)
65 |   }
66 |   d_n12    <- (n2 - n1)
67 | 
68 |   if (d_n12 == 1L) {
69 |     lognum   <- log(d)
70 |     logden   <- ((n2 - 1) * d + 1) * (log(x))
71 |     log_dens <- lognum - logden + (log(x > 1))
72 |   } else{
73 |     logB     <- sum(log(1:(n2 - n1 - 1))) + sum(log(1:(n1-1))) -
74 |       sum(log(1:(n2-1)))
75 |     lognum   <- log(d) + (d_n12 - 1) * (log(x ^ d - 1))
76 |     logden   <- ((n2 - 1) * d + 1) * (log(x))
77 |     log_dens <- lognum - logden + sum(log(x > 1)) - logB
78 |   }
79 |   if (!log) {
80 |     log_dens <- exp(log_dens)
81 |   }
82 | 
83 |   structure(log_dens, class = c("dgera", class(log_dens)))
84 | 
85 |   return(log_dens)
86 | }
87 | 


--------------------------------------------------------------------------------
/man/clustering.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/hidalgo_postproc.R
 3 | \name{clustering}
 4 | \alias{clustering}
 5 | \alias{print.hidalgo_psm}
 6 | \alias{plot.hidalgo_psm}
 7 | \title{Posterior similarity matrix and partition estimation}
 8 | \usage{
 9 | clustering(
10 |   object,
11 |   clustering_method = c("dendrogram", "salso"),
12 |   K = 2,
13 |   nCores = 1,
14 |   ...
15 | )
16 | 
17 | \method{print}{hidalgo_psm}(x, ...)
18 | 
19 | \method{plot}{hidalgo_psm}(x, ...)
20 | }
21 | \arguments{
22 | \item{object}{object of class \code{Hidalgo}, the output of the
23 | \code{Hidalgo} function.}
24 | 
25 | \item{clustering_method}{character indicating the method to use to perform
26 | clustering. It can be
27 | \describe{
28 |        \item{"dendrogram"}{thresholding the adjacency dendrogram with a given
29 |         number (\code{K});}
30 |        \item{"salso"}{estimation via minimization of several partition
31 |        estimation criteria.
32 |        The default loss function is the variation of information.}
33 |        }}
34 | 
35 | \item{K}{number of clusters to recover by thresholding the
36 | dendrogram obtained from the psm.}
37 | 
38 | \item{nCores}{parameter for the \code{salso} function: the number of CPU
39 | cores to use. A value of zero indicates to use all cores on the system.}
40 | 
41 | \item{...}{ignored.}
42 | 
43 | \item{x}{object of class \code{hidalgo_psm}, obtained from the function
44 | \code{clustering()}.}
45 | }
46 | \value{
47 | list containing the posterior similarity matrix (\code{psm}) and
48 | the estimated partition \code{clust}.
49 | }
50 | \description{
51 | The function computes the posterior similarity (coclustering) matrix (psm)
52 | and estimates a representative partition of the observations from the MCMC
53 | output. The user can provide the desired number of clusters or estimate a
54 | optimal clustering solution by minimizing a loss function on the space
55 | of the partitions.
56 | In the latter case, the function uses the package \code{salso}
57 | (\href{https://cran.r-project.org/package=salso}{Dahl et al., 2021}),
58 | that the user needs to load.
59 | }
60 | \examples{
61 | \donttest{
62 | library(salso)
63 | X            <- replicate(5,rnorm(500))
64 | X[1:250,1:2] <- 0
65 | h_out        <- Hidalgo(X)
66 | clustering(h_out)
67 | }
68 | }
69 | \references{
70 | D. B. Dahl, D. J. Johnson, and P. Müller (2022),
71 | "Search Algorithms and Loss Functions for Bayesian Clustering",
72 | Journal of Computational and Graphical Statistics,
73 | \doi{10.1080/10618600.2022.2069779}.
74 | 
75 | David B. Dahl, Devin J. Johnson and Peter Müller (2022). "salso: Search
76 | Algorithms and Loss Functions for Bayesian Clustering".
77 | R package version
78 | 0.3.0. \url{https://CRAN.R-project.org/package=salso}
79 | }
80 | \seealso{
81 | \code{\link{Hidalgo}}, \code{\link[salso]{salso}}
82 | }
83 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
 1 | # intRinsic 1.1.2
 2 | 
 3 | * Removed the dependency from the package `latex2exp` 
 4 | 
 5 | # intRinsic 1.1.1
 6 | 
 7 | * Replaced `arma::is_finite` with `std::isfinite` to ensure compatibility with the latest `RcppArmadillo` release and CRAN policies.
 8 | * Implemented small code adjustments to address and resolve compiler warnings.
 9 | 
10 | # intRinsic 1.1.0
11 | 
12 | * Re-organized the cpp code into multiple files
13 | * Solved an indexing problem with `log_Zeta` and `log_corr`, sometimes causing `na` with single-manifold data when using `Hidalgo()`
14 | * Translated some `R` parts of the `Hidalgo()` code into `C++`
15 | * Corrected a bug affecting the `Truncated-pointmass` approach
16 | * Updated the README file
17 | * Set up the new `rhub` checks workflows
18 | 
19 | # intRinsic 1.0.2
20 | 
21 | * Removed the dependency from the package `MCMCpack` 
22 | * Fixed and updated the documentation (minor changes)
23 | 
24 | # intRinsic 1.0.1
25 | 
26 | * Fixed a typo in `compute_mus` when computing ratios from `dist_matrix`. Now the function handles also the `dissimilarity` class
27 | * Fixed a typo in `compute_mus` involving the class of the returned object
28 | * Adjusted some indentations in `print` methods
29 | * Adjusted the `autoplot` method related to `gride_mle`
30 | 
31 | # intRinsic 1.0.0
32 | 
33 | * This update marks the acceptance of the `intRinsic` vignette in the `Journal of Statistical Software`. Note: the DOI in the CITATION is for a new JSS publication that will be registered after publication on CRAN
34 | * Fixed some spelling typos and new-line inconsistencies
35 | * Removed the dependencies on the `as_tibble()` function (deprecated)
36 | * The function `twonn_decimated()` is now deprecated and will be removed in future releases. The function to use is `twonn_decimation()`
37 | * Now `gride_evolution()` and `twonn_decimation()` remove duplicated observations by default
38 | 
39 | # intRinsic 0.2.2
40 | 
41 | * Fixed bug in `Hidalgo()`, causing errors when setting `verbose = FALSE`
42 | * Fixed bug in `Hidalgo()` initialization. Added warning if `alpha_Dirichlet` is set so small it causes underflow problems
43 | * Fixed bug in `clustering()`. Now the returned `K` when the option `salso` is selected, is correct
44 | * Fixed bug in `print.mus()`. Now it checks if the passed object is a list
45 | * Corrected a few typos in the documentation
46 | * Updated `doi` of recently published references
47 | * Added additional checks in `compute_mus`: NA, symmetry, non-negativity
48 | 
49 | # intRinsic 0.2.1
50 | 
51 | * Fixed minor bugs involving the `gride()` family of functions
52 | * Added several methods (`print()`, `summary()`, `plot()`,...) to facilitate the
53 |   extraction of the results
54 | * Now, the `autoplot()` method is directly exported from the `ggplot2` package
55 | 
56 | # intRinsic 0.2.0
57 | 
58 | * Submitted to CRAN
59 | * Released to the public the first official version
60 | 
61 | 


--------------------------------------------------------------------------------
/R/twonn_dec_depr.R:
--------------------------------------------------------------------------------
 1 | #' Estimate the decimated \code{TWO-NN} evolution with halving steps or vector of
 2 | #' proportions
 3 | #'
 4 | #' The estimation of the \code{id} is related to the scale of the
 5 | #' dataset. To escape the local reach of the \code{TWO-NN} estimator,
 6 | #' \href{https://www.nature.com/articles/s41598-017-11873-y}{Facco et al. (2017)}
 7 | #' proposed to subsample the original dataset in order to induce greater
 8 | #' distances between the data points. By investigating the estimates' evolution
 9 | #' as a function of the size of the neighborhood, it is possible to obtain
10 | #' information about the validity of the modeling assumptions and the robustness
11 | #' of the model in the presence of noise.
12 | #'
13 | #' @param X data matrix with \code{n} observations and \code{D} variables.
14 | #' @param method method to use for decimation:
15 | #' \describe{
16 | #'   \item{\code{"steps"}}{the number of times the dataset is halved;}
17 | #'   \item{\code{"proportion"}}{the dataset is subsampled according to a vector
18 | #'   of proportions.}
19 | #' }
20 | #' @param proportions vector containing the fractions of the dataset to be
21 | #' considered.
22 | #' @param steps number of times the dataset is halved.
23 | #' @param seed random seed controlling the sequence of sub-sampled observations.
24 | #'
25 | #'
26 | #' @references
27 | #' Facco E, D'Errico M, Rodriguez A, Laio A (2017). "Estimating the intrinsic
28 | #' dimension of datasets by a minimal neighborhood information."
29 | #' Scientific Reports, 7(1).
30 | #' ISSN 20452322, \doi{10.1038/s41598-017-11873-y}.
31 | #'
32 | #' Denti F, Doimo D, Laio A, Mira A (2022). "The generalized ratios intrinsic
33 | #' dimension estimator."
34 | #' Scientific Reports, 12(20005).
35 | #' ISSN  20452322, \doi{10.1038/s41598-022-20991-1}.
36 | #'
37 | #' @return list containing the \code{TWO-NN} evolution
38 | #' (maximum likelihood estimation and confidence intervals), the average
39 | #' distance from the second NN, and the vector of proportions that were
40 | #' considered. According to the chosen estimation method, it is accompanied with
41 | #' the vector of proportions or halving steps considered.
42 | #'
43 | #' @export
44 | #'
45 | #' @seealso \code{\link{twonn}}
46 | #'
47 | twonn_decimated <- function(X,
48 |                             method = c("steps", "proportions"),
49 |                             steps = 0,
50 |                             proportions = 1,
51 |                             seed = NULL) {
52 |   .Deprecated("twonn_decimation")
53 |   method <- match.arg(method)
54 | 
55 |   if (steps == 0 & length(proportions) == 1) {
56 |     if (proportions == 1)
57 |       method <- "mle"
58 |   }
59 | 
60 |   res <- switch(
61 |     method,
62 |     steps = twonn_dec_by(X = X,
63 |                          steps = steps,
64 |                          seed = seed),
65 |     proportions = twonn_dec_prop(
66 |       X = X,
67 |       proportions = proportions,
68 |       seed = seed
69 |     ),
70 |     mle = twonn(X = X,
71 |                 method = "mle")
72 |   )
73 |   return(res)
74 | 
75 | }
76 | 


--------------------------------------------------------------------------------
/src/G_GRIDE.cpp:
--------------------------------------------------------------------------------
  1 | #include <RcppArmadillo.h>
  2 | 
  3 | // [[Rcpp::export]]
  4 | double gride_log_likelihood(double d,
  5 |                             int n1,
  6 |                             int n2,
  7 |                             arma::colvec mus_n1_n2){
  8 | 
  9 | 
 10 |   if(n2<n1){
 11 |     Rcpp::stop("n2 is lower than n1");
 12 |   }
 13 | 
 14 |   int d_n12 = (n2-n1);
 15 |   int nn    = mus_n1_n2.n_elem;
 16 |   double log_dens, lognum, logden, logB;
 17 | 
 18 |   arma::colvec vn1  = arma::linspace(1, n1-1, n1-1);
 19 |   arma::colvec vn2  = arma::linspace(1, n2-1, n2-1);
 20 |   arma::colvec vn12 = arma::linspace(1, d_n12-1, d_n12-1);
 21 | 
 22 | 
 23 |   if (d_n12 == 1) {
 24 |     lognum   = nn * log(d);
 25 |     logden   = ((n2 - 1) * d + 1) * arma::accu( log( mus_n1_n2 ) );
 26 |     log_dens = lognum - logden + sum( log( mus_n1_n2 > 1 ) );
 27 |   } else{
 28 |     logB     = arma::accu(log(vn1)) + arma::accu(log(vn12))-arma::accu(log(vn2));
 29 |     lognum   = nn * log(d) +
 30 |       (d_n12 - 1) * arma::accu( log( pow( mus_n1_n2, d)  - 1 ) );
 31 |     logden   = ((n2 - 1) * d + 1) * sum(log(mus_n1_n2));
 32 |     log_dens =  lognum - logden + sum(log(mus_n1_n2 > 1.)) - logB * nn;
 33 |   }
 34 | 
 35 |   return(log_dens);
 36 | }
 37 | 
 38 | 
 39 | // [[Rcpp::export]]
 40 | double gride_log_posterior(double z,
 41 |                            int n1,
 42 |                            int n2,
 43 |                            double a_d,
 44 |                            double b_d,
 45 |                            arma::colvec mus_n1_n2){
 46 | 
 47 | 
 48 |   if(n2 < n1){
 49 |     Rcpp::stop("n2 should be greater than n1");
 50 |   }
 51 | 
 52 |   double d = exp(z) + 1;
 53 |   double log_post = gride_log_likelihood(d, n1, n2, mus_n1_n2) +
 54 |     z +
 55 |     R::dgamma( d, a_d, 1/b_d, 1 );
 56 | 
 57 |   return(log_post);
 58 | }
 59 | 
 60 | // [[Rcpp::export]]
 61 | arma::colvec gride_mh_sampler(double start_d,
 62 |                               int n1,
 63 |                               int n2,
 64 |                               double a_d,
 65 |                               double b_d,
 66 |                               arma::colvec mus_n1_n2,
 67 |                               int nsim,
 68 |                               int burn_in,
 69 |                               double sigma){
 70 | 
 71 |   arma::colvec mh_sample(nsim);
 72 | 
 73 |   double oldpar, newpar, alpha;
 74 |   oldpar = start_d;
 75 |   for(int i=0; i<(nsim + burn_in); i++) {
 76 |     newpar = arma::randn(1)[0] * sigma + oldpar;
 77 |     alpha  = gride_log_posterior(
 78 |       newpar,
 79 |       n1,
 80 |       n2,
 81 |       a_d,
 82 |       b_d,
 83 |       mus_n1_n2) -
 84 |         gride_log_posterior(
 85 |           oldpar,
 86 |           n1,
 87 |           n2,
 88 |           a_d,
 89 |           b_d,
 90 |           mus_n1_n2);
 91 | 
 92 |     if (log(arma::randu(1)[0]) < alpha) {
 93 |       oldpar = newpar;
 94 |     }
 95 |     if(i >= burn_in){
 96 |       mh_sample(i-burn_in) = oldpar;
 97 |     }
 98 |   }
 99 |   return(mh_sample);
100 | }
101 | 
102 | 


--------------------------------------------------------------------------------
/R/twonn_dec.R:
--------------------------------------------------------------------------------
 1 | #' Estimate the decimated \code{TWO-NN} evolution with halving steps or vector of
 2 | #' proportions
 3 | #'
 4 | #' The estimation of the \code{id} is related to the scale of the
 5 | #' dataset. To escape the local reach of the \code{TWO-NN} estimator,
 6 | #' \href{https://www.nature.com/articles/s41598-017-11873-y}{Facco et al. (2017)}
 7 | #' proposed to subsample the original dataset in order to induce greater
 8 | #' distances between the data points. By investigating the estimates' evolution
 9 | #' as a function of the size of the neighborhood, it is possible to obtain
10 | #' information about the validity of the modeling assumptions and the robustness
11 | #' of the model in the presence of noise.
12 | #'
13 | #' @param X data matrix with \code{n} observations and \code{D} variables.
14 | #' @param method method to use for decimation:
15 | #' \describe{
16 | #'   \item{\code{"steps"}}{the number of times the dataset is halved;}
17 | #'   \item{\code{"proportion"}}{the dataset is subsampled according to a vector
18 | #'   of proportions.}
19 | #' }
20 | #' @param proportions vector containing the fractions of the dataset to be
21 | #' considered.
22 | #' @param steps number of times the dataset is halved.
23 | #' @param seed random seed controlling the sequence of sub-sampled observations.
24 | #'
25 | #'
26 | #' @references
27 | #' Facco E, D'Errico M, Rodriguez A, Laio A (2017). "Estimating the intrinsic
28 | #' dimension of datasets by a minimal neighborhood information."
29 | #' Scientific Reports, 7(1).
30 | #' ISSN 20452322, \doi{10.1038/s41598-017-11873-y}.
31 | #'
32 | #' Denti F, Doimo D, Laio A, Mira A (2022). "The generalized ratios intrinsic
33 | #' dimension estimator."
34 | #' Scientific Reports, 12(20005).
35 | #' ISSN  20452322, \doi{10.1038/s41598-022-20991-1}.
36 | #'
37 | #' @return list containing the \code{TWO-NN} evolution
38 | #' (maximum likelihood estimation and confidence intervals), the average
39 | #' distance from the second NN, and the vector of proportions that were
40 | #' considered. According to the chosen estimation method, it is accompanied with
41 | #' the vector of proportions or halving steps considered.
42 | #'
43 | #' @export
44 | #'
45 | #' @seealso \code{\link{twonn}}
46 | #'
47 | #' @examples
48 | #' X <- replicate(4,rnorm(1000))
49 | #' twonn_decimation(X,,method = "proportions",
50 | #'                 proportions = c(1,.5,.2,.1,.01))
51 | #'
52 | twonn_decimation <- function(X,
53 |                             method = c("steps", "proportions"),
54 |                             steps = 0,
55 |                             proportions = 1,
56 |                             seed = NULL) {
57 |   method <- match.arg(method)
58 | 
59 |   if (steps == 0 & length(proportions) == 1) {
60 |     if (proportions == 1)
61 |       method <- "mle"
62 |   }
63 | 
64 |   res <- switch(
65 |     method,
66 |     steps = twonn_dec_by(X = X,
67 |                          steps = steps,
68 |                          seed = seed),
69 |     proportions = twonn_dec_prop(
70 |       X = X,
71 |       proportions = proportions,
72 |       seed = seed
73 |     ),
74 |     mle = twonn(X = X,
75 |                 method = "mle")
76 |   )
77 |   return(res)
78 | }
79 | 


--------------------------------------------------------------------------------
/.github/workflows/rhub.yaml:
--------------------------------------------------------------------------------
 1 | # R-hub's generic GitHub Actions workflow file. It's canonical location is at
 2 | # https://github.com/r-hub/actions/blob/v1/workflows/rhub.yaml
 3 | # You can update this file to a newer version using the rhub2 package:
 4 | #
 5 | # rhub::rhub_setup()
 6 | #
 7 | # It is unlikely that you need to modify this file manually.
 8 | 
 9 | name: R-hub
10 | run-name: "${{ github.event.inputs.id }}: ${{ github.event.inputs.name || format('Manually run by {0}', github.triggering_actor) }}"
11 | 
12 | on:
13 |   workflow_dispatch:
14 |     inputs:
15 |       config:
16 |         description: 'A comma separated list of R-hub platforms to use.'
17 |         type: string
18 |         default: 'linux,windows,macos'
19 |       name:
20 |         description: 'Run name. You can leave this empty now.'
21 |         type: string
22 |       id:
23 |         description: 'Unique ID. You can leave this empty now.'
24 |         type: string
25 | 
26 | jobs:
27 | 
28 |   setup:
29 |     runs-on: ubuntu-latest
30 |     outputs:
31 |       containers: ${{ steps.rhub-setup.outputs.containers }}
32 |       platforms: ${{ steps.rhub-setup.outputs.platforms }}
33 | 
34 |     steps:
35 |     # NO NEED TO CHECKOUT HERE
36 |     - uses: r-hub/actions/setup@v1
37 |       with:
38 |         config: ${{ github.event.inputs.config }}
39 |       id: rhub-setup
40 | 
41 |   linux-containers:
42 |     needs: setup
43 |     if: ${{ needs.setup.outputs.containers != '[]' }}
44 |     runs-on: ubuntu-latest
45 |     name: ${{ matrix.config.label }}
46 |     strategy:
47 |       fail-fast: false
48 |       matrix:
49 |         config: ${{ fromJson(needs.setup.outputs.containers) }}
50 |     container:
51 |       image: ${{ matrix.config.container }}
52 | 
53 |     steps:
54 |       - uses: r-hub/actions/checkout@v1
55 |       - uses: r-hub/actions/platform-info@v1
56 |         with:
57 |           token: ${{ secrets.RHUB_TOKEN }}
58 |           job-config: ${{ matrix.config.job-config }}
59 |       - uses: r-hub/actions/setup-deps@v1
60 |         with:
61 |           token: ${{ secrets.RHUB_TOKEN }}
62 |           job-config: ${{ matrix.config.job-config }}
63 |       - uses: r-hub/actions/run-check@v1
64 |         with:
65 |           token: ${{ secrets.RHUB_TOKEN }}
66 |           job-config: ${{ matrix.config.job-config }}
67 | 
68 |   other-platforms:
69 |     needs: setup
70 |     if: ${{ needs.setup.outputs.platforms != '[]' }}
71 |     runs-on: ${{ matrix.config.os }}
72 |     name: ${{ matrix.config.label }}
73 |     strategy:
74 |       fail-fast: false
75 |       matrix:
76 |         config: ${{ fromJson(needs.setup.outputs.platforms) }}
77 | 
78 |     steps:
79 |       - uses: r-hub/actions/checkout@v1
80 |       - uses: r-hub/actions/setup-r@v1
81 |         with:
82 |           job-config: ${{ matrix.config.job-config }}
83 |           token: ${{ secrets.RHUB_TOKEN }}
84 |       - uses: r-hub/actions/platform-info@v1
85 |         with:
86 |           token: ${{ secrets.RHUB_TOKEN }}
87 |           job-config: ${{ matrix.config.job-config }}
88 |       - uses: r-hub/actions/setup-deps@v1
89 |         with:
90 |           job-config: ${{ matrix.config.job-config }}
91 |           token: ${{ secrets.RHUB_TOKEN }}
92 |       - uses: r-hub/actions/run-check@v1
93 |         with:
94 |           job-config: ${{ matrix.config.job-config }}
95 |           token: ${{ secrets.RHUB_TOKEN }}
96 | 


--------------------------------------------------------------------------------
/man/twonn_decimation.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/twonn_dec.R, R/twonn_dec_prop.R,
 3 | %   R/twonn_dec_steps.R
 4 | \name{twonn_decimation}
 5 | \alias{twonn_decimation}
 6 | \alias{print.twonn_dec_prop}
 7 | \alias{plot.twonn_dec_prop}
 8 | \alias{print.twonn_dec_by}
 9 | \alias{plot.twonn_dec_by}
10 | \title{Estimate the decimated \code{TWO-NN} evolution with halving steps or vector of
11 | proportions}
12 | \usage{
13 | twonn_decimation(
14 |   X,
15 |   method = c("steps", "proportions"),
16 |   steps = 0,
17 |   proportions = 1,
18 |   seed = NULL
19 | )
20 | 
21 | \method{print}{twonn_dec_prop}(x, ...)
22 | 
23 | \method{plot}{twonn_dec_prop}(x, CI = FALSE, proportions = FALSE, ...)
24 | 
25 | \method{print}{twonn_dec_by}(x, ...)
26 | 
27 | \method{plot}{twonn_dec_by}(x, CI = FALSE, steps = FALSE, ...)
28 | }
29 | \arguments{
30 | \item{X}{data matrix with \code{n} observations and \code{D} variables.}
31 | 
32 | \item{method}{method to use for decimation:
33 | \describe{
34 |   \item{\code{"steps"}}{the number of times the dataset is halved;}
35 |   \item{\code{"proportion"}}{the dataset is subsampled according to a vector
36 |   of proportions.}
37 | }}
38 | 
39 | \item{steps}{logical, if \code{TRUE}, the x-axis reports the number of halving steps.
40 | If \code{FALSE}, the x-axis reports the log10 average distance.}
41 | 
42 | \item{proportions}{logical, if \code{TRUE}, the x-axis reports the number of decimating proportions.
43 | If \code{FALSE}, the x-axis reports the log10 average distance.}
44 | 
45 | \item{seed}{random seed controlling the sequence of sub-sampled observations.}
46 | 
47 | \item{x}{object of class \code{twonn_dec_prop}, obtained from the function
48 | \code{twonn_dec_prop()}.}
49 | 
50 | \item{...}{ignored.}
51 | 
52 | \item{CI}{logical, if \code{TRUE}, the confidence intervals are plotted}
53 | }
54 | \value{
55 | list containing the \code{TWO-NN} evolution
56 | (maximum likelihood estimation and confidence intervals), the average
57 | distance from the second NN, and the vector of proportions that were
58 | considered. According to the chosen estimation method, it is accompanied with
59 | the vector of proportions or halving steps considered.
60 | }
61 | \description{
62 | The estimation of the \code{id} is related to the scale of the
63 | dataset. To escape the local reach of the \code{TWO-NN} estimator,
64 | \href{https://www.nature.com/articles/s41598-017-11873-y}{Facco et al. (2017)}
65 | proposed to subsample the original dataset in order to induce greater
66 | distances between the data points. By investigating the estimates' evolution
67 | as a function of the size of the neighborhood, it is possible to obtain
68 | information about the validity of the modeling assumptions and the robustness
69 | of the model in the presence of noise.
70 | }
71 | \examples{
72 | X <- replicate(4,rnorm(1000))
73 | twonn_decimation(X,,method = "proportions",
74 |                 proportions = c(1,.5,.2,.1,.01))
75 | 
76 | }
77 | \references{
78 | Facco E, D'Errico M, Rodriguez A, Laio A (2017). "Estimating the intrinsic
79 | dimension of datasets by a minimal neighborhood information."
80 | Scientific Reports, 7(1).
81 | ISSN 20452322, \doi{10.1038/s41598-017-11873-y}.
82 | 
83 | Denti F, Doimo D, Laio A, Mira A (2022). "The generalized ratios intrinsic
84 | dimension estimator."
85 | Scientific Reports, 12(20005).
86 | ISSN  20452322, \doi{10.1038/s41598-022-20991-1}.
87 | }
88 | \seealso{
89 | \code{\link{twonn}}
90 | }
91 | 


--------------------------------------------------------------------------------
/R/gride_mle_aux.R:
--------------------------------------------------------------------------------
  1 | #' Bootstrap sample generator for the \code{Gride} MLE
  2 | #'
  3 | #' @param mus_n1_n2 vector of generalized order NN distance ratios.
  4 | #' @param n1 order of the first NN considered.
  5 | #' @param n2 order of the second NN considered.
  6 | #' @param nsim integer, the number of bootstrap replications to consider.
  7 | #' @param upper_D nominal dimension of the dataset (upper bound for the
  8 | #' maximization routine).
  9 | #'
 10 | #' @keywords Internal
 11 | #' @noRd
 12 | #'
 13 | #' @return list containing the MLE, confidence interval, and the
 14 | #' bootstrap sample used for the estimation.
 15 | #'
 16 | gride_bootstrap <- function(mus_n1_n2,
 17 |                             n1 = 1,
 18 |                             n2 = 2,
 19 |                             nsim = 2000,
 20 |                             upper_D = NULL) {
 21 | 
 22 |   if (n2 < n1) {
 23 |     stop("n2 should be greater than n1", call. = FALSE)
 24 |   }
 25 | 
 26 |   if (is.null(upper_D)) {
 27 |     stop("Please provide the nominal dimension of the dataset D in upper_D",
 28 |          call. = FALSE)
 29 |   }
 30 | 
 31 |   n       <- length(mus_n1_n2)
 32 | 
 33 |   mle_est <- gride_mle_point(
 34 |     mus_n1_n2 = mus_n1_n2,
 35 |     n1 = n1,
 36 |     n2 = n2,
 37 |     upper_D = upper_D
 38 |   )
 39 | 
 40 |   boot_mus_samples <- replicate(nsim, rgera(
 41 |     nsim = n,
 42 |     n1 = n1,
 43 |     n2 = n2,
 44 |     d = mle_est
 45 |   ))
 46 | 
 47 |   bootstrap_sample <- apply(boot_mus_samples,
 48 |                             2,
 49 |                             function(x)
 50 |                               stats::optimize(
 51 |                                 gride_log_likelihood,
 52 |                                 interval = c(1.01, upper_D),
 53 |                                 n1 = n1,
 54 |                                 n2 = n2,
 55 |                                 mus_n1_n2 = x,
 56 |                                 maximum = TRUE
 57 |                               )$max)
 58 | 
 59 |   Res <- list(mle = mle_est, boot_sample = bootstrap_sample)
 60 | 
 61 |   structure(Res, class = c("bootstrap_sample", "gride", class(Res)))
 62 | 
 63 | }
 64 | 
 65 | #' Maximum Likelihood Estimator (MLE) for the id using generic ratio statistics
 66 | #'
 67 | #' @param mus_n1_n2 vector of generalized order NN distance ratios statistics.
 68 | #' @param n1 order of the first NN considered. Default is 1.
 69 | #' @param n2 order of the second NN considered. Default is 2.
 70 | #' @param upper_D nominal dimension of the dataset (upper bound for the
 71 | #' maximization routine).
 72 | #'
 73 | #' @keywords Internal
 74 | #' @noRd
 75 | #'
 76 | #' @return Gride MLE point estimate obtained via numeric optimization.
 77 | #'
 78 | gride_mle_point <- function(mus_n1_n2,
 79 |                             n1 = 1,
 80 |                             n2 = 2,
 81 |                             upper_D = NULL) {
 82 |   if (class(mus_n1_n2)[1] == "mus") {
 83 |     n1 <- attr(mus_n1_n2, which = "n1")
 84 |     n2 <- attr(mus_n1_n2, which = "n2")
 85 |   }
 86 | 
 87 |   if (n2 < n1) {
 88 |     stop("n2 should be greater than n1", call. = FALSE)
 89 |   }
 90 | 
 91 |   if (is.null(upper_D)) {
 92 |     stop("Please provide the nominal dimension of the dataset D in upper_D",
 93 |          call. = FALSE)
 94 |   }
 95 | 
 96 |   O       <- stats::optimize(
 97 |     gride_log_likelihood,
 98 |     interval = c(1.01, upper_D),
 99 |     n1 = n1,
100 |     n2 = n2,
101 |     mus_n1_n2 = mus_n1_n2,
102 |     maximum = TRUE
103 |   )
104 | 
105 | 
106 |   structure(O$max, class = c("mle_point", "gride", class(O$max)))
107 | 
108 | }
109 | 


--------------------------------------------------------------------------------
/man/autoplot.Hidalgo.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/autoplot_hidalgo.R
 3 | \name{autoplot.Hidalgo}
 4 | \alias{autoplot.Hidalgo}
 5 | \title{Plot the output of the \code{Hidalgo} function}
 6 | \usage{
 7 | \method{autoplot}{Hidalgo}(
 8 |   object,
 9 |   type = c("raw_chains", "point_estimates", "class_plot", "clustering"),
10 |   class_plot_type = c("histogram", "density", "boxplot", "violin"),
11 |   class = NULL,
12 |   psm = NULL,
13 |   clust = NULL,
14 |   title = NULL,
15 |   ...
16 | )
17 | }
18 | \arguments{
19 | \item{object}{object of class \code{Hidalgo}, the output of the
20 | \code{Hidalgo()} function.}
21 | 
22 | \item{type}{character that indicates the requested type of plot.
23 | It can be:
24 | \describe{
25 |  \item{\code{"raw_chains"}}{plot the MCMC and the ergodic means NOT corrected
26 |  for label switching;}
27 |  \item{\code{"point_estimates"}}{plot the posterior mean and median of the id
28 |  for each observation, after the chains are processed for label switching;}
29 |  \item{\code{"class_plot"}}{plot the estimated id distributions stratified by
30 |  the groups specified in the class vector;}
31 |  \item{\code{"clustering"}}{plot the posterior coclustering matrix. Rows and
32 |  columns can be stratified by an exogenous class and/or a clustering
33 |  solution.}
34 |  }}
35 | 
36 | \item{class_plot_type}{if \code{type} is chosen to be \code{"class_plot"},
37 | one can plot the stratified id estimates with a \code{"density"} plot or a
38 | \code{"histogram"}, or using \code{"boxplots"} or \code{"violin"} plots.}
39 | 
40 | \item{class}{factor variable used to stratify observations according to
41 | their the \code{id} estimates.}
42 | 
43 | \item{psm}{posterior similarity matrix containing the posterior probability
44 | of coclustering.}
45 | 
46 | \item{clust}{vector containing the cluster membership labels.}
47 | 
48 | \item{title}{character string used as title of the plot.}
49 | 
50 | \item{...}{other arguments passed to specific methods.}
51 | }
52 | \value{
53 | a \code{\link[ggplot2]{ggplot2}} object produced by the function
54 | according to the \code{type} chosen.
55 | More precisely, if
56 | \describe{
57 |  \item{\code{method = "raw_chains"}}{The functions produces the traceplots
58 |  of the parameters \code{d_k}, for \code{k=1...K}.
59 |  The ergodic means for all the chains are superimposed. The \code{K} chains
60 |  that are plotted are not post-processed.
61 |  Ergo, they are subjected to label switching;}
62 |  \item{\code{method = "point_estimates"}}{The function returns two
63 |  scatterplots displaying
64 | the posterior mean and median \code{id} for each observation, after that the
65 | MCMC has been postprocessed to handle label switching;}
66 |  \item{\code{method = "class_plot"}}{The function returns a plot that can be
67 |  used to visually assess the relationship between the posterior \code{id}
68 |  estimates and an external, categorical variable. The type of plot varies
69 |  according to the specification of \code{class_plot_type}, and it can be
70 |  either a set of boxplots or violin plots or a collection of overlapping
71 |  densities or histograms;}
72 |  \item{\code{method = "clustering"}}{The function displays the posterior
73 |  similarity matrix, to allow the study of the clustering structure present in
74 |  the data estimated via the mixture model. Rows and columns can be stratified
75 |  by an exogenous class and/or a clustering structure.}
76 |  }
77 | }
78 | \description{
79 | Use this method without the \code{.Hidalgo} suffix.
80 | It produces several plots to explore the output of
81 | the \code{Hidalgo} model.
82 | }
83 | \seealso{
84 | \code{\link{Hidalgo}}
85 | 
86 | Other autoplot methods: 
87 | \code{\link{autoplot.gride_bayes}()},
88 | \code{\link{autoplot.twonn_bayes}()},
89 | \code{\link{autoplot.twonn_linfit}()},
90 | \code{\link{autoplot.twonn_mle}()}
91 | }
92 | \concept{autoplot methods}
93 | 


--------------------------------------------------------------------------------
/R/twonn_linfit.R:
--------------------------------------------------------------------------------
  1 | #' Least Squares Estimator for the \code{TWO-NN} model
  2 | #'
  3 | #' The function fits the \code{TWO-NN} model via least squares estimation, as
  4 | #' originally proposed in
  5 | #' \href{https://www.nature.com/articles/s41598-017-11873-y}{Facco et al., 2017}.
  6 | #'
  7 | #' @param mus vector of second to first NN distance ratios.
  8 | #' @param alpha confidence level needed for the computation of the confidence
  9 | #' interval.
 10 | #' @param c_trimmed proportion of trimmed observations.
 11 | #'
 12 | #' @return object of class \code{twonn_linfit}, which is a list containing the
 13 | #' least squares estimate of the intrinsic dimension, along with the \code{lm()}
 14 | #' output used for the computation.
 15 | #'
 16 | #' @seealso \code{\link{twonn}}, \code{\link{autoplot.twonn_linfit}}
 17 | #' @keywords Internal
 18 | #'
 19 | #' @noRd
 20 | #'
 21 | #' @references
 22 | #' Facco E, D'Errico M, Rodriguez A, Laio A (2017). "Estimating the intrinsic
 23 | #' dimension of datasets by a minimal neighborhood information."
 24 | #' Scientific Reports, 7(1).
 25 | #' ISSN 20452322, doi: 10.1038/s41598-017-11873-y.
 26 | #'
 27 | twonn_linfit <- function(mus,
 28 |                          alpha = 0.95,
 29 |                          c_trimmed = .01) {
 30 |   c_considered    <- 1 - c_trimmed
 31 |   n <- n_original <- base::length(mus)
 32 | 
 33 |   if (c_trimmed > 0) {
 34 |     mus <- base::sort(mus)[1:floor(n * c_considered)]
 35 |     n  <- base::length(mus)
 36 |   }
 37 | 
 38 |   F_musi <- (0:(n - 1)) / n
 39 |   y <- -base::log(1 - (F_musi))
 40 |   x <- base::sort(base::log(mus))
 41 | 
 42 |   modlin1 <- stats::lm(y ~ x - 1)
 43 | 
 44 |   Res          <- numeric(3)
 45 |   Res[c(1, 3)] <- stats::confint.lm(modlin1, level = alpha)
 46 |   Res[2]       <- stats::coef(modlin1)
 47 |   Res <- list(
 48 |     est = Res,
 49 |     lm_output = modlin1,
 50 |     cl = alpha,
 51 |     c_trimmed = c_trimmed,
 52 |     n_original = n_original,
 53 |     n = n
 54 |   )
 55 | 
 56 |   structure(Res, class = c("twonn_linfit", class(Res)))
 57 | }
 58 | 
 59 | #' @name twonn
 60 | #'
 61 | #' @param x object of class \code{twonn_mle}, obtained from the function
 62 | #' \code{twonn_mle()}.
 63 | #' @param ... ignored.
 64 | #'
 65 | #'
 66 | #' @export
 67 | print.twonn_linfit <- function(x, ...) {
 68 |   y <- c("TWONN - Linfit" = x[["est"]][2])
 69 |   cat(y)
 70 |   cat("\n")
 71 |   invisible(x)
 72 | }
 73 | 
 74 | #' @name twonn
 75 | #'
 76 | #' @param object object of class \code{twonn_mle}, obtained from the function
 77 | #' \code{twonn_mle()}.
 78 | #' @param ... ignored.
 79 | #'
 80 | #' @export
 81 | summary.twonn_linfit <- function(object, ...) {
 82 |   y <- cbind(
 83 |     `Original sample size` = object[["n_original"]],
 84 |     `Used sample size` = object[["n"]],
 85 |     `Trimming proportion` = object[["c_trimmed"]],
 86 |     `Confidence level` = object[["cl"]],
 87 |     `Lower Bound` = object[["est"]][1],
 88 |     `Estimate` = object[["est"]][2],
 89 |     `Upper Bound` = object[["est"]][3]
 90 |   )
 91 |   structure(y, class = c("summary.twonn_linfit","matrix"))
 92 | }
 93 | 
 94 | 
 95 | #' @name twonn
 96 | #'
 97 | #' @param x object of class \code{twonn_mle}, obtained from the function
 98 | #' \code{twonn_mle()}.
 99 | #' @param ... ignored.
100 | #'
101 | #' @export
102 | print.summary.twonn_linfit <- function(x, ...) {
103 |   cat("Model: TWO-NN\n")
104 |   cat("Method: Least Squares Estimation\n")
105 |   cat(paste0(
106 |     "Sample size: ",
107 |     x[1],
108 |     ", Obs. used: ",
109 |     x[2],
110 |     ". Trimming proportion: ",
111 |     100 * x[3],
112 |     "%\n"
113 |   ))
114 |   cat(paste0("ID estimates (confidence level: ", x[4], ")"))
115 |   y <- cbind(
116 |     `Lower Bound` = x[5],
117 |     `Estimate` = x[6],
118 |     `Upper Bound` = x[7]
119 |   )
120 |   print(knitr::kable(y))
121 |   cat("\n")
122 |   invisible(x)
123 | }
124 | 
125 | 
126 | 
127 | #' @name twonn
128 | #'
129 | #' @param x object of class \code{twonn_linfit}, the output of the
130 | #' \code{twonn} function when \code{method = "linfit"}.
131 | #'
132 | #' @export
133 | #'
134 | plot.twonn_linfit <- function(x,
135 |                               ...) {
136 |   lmod <- x$lm_output
137 |   xx   <- lmod$model$x
138 |   y    <- lmod$model$y
139 |   Res  <- x$est
140 | 
141 |   plot( y ~ xx, pch = 21, bg = 1,cex=.4,
142 |         ylab = ("-log(1-(i/N))"),
143 |         xlab = expression(log(mu)))
144 |   abline(lmod,col=2,lwd=1.3)
145 |   graphics::title("TWO-NN Linear Fit")
146 |   expr <- expression(R^2)
147 |   legend("topleft",legend =  paste0("ID : ", round(Res[2], 3),"\n" ,
148 |                                     expr," : ", round(summary(lmod)$r.squared, 3) ) )
149 |   invisible()
150 | 
151 | }
152 | 


--------------------------------------------------------------------------------
/man/gride.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/gride.R, R/gride_bayes.R, R/gride_mle.R
  3 | \name{gride}
  4 | \alias{gride}
  5 | \alias{print.gride_bayes}
  6 | \alias{summary.gride_bayes}
  7 | \alias{print.summary.gride_bayes}
  8 | \alias{plot.gride_bayes}
  9 | \alias{print.gride_mle}
 10 | \alias{summary.gride_mle}
 11 | \alias{print.summary.gride_mle}
 12 | \alias{plot.gride_mle}
 13 | \title{\code{Gride}: the Generalized Ratios ID Estimator}
 14 | \usage{
 15 | gride(
 16 |   X = NULL,
 17 |   dist_mat = NULL,
 18 |   mus_n1_n2 = NULL,
 19 |   method = c("mle", "bayes"),
 20 |   n1 = 1,
 21 |   n2 = 2,
 22 |   alpha = 0.95,
 23 |   nsim = 5000,
 24 |   upper_D = 50,
 25 |   burn_in = 2000,
 26 |   sigma = 0.5,
 27 |   start_d = NULL,
 28 |   a_d = 1,
 29 |   b_d = 1,
 30 |   ...
 31 | )
 32 | 
 33 | \method{print}{gride_bayes}(x, ...)
 34 | 
 35 | \method{summary}{gride_bayes}(object, ...)
 36 | 
 37 | \method{print}{summary.gride_bayes}(x, ...)
 38 | 
 39 | \method{plot}{gride_bayes}(x, ...)
 40 | 
 41 | \method{print}{gride_mle}(x, ...)
 42 | 
 43 | \method{summary}{gride_mle}(object, ...)
 44 | 
 45 | \method{print}{summary.gride_mle}(x, ...)
 46 | 
 47 | \method{plot}{gride_mle}(x, ...)
 48 | }
 49 | \arguments{
 50 | \item{X}{data matrix with \code{n} observations and \code{D} variables.}
 51 | 
 52 | \item{dist_mat}{distance matrix computed between the \code{n} observations.}
 53 | 
 54 | \item{mus_n1_n2}{vector of generalized order NN distance ratios.}
 55 | 
 56 | \item{method}{the chosen estimation method. It can be
 57 | \describe{
 58 | \item{\code{"mle"}}{maximum likelihood estimation;}
 59 | \item{\code{"bayes"}}{estimation with the Bayesian approach.}
 60 | }}
 61 | 
 62 | \item{n1}{order of the first NN considered. Default is 1.}
 63 | 
 64 | \item{n2}{order of the second NN considered. Default is 2.}
 65 | 
 66 | \item{alpha}{confidence level (for \code{mle}) or posterior probability in
 67 | the credible interval (\code{bayes}).}
 68 | 
 69 | \item{nsim}{number of bootstrap samples or posterior simulation to consider.}
 70 | 
 71 | \item{upper_D}{nominal dimension of the dataset (upper bound for the
 72 | maximization routine).}
 73 | 
 74 | \item{burn_in}{number of iterations to discard from the MCMC sample.
 75 | Applicable if \code{method = "bayes"}.}
 76 | 
 77 | \item{sigma}{standard deviation of the Gaussian proposal used in the MH step.
 78 | Applicable if \code{method = "bayes"}.}
 79 | 
 80 | \item{start_d}{initial value for the MCMC chain. If \code{NULL},
 81 | the MLE is used. Applicable if \code{method = "bayes"}.}
 82 | 
 83 | \item{a_d}{shape parameter of the Gamma prior distribution for \code{d}.
 84 | Applicable if \code{method = "bayes"}.}
 85 | 
 86 | \item{b_d}{rate parameter of the Gamma prior distribution for \code{d}.
 87 | Applicable if \code{method = "bayes"}.}
 88 | 
 89 | \item{...}{other arguments passed to specific methods.}
 90 | 
 91 | \item{x}{object of class \code{gride_mle}.
 92 | It is obtained using the output of the \code{gride} function when
 93 | \code{method = "mle"}.}
 94 | 
 95 | \item{object}{object of class \code{gride_mle}, obtained from the function
 96 | \code{gride_mle()}.}
 97 | }
 98 | \value{
 99 | a list containing the \code{id} estimate obtained with the Gride
100 | method, along with the relative confidence or credible interval
101 | (object \code{est}). The class of the output object changes according to the
102 | chosen \code{method}. Similarly,
103 | the remaining elements stored in the list reports a summary of the key
104 | quantities involved in the estimation process, e.g.,
105 | the NN orders \code{n1} and \code{n2}.
106 | }
107 | \description{
108 | The function can fit the Generalized ratios ID estimator under both the
109 | frequentist and the Bayesian frameworks, depending on the specification of
110 | the argument \code{method}. The model is the direct extension of the
111 | \code{TWO-NN} method presented in
112 | \href{https://www.nature.com/articles/s41598-017-11873-y}{Facco et al., 2017}
113 | . See also \href{https://www.nature.com/articles/s41598-022-20991-1}{Denti et al., 2022} \
114 | for more details.
115 | }
116 | \examples{
117 | \donttest{
118 |  X  <- replicate(2,rnorm(500))
119 |  dm <- as.matrix(dist(X,method = "manhattan"))
120 |  res <- gride(X, nsim = 500)
121 |  res
122 |  plot(res)
123 |  gride(dist_mat = dm, method = "bayes", upper_D =10,
124 |  nsim = 500, burn_in = 100)
125 | }
126 | }
127 | \references{
128 | Facco E, D'Errico M, Rodriguez A, Laio A (2017). "Estimating the intrinsic
129 | dimension of datasets by a minimal neighborhood information."
130 | Scientific Reports, 7(1).
131 | ISSN 20452322, \doi{10.1038/s41598-017-11873-y}.
132 | 
133 | Denti F, Doimo D, Laio A, Mira A (2022). "The generalized ratios intrinsic
134 | dimension estimator."
135 | Scientific Reports, 12(20005).
136 | ISSN  20452322, \doi{10.1038/s41598-022-20991-1}.
137 | }
138 | 


--------------------------------------------------------------------------------
/R/autoplot_hidalgo.R:
--------------------------------------------------------------------------------
  1 | #' @importFrom ggplot2 autoplot
  2 | #' @export
  3 | ggplot2::autoplot
  4 | 
  5 | 
  6 | #' Plot the output of the \code{Hidalgo} function
  7 | #'
  8 | #' Use this method without the \code{.Hidalgo} suffix.
  9 | #' It produces several plots to explore the output of
 10 | #' the \code{Hidalgo} model.
 11 | #'
 12 | #' @param object object of class \code{Hidalgo}, the output of the
 13 | #' \code{Hidalgo()} function.
 14 | #' @param type character that indicates the requested type of plot.
 15 | #' It can be:
 16 | #' \describe{
 17 | #'  \item{\code{"raw_chains"}}{plot the MCMC and the ergodic means NOT corrected
 18 | #'  for label switching;}
 19 | #'  \item{\code{"point_estimates"}}{plot the posterior mean and median of the id
 20 | #'  for each observation, after the chains are processed for label switching;}
 21 | #'  \item{\code{"class_plot"}}{plot the estimated id distributions stratified by
 22 | #'  the groups specified in the class vector;}
 23 | #'  \item{\code{"clustering"}}{plot the posterior coclustering matrix. Rows and
 24 | #'  columns can be stratified by an exogenous class and/or a clustering
 25 | #'  solution.}
 26 | #'  }
 27 | #' @param class_plot_type if \code{type} is chosen to be \code{"class_plot"},
 28 | #' one can plot the stratified id estimates with a \code{"density"} plot or a
 29 | #' \code{"histogram"}, or using \code{"boxplots"} or \code{"violin"} plots.
 30 | #' @param class factor variable used to stratify observations according to
 31 | #' their the \code{id} estimates.
 32 | #' @param psm posterior similarity matrix containing the posterior probability
 33 | #' of coclustering.
 34 | #' @param clust vector containing the cluster membership labels.
 35 | #' @param title character string used as title of the plot.
 36 | #' @param ... other arguments passed to specific methods.
 37 | #'
 38 | #' @rdname autoplot.Hidalgo
 39 | #'
 40 | #' @return a \code{\link[ggplot2]{ggplot2}} object produced by the function
 41 | #' according to the \code{type} chosen.
 42 | #' More precisely, if
 43 | #' \describe{
 44 | #'  \item{\code{method = "raw_chains"}}{The functions produces the traceplots
 45 | #'  of the parameters \code{d_k}, for \code{k=1...K}.
 46 | #'  The ergodic means for all the chains are superimposed. The \code{K} chains
 47 | #'  that are plotted are not post-processed.
 48 | #'  Ergo, they are subjected to label switching;}
 49 | #'  \item{\code{method = "point_estimates"}}{The function returns two
 50 | #'  scatterplots displaying
 51 | #' the posterior mean and median \code{id} for each observation, after that the
 52 | #' MCMC has been postprocessed to handle label switching;}
 53 | #'  \item{\code{method = "class_plot"}}{The function returns a plot that can be
 54 | #'  used to visually assess the relationship between the posterior \code{id}
 55 | #'  estimates and an external, categorical variable. The type of plot varies
 56 | #'  according to the specification of \code{class_plot_type}, and it can be
 57 | #'  either a set of boxplots or violin plots or a collection of overlapping
 58 | #'  densities or histograms;}
 59 | #'  \item{\code{method = "clustering"}}{The function displays the posterior
 60 | #'  similarity matrix, to allow the study of the clustering structure present in
 61 | #'  the data estimated via the mixture model. Rows and columns can be stratified
 62 | #'  by an exogenous class and/or a clustering structure.}
 63 | #'  }
 64 | #'
 65 | #'
 66 | #' @seealso \code{\link{Hidalgo}}
 67 | #'
 68 | #'
 69 | #' @importFrom rlang .data
 70 | #' @importFrom ggplot2 autoplot
 71 | #'
 72 | #' @export
 73 | #' @family autoplot methods
 74 | #'
 75 | autoplot.Hidalgo <- function(object,
 76 |                              type = c("raw_chains",
 77 |                                       "point_estimates",
 78 |                                       "class_plot",
 79 |                                       "clustering"),
 80 |                              class_plot_type = c("histogram", "density",
 81 |                                                  "boxplot", "violin"),
 82 |                              class = NULL,
 83 |                              psm = NULL,
 84 |                              clust = NULL,
 85 |                              title = NULL,
 86 |                              ...) {
 87 |   type <- match.arg(type)
 88 | 
 89 |   if (type == "class_plot") {
 90 |     class_plot_type <- match.arg(class_plot_type)
 91 |     if (is.null(class))
 92 |       stop("Please provide the factor variable to stratify the id estimates")
 93 |   }
 94 | 
 95 |   G1 <- switch(
 96 |     type,
 97 |     raw_chains      = ggHid_chains(object),
 98 |     point_estimates = ggHid_mean_median(object),
 99 |     class_plot      = ggHid_class(object, class, class_plot_type),
100 |     clustering      = ggHid_psm(object, psm, class, clust, ...)
101 |   )
102 | 
103 |   if (!is.null(title)) {
104 |     G1 <- G1 + ggplot2::ggtitle(title)
105 |   }
106 | 
107 |   G1
108 | }
109 | 
110 | 
111 | 
112 | 


--------------------------------------------------------------------------------
/R/twoNN.R:
--------------------------------------------------------------------------------
  1 | #' \code{TWO-NN} estimator
  2 | #'
  3 | #' The function can fit the two-nearest neighbor estimator within the maximum
  4 | #' likelihood and the Bayesian frameworks. Also, one can obtain the estimates
  5 | #' using least squares estimation, depending on the specification of the
  6 | #' argument \code{method}. This model has been originally presented in
  7 | #' \href{https://www.nature.com/articles/s41598-017-11873-y}{Facco et al., 2017}
  8 | #' . See also \href{https://www.nature.com/articles/s41598-022-20991-1}{Denti et al., 2022}
  9 | #' for more details.
 10 | #'
 11 | #' @param X data matrix with \code{n} observations and \code{D} variables.
 12 | #' @param dist_mat distance matrix computed between the \code{n} observations.
 13 | #' @param mus vector of second to first NN distance ratios.
 14 | #' @param method chosen estimation method. It can be
 15 | #'  \describe{
 16 | #'        \item{\code{"mle"}}{for maximum likelihood estimator;}
 17 | #'        \item{\code{"linfit"}}{for estimation via the least squares approach;}
 18 | #'        \item{\code{"bayes"}}{for estimation with the Bayesian approach.}
 19 | #'               }
 20 | #' @param alpha the confidence level (for \code{mle} and least squares fit) or
 21 | #' posterior probability in the credible interval (\code{bayes}).
 22 | #' @param c_trimmed the proportion of trimmed observations.
 23 | #' @param unbiased logical, applicable when \code{method = "mle"}.
 24 | #' If \code{TRUE}, the MLE is corrected to ensure unbiasedness.
 25 | #' @param a_d shape parameter of the Gamma prior on the parameter \code{d},
 26 | #' applicable when \code{method = "bayes"}.
 27 | #' @param b_d rate parameter of the Gamma prior on the parameter \code{d},
 28 | #' applicable when \code{method = "bayes"}.
 29 | #' @param ... additional arguments for the different methods.
 30 | #'
 31 | #'
 32 | #' @return list characterized by a class type that depends on the \code{method}
 33 | #' chosen. Regardless of the \code{method}, the output list always contains the
 34 | #' object \code{est}, which provides the estimated intrinsic dimension along
 35 | #' with uncertainty quantification. The remaining objects vary with the
 36 | #' estimation method. In particular, if
 37 | #' \describe{
 38 | #' \item{\code{method = "mle"}}{the output reports the MLE and the relative
 39 | #' confidence interval;}
 40 | #' \item{\code{method = "linfit"}}{the output includes the \code{lm()} object used for the computation;}
 41 | #' \item{\code{method = "bayes"}}{the output contains the (1 + \code{alpha}) / 2 and (1 - \code{alpha}) / 2 quantiles, mean, mode, and median of the posterior distribution of \code{d}.}
 42 | #' }
 43 | #'
 44 | #' @export
 45 | #'
 46 | #' @references
 47 | #' Facco E, D'Errico M, Rodriguez A, Laio A (2017). "Estimating the intrinsic
 48 | #' dimension of datasets by a minimal neighborhood information."
 49 | #' Scientific Reports, 7(1).
 50 | #' ISSN 20452322, \doi{10.1038/s41598-017-11873-y}.
 51 | #'
 52 | #' Denti F, Doimo D, Laio A, Mira A (2022). "The generalized ratios intrinsic
 53 | #' dimension estimator."
 54 | #' Scientific Reports, 12(20005).
 55 | #' ISSN  20452322, \doi{10.1038/s41598-022-20991-1}.
 56 | #'
 57 | #' @examples
 58 | #' # dataset with 1000 observations and id = 2
 59 | #' X <- replicate(2,rnorm(1000))
 60 | #' twonn(X)
 61 | #' # dataset with 1000 observations and id = 3
 62 | #' Y <- replicate(3,runif(1000))
 63 | #' #  Bayesian and least squares estimate from distance matrix
 64 | #' dm <- as.matrix(dist(Y,method = "manhattan"))
 65 | #' twonn(dist_mat = dm,method = "bayes")
 66 | #' twonn(dist_mat = dm,method = "linfit")
 67 | #'
 68 | twonn <- function(X = NULL,
 69 |                   dist_mat = NULL,
 70 |                   mus = NULL,
 71 |                   method = c("mle", "linfit", "bayes"),
 72 |                   alpha = 0.95,
 73 |                   c_trimmed = .01,
 74 |                   unbiased = TRUE,
 75 |                   a_d = 0.001,
 76 |                   b_d = 0.001,
 77 |                   ...) {
 78 | 
 79 |   if (!is.null(mus)) {
 80 |     if( !all(mus >= 1) ){
 81 |       stop("Detected some values in mus below 1.
 82 |            Please provide a proper vector of ratios.",
 83 |            call. = FALSE)
 84 |     }
 85 |     }
 86 | 
 87 |   if (is.null(mus)) {
 88 |     if (is.null(X) & is.null(dist_mat)) {
 89 |       stop("Please provide either a dataset X or a distance matrix",
 90 |            call. = FALSE)
 91 |     }
 92 | 
 93 |     mus          <- compute_mus(
 94 |       X,
 95 |       dist_mat = dist_mat,
 96 |       n1 = 1,
 97 |       n2 = 2,
 98 |       Nq = FALSE
 99 |     )
100 |   }
101 | 
102 |   method <- match.arg(method)
103 | 
104 | 
105 |   switch(
106 |     method,
107 |     mle    = twonn_mle(
108 |       mus = mus,
109 |       alpha = alpha,
110 |       c_trimmed = c_trimmed,
111 |       unbiased = unbiased,
112 |       ...
113 |     ),
114 |     linfit = twonn_linfit(
115 |       mus = mus,
116 |       alpha = alpha,
117 |       c_trimmed = c_trimmed,
118 |       ...
119 |     ),
120 |     bayes  = twonn_bayes(
121 |       mus = mus,
122 |       alpha = alpha,
123 |       c_trimmed = c_trimmed,
124 |       a_d = a_d,
125 |       b_d = b_d,
126 |       ...
127 |     )
128 |   )
129 | 
130 | 
131 | }
132 | 


--------------------------------------------------------------------------------
/R/twonn_mle.R:
--------------------------------------------------------------------------------
  1 | #' Maximum Likelihood Estimator for the \code{TWO-NN} model
  2 | #'
  3 | #' The function fits the \code{TWO-NN} model via maximum likelihood estimation,
  4 | #' as discussed in
  5 | #' \href{https://www.nature.com/articles/s41598-022-20991-1}{Denti et al., 2022}.
  6 | #' This function is not exported, and can be accessed with \code{twonn()},
  7 | #' specifying \code{method = "mle"}.
  8 | #'
  9 | #' @param mus vector of second to first NN distance ratios.
 10 | #' @param unbiased logical, if \code{TRUE} the MLE is corrected to ensure
 11 | #' unbiasedness.
 12 | #' @param alpha confidence level needed for the computation of the confidence
 13 | #' interval.
 14 | #' @param c_trimmed proportion of trimmed observations.
 15 | #'
 16 | #' @return list of class \code{twonn_mle} containing the maximum likelihood
 17 | #' estimation of the intrinsic dimension with its confidence interval.
 18 | #' @keywords Internal
 19 | #'
 20 | #' @noRd
 21 | #'
 22 | #' @seealso \code{\link{twonn}}
 23 | #'
 24 | #' @references
 25 | #' #' Facco E, D'Errico M, Rodriguez A, Laio A (2017). "Estimating the intrinsic
 26 | #' dimension of datasets by a minimal neighborhood information."
 27 | #' Scientific Reports, 7(1).
 28 | #' ISSN 20452322, \doi{10.1038/s41598-017-11873-y}.
 29 | #'
 30 | #' Denti F, Doimo D, Laio A, Mira A (2022). "The generalized ratios intrinsic
 31 | #' dimension estimator."
 32 | #' Scientific Reports, 12(20005).
 33 | #' ISSN  20452322, \doi{10.1038/s41598-022-20991-1}.
 34 | #'
 35 | twonn_mle <-
 36 |   function(mus,
 37 |            unbiased = TRUE,
 38 |            alpha = .95,
 39 |            c_trimmed = 0.01) {
 40 |     n <- n_original <- base::length(mus)
 41 |     c_considered <- 1 - c_trimmed
 42 | 
 43 |     if (c_trimmed > 0) {
 44 |       mus <- sort(mus)[1:floor(n * c_considered)]
 45 |       n  <- base::length(mus)
 46 |     }
 47 | 
 48 |     alpha  <- 1 - alpha
 49 |     logmus <- base::log(mus)
 50 | 
 51 |     if (unbiased) {
 52 |       Z  <- (n - 1) / base::sum(logmus)
 53 |       UB <-
 54 |         Z * stats::qgamma(1 - alpha / 2, shape = n, rate = n - 1)
 55 |       LB <- Z * stats::qgamma(alpha / 2, shape = n, rate = n - 1)
 56 |     } else {
 57 |       Z <- (n) / base::sum(logmus)
 58 |       UB <- Z * stats::qgamma(1 - alpha / 2, shape = n, rate = n)
 59 |       LB <- Z * stats::qgamma(alpha / 2, shape = n, rate = n)
 60 |     }
 61 |     results <- c(LB, Z, UB)
 62 | 
 63 |     Res <-
 64 |       list(
 65 |         est = results,
 66 |         cl = 1 - alpha,
 67 |         c_trimmed = c_trimmed,
 68 |         n_original = n_original,
 69 |         n = n
 70 |       )
 71 |     structure(Res, class = c("twonn_mle", class(Res)))
 72 |   }
 73 | 
 74 | 
 75 | #' @name twonn
 76 | #'
 77 | #' @param x object of class \code{twonn_mle}, obtained from the function
 78 | #' \code{twonn_mle()}.
 79 | #' @param ... ignored.
 80 | #'
 81 | #'
 82 | #' @export
 83 | print.twonn_mle <- function(x, ...) {
 84 |   y <- c("TWONN - MLE" = x[["est"]][2])
 85 |   cat(y)
 86 |   cat("\n")
 87 |   invisible(x)
 88 | }
 89 | 
 90 | #' @name twonn
 91 | #'
 92 | #' @param object object of class \code{twonn_mle}, obtained from the function
 93 | #' \code{twonn_mle()}.
 94 | #' @param ... ignored.
 95 | #'
 96 | #' @export
 97 | summary.twonn_mle <- function(object, ...) {
 98 |   y <- cbind(
 99 |     `Original sample size` = object[["n_original"]],
100 |     `Used sample size` = object[["n"]],
101 |     `Trimming proportion` = object[["c_trimmed"]],
102 |     `Confidence level` = object[["cl"]],
103 |     `Lower Bound` = object[["est"]][1],
104 |     `Estimate` = object[["est"]][2],
105 |     `Upper Bound` = object[["est"]][3]
106 |   )
107 |   structure(y, class = c("summary.twonn_mle","matrix"))
108 | }
109 | 
110 | 
111 | #' @name twonn
112 | #'
113 | #' @param x object of class \code{twonn_mle}, obtained from the function
114 | #' \code{twonn_mle()}.
115 | #' @param ... ignored.
116 | #'
117 | #' @export
118 | print.summary.twonn_mle <- function(x, ...) {
119 |   cat("Model: TWO-NN\n")
120 |   cat("Method: MLE\n")
121 |   cat(paste0(
122 |     "Sample size: ",
123 |     x[1],
124 |     ", Obs. used: ",
125 |     x[2],
126 |     ". Trimming proportion: ",
127 |     100 * x[3],
128 |     "%\n"
129 |   ))
130 |   cat(paste0("ID estimates (confidence level: ", x[4], ")"))
131 |   y <- cbind(
132 |     `Lower Bound` = x[5],
133 |     `Estimate` = x[6],
134 |     `Upper Bound` = x[7]
135 |   )
136 |   print(knitr::kable(y))
137 |   cat("\n")
138 |   invisible(x)
139 | }
140 | 
141 | 
142 | #' @name twonn
143 | #' @param x object of class \code{twonn_mle}, the output of the
144 | #' \code{twonn} function when \code{method = "mle"}.
145 | #'
146 | #'
147 | #' @importFrom graphics abline legend lines matplot par points polygon
148 | #' @importFrom stats density ts
149 | #'
150 | #' @export
151 | #'
152 | plot.twonn_mle <-
153 |   function(x,
154 |            ...) {
155 | 
156 |     plot(rep(0,3) ~ x$est,type="n",xlab = ("Maximum Likelihood Estimation"),ylab="")
157 |     graphics::abline(v = x$est,col = "gray",lwd=1,lty=2)
158 |     graphics::points(0~x$est[2],cex = 2,pch=21,bg=1)
159 |     graphics::arrows(y0 = 0,code = 3,y1 = 0,x0 = x$est[1],x1 = x$est[3],angle=90,lwd=2)
160 |     graphics::title("MLE TWO-NN")
161 |     invisible()
162 |   }
163 | 
164 | 
165 | 


--------------------------------------------------------------------------------
/R/gride_mle.R:
--------------------------------------------------------------------------------
  1 | #' Generalized Ratios ID Estimation via MLE
  2 | #'
  3 | #' The function fits the frequentist Gride model. To run this function, use the
  4 | #' high-level \code{gride()} and specify \code{method = "mle"}.
  5 | #' The function finds the maximum likelihood estimates for \code{d}, and
  6 | #' subsequently simulate parametric bootstrap samples for uncertainty
  7 | #' quantification.
  8 | #' See \href{https://www.nature.com/articles/s41598-022-20991-1}{Denti et al., 2022}
  9 | #' for more details.
 10 | #'
 11 | #'
 12 | #' @param mus_n1_n2 vector of generalized order NN distance ratios.
 13 | #' @param n1 order of the first NN considered. Default is 1.
 14 | #' @param n2 order of the second NN considered. Default is 2.
 15 | #' @param nsim number of bootstrap simulations to consider.
 16 | #' @param alpha confidence level for the computation of the confidence interval.
 17 | #' @param upper_D nominal dimension of the dataset (upper bound for the
 18 | #' maximization routine).
 19 | #'
 20 | #' @return MLE estimate obtained via numeric optimization along with the
 21 | #' bootstrap confidence interval.
 22 | #' @keywords internal
 23 | #' @noRd
 24 | #'
 25 | #' @references
 26 | #' Denti F, Doimo D, Laio A, Mira A (2022). "The generalized ratios intrinsic
 27 | #' dimension estimator."
 28 | #' Scientific Reports, 12(20005).
 29 | #' ISSN  20452322, \doi{10.1038/s41598-022-20991-1}.
 30 | #'
 31 | #' @seealso \code{\link{gride}}
 32 | #'
 33 | gride_mle <- function(mus_n1_n2 = NULL,
 34 |                       n1 = 1,
 35 |                       n2 = 2,
 36 |                       nsim = 2000,
 37 |                       alpha = .95,
 38 |                       upper_D = NULL) {
 39 |   if (class(mus_n1_n2)[1] == "mus") {
 40 |     n1 <- attr(mus_n1_n2, which = "n1")
 41 |     n2 <- attr(mus_n1_n2, which = "n2")
 42 |   }
 43 | 
 44 |   if (n2 < n1) {
 45 |     stop("n2 should be greater than n1", call. = FALSE)
 46 |   }
 47 | 
 48 |   one_m_alpha <- 1 - alpha
 49 | 
 50 |   bs <- gride_bootstrap(
 51 |     mus_n1_n2 = mus_n1_n2,
 52 |     n1 = n1,
 53 |     n2 = n2,
 54 |     nsim = nsim,
 55 |     upper_D = upper_D
 56 |   )
 57 | 
 58 |   qq  <- base::unname(stats::quantile(bs$boot_sample,
 59 |                                       probs = c(one_m_alpha / 2,
 60 |                                                 1 - one_m_alpha / 2)))
 61 | 
 62 |   Res <- list(
 63 |     est = c(
 64 |       lb = qq[1],
 65 |       mle = bs$mle,
 66 |       ub = qq[2]
 67 |     ),
 68 |     cl = alpha,
 69 |     boot_sample = bs$boot_sample,
 70 |     n1 = n1,
 71 |     n2 = n2,
 72 |     nsim = nsim
 73 |   )
 74 |   structure(Res, class = c("gride_mle", class(Res)))
 75 | 
 76 | }
 77 | 
 78 | 
 79 | 
 80 | 
 81 | #' @name gride
 82 | #'
 83 | #' @param object object of class \code{gride_mle}, obtained from the function
 84 | #' \code{gride_mle()}.
 85 | #' @param ... ignored.
 86 | #'
 87 | #'
 88 | #' @export
 89 | print.gride_mle <- function(x, ...) {
 90 |   cat(paste0("Gride(",x[["n1"]],",",x[["n2"]],") - MLE\n"))
 91 |   cat(x[["est"]][2])
 92 |   cat("\n")
 93 |   invisible(x)
 94 | }
 95 | 
 96 | #' @name gride
 97 | #'
 98 | #' @param object object of class \code{gride_mle}, obtained from the function
 99 | #' \code{gride_mle()}.
100 | #' @param ... ignored.
101 | #'
102 | #' @export
103 | summary.gride_mle <- function(object, ...) {
104 |   y <- cbind(
105 |     `NN order 1` = object[["n1"]],
106 |     `NN order 2` = object[["n2"]],
107 |     `Bootstrap simulations` = object[["nsim"]],
108 |     `Confidence level` = object[["cl"]],
109 | 
110 |     `Lower Bound` = object[["est"]][1],
111 |     `Estimate` = object[["est"]][2],
112 |     `Upper Bound` = object[["est"]][3]
113 |   )
114 |   structure(y, class = c("summary.gride_mle","matrix"))
115 | }
116 | 
117 | 
118 | #' @name gride
119 | #'
120 | #' @param x object of class \code{twonn_mle}, obtained from the function
121 | #' \code{twonn_mle()}.
122 | #' @param ... ignored.
123 | #'
124 | #' @export
125 | print.summary.gride_mle <- function(x, ...) {
126 |   cat(paste0("Model: Gride(", x[1], ",", x[2], ")\n"))
127 |   cat("Method: MLE\n")
128 |   cat(paste0("CI obtained with a parametric bootstrap sample of size ",
129 |              x[3], "\n"))
130 |   cat(paste0("ID estimates (confidence level: ", x[4], ")"))
131 |   y <- cbind(
132 |     `Lower Bound` = x[5],
133 |     `Estimate` = x[6],
134 |     `Upper Bound` = x[7]
135 |   )
136 |   print(knitr::kable(y))
137 |   cat("\n")
138 |   invisible(x)
139 |   }
140 | 
141 | 
142 | #' @name gride
143 | #'
144 | #' @param x object of class \code{gride_mle}.
145 | #' It is obtained using the output of the \code{gride} function when
146 | #' \code{method = "mle"}.
147 | #'
148 | #' @param ... other arguments passed to specific methods.
149 | #'
150 | #' @export
151 | #'
152 | plot.gride_mle <- function(x,
153 |                            ...) {
154 |   ID <- x$boot_sample
155 |   dx <- density(ID)
156 |   plot(dx, xlab = "Intrinsic Dimension" , ylab = "Bootstrap Density",
157 |        col="darkblue",lwd=1.3, main = "")
158 |   polygon(c(dx$x), c(dx$y),
159 |           col = "lightgray", border = "darkblue", main = "")
160 |   abline(v =  c(x$est, x$lb, x$ub),
161 |          lty = 2,
162 |          col = 2)
163 |   graphics::title("MLE Gride: Bootstrap sample")
164 |   invisible()
165 | }
166 | 
167 | 


--------------------------------------------------------------------------------
/R/twonn_dec_steps.R:
--------------------------------------------------------------------------------
  1 | #' Decimated TWO-NN evolution with halving steps
  2 | #'
  3 | #' The estimation of the \code{id} is related to the scale of the
  4 | #' dataset. To escape the local reach of the \code{TWO-NN} estimator,
  5 | #' \href{https://www.nature.com/articles/s41598-017-11873-y}{Facco et al. (2017)}
  6 | #' propose to subsample the original dataset in order to induce greater
  7 | #' distances between the data points. By investigating the estimates' evolution
  8 | #' as a function of the size of the neighborhood, it is possible to obtain
  9 | #' information about the validity of the modeling assumptions and the robustness
 10 | #' of the model in the presence of noise.
 11 | #'
 12 | #' @param X data matrix with \code{n} observations and \code{D} variables.
 13 | #' @param steps number of times the dataset is halved.
 14 | #' @param seed random seed controlling the sequence of subsampling.
 15 | #'
 16 | #' @return object of class \code{twonn_decimation_steps}, which is a list
 17 | #' containing the \code{TWO-NN} evolution (maximum likelihood estimation and
 18 | #' confidence intervals), the average distance from the second NN, and the
 19 | #' number of steps.
 20 | #'
 21 | #' @keywords internal
 22 | #' @noRd
 23 | #'
 24 | twonn_dec_by <- function(X,
 25 |                          steps = 2,
 26 |                          seed = NULL) {
 27 |   if (!is.null(seed)) {
 28 |     set.seed(seed)
 29 |   }
 30 | 
 31 |   X     <- as.matrix(X)
 32 |   D     <- ncol(X)
 33 |   n     <- n0 <- nrow(X)
 34 |   check <- 0
 35 |   check <- duplicated(X)
 36 | 
 37 |   if (sum(check) > 0) {
 38 |     X <- X[-which(check),]
 39 |     n <- nrow(X)
 40 |     warning(
 41 |       paste0(
 42 |         "\n  Duplicates are present and will be removed.
 43 |         \n  Original sample size: ",
 44 |         n0,
 45 |         ". New sample size: ",
 46 |         n,
 47 |         "."
 48 |       ),
 49 |       call. = FALSE
 50 |     )
 51 |   }
 52 | 
 53 |   if (floor(2 ^ (-steps) * n)  <= 2) {
 54 |     stop(
 55 |       "Too many steps, no observations left. Please lower the number of steps considered",
 56 |       call. = FALSE
 57 |     )
 58 |   }
 59 | 
 60 |   W               <- steps + 1
 61 |   twonns          <- matrix(NA, W, 3)
 62 |   avg_distance_n2 <- numeric(W)
 63 |   inds            <- 1:n
 64 | 
 65 |   # Classic TWO-NN
 66 |   K                  <- FNN::get.knn(X, k = 2)
 67 |   mudots             <- K$nn.dist[, 2] / K$nn.dist[, 1]
 68 |   avg_distance_n2[1] <- mean(K$nn.dist[, 2])
 69 |   ests               <- twonn_mle(mudots)
 70 |   twonns[1, ]        <- ests$est
 71 | 
 72 | 
 73 |   for (w in 2:(W)) {
 74 |     sub_ind    <- sample(x = inds,
 75 |                          size = floor(.5 * length(inds)))
 76 | 
 77 |     subX       <- X[sub_ind, ]
 78 |     K          <- FNN::get.knn(subX, k = 2)
 79 |     mudots     <- K$nn.dist[, 2] / K$nn.dist[, 1]
 80 |     avg_distance_n2[w] <- mean(K$nn.dist[, 2])
 81 |     ests       <- twonn_mle(mudots)
 82 |     twonns[w, ] <- ests$est
 83 |     inds <- sub_ind
 84 | 
 85 |   }
 86 | 
 87 |   res <- list(
 88 |     decimated_twonn = twonns,
 89 |     avg_distance_n2 = avg_distance_n2,
 90 |     steps = steps
 91 |   )
 92 |   structure(res, class = c("twonn_dec_by", class(res)))
 93 | 
 94 | }
 95 | 
 96 | 
 97 | 
 98 | 
 99 | #' @name twonn_decimation
100 | #'
101 | #' @param x object of class \code{twonn_dec_prop}, obtained from the function
102 | #' \code{twonn_dec_prop()}.
103 | #' @param ... ignored.
104 | #'
105 | #'
106 | #' @export
107 | print.twonn_dec_by <- function(x, ...) {
108 |   cat(paste0("Model: decimated TWO-NN\n"))
109 |   cat(paste0("Dataset halved ", x[["steps"]], " times \n"))
110 |   cat(paste0(
111 |     "Average distance from the n2-th NN ranging from ",
112 |     round(min(x[["avg_distance_n2"]]), 4),
113 |     " to ",
114 |     round(max(x[["avg_distance_n2"]]), 4),
115 |     "\n"
116 |   ))
117 |   invisible(x)
118 | }
119 | 
120 | 
121 | 
122 | #' @name twonn_decimation
123 | #'
124 | #' @param x object of class \code{twonn_dec_prop}, obtained from the function
125 | #' \code{twonn_dec_prop()}.
126 | #' @param CI logical, if \code{TRUE}, the confidence intervals are plotted
127 | #' @param steps logical, if \code{TRUE}, the x-axis reports the number of halving steps.
128 | #' If \code{FALSE}, the x-axis reports the log10 average distance.
129 | #' @param ... ignored.
130 | #'
131 | #'
132 | #' @export
133 | plot.twonn_dec_by <- function(x, CI = FALSE, steps = FALSE, ...) {
134 | 
135 |   if(steps){
136 |     xx       <- 1:(x$steps+1)
137 |     logscale <- ""
138 |     xlab     <- "Halving steps"
139 |   }else{
140 |     xx <- x$avg_distance_n2
141 |     logscale <- "x"
142 |     xlab     <- Log[10] ~ average ~ n[2] ~ distance
143 |     }
144 |   if(CI){
145 |   plot(x$decimated_twonn[,2]~xx,col="darkblue",type = "b",lwd=1.5,
146 |           xlab = Log[10] ~ average ~ n[2] ~ distance, ylab = "ID estimate", log = logscale,
147 |           ylim = c(min(x$decimated_twonn),max(x$decimated_twonn)))
148 |   graphics::polygon(c(xx, rev(xx)), c(x$decimated_twonn[,1], rev(x$decimated_twonn[,3])),
149 |           col = "lightblue", border = NA)
150 |   lines(x$decimated_twonn[,2]~xx,col="darkblue",type = "b",lwd=1.5)
151 |   }else{
152 |   plot(x$decimated_twonn[,2]~xx,col="darkblue",type = "b",lwd=1.5,
153 |          xlab = Log[10] ~ average ~ n[2] ~ distance, ylab = "ID estimate", log = logscale)
154 |   }
155 |   graphics::title("Decimated TWO-NN: Halving steps")
156 |   invisible()
157 | }
158 | 


--------------------------------------------------------------------------------
/man/twonn.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/twoNN.R, R/twonn_bayes.R, R/twonn_linfit.R,
  3 | %   R/twonn_mle.R
  4 | \name{twonn}
  5 | \alias{twonn}
  6 | \alias{print.twonn_bayes}
  7 | \alias{summary.twonn_bayes}
  8 | \alias{print.summary.twonn_bayes}
  9 | \alias{plot.twonn_bayes}
 10 | \alias{print.twonn_linfit}
 11 | \alias{summary.twonn_linfit}
 12 | \alias{print.summary.twonn_linfit}
 13 | \alias{plot.twonn_linfit}
 14 | \alias{print.twonn_mle}
 15 | \alias{summary.twonn_mle}
 16 | \alias{print.summary.twonn_mle}
 17 | \alias{plot.twonn_mle}
 18 | \title{\code{TWO-NN} estimator}
 19 | \usage{
 20 | twonn(
 21 |   X = NULL,
 22 |   dist_mat = NULL,
 23 |   mus = NULL,
 24 |   method = c("mle", "linfit", "bayes"),
 25 |   alpha = 0.95,
 26 |   c_trimmed = 0.01,
 27 |   unbiased = TRUE,
 28 |   a_d = 0.001,
 29 |   b_d = 0.001,
 30 |   ...
 31 | )
 32 | 
 33 | \method{print}{twonn_bayes}(x, ...)
 34 | 
 35 | \method{summary}{twonn_bayes}(object, ...)
 36 | 
 37 | \method{print}{summary.twonn_bayes}(x, ...)
 38 | 
 39 | \method{plot}{twonn_bayes}(x, plot_low = 0.001, plot_upp = NULL, by = 0.05, ...)
 40 | 
 41 | \method{print}{twonn_linfit}(x, ...)
 42 | 
 43 | \method{summary}{twonn_linfit}(object, ...)
 44 | 
 45 | \method{print}{summary.twonn_linfit}(x, ...)
 46 | 
 47 | \method{plot}{twonn_linfit}(x, ...)
 48 | 
 49 | \method{print}{twonn_mle}(x, ...)
 50 | 
 51 | \method{summary}{twonn_mle}(object, ...)
 52 | 
 53 | \method{print}{summary.twonn_mle}(x, ...)
 54 | 
 55 | \method{plot}{twonn_mle}(x, ...)
 56 | }
 57 | \arguments{
 58 | \item{X}{data matrix with \code{n} observations and \code{D} variables.}
 59 | 
 60 | \item{dist_mat}{distance matrix computed between the \code{n} observations.}
 61 | 
 62 | \item{mus}{vector of second to first NN distance ratios.}
 63 | 
 64 | \item{method}{chosen estimation method. It can be
 65 | \describe{
 66 |       \item{\code{"mle"}}{for maximum likelihood estimator;}
 67 |       \item{\code{"linfit"}}{for estimation via the least squares approach;}
 68 |       \item{\code{"bayes"}}{for estimation with the Bayesian approach.}
 69 |              }}
 70 | 
 71 | \item{alpha}{the confidence level (for \code{mle} and least squares fit) or
 72 | posterior probability in the credible interval (\code{bayes}).}
 73 | 
 74 | \item{c_trimmed}{the proportion of trimmed observations.}
 75 | 
 76 | \item{unbiased}{logical, applicable when \code{method = "mle"}.
 77 | If \code{TRUE}, the MLE is corrected to ensure unbiasedness.}
 78 | 
 79 | \item{a_d}{shape parameter of the Gamma prior on the parameter \code{d},
 80 | applicable when \code{method = "bayes"}.}
 81 | 
 82 | \item{b_d}{rate parameter of the Gamma prior on the parameter \code{d},
 83 | applicable when \code{method = "bayes"}.}
 84 | 
 85 | \item{...}{ignored.}
 86 | 
 87 | \item{x}{object of class \code{twonn_mle}, the output of the
 88 | \code{twonn} function when \code{method = "mle"}.}
 89 | 
 90 | \item{object}{object of class \code{twonn_mle}, obtained from the function
 91 | \code{twonn_mle()}.}
 92 | 
 93 | \item{plot_low}{lower bound of the interval on which the posterior density
 94 | is plotted.}
 95 | 
 96 | \item{plot_upp}{upper bound of the interval on which the posterior density
 97 | is plotted.}
 98 | 
 99 | \item{by}{step-size at which the sequence spanning the interval is
100 | incremented.}
101 | }
102 | \value{
103 | list characterized by a class type that depends on the \code{method}
104 | chosen. Regardless of the \code{method}, the output list always contains the
105 | object \code{est}, which provides the estimated intrinsic dimension along
106 | with uncertainty quantification. The remaining objects vary with the
107 | estimation method. In particular, if
108 | \describe{
109 | \item{\code{method = "mle"}}{the output reports the MLE and the relative
110 | confidence interval;}
111 | \item{\code{method = "linfit"}}{the output includes the \code{lm()} object used for the computation;}
112 | \item{\code{method = "bayes"}}{the output contains the (1 + \code{alpha}) / 2 and (1 - \code{alpha}) / 2 quantiles, mean, mode, and median of the posterior distribution of \code{d}.}
113 | }
114 | }
115 | \description{
116 | The function can fit the two-nearest neighbor estimator within the maximum
117 | likelihood and the Bayesian frameworks. Also, one can obtain the estimates
118 | using least squares estimation, depending on the specification of the
119 | argument \code{method}. This model has been originally presented in
120 | \href{https://www.nature.com/articles/s41598-017-11873-y}{Facco et al., 2017}
121 | . See also \href{https://www.nature.com/articles/s41598-022-20991-1}{Denti et al., 2022}
122 | for more details.
123 | }
124 | \examples{
125 | # dataset with 1000 observations and id = 2
126 | X <- replicate(2,rnorm(1000))
127 | twonn(X)
128 | # dataset with 1000 observations and id = 3
129 | Y <- replicate(3,runif(1000))
130 | #  Bayesian and least squares estimate from distance matrix
131 | dm <- as.matrix(dist(Y,method = "manhattan"))
132 | twonn(dist_mat = dm,method = "bayes")
133 | twonn(dist_mat = dm,method = "linfit")
134 | 
135 | }
136 | \references{
137 | Facco E, D'Errico M, Rodriguez A, Laio A (2017). "Estimating the intrinsic
138 | dimension of datasets by a minimal neighborhood information."
139 | Scientific Reports, 7(1).
140 | ISSN 20452322, \doi{10.1038/s41598-017-11873-y}.
141 | 
142 | Denti F, Doimo D, Laio A, Mira A (2022). "The generalized ratios intrinsic
143 | dimension estimator."
144 | Scientific Reports, 12(20005).
145 | ISSN  20452322, \doi{10.1038/s41598-022-20991-1}.
146 | }
147 | 


--------------------------------------------------------------------------------
/R/autoplot_gride.R:
--------------------------------------------------------------------------------
  1 | #' Plot the simulated MCMC chains for the Bayesian \code{Gride}
  2 | #'
  3 | #' Use this method without the \code{.gride_bayes} suffix.
  4 | #' It displays the traceplot of the chain generated
  5 | #' with Metropolis-Hasting updates to visually assess mixing and convergence.
  6 | #' Alternatively, it is possible to plot the posterior density.
  7 | #'
  8 | #' @param object object of class \code{gride_bayes}.
  9 | #' It is obtained using the output of the \code{gride} function when
 10 | #' \code{method = "bayes"}.
 11 | #' @param traceplot logical. If \code{FALSE}, the function returns a plot of the
 12 | #' posterior density. If \code{TRUE}, the function returns the traceplots of the
 13 | #' MCMC used to simulate from the posterior distribution.
 14 | #' @param title optional string to display as title.
 15 | #' @param ... other arguments passed to specific methods.
 16 | #'
 17 | #' @rdname autoplot.gride_bayes
 18 | #'
 19 | #' @return object of class \code{\link[ggplot2]{ggplot}}.
 20 | #' It could represent the traceplot of the posterior simulations for the
 21 | #' Bayesian \code{Gride} model (\code{traceplot = TRUE}) or a density plot
 22 | #' of the simulated posterior distribution (\code{traceplot = FALSE}).
 23 | #'
 24 | #' @seealso \code{\link{gride}}
 25 | #'
 26 | #' @family autoplot methods
 27 | #'
 28 | #' @export
 29 | autoplot.gride_bayes <- function(object,
 30 |                                  traceplot = FALSE,
 31 |                               title = "Bayesian Gride - Posterior distribution",
 32 |                                  ...) {
 33 |   if (traceplot) {
 34 |     sam <- c(object$post_sample)
 35 |     cmm <- cumsum(sam) / seq_along(sam)
 36 |     G1 <-
 37 |       ggplot2::ggplot(dplyr::tibble(value = sam, ind = seq_along(sam), cmm = cmm)) +
 38 |       ggplot2::geom_line(ggplot2::aes(x = .data$ind, y = .data$value), col =
 39 |                            "lightgray") +
 40 |       ggplot2::geom_line(ggplot2::aes(x = .data$ind, y = .data$cmm), col =
 41 |                            "darkblue") +
 42 |       ggplot2::theme_bw() +
 43 |       ggplot2::ylab("Intrinsic Dimension") +
 44 |       ggplot2::xlab("MCMC Iteration") +
 45 |       ggplot2::theme(
 46 |         axis.title.x = ggplot2::element_text(size = 15),
 47 |         axis.title.y = ggplot2::element_text(size = 15),
 48 |         title = ggplot2::element_text(size = 15)
 49 |       ) +
 50 |       ggplot2::ggtitle(title,
 51 |         subtitle = bquote(
 52 | n[1] == .(object$n1) ~ "," ~ n[2] == .(object$n2) ~ "," ~ sigma == .(object$sigma)
 53 |       ))
 54 | 
 55 | 
 56 |   } else{
 57 |     sam <- object$post_sample
 58 |     G1 <- ggplot2::ggplot(dplyr::tibble(value = c(sam))) +
 59 |       ggplot2::geom_density(ggplot2::aes(x = .data$value),
 60 |                             col = "darkblue",
 61 |                             fill = "lightgray") +
 62 |       ggplot2::xlab("Intrinsic Dimension") +
 63 |       ggplot2::ylab("Simulated Posterior Density") +
 64 |       ggplot2::theme_bw() +
 65 |       ggplot2::geom_vline(xintercept = object$est,
 66 |                           lty = 2,
 67 |                           col = 2) +
 68 |       ggplot2::theme(
 69 |         axis.title.x = ggplot2::element_text(size = 15),
 70 |         axis.title.y = ggplot2::element_text(size = 15),
 71 |         title = ggplot2::element_text(size = 15)
 72 |       ) +
 73 |       ggplot2::ggtitle(title,
 74 |                        subtitle = bquote(
 75 | n[1] == .(object$n1) ~ "," ~ n[2] == .(object$n2) ~ "," ~ sigma == .(object$sigma)
 76 |       ))
 77 | 
 78 | 
 79 |   }
 80 |   return(G1)
 81 | }
 82 | 
 83 | 
 84 | 
 85 | 
 86 | #' Plot the simulated bootstrap sample for the MLE \code{Gride}
 87 | #'
 88 | #' Use this method without the \code{.gride_mle} suffix.
 89 | #' It displays the density plot of sample obtained via
 90 | #' parametric bootstrap for the \code{Gride} model.
 91 | #'
 92 | #' @param object object of class \code{gride_mle}.
 93 | #' It is obtained using the output of the \code{gride} function when
 94 | #' \code{method = "mle"}.
 95 | #' @param title title for the plot.
 96 | #' @param ... other arguments passed to specific methods.
 97 | #'
 98 | #' @rdname autoplot.gride_mle
 99 | #'
100 | #' @return object of class \code{\link[ggplot2]{ggplot}}. It displays the
101 | #' density plot of the sample generated via parametric bootstrap to help the
102 | #' visual assessment of the uncertainty of the \code{id} estimates.
103 | #'
104 | #'
105 | #' @seealso \code{\link{gride}}
106 | #'
107 | #' @export
108 | autoplot.gride_mle <- function(object,
109 |                                title = "MLE Gride - Bootstrap sample",
110 |                                ...) {
111 |   G1 <- ggplot2::ggplot(dplyr::tibble(value = object$boot_sample)) +
112 |     ggplot2::geom_density(ggplot2::aes(x = .data$value),
113 |                           col = "darkblue",
114 |                           fill = "lightgray") +
115 |     ggplot2::xlab("Intrinsic Dimension") +
116 |     ggplot2::ylab("Bootstrap Density") +
117 |     ggplot2::theme_bw() +
118 |     ggplot2::geom_vline(
119 |       xintercept = c(object$est, object$lb, object$ub),
120 |       lty = 2,
121 |       col = 2
122 |     ) +
123 |     ggplot2::theme(
124 |       axis.title.x = ggplot2::element_text(size = 15),
125 |       axis.title.y = ggplot2::element_text(size = 15),
126 |       title = ggplot2::element_text(size = 15)
127 |     ) +
128 |     ggplot2::ggtitle(title,
129 |           subtitle = bquote(n[1] == .(object$n1) ~ "," ~ n[2] == .(object$n2)))
130 | 
131 |   return(G1)
132 | }
133 | 
134 | 
135 | 


--------------------------------------------------------------------------------
/R/gride.R:
--------------------------------------------------------------------------------
  1 | #' \code{Gride}: the Generalized Ratios ID Estimator
  2 | #'
  3 | #' The function can fit the Generalized ratios ID estimator under both the
  4 | #' frequentist and the Bayesian frameworks, depending on the specification of
  5 | #' the argument \code{method}. The model is the direct extension of the
  6 | #' \code{TWO-NN} method presented in
  7 | #' \href{https://www.nature.com/articles/s41598-017-11873-y}{Facco et al., 2017}
  8 | #' . See also \href{https://www.nature.com/articles/s41598-022-20991-1}{Denti et al., 2022} \
  9 | #' for more details.
 10 | #'
 11 | #' @param X data matrix with \code{n} observations and \code{D} variables.
 12 | #' @param dist_mat distance matrix computed between the \code{n} observations.
 13 | #' @param mus_n1_n2 vector of generalized order NN distance ratios.
 14 | #' @param method the chosen estimation method. It can be
 15 | #' \describe{
 16 | #' \item{\code{"mle"}}{maximum likelihood estimation;}
 17 | #' \item{\code{"bayes"}}{estimation with the Bayesian approach.}
 18 | #' }
 19 | #' @param n1 order of the first NN considered. Default is 1.
 20 | #' @param n2 order of the second NN considered. Default is 2.
 21 | #' @param alpha confidence level (for \code{mle}) or posterior probability in
 22 | #' the credible interval (\code{bayes}).
 23 | #' @param upper_D nominal dimension of the dataset (upper bound for the
 24 | #' maximization routine).
 25 | #' @param nsim number of bootstrap samples or posterior simulation to consider.
 26 | #' @param burn_in number of iterations to discard from the MCMC sample.
 27 | #' Applicable if \code{method = "bayes"}.
 28 | #' @param sigma standard deviation of the Gaussian proposal used in the MH step.
 29 | #' Applicable if \code{method = "bayes"}.
 30 | #' @param start_d initial value for the MCMC chain. If \code{NULL},
 31 | #' the MLE is used. Applicable if \code{method = "bayes"}.
 32 | #' @param a_d shape parameter of the Gamma prior distribution for \code{d}.
 33 | #' Applicable if \code{method = "bayes"}.
 34 | #' @param b_d rate parameter of the Gamma prior distribution for \code{d}.
 35 | #' Applicable if \code{method = "bayes"}.
 36 | #' @param ... additional arguments for the different methods.
 37 | #'
 38 | #'
 39 | #' @return a list containing the \code{id} estimate obtained with the Gride
 40 | #' method, along with the relative confidence or credible interval
 41 | #' (object \code{est}). The class of the output object changes according to the
 42 | #' chosen \code{method}. Similarly,
 43 | #' the remaining elements stored in the list reports a summary of the key
 44 | #' quantities involved in the estimation process, e.g.,
 45 | #' the NN orders \code{n1} and \code{n2}.
 46 | #'
 47 | #' @export
 48 | #'
 49 | #' @references
 50 | #' Facco E, D'Errico M, Rodriguez A, Laio A (2017). "Estimating the intrinsic
 51 | #' dimension of datasets by a minimal neighborhood information."
 52 | #' Scientific Reports, 7(1).
 53 | #' ISSN 20452322, \doi{10.1038/s41598-017-11873-y}.
 54 | #'
 55 | #' Denti F, Doimo D, Laio A, Mira A (2022). "The generalized ratios intrinsic
 56 | #' dimension estimator."
 57 | #' Scientific Reports, 12(20005).
 58 | #' ISSN  20452322, \doi{10.1038/s41598-022-20991-1}.
 59 | #'
 60 | #' @examples
 61 | #' \donttest{
 62 | #'  X  <- replicate(2,rnorm(500))
 63 | #'  dm <- as.matrix(dist(X,method = "manhattan"))
 64 | #'  res <- gride(X, nsim = 500)
 65 | #'  res
 66 | #'  plot(res)
 67 | #'  gride(dist_mat = dm, method = "bayes", upper_D =10,
 68 | #'  nsim = 500, burn_in = 100)
 69 | #' }
 70 | gride <- function(X = NULL,
 71 |                   dist_mat = NULL,
 72 |                   mus_n1_n2 = NULL,
 73 |                   method = c("mle", "bayes"),
 74 |                   n1 = 1,
 75 |                   n2 = 2,
 76 |                   alpha = 0.95,
 77 |                   nsim = 5000,
 78 |                   upper_D = 50,
 79 |                   burn_in = 2000,
 80 |                   sigma = 0.5,
 81 |                   start_d = NULL,
 82 |                   a_d = 1,
 83 |                   b_d = 1,
 84 |                   ...) {
 85 | 
 86 | 
 87 |   if (is.null(mus_n1_n2)) {
 88 |     if (is.null(X) & is.null(dist_mat)) {
 89 |       stop("Please provide either a dataset X or a distance matrix",
 90 |            call. = FALSE)}
 91 | 
 92 |       mus_n1_n2  <- compute_mus(
 93 |         X = X,
 94 |         dist_mat = dist_mat,
 95 |         n1 = n1,
 96 |         n2 = n2
 97 |         )
 98 | 
 99 |   }else{
100 |     if (class(mus_n1_n2)[1] == "mus") {
101 |       n1 <- attr(mus_n1_n2, which = "n1")
102 |       n2 <- attr(mus_n1_n2, which = "n2")
103 |       if(attr(mus_n1_n2, which = "upper_D") != "unknown"){
104 |         upper_D <- attr(mus_n1_n2, which = "upper_D") + 5
105 |       }
106 | 
107 |     }
108 | 
109 | 
110 |     }
111 | 
112 | 
113 |   if (!is.null(X)) {
114 |       upper_D <- ncol(X) + 1
115 |   }
116 |   if (is.null(X) & is.null(upper_D)) {
117 |     stop("Please provide the nominal dimension of the dataset D in upper_D",
118 |          call. = FALSE)
119 |   }
120 | 
121 |   method <- match.arg(method)
122 |   switch(
123 |     method,
124 |     mle    = gride_mle(
125 |       mus_n1_n2 = mus_n1_n2,
126 |       n1 = n1,
127 |       n2 = n2,
128 |       alpha = alpha,
129 |       upper_D = upper_D,
130 |       nsim = nsim,
131 |       ...
132 |     ),
133 |     bayes  = gride_bayes(
134 |       mus_n1_n2 = mus_n1_n2,
135 |       alpha = alpha,
136 |       n1 = n1,
137 |       n2 = n2,
138 |       upper_D = upper_D,
139 |       nsim = nsim,
140 |       burn_in = burn_in,
141 |       sigma = sigma,
142 |       start_d = start_d,
143 |       a_d = a_d, b_d = b_d,
144 |       ...
145 |     )
146 |   )
147 | 
148 | }
149 | 


--------------------------------------------------------------------------------
/R/twonn_dec_prop.R:
--------------------------------------------------------------------------------
  1 | #' Decimated \code{TWO-NN} evolution with proportions
  2 | #'
  3 | #' The estimation of the intrinsic dimension is related to the scale of the
  4 | #' dataset. To escape the local reach of the \code{TWO-NN} estimator,
  5 | #' \href{https://www.nature.com/articles/s41598-017-11873-y}{Facco et al. (2017)}
  6 | #' proposed to subsample the original dataset in order to induce greater
  7 | #' distances between the data points. By investigating the estimates' evolution
  8 | #' as a function of the size of the neighborhood, it is possible to obtain
  9 | #' information about the validity of the modeling assumptions and the robustness
 10 | #' of the model in the presence of noise.
 11 | #'
 12 | #' @param X data matrix with \code{n} observations and \code{D} variables.
 13 | #' @param proportions vector containing the fractions of the dataset to be
 14 | #' considered.
 15 | #' @param seed random seed controlling the sequence of subsampling.
 16 | #'
 17 | #' @return list containing the \code{TWO-NN} evolution (maximum likelihood
 18 | #' estimation and confidence intervals), the average distance from the second
 19 | #' NN, and the vector of proportions that were considered.
 20 | #'
 21 | #' @keywords internal
 22 | #' @noRd
 23 | #'
 24 | twonn_dec_prop <- function(X,
 25 |                            proportions = 1,
 26 |                            seed = NULL) {
 27 |   if (!is.null(seed)) {
 28 |     set.seed(seed)
 29 |   }
 30 | 
 31 |   if (any(proportions  > 1) | any(proportions  < 0)) {
 32 |     stop(
 33 |       "The vector proportions must contain values between 0 (excluded) and 1 (included)",
 34 |       call. = FALSE
 35 |     )
 36 |   }
 37 | 
 38 |   X     <- as.matrix(X)
 39 |   D     <- ncol(X)
 40 |   n     <- n0 <- nrow(X)
 41 |   check <- 0
 42 |   check <- duplicated(X)
 43 | 
 44 |   if (sum(check) > 0) {
 45 |     X <- X[-which(check),]
 46 |     n <- nrow(X)
 47 |     warning(
 48 |       paste0(
 49 |         "\n  Duplicates are present and will be removed.
 50 |         \n  Original sample size: ",
 51 |         n0,
 52 |         ". New sample size: ",
 53 |         n,
 54 |         "."
 55 |       ),
 56 |       call. = FALSE
 57 |     )
 58 |   }
 59 | 
 60 |   if (floor(min(proportions) * n)  <= 2) {
 61 |     stop(
 62 |       "Proportions are too low, no observations left. Please lower the number of steps considered"
 63 |     )
 64 |   }
 65 | 
 66 |   proportions     <- sort(proportions,decreasing = T)
 67 |   W               <- length(proportions)
 68 |   twonns          <- matrix(NA, W, 3)
 69 |   avg_distance_n2 <- numeric(W)
 70 |   sub_ind         <- 1:n
 71 |   n_new           <- n
 72 | 
 73 |   # Classic TWO-NN
 74 |   K                  <- FNN::get.knn(X, k = 2)
 75 |   mudots             <- K$nn.dist[, 2] / K$nn.dist[, 1]
 76 |   avg_distance_n2[1] <- mean(K$nn.dist[, 2])
 77 |   ests               <- twonn_mle(mudots)
 78 |   twonns[1, ]        <- ests$est
 79 | 
 80 |   for (w in 2:(W)) {
 81 |     prop       <- proportions[(w)] / proportions[w - 1]
 82 | 
 83 |     sub_ind    <- sample(x = sub_ind,
 84 |                          size = floor(n_new * prop))
 85 | 
 86 |     n_new      <- length(sub_ind)
 87 | 
 88 |     subX       <- X[sub_ind, ]
 89 |     K          <- FNN::get.knn(subX, k = 2)
 90 |     mudots     <- K$nn.dist[, 2] / K$nn.dist[, 1]
 91 |     avg_distance_n2[w] <- mean(K$nn.dist[, 2])
 92 |     ests        <- twonn_mle(mudots)
 93 |     twonns[w, ] <- ests$est
 94 | 
 95 |   }
 96 | 
 97 |   res <- list(
 98 |     decimated_twonn = twonns,
 99 |     avg_distance_n2 = avg_distance_n2,
100 |     proportions = proportions
101 |   )
102 |   structure(res, class = c("twonn_dec_prop", class(res)))
103 | 
104 | }
105 | 
106 | 
107 | 
108 | #' @name twonn_decimation
109 | #'
110 | #' @param x object of class \code{twonn_dec_prop}, obtained from the function
111 | #' \code{twonn_dec_prop()}.
112 | #' @param ... ignored.
113 | #'
114 | #' @export
115 | print.twonn_dec_prop <- function(x, ...) {
116 |   cat(paste0("Model: decimated TWO-NN\n"))
117 |   if (length(x[["proportions"]]) > 10) {
118 |     cat(paste0(
119 |       "Decimating proportions ranging from ",
120 |       round(max(x[["proportions"]]), 5),
121 |       " to ",
122 |       round(min(x[["proportions"]]), 5)
123 |     ),
124 |     "\n")
125 |   } else{
126 |     cat("Decimating proportions: ", round(x[["proportions"]], 5), "\n")
127 |   }
128 |   cat(paste0(
129 |     "Average distance from the n2-th NN ranging from ",
130 |     round(min(x[["avg_distance_n2"]]), 4),
131 |     " to ",
132 |     round(max(x[["avg_distance_n2"]]), 4),
133 |     "\n"
134 |   ))
135 |   invisible(x)
136 | }
137 | 
138 | 
139 | 
140 | #' @name twonn_decimation
141 | #'
142 | #' @param x object of class \code{twonn_dec_prop}, obtained from the function
143 | #' \code{twonn_dec_prop()}.
144 | #' @param CI logical, if \code{TRUE}, the confidence intervals are plotted
145 | #' @param proportions logical, if \code{TRUE}, the x-axis reports the number of decimating proportions.
146 | #' If \code{FALSE}, the x-axis reports the log10 average distance.
147 | #' @param ... ignored.
148 | #'
149 | #'
150 | #' @export
151 | plot.twonn_dec_prop <- function(x,
152 |                                 CI = FALSE,
153 |                                 proportions = FALSE,
154 |                                 ...) {
155 | 
156 |     if(proportions){
157 |       xx       <- (x$proportions)
158 |       logscale <- ""
159 |       xlab     <- "Decimating proportions"
160 |     }else{
161 |       xx <- x$avg_distance_n2
162 |       logscale <- "x"
163 |       xlab     <- Log[10] ~ average ~ n[2] ~ distance
164 |     }
165 | 
166 |     if(CI){
167 |       plot(x$decimated_twonn[,2]~xx,col="darkblue",type = "b",lwd=1.5,
168 |            xlab = xlab, ylab = "ID estimate", log = logscale,
169 |            ylim = c(min(x$decimated_twonn),max(x$decimated_twonn)))
170 |       graphics::polygon(c(xx, rev(xx)), c(x$decimated_twonn[,1], rev(x$decimated_twonn[,3])),
171 |                         col = "lightblue", border = NA)
172 |       lines(x$decimated_twonn[,2]~xx,col="darkblue",type = "b",lwd=1.5)
173 |     }else{
174 |       plot(x$decimated_twonn[,2]~xx,col="darkblue",type = "b",lwd=1.5,
175 |            xlab = xlab, ylab = "ID estimate", log = logscale)
176 |     }
177 |     graphics::title("Decimated TWO-NN: Proportions")
178 |     invisible()
179 | }
180 | 


--------------------------------------------------------------------------------
/man/Hidalgo.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/hidalgo.R
  3 | \name{Hidalgo}
  4 | \alias{Hidalgo}
  5 | \alias{print.Hidalgo}
  6 | \alias{plot.Hidalgo}
  7 | \alias{summary.Hidalgo}
  8 | \alias{print.summary.Hidalgo}
  9 | \title{Fit the \code{Hidalgo} model}
 10 | \usage{
 11 | Hidalgo(
 12 |   X = NULL,
 13 |   dist_mat = NULL,
 14 |   K = 10,
 15 |   nsim = 5000,
 16 |   burn_in = 5000,
 17 |   thinning = 1,
 18 |   verbose = TRUE,
 19 |   q = 3,
 20 |   xi = 0.75,
 21 |   alpha_Dirichlet = 0.05,
 22 |   a0_d = 1,
 23 |   b0_d = 1,
 24 |   prior_type = c("Conjugate", "Truncated", "Truncated_PointMass"),
 25 |   D = NULL,
 26 |   pi_mass = 0.5
 27 | )
 28 | 
 29 | \method{print}{Hidalgo}(x, ...)
 30 | 
 31 | \method{plot}{Hidalgo}(x, type = c("A", "B", "C"), class = NULL, ...)
 32 | 
 33 | \method{summary}{Hidalgo}(object, ...)
 34 | 
 35 | \method{print}{summary.Hidalgo}(x, ...)
 36 | }
 37 | \arguments{
 38 | \item{X}{data matrix with \code{n} observations and \code{D} variables.}
 39 | 
 40 | \item{dist_mat}{distance matrix computed between the \code{n} observations.}
 41 | 
 42 | \item{K}{integer, number of mixture components.}
 43 | 
 44 | \item{nsim}{number of MCMC iterations to run.}
 45 | 
 46 | \item{burn_in}{number of MCMC iterations to discard as burn-in period.}
 47 | 
 48 | \item{thinning}{integer indicating the thinning interval.}
 49 | 
 50 | \item{verbose}{logical, should the progress of the sampler be printed?}
 51 | 
 52 | \item{q}{integer, first local homogeneity parameter. Default is 3.}
 53 | 
 54 | \item{xi}{real number between 0 and 1, second local homogeneity parameter.
 55 | Default is 0.75.}
 56 | 
 57 | \item{alpha_Dirichlet}{parameter of the symmetric Dirichlet prior
 58 | on the mixture weights. Default is 0.05, inducing a sparse mixture.
 59 | Values that are too small (i.e., lower than 0.005) may cause underflow.}
 60 | 
 61 | \item{a0_d}{shape parameter of the Gamma prior on \code{d}.}
 62 | 
 63 | \item{b0_d}{rate parameter of the Gamma prior on \code{d}.}
 64 | 
 65 | \item{prior_type}{character, type of Gamma prior on \code{d}, can be
 66 | \describe{
 67 |    \item{\code{"Conjugate"}}{a conjugate Gamma distribution is elicited;}
 68 |    \item{\code{"Truncated"}}{the conjugate Gamma prior is truncated over the
 69 |    interval \code{(0,D)};}
 70 |    \item{\code{"Truncated_PointMass"}}{same as \code{"Truncated"}, but a
 71 |    point mass is placed on \code{D}, to allow the \code{id} to be
 72 |    identically equal to the nominal dimension.}
 73 | }}
 74 | 
 75 | \item{D}{integer, the maximal dimension of the dataset.}
 76 | 
 77 | \item{pi_mass}{probability placed a priori on \code{D} when
 78 | \code{Truncated_PointMass} is chosen.}
 79 | 
 80 | \item{x}{object of class \code{Hidalgo}, the output of the
 81 | \code{Hidalgo()} function.}
 82 | 
 83 | \item{...}{other arguments passed to specific methods.}
 84 | 
 85 | \item{type}{character that indicates the type of plot that is requested.
 86 | It can be:
 87 | \describe{
 88 |  \item{\code{"A"}}{plot the MCMC and the ergodic means NOT corrected
 89 |  for label switching;}
 90 |  \item{\code{"B"}}{plot the posterior mean and median of the id
 91 |  for each observation, after the chains are processed for label switching;}
 92 |  \item{\code{"C"}}{plot the estimated id distributions stratified by
 93 |  the groups specified in the class vector;}
 94 |  }}
 95 | 
 96 | \item{class}{factor variable used to stratify observations according to
 97 | their the \code{id} estimates.}
 98 | 
 99 | \item{object}{object of class \code{Hidalgo}, the output of the
100 | \code{Hidalgo()} function.}
101 | }
102 | \value{
103 | object of class \code{Hidalgo}, which is a list containing
104 | \describe{
105 |     \item{\code{cluster_prob}}{chains of the posterior mixture weights;}
106 |     \item{\code{membership_labels}}{chains of the membership labels for all
107 |     the observations;}
108 |     \item{\code{id_raw}}{chains of the \code{K} intrinsic dimensions
109 |     parameters, one per mixture component;}
110 |     \item{\code{id_postpr}}{a chain for each observation, corrected for
111 |     label switching;}
112 |     \item{\code{id_summary}}{a matrix containing, for each observation, the
113 |     value of posterior mean and the 5\%, 25\%, 50\%, 75\%, 95\% quantiles;}
114 |     \item{\code{recap}}{a list with the objects and specifications passed to
115 |     the function used in the estimation.}
116 |  }
117 | }
118 | \description{
119 | The function fits the Heterogeneous intrinsic dimension algorithm, developed
120 | in Allegra et al., 2020. The model is a Bayesian mixture of Pareto
121 | distribution with modified likelihood to induce homogeneity across
122 | neighboring observations. The model can segment the observations into
123 | multiple clusters characterized by different
124 | intrinsic dimensions. This permits to capture hidden patterns in the data.
125 | For more details on the algorithm, refer to
126 | \href{https://www.nature.com/articles/s41598-020-72222-0}{Allegra et al., 2020}.
127 | For an example of application to basketball data, see
128 | \href{https://imstat.org/journals-and-publications/annals-of-applied-statistics/annals-of-applied-statistics-next-issues/}{Santos-Fernandez et al., 2021}.
129 | }
130 | \examples{
131 | \donttest{
132 | set.seed(1234)
133 | X            <- replicate(5,rnorm(500))
134 | X[1:250,1:2] <- 0
135 | X[1:250,]    <- X[1:250,] + 4
136 | oracle       <- rep(1:2,rep(250,2))
137 | # this is just a short example
138 | # increase the number of iterations to improve mixing and convergence
139 | h_out        <- Hidalgo(X, nsim = 500, burn_in = 500)
140 | plot(h_out, type =  "B")
141 | id_by_class(h_out, oracle)
142 | }
143 | 
144 | 
145 | }
146 | \references{
147 | Allegra M, Facco E, Denti F, Laio A, Mira A (2020).
148 | “Data segmentation based on the local intrinsic dimension.”
149 | Scientific Reports, 10(1), 1–27. ISSN 20452322,
150 | \doi{10.1038/s41598-020-72222-0},
151 | 
152 | Santos-Fernandez E, Denti F, Mengersen K, Mira A (2021).
153 | “The role of intrinsic dimension in high-resolution player tracking data –
154 | Insights in basketball.” Annals of Applied Statistics - Forthcoming, –
155 | ISSN 2331-8422, 2002.04148, \doi{10.1038/s41598-022-20991-1}
156 | }
157 | \seealso{
158 | \code{\link{id_by_class}} and \code{\link{clustering}}
159 | to understand how to further postprocess the results.
160 | }
161 | 


--------------------------------------------------------------------------------
/R/twonn_bayes.R:
--------------------------------------------------------------------------------
  1 | #' Bayesian estimator for the \code{TWO-NN} model
  2 | #'
  3 | #' The function fits the \code{TWO-NN} model via Bayesian estimation, employing
  4 | #' a conjugate prior. The formulas can be found in
  5 | #' \href{https://www.nature.com/articles/s41598-022-20991-1}{Denti et al., 2022}.
  6 | #'
  7 | #' @param mus vector of second to first NN distance ratios.
  8 | #' @param a_d shape parameter of the Gamma prior on the parameter \code{d}.
  9 | #' @param b_d rate parameter of the Gamma prior on the parameter \code{d}.
 10 | #' @param alpha posterior probability contained in the computed credible
 11 | #' interval.
 12 | #' @param c_trimmed proportion of trimmed observations.
 13 | #'
 14 | #' @return object of class \code{twonn_bayes}, which is a list containing the
 15 | #' {(1 + \code{alpha}) / 2} and 1 - \code{\alpha} / 2 quantiles, mean, mode and
 16 | #' median of the posterior distribution of \code{d}.
 17 | #'
 18 | #' @seealso \code{\link{twonn}}, \code{\link{autoplot.twonn_bayes}}
 19 | #'
 20 | #' @keywords Internal
 21 | #' @noRd
 22 | #'
 23 | #' @references
 24 | #' Denti F, Doimo D, Laio A, Mira A (2022). "The generalized ratios intrinsic
 25 | #' dimension estimator."
 26 | #' Scientific Reports, 12(20005).
 27 | #' ISSN  20452322, \doi{10.1038/s41598-022-20991-1}.
 28 | #'
 29 | twonn_bayes <- function(mus,
 30 |                         a_d = 0.001,
 31 |                         b_d = 0.001,
 32 |                         alpha = 0.95,
 33 |                         c_trimmed = 0.01) {
 34 |   n <- n_original <- base::length(mus)
 35 |   c_considered <- 1 - c_trimmed
 36 | 
 37 |   n  <- base::length(mus)
 38 | 
 39 |   if (c_trimmed) {
 40 |     mus <- sort(mus)[1:floor(n * c_considered)]
 41 |     n <- base::length(mus)
 42 |   }
 43 | 
 44 |   logmus  <- base::log(mus)
 45 |   Res    <- numeric(5)
 46 |   Res[2] <- (a_d + length(logmus)) / (b_d + sum(logmus))
 47 |   Res[3] <-
 48 |     stats::qgamma(.5, shape = (a_d + length(logmus)),
 49 |                   rate = (b_d + sum(logmus)))
 50 |   Res[4] <- (a_d + length(logmus) - 1) / (b_d + sum(logmus))
 51 |   Res[c(1, 5)] <-
 52 |     stats::qgamma(c((1 - alpha) / 2, (1 + alpha) / 2),
 53 |                   shape = (a_d + length(logmus)),
 54 |                   rate = (b_d + sum(logmus)))
 55 |   Res <- list(
 56 |     est = Res,
 57 |     alpha     = alpha,
 58 |     hp_prior  = c(a_d, b_d),
 59 |     hp_posterior = c(a_d + length(logmus), b_d + sum(logmus)),
 60 |     c_trimmed = c_trimmed,
 61 |     n_original = n_original,
 62 |     n = n
 63 |   )
 64 | 
 65 |   structure(Res, class = c("twonn_bayes", class(Res)))
 66 | }
 67 | 
 68 | 
 69 | #' @name twonn
 70 | #'
 71 | #' @param x object of class \code{twonn_bayes}, obtained from the function
 72 | #' \code{twonn_bayes()}.
 73 | #' @param ... ignored.
 74 | #'
 75 | #'
 76 | #' @export
 77 | print.twonn_bayes <- function(x, ...) {
 78 |   y <- c("TWONN - Bayes - Posterior Mean" = x[["est"]][2])
 79 |   cat(y)
 80 |   cat("\n")
 81 |   invisible(x)
 82 | }
 83 | 
 84 | #' @name twonn
 85 | #'
 86 | #' @param object object of class \code{twonn_bayes}, obtained from the function
 87 | #' \code{twonn_bayes()}.
 88 | #' @param ... ignored.
 89 | #'
 90 | #' @export
 91 | summary.twonn_bayes <- function(object, ...) {
 92 |   y <- c(
 93 |     `Original sample size` = object[["n_original"]],
 94 |     `Used sample size` = object[["n"]],
 95 |     `Trimming proportion` = object[["c_trimmed"]],
 96 | 
 97 |     `Prior shape` = object[["hp_prior"]][1],
 98 |     `Prior scale` = object[["hp_prior"]][2],
 99 | 
100 |     `Credible interval level` = object[["alpha"]],
101 | 
102 |     `Lower bound` = object[["est"]][1],
103 |     `Posterior mean` = object[["est"]][2],
104 |     `Posterior median` = object[["est"]][3],
105 |     `Posterior mode` = object[["est"]][4],
106 |     `Upper bound` = object[["est"]][5]
107 |   )
108 |   structure(y, class = c("summary.twonn_bayes","matrix"))
109 | }
110 | 
111 | 
112 | #' @name twonn
113 | #'
114 | #' @param x object of class \code{twonn_bayes}, obtained from the function
115 | #' \code{twonn_bayes()}.
116 | #' @param ... ignored.
117 | #'
118 | #' @export
119 | print.summary.twonn_bayes <- function(x, ...) {
120 |   cat("Model: TWO-NN\n")
121 |   cat("Method: Bayesian Estimation\n")
122 |   cat(paste0(
123 |     "Sample size: ",
124 |     x[1],
125 |     ", Obs. used: ",
126 |     x[2],
127 |     ". Trimming proportion: ",
128 |     100 * x[3],
129 |     "%\n"
130 |   ))
131 |   cat(paste0("Prior d ~ Gamma(",
132 |              x[4],
133 |              ", ",
134 |              x[5],
135 |              ")\n"))
136 |   cat(paste0(
137 |     "Credible Interval quantiles: ",
138 |     (1 - x[6]) / 2 * 100,
139 |     "%, ",
140 |     (1 + x[6]) / 2 * 100,
141 |     "%\n"
142 |   ))
143 |   cat(paste0("Posterior ID estimates:"))
144 |   y <- cbind(
145 |     `Lower Bound` = x[7],
146 |     `Mean` = x[8],
147 |     `Median` = x[9],
148 |     `Mode` = x[10],
149 |     `Upper Bound` = x[11]
150 |   )
151 |   rownames(y) <- NULL
152 |   print(knitr::kable(y))
153 |   cat("\n")
154 |   invisible(x)
155 | }
156 | 
157 | 
158 | 
159 | #' @name twonn
160 | #'
161 | #' @param x object of class \code{twonn_bayes}, the output of the
162 | #' \code{twonn} function when \code{method = "bayes"}.
163 | #' @param plot_low lower bound of the interval on which the posterior density
164 | #' is plotted.
165 | #' @param plot_upp upper bound of the interval on which the posterior density
166 | #' is plotted.
167 | #' @param by step-size at which the sequence spanning the interval is
168 | #' incremented.
169 | #' @param ... other arguments passed to specific methods.
170 | #'
171 | #' @export
172 | #'
173 | plot.twonn_bayes <-
174 |   function(x,
175 |            plot_low = 0.001,
176 |            plot_upp = NULL,
177 |            by = .05,
178 |            ...) {
179 | 
180 |     if (is.null(plot_upp)) {
181 |       plot_upp <- x$est[5] + 3
182 |     }
183 | 
184 |     xx <- seq(plot_low, plot_upp, by = by)
185 |     y0 <- stats::dgamma(xx,
186 |                         shape = x$hp_prior[1],
187 |                         rate =  x$hp_prior[2])
188 |     y <- stats::dgamma(xx,
189 |                        shape = x$hp_posterior[1],
190 |                        rate =  x$hp_posterior[2])
191 | 
192 |     plot(y0~xx, type="l", col = 4,
193 |          xlab=("Intrinsic Dimension"),
194 |          ylab=("Posterior density"), ylim = c(0, max(c(y,y0)) ))
195 |     lines(y~xx, type="l", col = 1)
196 |     abline(v = x$est,
197 |            lty = 2,
198 |            col = 2)
199 |     graphics::title("Bayesian TWO-NN")
200 |     invisible()
201 |   }
202 | 
203 | 
204 | 


--------------------------------------------------------------------------------
/R/hidalgo_postproc.R:
--------------------------------------------------------------------------------
  1 | #' Stratification of the \code{id} by an external categorical variable
  2 | #'
  3 | #' The function computes summary statistics (mean, median, and standard
  4 | #' deviation) of the post-processed chains of the intrinsic dimension stratified
  5 | #' by an external categorical variable.
  6 | #'
  7 | #' @param object object of class \code{Hidalgo}, the output of the
  8 | #' \code{Hidalgo()} function.
  9 | #' @param class factor according to the observations should be stratified by.
 10 | #'
 11 | #' @return a \code{data.frame} containing the posterior \code{id} means,
 12 | #' medians, and standard deviations stratified by the levels of the variable
 13 | #' \code{class}.
 14 | #' @export
 15 | #'
 16 | #' @seealso \code{\link{Hidalgo}}
 17 | #'
 18 | #' @examples
 19 | #' \donttest{
 20 | #' X            <- replicate(5,rnorm(500))
 21 | #' X[1:250,1:2] <- 0
 22 | #' oracle       <- rep(1:2,rep(250,2))
 23 | #' h_out        <- Hidalgo(X)
 24 | #' id_by_class(h_out,oracle)
 25 | #' }
 26 | #'
 27 | id_by_class <- function(object, class) {
 28 | 
 29 |   if (class(object)[1] != "Hidalgo") {
 30 |     stop("object is not of class 'Hidalgo'", call. = FALSE)
 31 |   }
 32 | 
 33 |   class <- factor(class)
 34 |   REV   <- object$id_summary
 35 | 
 36 |   means    <- tapply(REV$MEAN, class, mean)
 37 |   medians  <- tapply(REV$MEAN, class, stats::median)
 38 |   sds      <- tapply(REV$MEAN, class, stats::sd)
 39 | 
 40 |   Res <- data.frame(
 41 |     class = levels(class),
 42 |     mean = means,
 43 |     median = medians,
 44 |     sd = sds
 45 |   )
 46 | 
 47 |   structure(Res, class = c("hidalgo_class", class(Res)))
 48 | }
 49 | 
 50 | #' @name id_by_class
 51 | #'
 52 | #' @param x object of class \code{hidalgo_class}, the output of the \code{id_by_class()} function.
 53 | #' @param ... other arguments passed to specific methods.
 54 | #'
 55 | #' @export
 56 | #'
 57 | print.hidalgo_class <- function(x, ... ){
 58 |   cat("Posterior ID by class:")
 59 |   rownames(x) <- NULL
 60 |   print(knitr::kable(x))
 61 |   cat("\n")
 62 |   invisible(x)
 63 | }
 64 | 
 65 | #' Posterior similarity matrix and partition estimation
 66 | #'
 67 | #' The function computes the posterior similarity (coclustering) matrix (psm)
 68 | #' and estimates a representative partition of the observations from the MCMC
 69 | #' output. The user can provide the desired number of clusters or estimate a
 70 | #' optimal clustering solution by minimizing a loss function on the space
 71 | #' of the partitions.
 72 | #' In the latter case, the function uses the package \code{salso}
 73 | #' (\href{https://cran.r-project.org/package=salso}{Dahl et al., 2021}),
 74 | #' that the user needs to load.
 75 | #'
 76 | #' @param object object of class \code{Hidalgo}, the output of the
 77 | #' \code{Hidalgo} function.
 78 | #' @param clustering_method character indicating the method to use to perform
 79 | #' clustering. It can be
 80 | #' \describe{
 81 | #'        \item{"dendrogram"}{thresholding the adjacency dendrogram with a given
 82 | #'         number (\code{K});}
 83 | #'        \item{"salso"}{estimation via minimization of several partition
 84 | #'        estimation criteria.
 85 | #'        The default loss function is the variation of information.}
 86 | #'        }
 87 | #' @param K number of clusters to recover by thresholding the
 88 | #' dendrogram obtained from the psm.
 89 | #' @param nCores parameter for the \code{salso} function: the number of CPU
 90 | #' cores to use. A value of zero indicates to use all cores on the system.
 91 | #' @param ... optional additional parameter to pass to \code{salso()}.
 92 | #'
 93 | #' @return list containing the posterior similarity matrix (\code{psm}) and
 94 | #' the estimated partition \code{clust}.
 95 | #' @export
 96 | #'
 97 | #' @seealso \code{\link{Hidalgo}}, \code{\link[salso]{salso}}
 98 | #'
 99 | #' @references
100 | #' D. B. Dahl, D. J. Johnson, and P. Müller (2022),
101 | #' "Search Algorithms and Loss Functions for Bayesian Clustering",
102 | #' Journal of Computational and Graphical Statistics,
103 | #' \doi{10.1080/10618600.2022.2069779}.
104 | #'
105 | #' David B. Dahl, Devin J. Johnson and Peter Müller (2022). "salso: Search
106 | #' Algorithms and Loss Functions for Bayesian Clustering".
107 | #' R package version
108 | #' 0.3.0. \url{https://CRAN.R-project.org/package=salso}
109 | #'
110 | #' @examples
111 | #' \donttest{
112 | #' library(salso)
113 | #' X            <- replicate(5,rnorm(500))
114 | #' X[1:250,1:2] <- 0
115 | #' h_out        <- Hidalgo(X)
116 | #' clustering(h_out)
117 | #' }
118 | clustering <- function(object,
119 |                        clustering_method = c("dendrogram", "salso"),
120 |                        K = 2,
121 |                        nCores = 1,
122 |                        ...) {
123 | 
124 |   if (class(object)[1] != "Hidalgo") {
125 |     stop("object is not of class 'Hidalgo'", call. = FALSE)
126 |   }
127 | 
128 |   clustering_method <- match.arg(clustering_method)
129 | 
130 |   psm <- salso::psm(object$membership_labels,
131 |                     nCores = nCores)
132 | 
133 |   clust <- switch(clustering_method,
134 |                   "dendrogram" =   {
135 |                     dendr <-  stats::hclust(stats::as.dist(1 - psm))
136 |                     stats::cutree(dendr, k = K)
137 |                   },
138 |                   "salso" = {
139 |                     salso::salso(object$membership_labels, ...)
140 |                   })
141 | 
142 |   Res <-
143 |     list(
144 |       clust = factor(clust),
145 |       psm = psm,
146 |       chosen_method = clustering_method,
147 |       K = length(unique(clust))
148 |     )
149 |   structure(Res, class = c("hidalgo_psm", class(Res)))
150 | }
151 | 
152 | 
153 | #' @name clustering
154 | #'
155 | #' @param x object of class \code{hidalgo_psm}, obtained from the function
156 | #' \code{clustering()}.
157 | #' @param ... ignored.
158 | #'
159 | #' @export
160 | print.hidalgo_psm <- function(x, ...) {
161 |   cat("Estimated clustering solution summary:\n\n")
162 |   cat(paste0("Method: ", x$chosen_method, "\n"))
163 | 
164 |   if (x$chosen_method == "dendrogram") {
165 |     cat(paste0("Retrieved clusters: ", x$K, "\n"))
166 |   } else{
167 |     cat(paste0("Retrieved clusters: ", length(unique(x$clust)), "\n"))
168 |   }
169 | 
170 |   cat("Clustering frequencies:")
171 |   tab           <- t(table(Cluster = x$clust))
172 |   colnames(tab) <- paste("Cluster", colnames(tab))
173 |   print(knitr::kable(tab))
174 |   cat("\n")
175 |   invisible(x)
176 | }
177 | 
178 | #' @name clustering
179 | #'
180 | #'
181 | #' @param x object of class \code{hidalgo_psm}, obtained from the function
182 | #' \code{clustering()}.
183 | #' @param ... ignored.
184 | #'
185 | #' @export
186 | #'
187 | plot.hidalgo_psm <- function(x, ...) {
188 | 
189 |   psm <- x$psm
190 |   stats::heatmap(psm)
191 |   invisible()
192 | }
193 | 
194 | 
195 | 
196 | 
197 | 


--------------------------------------------------------------------------------
/R/autoplot_twonn.R:
--------------------------------------------------------------------------------
  1 | #' Plot the output of the \code{TWO-NN} model estimated via least squares
  2 | #'
  3 | #' Use this method without the \code{.twonn_linfit} suffix.
  4 | #' The function returns the representation of the linear
  5 | #' regression that is fitted with the \code{linfit} method.
  6 | #'
  7 | #' @param object object of class \code{twonn_linfit}, the output of the
  8 | #' \code{twonn} function when \code{method = "linfit"}.
  9 | #' @param title string used as title of the plot.
 10 | #' @param ... other arguments passed to specific methods.
 11 | #'
 12 | #' @rdname autoplot.twonn_linfit
 13 | #'
 14 | #' @seealso \code{\link{twonn}}
 15 | #'
 16 | #' @return a \code{\link[ggplot2]{ggplot2}} object displaying the goodness of
 17 | #' the linear fit of the TWO-NN model.
 18 | #'
 19 | #' @family autoplot methods
 20 | #'
 21 | #' @export
 22 | autoplot.twonn_linfit <- function(object,
 23 |                                   title = "TWO-NN Linear Fit",
 24 |                                   ...) {
 25 |   lmod <- object$lm_output
 26 |   x    <- lmod$model$x
 27 |   y    <- lmod$model$y
 28 |   Res  <- object$est
 29 | 
 30 |   p1 <- ggplot2::ggplot()+
 31 |     ggplot2::geom_point(ggplot2::aes(x = x, y = y)) +
 32 |     ggplot2::theme_bw() +
 33 |     ggplot2::geom_abline(
 34 |       intercept = 0,
 35 |       slope = lmod$coefficients,
 36 |       col = I("red")
 37 |     ) +
 38 |     ggplot2::ylab("-log(1-(i/N))") +
 39 |     ggplot2::xlab(expression(log(mu))) +
 40 |     ggplot2::annotate(
 41 |       "label",
 42 |       -Inf,
 43 |       Inf,
 44 |       label = paste("ID:", round(Res[2], 3)),
 45 |       hjust = -0.05,
 46 |       vjust = 1.1
 47 |     ) +
 48 |     ggplot2::annotate(
 49 |       "label",
 50 |       Inf,
 51 |       -Inf,
 52 |       label = paste("R^2:", round(summary(lmod)$r.squared, 3)),
 53 |       parse = TRUE,
 54 |       hjust = 1.05,
 55 |       vjust = -0.1
 56 |     ) +
 57 |     ggplot2::theme(
 58 |       axis.title.x = ggplot2::element_text(size = 15),
 59 |       axis.title.y = ggplot2::element_text(size = 15),
 60 |       title = ggplot2::element_text(size = 15)
 61 |     )
 62 | 
 63 |   p1 + ggplot2::ggtitle(title)
 64 | }
 65 | 
 66 | 
 67 | #' Plot the output of the \code{TWO-NN} model estimated via the Bayesian
 68 | #' approach
 69 | #'
 70 | #' Use this method without the \code{.twonn_bayes} suffix.
 71 | #' The function returns the density plot of the
 72 | #' posterior distribution computed with the \code{bayes} method.
 73 | #'
 74 | #' @param object object of class \code{twonn_bayes}, the output of the
 75 | #' \code{twonn} function when \code{method = "bayes"}.
 76 | #' @param title character string used as title of the plot.
 77 | #' @param plot_low lower bound of the interval on which the posterior density
 78 | #' is plotted.
 79 | #' @param plot_upp upper bound of the interval on which the posterior density
 80 | #' is plotted.
 81 | #' @param by step-size at which the sequence spanning the interval is
 82 | #' incremented.
 83 | #' @param ... other arguments passed to specific methods.
 84 | #'
 85 | #' @rdname autoplot.twonn_bayes
 86 | #'
 87 | #' @seealso \code{\link{twonn}}
 88 | #'
 89 | #' @return \code{\link[ggplot2]{ggplot2}} object displaying the posterior
 90 | #' distribution of the intrinsic dimension parameter.
 91 | #'
 92 | #' @family autoplot methods
 93 | #'
 94 | #' @export
 95 | autoplot.twonn_bayes <-
 96 |   function(object,
 97 |            plot_low = 0,
 98 |            plot_upp = NULL,
 99 |            by = .05,
100 |            title = "Bayesian TWO-NN",
101 |            ...) {
102 |     if (is.null(plot_upp)) {
103 |       plot_upp <- object$est[5] + 3
104 |     }
105 | 
106 |     x <- seq(plot_low, plot_upp, by = by)
107 |     y0 <- stats::dgamma(x,
108 |                         shape = object$hp_prior[1],
109 |                         rate =  object$hp_prior[2])
110 |     y <- stats::dgamma(x,
111 |                        shape = object$hp_posterior[1],
112 |                        rate =  object$hp_posterior[2])
113 |     G1 <- ggplot2::ggplot(dplyr::tibble(x, y)) +
114 |       ggplot2::geom_line(ggplot2::aes(x = x, y = y0), col = 4) +
115 |       ggplot2::geom_line(ggplot2::aes(x = x, y = y)) +
116 |       ggplot2::xlab("Intrinsic Dimension") +
117 |       ggplot2::ylab("Posterior density") +
118 |       ggplot2::theme_bw() +
119 |       ggplot2::geom_vline(xintercept = object$est,
120 |                           lty = 2,
121 |                           col = 2) +
122 |       ggplot2::theme(
123 |         axis.title.x = ggplot2::element_text(size = 20),
124 |         axis.title.y = ggplot2::element_text(size = 20),
125 |         title = ggplot2::element_text(size = 20)
126 |       )
127 |     G1 + ggplot2::ggtitle(title)
128 |   }
129 | 
130 | 
131 | #' Plot the output of the \code{TWO-NN} model estimated via the Maximum
132 | #' Likelihood approach
133 | #'
134 | #' Use this method without the \code{.twonn_mle} suffix.
135 | #' The function returns the point estimate along with the confidence bands
136 | #' computed via the \code{mle} method.
137 | #'
138 | #' @param object object of class \code{twonn_mle}, the output of the
139 | #' \code{twonn} function when \code{method = "mle"}.
140 | #' @param title character string used as title of the plot.
141 | #' @param ... other arguments passed to specific methods.
142 | #'
143 | #' @rdname autoplot.twonn_mle
144 | #'
145 | #' @seealso \code{\link{twonn}}
146 | #'
147 | #' @return \code{\link[ggplot2]{ggplot2}} object displaying the point estimate
148 | #' and confidence interval obtained via the maximum likelihood approach of the
149 | #' \code{id} parameter.
150 | #'
151 | #' @family autoplot methods
152 | #'
153 | #' @export
154 | autoplot.twonn_mle <-
155 |   function(object,
156 |            title = "MLE TWO-NN",
157 |            ...) {
158 |     D  <- data.frame(x = object$est, y = 0)
159 |     G1 <- ggplot2::ggplot() +
160 |       ggplot2::geom_segment(ggplot2::aes(
161 |         x = D[1, 1],
162 |         xend = D[3, 1],
163 |         y = D[1, 2],
164 |         yend = D[3, 2]
165 |       )) +
166 |       ggplot2::geom_vline(
167 |         xintercept = D[, 1],
168 |         col = "gray",
169 |         lty = 2,
170 |         size = .5
171 |       ) +
172 |       ggplot2::geom_point(ggplot2::aes(x = D[c(1), 1], y = D[c(1), 2]), size =
173 |                             10, pch = "[") +
174 |       ggplot2::geom_point(ggplot2::aes(x = D[c(3), 1], y = D[c(3), 2]), size =
175 |                             10, pch = "]") +
176 |       ggplot2::geom_point(ggplot2::aes(x = D[c(2), 1], y = D[c(2), 2]), size =
177 |                             10) +
178 |       ggplot2::xlab("Maximum Likelihood Estimation") +
179 |       ggplot2::theme_bw() +
180 |       ggplot2::theme(
181 |         axis.title.y = ggplot2::element_blank(),
182 |         axis.text.y  = ggplot2::element_blank(),
183 |         axis.ticks.y = ggplot2::element_blank(),
184 |         axis.title.x = ggplot2::element_text(size = 20),
185 |         title = ggplot2::element_text(size = 20)
186 |       )
187 |     G1 + ggplot2::ggtitle(title)
188 |   }
189 | 
190 | 
191 | 
192 | 
193 | 


--------------------------------------------------------------------------------
/R/gride_evolution.R:
--------------------------------------------------------------------------------
  1 | #' \code{Gride} evolution based on Maximum Likelihood Estimation
  2 | #'
  3 | #' The function allows the study of the evolution of the \code{id} estimates
  4 | #' as a function of the scale of a dataset. A scale-dependent analysis
  5 | #' is essential to identify the correct number of relevant directions in noisy
  6 | #' data. To increase the average distance from the second NN (and thus the
  7 | #' average neighborhood size) involved in the estimation, the function computes
  8 | #' a sequence of \code{Gride} models with increasing NN orders, \code{n1} and
  9 | #' \code{n2}.
 10 | #' See also \href{https://www.nature.com/articles/s41598-022-20991-1}{Denti et al., 2022}
 11 | #' for more details.
 12 | #'
 13 | #' @param X data matrix with \code{n} observations and \code{D} variables.
 14 | #' @param vec_n1 vector of integers, containing the smaller NN orders considered
 15 | #' in the evolution.
 16 | #' @param vec_n2 vector of integers, containing the larger NN orders considered
 17 | #' in the evolution.
 18 | #' @param upp_bound upper bound for the interval used in the numerical
 19 | #' optimization (via \code{optimize}). Default is set to 50.
 20 | #'
 21 | #' @return list containing the Gride evolution, the corresponding NN distance
 22 | #' ratios, the average n2-th NN order distances, and the NN orders considered.
 23 | #'
 24 | #' @name gride_evolution
 25 | #'
 26 | #' @export
 27 | #'
 28 | #' @references
 29 | #' Denti F, Doimo D, Laio A, Mira A (2022). "The generalized ratios intrinsic
 30 | #' dimension estimator."
 31 | #' Scientific Reports, 12(20005).
 32 | #' ISSN  20452322, \doi{10.1038/s41598-022-20991-1}.
 33 | #'
 34 | #' @examples
 35 | #' \donttest{
 36 | #' X       <-  replicate(5,rnorm(10000,0,.1))
 37 | #' gride_evolution(X = X,vec_n1 = 2^(0:5),vec_n2 = 2^(1:6))
 38 | #'}
 39 | #'
 40 | gride_evolution <- function(X, vec_n1, vec_n2, upp_bound = 50) {
 41 |   if (any(vec_n2 < vec_n1)) {
 42 |     stop("at least one n2 is lower than one n1", call. = FALSE)
 43 |   }
 44 |   if (length(vec_n1) != length(vec_n2)) {
 45 |     stop("the considered NN orders in n1 and n2 do not match", call. = FALSE)
 46 |   }
 47 | 
 48 |   X     <- as.matrix(X)
 49 |   D     <- ncol(X)
 50 |   n     <- n0 <- nrow(X)
 51 |   check <- 0
 52 |   check <- duplicated(X)
 53 | 
 54 |   if (sum(check) > 0) {
 55 |     X <- X[-which(check),]
 56 |     n <- nrow(X)
 57 |     warning(
 58 |       paste0(
 59 |         "\n  Duplicates are present and will be removed.
 60 |         \n  Original sample size: ",
 61 |         n0,
 62 |         ". New sample size: ",
 63 |         n,
 64 |         "."
 65 |       ),
 66 |       call. = FALSE
 67 |     )
 68 |   }
 69 | 
 70 | 
 71 | 
 72 |   W        <- length(vec_n1)
 73 |   MUdots   <- matrix(NA, n, W)
 74 |   K        <- FNN::get.knn(X, k = max(vec_n2))
 75 |   path     <- numeric(W)
 76 |   avg_distance_n2 <- numeric(W)
 77 | 
 78 |   for (w in 1:W) {
 79 |     MUdots[, w] <- (K$nn.dist[, vec_n2[w]]) /
 80 |       (K$nn.dist[, vec_n1[w]])
 81 | 
 82 |     avg_distance_n2[w] <- mean(K$nn.dist[, vec_n2[w]])
 83 |     path[w] <- stats::optimize(
 84 |       gride_log_likelihood,
 85 |       interval = c(0.01, min(D, upp_bound) + 1),
 86 |       n1 = vec_n1[w],
 87 |       n2 = vec_n2[w],
 88 |       mus_n1_n2 = MUdots[, w],
 89 |       maximum = TRUE
 90 |     )$max
 91 |   }
 92 | 
 93 |   res <- list(
 94 |     path     = path,
 95 |     MUdots   = MUdots,
 96 |     NNorders = rbind(vec_n1, vec_n2),
 97 |     avg_distance_n2 = avg_distance_n2
 98 |   )
 99 | 
100 |   structure(res, class = c("gride_evolution", class(res)))
101 | 
102 | }
103 | 
104 | 
105 | #' @name gride_evolution
106 | #'
107 | #' @param x object of class \code{gride_evolution}, obtained from the function
108 | #' \code{gride_evolution()}.
109 | #' @param ... ignored.
110 | #'
111 | #' @return the function prints a summary of the Gride evolution to
112 | #' console.
113 | #'
114 | #' @export
115 | print.gride_evolution <- function(x, ...) {
116 |   cat(paste0("Model: Gride evolution\n"))
117 |   cat(paste0("Smaller NN order ranging from ",
118 |              min(x[["NNorders"]][1, ]),
119 |              " to ",
120 |              max(x[["NNorders"]][1, ]), "\n"))
121 |   cat(paste0("Larger NN order ranging from ",
122 |              min(x[["NNorders"]][2, ]),
123 |              " to ",
124 |              max(x[["NNorders"]][2, ]), "\n"))
125 |   cat(paste0(
126 |     "Average distance from the n2-th NN ranging from ",
127 |     round(min(x[["avg_distance_n2"]]), 4),
128 |     " to ",
129 |     round(max(x[["avg_distance_n2"]]), 4),
130 |     "\n"
131 |   ))
132 |   invisible(x)
133 | }
134 | 
135 | 
136 | 
137 | #' @name gride_evolution
138 | #'
139 | #' @param x an object of class \code{gride_evolution}.
140 | #'
141 | #' @param ... other arguments passed to specific methods.
142 | #'
143 | #' @export
144 | #'
145 | plot.gride_evolution <- function(x,
146 |                                      ...) {
147 |   id     <- x$path
148 |   avg_n2 <- x$avg_distance_n2
149 | 
150 |   plot(id~avg_n2,type = "b", col = "darkblue",log = "x",
151 |        ylab = ("Intrinsic dimension"),
152 |        xlab = Log[10] ~ average ~ n[2] ~ distance)
153 |   graphics::title("Gride Evolution")
154 |   invisible()
155 | }
156 | 
157 | #' Plot the evolution of \code{Gride} estimates
158 | #'
159 | #' Use this method without the \code{.gride_evolution} suffix.
160 | #' It plots the evolution of the \code{id}
161 | #' estimates as a function of the average distance from the furthest NN of
162 | #' each point.
163 | #'
164 | #' @param object an object of class \code{gride_evolution}.
165 | #' @param title an optional string to customize the title of the plot.
166 | #' @param ... other arguments passed to specific methods.
167 | #'
168 | #' @rdname autoplot.gride_evolution
169 | #'
170 | #' @return object of class \code{\link[ggplot2]{ggplot}}. It displays the
171 | #' the evolution of the Gride maximum likelihood estimates as a function
172 | #' of the average distance from \code{n2}.
173 | #'
174 | #' @export
175 | #'
176 | autoplot.gride_evolution <- function(object,
177 |                                      title = "Gride Evolution",
178 |                                      ...) {
179 |   D <- data.frame(id = object$path,
180 |                   n2 = object$avg_distance_n2)
181 |   G1 <- ggplot2::ggplot(D) +
182 |     ggplot2::geom_path(ggplot2::aes(x = .data$n2,
183 |                                     y = .data$id),
184 |                        col = "darkblue") +
185 |     ggplot2::geom_point(ggplot2::aes(x = .data$n2,
186 |                                      y = .data$id),
187 |                         col = "darkblue") +
188 |     ggplot2::ylab("Intrinsic dimension") +
189 |     ggplot2::theme_bw() +
190 |     ggplot2::scale_x_continuous(trans = "log10") +
191 |     ggplot2::xlab(Log[10] ~ average ~ n[2] ~ distance) +
192 |     ggplot2::theme(
193 |       axis.title.x = ggplot2::element_text(size = 20),
194 |       axis.title.y = ggplot2::element_text(size = 20),
195 |       title = ggplot2::element_text(size = 20)
196 |     ) +
197 |     ggplot2::ggtitle(title)
198 | 
199 |   return(G1)
200 | }
201 | 


--------------------------------------------------------------------------------
/R/gride_bayes.R:
--------------------------------------------------------------------------------
  1 | #' Generalized Ratios ID Estimation via Bayesian approach
  2 | #'
  3 | #' The function fits the Bayesian Gride model. To run this function, use the
  4 | #' high-level \code{gride()} and specify \code{method = "bayes"}. The function
  5 | #' runs a Metropolis-Hasting (MH) algorithm over \code{log(d)}, adopting a
  6 | #' Normal distribution with pre-specified standard deviation \code{sigma} as
  7 | #' proposal distribution.
  8 | #' See \href{https://www.nature.com/articles/s41598-022-20991-1}{Denti et al., 2022}
  9 | #' for more details.
 10 | #'
 11 | #' @param mus_n1_n2 vector of generalized order NN distance ratios.
 12 | #' @param n1 order of the first NN considered. Default is 1.
 13 | #' @param n2 order of the second NN considered. Default is 2.
 14 | #' @param nsim number of MCMC iterations to collect in the sample.
 15 | #' @param burn_in number of iterations to discard from the MCMC sample.
 16 | #' @param sigma standard deviation of the Gaussian proposal used in the MH step.
 17 | #' @param start_d initial value for the MCMC chain. If \code{NULL}, the MLE is
 18 | #' computed.
 19 | #' @param a_d shape parameter of the Gamma prior distribution for \code{d}.
 20 | #' @param b_d rate parameter of the Gamma prior distribution for \code{d}.
 21 | #' @param alpha  posterior probability contained in computed credible interval.
 22 | #' @param upper_D upper bound for the id parameter, needed if \code{start_d}
 23 | #' is not provided to initiate the chain with the MLE estimate.
 24 | #'
 25 | #' @return a list containing the MCMC sample and the summary of the
 26 | #' specified arguments.
 27 | #'
 28 | #' @keywords Internal
 29 | #' @noRd
 30 | #'
 31 | #' @references
 32 | #' Denti F, Doimo D, Laio A, Mira A (2022). "The generalized ratios intrinsic
 33 | #' dimension estimator."
 34 | #' Scientific Reports, 12(20005).
 35 | #' ISSN  20452322, \doi{10.1038/s41598-022-20991-1}.
 36 | #'
 37 | #' @seealso \code{\link{gride}}
 38 | #'
 39 | gride_bayes <- function(mus_n1_n2 = NULL,
 40 |                         n1 = 1,
 41 |                         n2 = 2,
 42 |                         nsim = 5000,
 43 |                         burn_in = 2000,
 44 |                         sigma = 0.5,
 45 |                         start_d = NULL,
 46 |                         a_d = 1,
 47 |                         b_d = 1,
 48 |                         alpha = 0.95,
 49 |                         upper_D = NULL) {
 50 | 
 51 | 
 52 |   if (n2 < n1) {
 53 |     stop("n2 should be greater than n1", call. = FALSE)
 54 |   }
 55 | 
 56 | 
 57 |   if (is.null(start_d)) {
 58 |     start_d <- gride_mle_point(
 59 |       mus_n1_n2 = mus_n1_n2,
 60 |       n1 = n1,
 61 |       n2 = n2,
 62 |       upper_D = upper_D
 63 |     )
 64 |     start_d <- log(start_d - 1)
 65 |   } else{
 66 |     if (start_d <= 1) {
 67 |       stop("Invalid starting point: it has to be > 1", call. = FALSE)
 68 |     }
 69 |     start_d  <- log(start_d - 1)
 70 |   }
 71 | 
 72 |   post_sample <- gride_mh_sampler(
 73 |     start_d = start_d,
 74 |     n1 = n1,
 75 |     n2 = n2,
 76 |     a_d = a_d,
 77 |     b_d = b_d,
 78 |     mus_n1_n2 = mus_n1_n2,
 79 |     nsim = nsim,
 80 |     burn_in = burn_in,
 81 |     sigma = sigma
 82 |   )
 83 | 
 84 |   sam         <- (exp(post_sample) + 1)
 85 |   est         <- numeric(5)
 86 |   est[2]      <- mean(sam)
 87 |   est[3]      <- stats::median(sam)
 88 |   dens        <- stats::density(sam)
 89 |   est[4]      <- dens$x[which.max(dens$y)]
 90 |   est[c(1, 5)] <- stats::quantile(sam, c((1 - alpha) / 2, (1 + alpha) /
 91 |                                            2))
 92 |   Res <- list(
 93 |     est = est,
 94 |     post_sample = sam,
 95 |     hp_prior = c(a_d, b_d),
 96 |     alpha = alpha,
 97 |     n1 = n1,
 98 |     n2 = n2,
 99 |     nsim = nsim,
100 |     sigma = sigma
101 |   )
102 |   structure(Res, class = c("gride_bayes", class(Res)))
103 | }
104 | 
105 | 
106 | 
107 | #' @name gride
108 | #'
109 | #' @param x object of class \code{gride_bayes}, obtained from the function
110 | #' \code{gride_bayes()}.
111 | #' @param ... ignored.
112 | #'
113 | #'
114 | #' @export
115 | print.gride_bayes <- function(x, ...) {
116 |   cat(paste0("Gride(",x[["n1"]],",",x[["n2"]],") - Bayes - Posterior Mean\n"))
117 |   cat(x[["est"]][2])
118 |   cat("\n")
119 |   invisible(x)
120 | }
121 | 
122 | #' @name gride
123 | #'
124 | #' @param object object of class \code{gride_bayes}, obtained from the function
125 | #' \code{gride_bayes()}.
126 | #' @param ... ignored.
127 | #'
128 | #' @export
129 | summary.gride_bayes <- function(object, ...) {
130 |   y <- c(
131 |     `MCMC Iterations` = object[["nsim"]],
132 | 
133 | 
134 |     `Prior shape` = object[["hp_prior"]][1],
135 |     `Prior scale` = object[["hp_prior"]][2],
136 | 
137 |     `Credible interval level` = object[["alpha"]],
138 | 
139 |     `n1` = object[["n1"]],
140 |     `n2` = object[["n2"]],
141 | 
142 |     `Lower bound` = object[["est"]][1],
143 |     `Posterior mean` = object[["est"]][2],
144 |     `Posterior median` = object[["est"]][3],
145 |     `Posterior mode` = object[["est"]][4],
146 |     `Upper bound` = object[["est"]][5]
147 |   )
148 |   structure(y, class = c("summary.gride_bayes","matrix"))
149 | }
150 | 
151 | 
152 | #' @name gride
153 | #'
154 | #' @param x object of class \code{gride_bayes()}, obtained from the function
155 | #' \code{gride_bayes()}.
156 | #' @param ... ignored.
157 | #'
158 | #'
159 | #' @export
160 | print.summary.gride_bayes <- function(x, ...) {
161 |   cat(paste0("Model: Gride(", x[5], ",", x[6], ")\n"))
162 |   cat("Method: Bayesian Estimation\n")
163 |   cat(paste0("Prior d ~ Gamma(",
164 |              x[2],
165 |              ", ",
166 |              x[3],
167 |              ")\n"))
168 |   cat(paste0("MCMC posterior sample size: ", x[1], "\n"))
169 |   cat(paste0(
170 |     "Credible Interval quantiles: ",
171 |     (1 - x[4]) / 2 * 100,
172 |     "%, ",
173 |     (1 + x[4]) / 2 * 100,
174 |     "%\n"
175 |   ))
176 | 
177 |   cat(paste0("Posterior ID estimates:"))
178 |   y <- cbind(
179 |     `Lower Bound` = x[7],
180 |     `Mean` = x[8],
181 |     `Median` = x[9],
182 |     `Mode` = x[10],
183 |     `Upper Bound` = x[11]
184 |   )
185 |   rownames(y) <- NULL
186 |   print(knitr::kable(y))
187 |   cat("\n")
188 |   invisible(x)
189 | }
190 | 
191 | 
192 | 
193 | #' @name gride
194 | #'
195 | #' @param x object of class \code{gride_bayes}.
196 | #' It is obtained using the output of the \code{gride} function when
197 | #' \code{method = "bayes"}.
198 | #'
199 | #' @export
200 | #'
201 | plot.gride_bayes <- function(x,
202 |                              ...) {
203 | 
204 |     on.exit({par(my_par)}, add = TRUE, after = TRUE)
205 |     my_par <- par(mfrow = c(2, 1))
206 | 
207 |     ID <- c(x$post_sample)
208 |     cmm <- cumsum(ID) / seq_along(ID)
209 | 
210 |     plot(ID ,col="gray",type="l", xlab = "MCMC Iteration")
211 |     lines(stats::ts(cmm), col = "darkblue",lwd=1.3)
212 |     graphics::title("Bayesian Gride: Traceplot")
213 | 
214 | 
215 |     dx <- density(ID)
216 |     plot(dx, xlab = "Intrinsic Dimension" , ylab = "Simulated Posterior Density",
217 |          col="darkblue",lwd=1.3, main = "")
218 |     polygon(c(dx$x), c(dx$y),
219 |             col = "lightgray", border = "darkblue", main = "")
220 |     graphics::title("Bayesian Gride: Posterior Density")
221 | 
222 |     invisible()
223 |       }
224 | 
225 | 


--------------------------------------------------------------------------------
/R/compute_mus.R:
--------------------------------------------------------------------------------
  1 | #' Compute the ratio statistics needed for the intrinsic dimension estimation
  2 | #'
  3 | #' The function \code{compute_mus} computes the ratios of distances between
  4 | #' nearest neighbors (NNs) of generic order, denoted as
  5 | #' \code{mu(n_1,n_2)}.
  6 | #' This quantity is at the core of all the likelihood-based methods contained
  7 | #' in the package.
  8 | #'
  9 | #' @param X a dataset with \code{n} observations and \code{D} variables.
 10 | #' @param dist_mat a distance matrix computed between \code{n} observations.
 11 | #' @param n1 order of the first NN considered. Default is 1.
 12 | #' @param n2 order of the second NN considered. Default is 2.
 13 | #' @param Nq logical indicator. If \code{TRUE}, it provides the \code{N^q}
 14 | #' matrix needed for fitting the Hidalgo model.
 15 | #' @param q integer, number of NN considered to build \code{N^q}.
 16 | #'
 17 | #' @references
 18 | #' Facco E, D'Errico M, Rodriguez A, Laio A (2017). "Estimating the intrinsic
 19 | #' dimension of datasets by a minimal neighborhood information."
 20 | #' Scientific Reports, 7(1).
 21 | #' ISSN 20452322, \doi{10.1038/s41598-017-11873-y}.
 22 | #'
 23 | #' Denti F, Doimo D, Laio A, Mira A (2022). "The generalized ratios intrinsic
 24 | #' dimension estimator."
 25 | #' Scientific Reports, 12(20005).
 26 | #' ISSN  20452322, \doi{10.1038/s41598-022-20991-1}.
 27 | #'
 28 | #' @return the principal output of this function is a vector containing the
 29 | #' ratio statistics, an object of class \code{mus}. The length of the vector is
 30 | #' equal to the number of observations considered, unless ties are present in
 31 | #' the dataset. In that case, the duplicates are removed. Optionally, if
 32 | #' \code{Nq} is \code{TRUE}, the function returns an object of class
 33 | #' \code{mus_Nq}, a list containing both the ratio statistics \code{mus} and the
 34 | #' adjacency matrix \code{NQ}.
 35 | #
 36 | #' @export
 37 | #'
 38 | #'
 39 | #' @examples
 40 | #' X           <- replicate(2,rnorm(1000))
 41 | #' mu          <- compute_mus(X, n1 = 1, n2 = 2)
 42 | #' mudots      <- compute_mus(X, n1 = 4, n2 = 8)
 43 | #' pre_hidalgo <- compute_mus(X, n1 = 4, n2 = 8, Nq = TRUE, q = 3)
 44 | compute_mus <- function(X = NULL,
 45 |                         dist_mat = NULL,
 46 |                         n1 = 1,
 47 |                         n2 = 2,
 48 |                         Nq = FALSE,
 49 |                         q  = 3) {
 50 |   if (n2 < n1) {
 51 |     stop("n2 is lower than n1", call. = FALSE)
 52 |   }
 53 | 
 54 |   if (is.null(dist_mat) & is.null(X)) {
 55 |     stop("Please provide either a dataset X or a distance matrix",
 56 |          call. = FALSE)
 57 |   }
 58 | 
 59 |   D <- NULL
 60 | 
 61 |   #####################################################
 62 |   # data, no dist_mat
 63 |   if (is.null(dist_mat)) {
 64 |     if(any(is.na(X))){
 65 |       stop("There are missing values in the provided dataset.
 66 | Please remove the problematic observations and try again.",
 67 |            call. = FALSE)
 68 |     }
 69 | 
 70 |     X     <- as.matrix(X)
 71 |     D     <- ncol(X)
 72 |     n     <- n0 <- nrow(X)
 73 |     check <- 0
 74 |     check <- duplicated(X)
 75 | 
 76 |     # remove duplicates
 77 |     if (sum(check) > 0) {
 78 |       X <- X[-which(check),]
 79 |       n <- nrow(X)
 80 |       warning(
 81 |         paste0(
 82 |           "\n  Duplicates are present and will be removed.
 83 |         \n  Original sample size: ",
 84 |           n0,
 85 |           ". New sample size: ",
 86 |           n,
 87 |           "."
 88 |         ),
 89 |         call. = FALSE
 90 |       )
 91 |     }
 92 | 
 93 | 
 94 | 
 95 | 
 96 |     if (!Nq) {
 97 |       K   <- FNN::get.knn(X, k = n2)
 98 |       mus <- K$nn.dist[, n2] / K$nn.dist[, n1]
 99 | 
100 |       attr(mus, "upper_D") <- D
101 |       attr(mus, "n1") <- n1
102 |       attr(mus, "n2") <- n2
103 |       structure(mus, class = c("mus", class(mus)))
104 | 
105 |     } else{
106 |       K   <- FNN::get.knn(X, k = max(n2, q))
107 |       mus <- K$nn.dist[, n2] / K$nn.dist[, n1]
108 |       NQ  <- matrix(0, n, n)
109 |       for (h in 1:n) {
110 |         NQ[h, K$nn.index[h,]] <- 1
111 |       }
112 |       attr(mus, "upper_D") <- D
113 |       attr(mus, "n1") <- n1
114 |       attr(mus, "n2") <- n2
115 |       mus <- structure(mus, class = c("mus", class(mus)))
116 | 
117 |       mus_nq <- list(mus = mus, NQ = NQ)
118 |       attr(mus_nq, "upper_D") <- D
119 |       attr(mus_nq, "n1") <- n1
120 |       attr(mus_nq, "n2") <- n2
121 |       attr(mus_nq, "q")  <- q
122 |       structure(mus_nq, class = c("mus_Nq", class(mus_nq)))
123 |     }
124 | 
125 | 
126 |     #################################
127 |     } else {
128 |     # dist_mat is passed
129 |     #################################
130 |     # checks on dist_mat
131 | 
132 |     # if it is of class dist, then transform it into a matrix
133 |     if(inherits(dist_mat,"dist") | inherits(dist_mat,"dissimilarity")){
134 |       dist_mat <- as.matrix(dist_mat)
135 |     }
136 | 
137 |     ## does it contain non-negative distances?
138 |     if(!all(dist_mat>=0)){
139 |       stop("Negative distances detected in dist_mat. Please provide a valid distance matrix",
140 |            call. = FALSE)
141 |     }
142 |     # is it symmetric?
143 |     if(!isSymmetric(dist_mat)){
144 |       stop("The provided distance matrix is not symmetric.
145 | Please provide a valid distance matrix",
146 |            call. = FALSE)
147 |     }
148 |     # NA?
149 |     if(any(is.na(dist_mat))){
150 |       stop("There are missing values in the provided distance matrix.
151 | Please remove the problematic observations and try again.",
152 |            call. = FALSE)
153 |     }
154 | 
155 |     n <- n0 <- nrow(dist_mat)
156 |     dummy <- dist_mat
157 |     dummy[lower.tri(dummy, diag = TRUE)] <- -1
158 |     inds  <- unique(which(dummy == 0, arr.ind = TRUE)[, 2])
159 | 
160 |     if (length(inds) > 0) {
161 |       dist_mat <- dist_mat[-inds, -inds]
162 |       n        <- nrow(dist_mat)
163 | 
164 |       warning(
165 |         paste0("\nDuplicates are present and will be removed.\n",
166 |                "Original sample size: ", n0,". New sample size: ",
167 |           n,"."
168 |         )
169 |       )
170 |     }
171 | 
172 |       sDistMat <- apply(dist_mat, 1, function(x)
173 |         sort(x, index = TRUE))
174 |       mus      <- unlist(lapply(sDistMat,
175 |                                 function(z)
176 |                                   z$x[n2 + 1] / z$x[n1 + 1]))
177 |     if (!Nq) {
178 | 
179 |       attr(mus, "n1") <- n1
180 |       attr(mus, "n2") <- n2
181 |       structure(mus, class = c("mus", class(mus)))
182 | 
183 |     }else{
184 | 
185 |       NQ       <- matrix(0, n, n)
186 |       for (h in 1:n) {
187 |         NQ[h, (sDistMat[[h]]$ix)[2:(q+1)]] <- 1
188 |       }
189 | 
190 |       attr(mus, "n1") <- n1
191 |       attr(mus, "n2") <- n2
192 |       mus <- structure(mus, class = c("mus", class(mus)))
193 |       mus_nq <- list(mus = mus, NQ = NQ)
194 |       attr(mus_nq, "n1") <- n1
195 |       attr(mus_nq, "n2") <- n2
196 |       attr(mus_nq, "q")  <- q
197 |       structure(mus_nq, class = c("mus_Nq", class(mus_nq)))
198 | 
199 |       }
200 |     }
201 | 
202 | }
203 | 
204 | 
205 | 
206 | #' @name compute_mus
207 | #'
208 | #' @param x object of class \code{mus}, obtained from the
209 | #' function \code{compute_mus()}.
210 | #' @param ... ignored.
211 | #'
212 | #' @export
213 | print.mus <- function(x, ...) {
214 | 
215 |   if(is.list(x)){ #check if the mus object contains Nq
216 |     nn <- length(x[[1]])
217 |   }else{
218 |     nn <- length(x)
219 |   }
220 | 
221 |   cat("Ratio statistics mu's\n")
222 |   cat(paste0("NN orders: n1 = ", attr(x, "n1"), ", n2 = ",
223 |              attr(x, "n2"), "\n"))
224 |   cat(paste0("Sample size: ", nn, "\n"))
225 |   if (!is.null(attr(x, "upper_D"))) {
226 |     cat(paste0("Nominal Dimension: ", attr(x, "upper_D"), "\n"))
227 |   }
228 |   invisible(x)
229 |   }
230 | 
231 | 
232 | #' @name compute_mus
233 | #'
234 | #' @param x object of class \code{mus_Nq}, obtained from the
235 | #' function \code{compute_mus()}.
236 | #' @param ... ignored.
237 | #'
238 | #' @export
239 | print.mus_Nq <- function(x, ...) {
240 | 
241 |   if(is.list(x)){ #check if the mus object contains Nq
242 |     nn <- length(x[[1]])
243 |   }else{
244 |     nn <- length(x)
245 |   }
246 | 
247 |   cat("List containing the ratio statistics mu's and the Nq adjacency matrix\n")
248 | 
249 |   cat(paste0("NN orders: n1 = ", attr(x, "n1"), ", n2 = ",
250 |              attr(x, "n2"), "\n"))
251 |   cat(paste0("Sample size: ", nn, "\n"))
252 |   if (!is.null(attr(x, "upper_D"))) {
253 |     cat(paste0("Nominal Dimension: ", attr(x, "upper_D"), "\n"))
254 |   }
255 |   cat(paste0("NNs considered: q = ", attr(x, "q"), "\n"))
256 | 
257 |   invisible(x)
258 | }
259 | 
260 | 
261 | #' @name compute_mus
262 | #'
263 | #' @importFrom graphics hist curve
264 | #'
265 | #' @param x object of class \code{mus}, obtained from the
266 | #' function \code{compute_mus()}.
267 | #' @param range_d a sequence of values for which the generalized ratios density
268 | #' is superimposed to the histogram of \code{mus}.
269 | #' @param ... ignored.
270 | #'
271 | #' @export
272 | plot.mus <- function(x, range_d = NULL, ...) {
273 | 
274 | 
275 |   n1 <- attr(x, "n1")
276 |   n2 <- attr(x, "n2")
277 |   hist(x,
278 |        breaks = 30,
279 |        freq = F,
280 |        col = "white",
281 |        main = "", xlab = bquote(mu))
282 | 
283 |   if(!is.null(range_d)){
284 | 
285 |   for (i in seq_along(range_d)) {
286 |     y <- dgera(sort(x),
287 |                n1 = n1,
288 |                n2 = n2,
289 |                d = range_d[i])
290 |     lines(y ~ sort(x),
291 |           col = range_d[i] - min(range_d) + 1,
292 |           lwd = 2,
293 |           lty = 3)
294 |   }
295 | 
296 |   legend(
297 |     "topright",
298 |     legend = range_d,
299 |     col = (range_d) - min(range_d) + 1,
300 |     lty = 3,
301 |     lwd = 2
302 |   )
303 |   }
304 |   invisible()
305 | }
306 | 
307 | 
308 | 


--------------------------------------------------------------------------------
/R/autoplot_hidalgo_aux.R:
--------------------------------------------------------------------------------
  1 | #' Plot the raw traceplots of the \code{id} parameters
  2 | #'
  3 | #' The function produces the traceplots of the parameters
  4 | #' \code{d_k}, for \code{k=1...K}. The ergodic means for all the chains
  5 | #' are superimposed.
  6 | #' The \code{K} chains that are plotted are not post-processed.
  7 | #' Ergo, they are subjected to label switching.
  8 | #'
  9 | #' @param object object of class \code{Hidalgo}, the output of the
 10 | #' \code{Hidalgo()} function.
 11 | #' @param ... other arguments passed to specific methods.
 12 | #'
 13 | #' @importFrom rlang .data
 14 | #'
 15 | #' @seealso \code{\link{autoplot.Hidalgo}}
 16 | #' @rdname autoplot.gride_bayes
 17 | #'
 18 | #' @return object of class \code{\link[ggplot2]{ggplot}}, which displays the
 19 | #' chains of the \code{id} parameters sampled from the mixture model.
 20 | #' @keywords Internal
 21 | #' @noRd
 22 | #'
 23 | ggHid_chains <- function(object, ...) {
 24 |   ID  <- object$id_raw
 25 |   cmm <- (apply(ID, 2, function(x)
 26 |     cumsum(x) / seq_along(x)))
 27 |   D   <- reshape2::melt(ID)
 28 |   D1  <- reshape2::melt(cmm)
 29 | 
 30 |   ggplot2::ggplot() +
 31 |     ggplot2::geom_line(
 32 |       data = D,
 33 |       ggplot2::aes(
 34 |         x = .data$Var1,
 35 |         y = .data$value,
 36 |         group = .data$Var2
 37 |       ),
 38 |       col = "gray",
 39 |       alpha = .2
 40 |     ) +
 41 |     ggplot2::theme_bw() +
 42 |     ggplot2::ylab("Raw MCMC - Intrinsic Dimension") +
 43 |     ggplot2::xlab("MCMC Iteration") +
 44 |     ggplot2::geom_line(
 45 |       data = D1,
 46 |       ggplot2::aes(
 47 |         x = .data$Var1,
 48 |         y = .data$value,
 49 |         group = .data$Var2,
 50 |         col = factor(.data$Var2)
 51 |       ),
 52 |       alpha = 1,
 53 |       lwd = 1
 54 |     ) +
 55 |     ggplot2::theme(
 56 |       axis.title.x = ggplot2::element_text(size = 20),
 57 |       axis.title.y = ggplot2::element_text(size = 20),
 58 |       title = ggplot2::element_text(size = 20),
 59 |       legend.position = "none"
 60 |     )
 61 | }
 62 | 
 63 | 
 64 | #' Plot a summary of the distributions of re-arranged chains
 65 | #'
 66 | #' The function produces two panels, reporting the means (left) and the medians
 67 | #' (right) of the processed chains. Each observation is mapped to its own
 68 | #' intrinsic dimension value assumed at each iteration \code{t} of the MCMC,
 69 | #' denoted as \code{d(t,z_i)}. The 90% credible intervals are also depicted
 70 | #' with gray lines.
 71 | #'
 72 | #' @param object object of class \code{Hidalgo}, the output of the
 73 | #' \code{Hidalgo()} function.
 74 | #' @param ... other arguments passed to specific methods.
 75 | #' @seealso \code{\link{autoplot.Hidalgo}}
 76 | #'
 77 | #'
 78 | #' @return object of class \code{\link[ggplot2]{ggplot}}. It displays two
 79 | #' scatterplots containing the posterior mean and median \code{id} for each
 80 | #' observation, once the MCMC has been postprocessed to handle label
 81 | #' switching.
 82 | #'
 83 | #' @keywords internal
 84 | #' @noRd
 85 | #'
 86 | ggHid_mean_median <- function(object, ...) {
 87 |   a    <- object$id_summary
 88 |   data <-  rbind(
 89 |     data.frame(
 90 |       x = a$OBS,
 91 |       low = a$Q.05,
 92 |       est = a$MEAN,
 93 |       upp = a$Q.95,
 94 |       type = "Mean"
 95 |     ),
 96 |     data.frame(
 97 |       x = a$OBS,
 98 |       low = a$Q.05,
 99 |       est = a$MEDIAN,
100 |       upp = a$Q.95,
101 |       type = "Median"
102 |     )
103 |   )
104 |   ggplot2::ggplot(data = data) +
105 |     ggplot2::geom_segment(
106 |       ggplot2::aes(
107 |         x = .data$x,
108 |         xend = .data$x,
109 |         y = .data$low,
110 |         yend = .data$upp
111 |       ),
112 |       col = "gray",
113 |       alpha = .4
114 |     ) +
115 |     ggplot2::theme_bw() +
116 |     ggplot2::theme(text = ggplot2::element_text(size = 20),
117 |                    legend.position = "none") +
118 |     ggplot2::geom_point(
119 |       ggplot2::aes(x = .data$x, y = .data$est),
120 |       col = "darkblue",
121 |       alpha = 1,
122 |       pch = 21
123 |     ) +
124 |     ggplot2::facet_wrap( ~ type) +
125 |     ggplot2::xlab("Observation") +
126 |     ggplot2::ylab("Intrinsic Dimension")
127 | }
128 | 
129 | #' Plot posterior \code{id} for each observation stratified by external factor
130 | #'
131 | #' The function produces different plots to investigate the relationship between
132 | #' the posterior estimates of the \code{id} and an external, categorical
133 | #' variable \code{class}.
134 | #'
135 | #' @param object object of class \code{Hidalgo}, the output of the
136 | #' \code{Hidalgo()} function.
137 | #' @param class factor variable used to stratify observations according to their
138 | #' \code{id} estimates.
139 | #' @param class_plot_type a string indicating the preferred type of plot used
140 | #' for the comparison. It can be:
141 | #' \describe{
142 | #' \item{\code{"histogram"} or \code{"density"}}{which produces overlapping
143 | #' plots of the stratified distributions}
144 | #' \item{\code{"boxplot"} or \code{"violin"}}{which produces side-to-side
145 | #' boxplots or violin plots}
146 | #' }
147 | #' @param ... other arguments passed to specific methods.
148 | #'
149 | #' @seealso \code{\link{autoplot.Hidalgo}}
150 | #' @keywords internal
151 | #' @noRd
152 | #'
153 | #' @return object of class \code{\link[ggplot2]{ggplot}}. It can be used to
154 | #' visually study the relation between the posterior \code{id} estimates and an
155 | #' exogenous, categorical variable.
156 | #' The type of plot varies according to the specification of
157 | #' \code{class_plot_type}, and it can be either a set of boxplots or
158 | #' violin plots, or a collection of overlapping densities or histograms.
159 | #'
160 | ggHid_class <- function(object,
161 |                         class,
162 |                         class_plot_type = c("histogram", "density",
163 |                                             "boxplot", "violin"),
164 |                         ...) {
165 |   class_plot_type <- match.arg(class_plot_type)
166 |   D <- object$id_summary
167 |   D <- rbind(
168 |     data.frame(
169 |       Class = as.factor(class),
170 |       est = D$MEAN,
171 |       type = "Mean"
172 |     ),
173 |     data.frame(
174 |       Class = as.factor(class),
175 |       est = D$MEDIAN,
176 |       type = "Median"
177 |     )
178 |   )
179 | 
180 |   if (class_plot_type == "histogram") {
181 |     p1 <- ggplot2::ggplot() +
182 |       ggplot2::geom_histogram(
183 |         data = D,
184 |         ggplot2::aes(x = .data$est,
185 |                      fill = .data$Class),
186 |         position = "identity",
187 |         col = 1,
188 |         alpha = .5,
189 |         bins = 25
190 |       )
191 |   } else if (class_plot_type == "density") {
192 |     p1 <- ggplot2::ggplot() +
193 |       ggplot2::geom_density(
194 |         data = D,
195 |         ggplot2::aes(x = .data$est,
196 |                      fill = .data$Class),
197 |         position = "identity",
198 |         col = 1,
199 |         alpha = .5
200 |       )
201 |   } else if (class_plot_type == "boxplot") {
202 |     p1 <- ggplot2::ggplot() +
203 |       ggplot2::geom_boxplot(
204 |         data = D,
205 |         ggplot2::aes(
206 |           x = .data$est,
207 |           y = .data$Class,
208 |           fill = .data$Class
209 |         ),
210 |         col = 1
211 |       )
212 |   } else if (class_plot_type == "violin") {
213 |     p1 <- ggplot2::ggplot() +
214 |       ggplot2::geom_violin(
215 |         data = D,
216 |         ggplot2::aes(
217 |           x = .data$est,
218 |           y = .data$Class,
219 |           fill = .data$Class
220 |         ),
221 |         col = 1
222 |       )
223 |   }
224 | 
225 |   p1 +
226 |     ggplot2::theme_bw() +
227 |     ggplot2::theme(text = ggplot2::element_text(size = 20),
228 |                    legend.position = "none") +
229 |     ggplot2::facet_wrap( ~ type) +
230 |     ggplot2::xlab("ID posterior estimate") +
231 |     ggplot2::ylab("")
232 | 
233 | }
234 | 
235 | #' Plot the posterior similarity matrix
236 | #'
237 | #' The function produces a heatmap of the posterior similarity (coclustering)
238 | #' matrix (psm) computed from the MCMC output of the function \code{Hidalgo()}.
239 | #' Rows and columns can be organized according to a clustering solution or an
240 | #' exogenous categorical variable.
241 | #'
242 | #' @param object object of class \code{Hidalgo}, the output of the
243 | #' \code{Hidalgo()} function.
244 | #' @param psm posterior similarity matrix that can be provided directly to the
245 | #' function.
246 | #' @param class factor variable used to order the observations according to
247 | #' their \code{id} estimates.
248 | #' @param ... other arguments passed to specific methods.
249 | #'
250 | #' @keywords internal
251 | #' @noRd
252 | #'
253 | #' @return object of class \code{\link[ggplot2]{ggplot}}. It is a heatmap
254 | #' representing the psm, which allows to study the clustering structure present
255 | #' in the data estimated via the mixture model.
256 | #'
257 | #' @seealso \code{\link{autoplot.Hidalgo}}
258 | #'
259 | ggHid_psm <- function(object,
260 |                       psm = NULL,
261 |                       class = NULL,
262 |                       ...) {
263 |   if (is.null(psm)) {
264 |     psm <- clustering(object)$psm
265 |   }
266 | 
267 |   n     <- nrow(psm)
268 | 
269 |   if (!is.null(class)) {
270 |     ind <-  sort(as.numeric(class),index=T)$ix
271 |     D <- data.frame(reshape2::melt(psm[ind,ind]))
272 | 
273 |     Q1 <- ggplot2::ggplot(D) +
274 |       ggplot2::geom_tile(ggplot2::aes(
275 |         x = .data$Var2,
276 |         y = .data$Var1,
277 |         fill = .data$value
278 |       ))
279 | 
280 |   } else {
281 |     cl    <- stats::hclust(stats::as.dist(1-psm))
282 | 
283 |     D     <- data.frame(reshape2::melt(psm[cl$order,cl$order]))
284 | 
285 |     Q1 <- ggplot2::ggplot(D) +
286 |       ggplot2::geom_tile(ggplot2::aes(
287 |         x = (.data$Var1),
288 |         y = (.data$Var2),
289 |         fill = .data$value
290 |       ))
291 |   }
292 |   Q <- Q1 +
293 |     ggplot2::theme_bw() +
294 |     ggplot2::theme(
295 |       text = ggplot2::element_text(size = 20),
296 |       legend.position = "bottom",
297 |       legend.margin = ggplot2::margin(),
298 |       legend.key.width = ggplot2::unit(2, "cm")
299 |     ) +
300 |     ggplot2::xlab("") +
301 |     ggplot2::ylab("") +
302 |     ggplot2::scale_fill_gradient("PCP   ",
303 |                                  low = "white", high = 4)
304 | 
305 | 
306 |   return(Q)
307 | }
308 | 


--------------------------------------------------------------------------------
/src/RcppExports.cpp:
--------------------------------------------------------------------------------
  1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand
  2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
  3 | 
  4 | #include <RcppArmadillo.h>
  5 | #include <Rcpp.h>
  6 | 
  7 | using namespace Rcpp;
  8 | 
  9 | #ifdef RCPP_USE_GLOBAL_ROSTREAM
 10 | Rcpp::Rostream<true>&  Rcpp::Rcout = Rcpp::Rcpp_cout_get();
 11 | Rcpp::Rostream<false>& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get();
 12 | #endif
 13 | 
 14 | // log_Likelihood_double
 15 | double log_Likelihood_double(double mu_obs, double d);
 16 | RcppExport SEXP _intRinsic_log_Likelihood_double(SEXP mu_obsSEXP, SEXP dSEXP) {
 17 | BEGIN_RCPP
 18 |     Rcpp::RObject rcpp_result_gen;
 19 |     Rcpp::RNGScope rcpp_rngScope_gen;
 20 |     Rcpp::traits::input_parameter< double >::type mu_obs(mu_obsSEXP);
 21 |     Rcpp::traits::input_parameter< double >::type d(dSEXP);
 22 |     rcpp_result_gen = Rcpp::wrap(log_Likelihood_double(mu_obs, d));
 23 |     return rcpp_result_gen;
 24 | END_RCPP
 25 | }
 26 | // Groups_quantities
 27 | arma::mat Groups_quantities(arma::colvec mu_obser, arma::vec Ci, int K);
 28 | RcppExport SEXP _intRinsic_Groups_quantities(SEXP mu_obserSEXP, SEXP CiSEXP, SEXP KSEXP) {
 29 | BEGIN_RCPP
 30 |     Rcpp::RObject rcpp_result_gen;
 31 |     Rcpp::RNGScope rcpp_rngScope_gen;
 32 |     Rcpp::traits::input_parameter< arma::colvec >::type mu_obser(mu_obserSEXP);
 33 |     Rcpp::traits::input_parameter< arma::vec >::type Ci(CiSEXP);
 34 |     Rcpp::traits::input_parameter< int >::type K(KSEXP);
 35 |     rcpp_result_gen = Rcpp::wrap(Groups_quantities(mu_obser, Ci, K));
 36 |     return rcpp_result_gen;
 37 | END_RCPP
 38 | }
 39 | // Norm_Constant_Z_l2
 40 | double Norm_Constant_Z_l2(int Nzi_l, int N, double xi, int q);
 41 | RcppExport SEXP _intRinsic_Norm_Constant_Z_l2(SEXP Nzi_lSEXP, SEXP NSEXP, SEXP xiSEXP, SEXP qSEXP) {
 42 | BEGIN_RCPP
 43 |     Rcpp::RObject rcpp_result_gen;
 44 |     Rcpp::RNGScope rcpp_rngScope_gen;
 45 |     Rcpp::traits::input_parameter< int >::type Nzi_l(Nzi_lSEXP);
 46 |     Rcpp::traits::input_parameter< int >::type N(NSEXP);
 47 |     Rcpp::traits::input_parameter< double >::type xi(xiSEXP);
 48 |     Rcpp::traits::input_parameter< int >::type q(qSEXP);
 49 |     rcpp_result_gen = Rcpp::wrap(Norm_Constant_Z_l2(Nzi_l, N, xi, q));
 50 |     return rcpp_result_gen;
 51 | END_RCPP
 52 | }
 53 | // log_Zeta_maker
 54 | arma::vec log_Zeta_maker(int N, double xi, int q);
 55 | RcppExport SEXP _intRinsic_log_Zeta_maker(SEXP NSEXP, SEXP xiSEXP, SEXP qSEXP) {
 56 | BEGIN_RCPP
 57 |     Rcpp::RObject rcpp_result_gen;
 58 |     Rcpp::RNGScope rcpp_rngScope_gen;
 59 |     Rcpp::traits::input_parameter< int >::type N(NSEXP);
 60 |     Rcpp::traits::input_parameter< double >::type xi(xiSEXP);
 61 |     Rcpp::traits::input_parameter< int >::type q(qSEXP);
 62 |     rcpp_result_gen = Rcpp::wrap(log_Zeta_maker(N, xi, q));
 63 |     return rcpp_result_gen;
 64 | END_RCPP
 65 | }
 66 | // index_row_col
 67 | Rcpp::List index_row_col(arma::mat Nq, int q, int N);
 68 | RcppExport SEXP _intRinsic_index_row_col(SEXP NqSEXP, SEXP qSEXP, SEXP NSEXP) {
 69 | BEGIN_RCPP
 70 |     Rcpp::RObject rcpp_result_gen;
 71 |     Rcpp::RNGScope rcpp_rngScope_gen;
 72 |     Rcpp::traits::input_parameter< arma::mat >::type Nq(NqSEXP);
 73 |     Rcpp::traits::input_parameter< int >::type q(qSEXP);
 74 |     Rcpp::traits::input_parameter< int >::type N(NSEXP);
 75 |     rcpp_result_gen = Rcpp::wrap(index_row_col(Nq, q, N));
 76 |     return rcpp_result_gen;
 77 | END_RCPP
 78 | }
 79 | // rdir_cpp
 80 | arma::colvec rdir_cpp(arma::colvec alpha);
 81 | RcppExport SEXP _intRinsic_rdir_cpp(SEXP alphaSEXP) {
 82 | BEGIN_RCPP
 83 |     Rcpp::RObject rcpp_result_gen;
 84 |     Rcpp::RNGScope rcpp_rngScope_gen;
 85 |     Rcpp::traits::input_parameter< arma::colvec >::type alpha(alphaSEXP);
 86 |     rcpp_result_gen = Rcpp::wrap(rdir_cpp(alpha));
 87 |     return rcpp_result_gen;
 88 | END_RCPP
 89 | }
 90 | // Update_memberships_faster
 91 | arma::colvec Update_memberships_faster(arma::colvec mu_obser, arma::colvec dl, arma::colvec pl, int K, int N, int q, arma::colvec possible_label, arma::colvec Ci, double QQ, arma::umat index_row, Rcpp::List index_col, arma::colvec log_Precomp_Z, arma::colvec log_Precomp_ratios);
 92 | RcppExport SEXP _intRinsic_Update_memberships_faster(SEXP mu_obserSEXP, SEXP dlSEXP, SEXP plSEXP, SEXP KSEXP, SEXP NSEXP, SEXP qSEXP, SEXP possible_labelSEXP, SEXP CiSEXP, SEXP QQSEXP, SEXP index_rowSEXP, SEXP index_colSEXP, SEXP log_Precomp_ZSEXP, SEXP log_Precomp_ratiosSEXP) {
 93 | BEGIN_RCPP
 94 |     Rcpp::RObject rcpp_result_gen;
 95 |     Rcpp::RNGScope rcpp_rngScope_gen;
 96 |     Rcpp::traits::input_parameter< arma::colvec >::type mu_obser(mu_obserSEXP);
 97 |     Rcpp::traits::input_parameter< arma::colvec >::type dl(dlSEXP);
 98 |     Rcpp::traits::input_parameter< arma::colvec >::type pl(plSEXP);
 99 |     Rcpp::traits::input_parameter< int >::type K(KSEXP);
100 |     Rcpp::traits::input_parameter< int >::type N(NSEXP);
101 |     Rcpp::traits::input_parameter< int >::type q(qSEXP);
102 |     Rcpp::traits::input_parameter< arma::colvec >::type possible_label(possible_labelSEXP);
103 |     Rcpp::traits::input_parameter< arma::colvec >::type Ci(CiSEXP);
104 |     Rcpp::traits::input_parameter< double >::type QQ(QQSEXP);
105 |     Rcpp::traits::input_parameter< arma::umat >::type index_row(index_rowSEXP);
106 |     Rcpp::traits::input_parameter< Rcpp::List >::type index_col(index_colSEXP);
107 |     Rcpp::traits::input_parameter< arma::colvec >::type log_Precomp_Z(log_Precomp_ZSEXP);
108 |     Rcpp::traits::input_parameter< arma::colvec >::type log_Precomp_ratios(log_Precomp_ratiosSEXP);
109 |     rcpp_result_gen = Rcpp::wrap(Update_memberships_faster(mu_obser, dl, pl, K, N, q, possible_label, Ci, QQ, index_row, index_col, log_Precomp_Z, log_Precomp_ratios));
110 |     return rcpp_result_gen;
111 | END_RCPP
112 | }
113 | // gam_trunc
114 | arma::colvec gam_trunc(int D, int K, double a0_d, double b0_d, arma::colvec n_l, arma::colvec sLog);
115 | RcppExport SEXP _intRinsic_gam_trunc(SEXP DSEXP, SEXP KSEXP, SEXP a0_dSEXP, SEXP b0_dSEXP, SEXP n_lSEXP, SEXP sLogSEXP) {
116 | BEGIN_RCPP
117 |     Rcpp::RObject rcpp_result_gen;
118 |     Rcpp::RNGScope rcpp_rngScope_gen;
119 |     Rcpp::traits::input_parameter< int >::type D(DSEXP);
120 |     Rcpp::traits::input_parameter< int >::type K(KSEXP);
121 |     Rcpp::traits::input_parameter< double >::type a0_d(a0_dSEXP);
122 |     Rcpp::traits::input_parameter< double >::type b0_d(b0_dSEXP);
123 |     Rcpp::traits::input_parameter< arma::colvec >::type n_l(n_lSEXP);
124 |     Rcpp::traits::input_parameter< arma::colvec >::type sLog(sLogSEXP);
125 |     rcpp_result_gen = Rcpp::wrap(gam_trunc(D, K, a0_d, b0_d, n_l, sLog));
126 |     return rcpp_result_gen;
127 | END_RCPP
128 | }
129 | // gam_trunc_pmass
130 | arma::colvec gam_trunc_pmass(int D, int K, double a0_d, double b0_d, arma::colvec n_l, arma::colvec sLog, double pi_mass);
131 | RcppExport SEXP _intRinsic_gam_trunc_pmass(SEXP DSEXP, SEXP KSEXP, SEXP a0_dSEXP, SEXP b0_dSEXP, SEXP n_lSEXP, SEXP sLogSEXP, SEXP pi_massSEXP) {
132 | BEGIN_RCPP
133 |     Rcpp::RObject rcpp_result_gen;
134 |     Rcpp::RNGScope rcpp_rngScope_gen;
135 |     Rcpp::traits::input_parameter< int >::type D(DSEXP);
136 |     Rcpp::traits::input_parameter< int >::type K(KSEXP);
137 |     Rcpp::traits::input_parameter< double >::type a0_d(a0_dSEXP);
138 |     Rcpp::traits::input_parameter< double >::type b0_d(b0_dSEXP);
139 |     Rcpp::traits::input_parameter< arma::colvec >::type n_l(n_lSEXP);
140 |     Rcpp::traits::input_parameter< arma::colvec >::type sLog(sLogSEXP);
141 |     Rcpp::traits::input_parameter< double >::type pi_mass(pi_massSEXP);
142 |     rcpp_result_gen = Rcpp::wrap(gam_trunc_pmass(D, K, a0_d, b0_d, n_l, sLog, pi_mass));
143 |     return rcpp_result_gen;
144 | END_RCPP
145 | }
146 | // gride_log_likelihood
147 | double gride_log_likelihood(double d, int n1, int n2, arma::colvec mus_n1_n2);
148 | RcppExport SEXP _intRinsic_gride_log_likelihood(SEXP dSEXP, SEXP n1SEXP, SEXP n2SEXP, SEXP mus_n1_n2SEXP) {
149 | BEGIN_RCPP
150 |     Rcpp::RObject rcpp_result_gen;
151 |     Rcpp::RNGScope rcpp_rngScope_gen;
152 |     Rcpp::traits::input_parameter< double >::type d(dSEXP);
153 |     Rcpp::traits::input_parameter< int >::type n1(n1SEXP);
154 |     Rcpp::traits::input_parameter< int >::type n2(n2SEXP);
155 |     Rcpp::traits::input_parameter< arma::colvec >::type mus_n1_n2(mus_n1_n2SEXP);
156 |     rcpp_result_gen = Rcpp::wrap(gride_log_likelihood(d, n1, n2, mus_n1_n2));
157 |     return rcpp_result_gen;
158 | END_RCPP
159 | }
160 | // gride_log_posterior
161 | double gride_log_posterior(double z, int n1, int n2, double a_d, double b_d, arma::colvec mus_n1_n2);
162 | RcppExport SEXP _intRinsic_gride_log_posterior(SEXP zSEXP, SEXP n1SEXP, SEXP n2SEXP, SEXP a_dSEXP, SEXP b_dSEXP, SEXP mus_n1_n2SEXP) {
163 | BEGIN_RCPP
164 |     Rcpp::RObject rcpp_result_gen;
165 |     Rcpp::RNGScope rcpp_rngScope_gen;
166 |     Rcpp::traits::input_parameter< double >::type z(zSEXP);
167 |     Rcpp::traits::input_parameter< int >::type n1(n1SEXP);
168 |     Rcpp::traits::input_parameter< int >::type n2(n2SEXP);
169 |     Rcpp::traits::input_parameter< double >::type a_d(a_dSEXP);
170 |     Rcpp::traits::input_parameter< double >::type b_d(b_dSEXP);
171 |     Rcpp::traits::input_parameter< arma::colvec >::type mus_n1_n2(mus_n1_n2SEXP);
172 |     rcpp_result_gen = Rcpp::wrap(gride_log_posterior(z, n1, n2, a_d, b_d, mus_n1_n2));
173 |     return rcpp_result_gen;
174 | END_RCPP
175 | }
176 | // gride_mh_sampler
177 | arma::colvec gride_mh_sampler(double start_d, int n1, int n2, double a_d, double b_d, arma::colvec mus_n1_n2, int nsim, int burn_in, double sigma);
178 | RcppExport SEXP _intRinsic_gride_mh_sampler(SEXP start_dSEXP, SEXP n1SEXP, SEXP n2SEXP, SEXP a_dSEXP, SEXP b_dSEXP, SEXP mus_n1_n2SEXP, SEXP nsimSEXP, SEXP burn_inSEXP, SEXP sigmaSEXP) {
179 | BEGIN_RCPP
180 |     Rcpp::RObject rcpp_result_gen;
181 |     Rcpp::RNGScope rcpp_rngScope_gen;
182 |     Rcpp::traits::input_parameter< double >::type start_d(start_dSEXP);
183 |     Rcpp::traits::input_parameter< int >::type n1(n1SEXP);
184 |     Rcpp::traits::input_parameter< int >::type n2(n2SEXP);
185 |     Rcpp::traits::input_parameter< double >::type a_d(a_dSEXP);
186 |     Rcpp::traits::input_parameter< double >::type b_d(b_dSEXP);
187 |     Rcpp::traits::input_parameter< arma::colvec >::type mus_n1_n2(mus_n1_n2SEXP);
188 |     Rcpp::traits::input_parameter< int >::type nsim(nsimSEXP);
189 |     Rcpp::traits::input_parameter< int >::type burn_in(burn_inSEXP);
190 |     Rcpp::traits::input_parameter< double >::type sigma(sigmaSEXP);
191 |     rcpp_result_gen = Rcpp::wrap(gride_mh_sampler(start_d, n1, n2, a_d, b_d, mus_n1_n2, nsim, burn_in, sigma));
192 |     return rcpp_result_gen;
193 | END_RCPP
194 | }
195 | 
196 | static const R_CallMethodDef CallEntries[] = {
197 |     {"_intRinsic_log_Likelihood_double", (DL_FUNC) &_intRinsic_log_Likelihood_double, 2},
198 |     {"_intRinsic_Groups_quantities", (DL_FUNC) &_intRinsic_Groups_quantities, 3},
199 |     {"_intRinsic_Norm_Constant_Z_l2", (DL_FUNC) &_intRinsic_Norm_Constant_Z_l2, 4},
200 |     {"_intRinsic_log_Zeta_maker", (DL_FUNC) &_intRinsic_log_Zeta_maker, 3},
201 |     {"_intRinsic_index_row_col", (DL_FUNC) &_intRinsic_index_row_col, 3},
202 |     {"_intRinsic_rdir_cpp", (DL_FUNC) &_intRinsic_rdir_cpp, 1},
203 |     {"_intRinsic_Update_memberships_faster", (DL_FUNC) &_intRinsic_Update_memberships_faster, 13},
204 |     {"_intRinsic_gam_trunc", (DL_FUNC) &_intRinsic_gam_trunc, 6},
205 |     {"_intRinsic_gam_trunc_pmass", (DL_FUNC) &_intRinsic_gam_trunc_pmass, 7},
206 |     {"_intRinsic_gride_log_likelihood", (DL_FUNC) &_intRinsic_gride_log_likelihood, 4},
207 |     {"_intRinsic_gride_log_posterior", (DL_FUNC) &_intRinsic_gride_log_posterior, 6},
208 |     {"_intRinsic_gride_mh_sampler", (DL_FUNC) &_intRinsic_gride_mh_sampler, 9},
209 |     {NULL, NULL, 0}
210 | };
211 | 
212 | RcppExport void R_init_intRinsic(DllInfo *dll) {
213 |     R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
214 |     R_useDynamicSymbols(dll, FALSE);
215 | }
216 | 


--------------------------------------------------------------------------------