├── .Rbuildignore ├── LICENSE ├── .gitignore ├── NAMESPACE ├── clusterpval.Rproj ├── man ├── norm_vec.Rd ├── TNProbEachInt.Rd ├── isSameIntervals.Rd ├── TNProb.Rd ├── preserve_cl.Rd ├── same_cl.Rd ├── is_integer_between_a_b.Rd ├── magicfun.Rd ├── sortE.Rd ├── finiteE.Rd ├── solve_one_ineq.Rd ├── compute_S_ward.Rd ├── compute_S_single.Rd ├── compute_S_average.Rd ├── compute_S_centroid.Rd ├── compute_S_median.Rd ├── TNRatioApprox.Rd ├── compute_S_mcquitty.Rd ├── compute_S_single_gencov.Rd ├── compute_S_ward_gencov.Rd ├── compute_S_average_gencov.Rd ├── compute_S_centroid_gencov.Rd ├── compute_S_median_gencov.Rd ├── compute_S_mcquitty_gencov.Rd ├── TChisqRatioApprox.Rd ├── TNSurv.Rd ├── rect_hier_clusters.Rd ├── test_clusters_approx.Rd ├── test_complete_hier_clusters_approx.Rd └── test_hier_clusters_exact.Rd ├── README.md ├── DESCRIPTION └── R ├── util.R ├── helper_cutree.R ├── trunc_dist.R ├── .Rhistory └── trunc_inf.R /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2020 2 | COPYRIGHT HOLDER: Lucy L. Gao 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(TChisqRatioApprox) 4 | export(rect_hier_clusters) 5 | export(test_clusters_approx) 6 | export(test_complete_hier_clusters_approx) 7 | export(test_hier_clusters_exact) 8 | -------------------------------------------------------------------------------- /clusterpval.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | BuildType: Package 16 | PackageUseDevtools: Yes 17 | PackageInstallArgs: --no-multiarch --with-keep.source 18 | -------------------------------------------------------------------------------- /man/norm_vec.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/util.R 3 | \name{norm_vec} 4 | \alias{norm_vec} 5 | \title{Takes the l2-norm of a vector.} 6 | \usage{ 7 | norm_vec(x) 8 | } 9 | \arguments{ 10 | \item{x}{the vector to be normed} 11 | } 12 | \value{ 13 | Returns the l2-norm of x. 14 | } 15 | \description{ 16 | Takes the l2-norm of a vector. 17 | } 18 | \keyword{internal} 19 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # clusterpval: Inference for Estimated Clusters in R 2 | 3 | See [http://lucylgao.com/clusterpval](http://lucylgao.com/clusterpval) for tutorials and examples. See [https://arxiv.org/abs/2012.02936](https://arxiv.org/abs/2012.02936) for the preprint. 4 | 5 | Install 6 | ----- 7 | 8 | Make sure that ``devtools`` is installed by running ``install.packages("devtools")``, then type 9 | 10 | ```R 11 | devtools::install_github("lucylgao/clusterpval") 12 | ``` 13 | 14 | -------------------------------------------------------------------------------- /man/TNProbEachInt.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/trunc_dist.R 3 | \name{TNProbEachInt} 4 | \alias{TNProbEachInt} 5 | \title{Probability of a standard normal in a single interval} 6 | \usage{ 7 | TNProbEachInt(lo, up) 8 | } 9 | \arguments{ 10 | \item{lo, up}{quantiles.} 11 | } 12 | \value{ 13 | This function returns the desired probability. 14 | } 15 | \description{ 16 | This function returns \eqn{P(lo \le Z \le up)}, where \eqn{Z ~ N(0, 1)}. 17 | } 18 | \keyword{internal} 19 | -------------------------------------------------------------------------------- /man/isSameIntervals.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/trunc_dist.R 3 | \name{isSameIntervals} 4 | \alias{isSameIntervals} 5 | \title{Comparison between two intervals} 6 | \usage{ 7 | isSameIntervals(int1, int2) 8 | } 9 | \arguments{ 10 | \item{int1, int2}{"Intervals" objects.} 11 | } 12 | \value{ 13 | This function returns the desired logical result. 14 | } 15 | \description{ 16 | This functions returns \code{TRUE} if and only if two intervals are the same. 17 | } 18 | \keyword{internal} 19 | -------------------------------------------------------------------------------- /man/TNProb.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/trunc_dist.R 3 | \name{TNProb} 4 | \alias{TNProb} 5 | \title{Probability of a standard normal in a union of intervals} 6 | \usage{ 7 | TNProb(E) 8 | } 9 | \arguments{ 10 | \item{E}{an "Intervals" object or a matrix where rows represents 11 | a union of disjoint intervals.} 12 | } 13 | \value{ 14 | This function returns the desired probability. 15 | } 16 | \description{ 17 | This function returns \eqn{P(Z \in E)}, where \eqn{Z ~ N(0, 1)}. 18 | } 19 | \keyword{internal} 20 | -------------------------------------------------------------------------------- /man/preserve_cl.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/util.R 3 | \name{preserve_cl} 4 | \alias{preserve_cl} 5 | \title{Checks if Ck, Ck' in C(x'(phi))} 6 | \usage{ 7 | preserve_cl(cl, cl_phi, k1, k2) 8 | } 9 | \arguments{ 10 | \item{cl}{clustering of x} 11 | 12 | \item{cl_phi}{clustering of x'(phi)} 13 | 14 | \item{k1, k2}{index of clusters involved in the test} 15 | } 16 | \value{ 17 | Returns TRUE if Ck, Ck' in C(x'(phi)), and FALSE otherwise 18 | } 19 | \description{ 20 | Checks if Ck, Ck' in C(x'(phi)) 21 | } 22 | \keyword{internal} 23 | -------------------------------------------------------------------------------- /man/same_cl.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/util.R 3 | \name{same_cl} 4 | \alias{same_cl} 5 | \title{Checks if two clusterings are the same up to permutation} 6 | \usage{ 7 | same_cl(cl1, cl2, K) 8 | } 9 | \arguments{ 10 | \item{cl1}{the first clustering} 11 | 12 | \item{cl2}{the second clustering} 13 | 14 | \item{K}{the number of clusters} 15 | } 16 | \value{ 17 | Returns TRUE if they are the same, and FALSE otherwise 18 | } 19 | \description{ 20 | Checks if two clusterings are the same up to permutation 21 | } 22 | \keyword{internal} 23 | -------------------------------------------------------------------------------- /man/is_integer_between_a_b.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/util.R 3 | \name{is_integer_between_a_b} 4 | \alias{is_integer_between_a_b} 5 | \title{Checks if input is an integer between a and b} 6 | \usage{ 7 | is_integer_between_a_b(x, a, b) 8 | } 9 | \arguments{ 10 | \item{x}{input to check} 11 | 12 | \item{a}{lower} 13 | 14 | \item{b}{upper} 15 | } 16 | \value{ 17 | Returns TRUE if input is an integer between a and b, FALSE otherwise 18 | } 19 | \description{ 20 | Checks if input is an integer between a and b 21 | } 22 | \keyword{internal} 23 | -------------------------------------------------------------------------------- /man/magicfun.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/trunc_dist.R 3 | \name{magicfun} 4 | \alias{magicfun} 5 | \title{A helper function for approximating normal tail probabilities} 6 | \usage{ 7 | magicfun(z) 8 | } 9 | \arguments{ 10 | \item{z, }{the number where the function is evaluated.} 11 | } 12 | \value{ 13 | This function returns the value of the function evaluated at \code{z}. 14 | } 15 | \description{ 16 | For \eqn{Z ~ N(0, 1)}, we have the approximation 17 | \eqn{P(Z \ge z) \approx }\code{magicfun(z)*exp(-z^2/2)}. 18 | } 19 | \references{ 20 | Bryc, Wlodzimierz. "A uniform approximation to the right normal tail integral." 21 | Applied mathematics and computation 127.2 (2002): 365-374. 22 | } 23 | \keyword{internal} 24 | -------------------------------------------------------------------------------- /man/sortE.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/trunc_dist.R 3 | \name{sortE} 4 | \alias{sortE} 5 | \title{Make endpoints of intervals positive} 6 | \usage{ 7 | sortE(E) 8 | } 9 | \arguments{ 10 | \item{E}{an "Intervals" object or a matrix where rows represents 11 | a union of intervals with \emph{positive} but possibly infinite endpoints.} 12 | } 13 | \value{ 14 | This function returns an "Intervals" object or a matrix depending on the input. 15 | } 16 | \description{ 17 | This function modifies a union of intervals with possibly negative enpoints 18 | into a union of intervals with \emph{positive} endpoints, while ensuring 19 | the probability of a \eqn{N(0, 1)} falling into it numerically the same. 20 | } 21 | \keyword{internal} 22 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: clusterpval 2 | Type: Package 3 | Title: Clusterpval: P-Values for Differences in Means after Clustering 4 | Version: 1.0.1 5 | Authors@R: 6 | person("Lucy", "Gao", email = "lucy.gao@uwaterloo.ca", role = c("aut", "cre")) 7 | Description: Clusters observations, then tests whether there 8 | is a statistically significant difference between the cluster means, using the 9 | methods described in the preprint "Selective inference for hierarchical clustering". See the package website at http://lucylgao.com/clusterpval for more information and more examples. 10 | License: MIT + file LICENSE 11 | Encoding: UTF-8 12 | LazyData: true 13 | RoxygenNote: 7.1.2 14 | Suggests: 15 | graphics 16 | Imports: 17 | fastcluster, 18 | stats, 19 | intervals, 20 | future.apply 21 | -------------------------------------------------------------------------------- /man/finiteE.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/trunc_dist.R 3 | \name{finiteE} 4 | \alias{finiteE} 5 | \title{Make endpoints of intervals finite} 6 | \usage{ 7 | finiteE(E) 8 | } 9 | \arguments{ 10 | \item{E}{an "Intervals" object or a matrix where rows represents 11 | a union of intervals with \emph{positive} but possibly infinite endpoints.} 12 | } 13 | \value{ 14 | This function returns an "Intervals" object or a matrix depending on the input. 15 | } 16 | \description{ 17 | This function modifies a union of intervals with positive but possibly infinite endpoints 18 | into a union of intervals with positive and \emph{finite} endpoints, while ensuring 19 | the probability of a \eqn{N(0, 1)} falling into it numerically the same. 20 | } 21 | \keyword{internal} 22 | -------------------------------------------------------------------------------- /man/solve_one_ineq.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/trunc_sets.R 3 | \name{solve_one_ineq} 4 | \alias{solve_one_ineq} 5 | \title{Solve the roots of quadratic polynomials related to testing for a difference in means} 6 | \usage{ 7 | solve_one_ineq(A, B, C, tol = 1e-10) 8 | } 9 | \arguments{ 10 | \item{A, }{B, C the coefficients of the quadratic equation.} 11 | 12 | \item{tol}{if \eqn{|a|}, \eqn{|b|}, or \eqn{|c|} is not larger than tol, then treat it as zero.} 13 | } 14 | \value{ 15 | Returns an "Intervals" object containing NA or the complement of the solution set. 16 | } 17 | \description{ 18 | Solves \eqn{ax^2 + bx + c \ge 0}, then returns the complement of the solution set 19 | wrt to the real line, unless the complement is empty, in which case 20 | the function returns NA. 21 | } 22 | \keyword{internal} 23 | -------------------------------------------------------------------------------- /man/compute_S_ward.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/trunc_sets.R 3 | \name{compute_S_ward} 4 | \alias{compute_S_ward} 5 | \title{Computes the conditioning set for Ward linkage hierarchical clustering} 6 | \usage{ 7 | compute_S_ward(X, hcl, K, k1, k2, dist) 8 | } 9 | \arguments{ 10 | \item{X}{the n x q data set} 11 | 12 | \item{hcl}{hclust object obtained by clustering X} 13 | 14 | \item{K}{number of clusters} 15 | 16 | \item{k1}{the index of first cluster involved in the test} 17 | 18 | \item{k2}{the index of second cluster involved in the test} 19 | 20 | \item{dist}{The distances of matrix X} 21 | } 22 | \value{ 23 | Returns an "Intervals" object containing the conditioning set. 24 | } 25 | \description{ 26 | Computes the conditioning set for Ward linkage hierarchical clustering 27 | } 28 | \keyword{internal} 29 | -------------------------------------------------------------------------------- /man/compute_S_single.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/trunc_sets.R 3 | \name{compute_S_single} 4 | \alias{compute_S_single} 5 | \title{Computes the conditioning set for single linkage hierarchical clustering} 6 | \usage{ 7 | compute_S_single(X, hcl, K, k1, k2, dist) 8 | } 9 | \arguments{ 10 | \item{X}{the n x q data set} 11 | 12 | \item{hcl}{hclust object obtained by clustering X} 13 | 14 | \item{K}{number of clusters} 15 | 16 | \item{k1}{the index of first cluster involved in the test} 17 | 18 | \item{k2}{the index of second cluster involved in the test} 19 | 20 | \item{dist}{The distances of matrix X} 21 | } 22 | \value{ 23 | Returns an "Intervals" object containing the conditioning set. 24 | } 25 | \description{ 26 | Computes the conditioning set for single linkage hierarchical clustering 27 | } 28 | \keyword{internal} 29 | -------------------------------------------------------------------------------- /man/compute_S_average.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/trunc_sets.R 3 | \name{compute_S_average} 4 | \alias{compute_S_average} 5 | \title{Computes the conditioning set for average linkage hierarchical clustering} 6 | \usage{ 7 | compute_S_average(X, hcl, K, k1, k2, dist) 8 | } 9 | \arguments{ 10 | \item{X}{the n x q data set} 11 | 12 | \item{hcl}{hclust object obtained by clustering X} 13 | 14 | \item{K}{number of clusters} 15 | 16 | \item{k1}{the index of first cluster involved in the test} 17 | 18 | \item{k2}{the index of second cluster involved in the test} 19 | 20 | \item{dist}{The distances of matrix X} 21 | } 22 | \value{ 23 | Returns an "Intervals" object containing the conditioning set. 24 | } 25 | \description{ 26 | Computes the conditioning set for average linkage hierarchical clustering 27 | } 28 | \keyword{internal} 29 | -------------------------------------------------------------------------------- /man/compute_S_centroid.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/trunc_sets.R 3 | \name{compute_S_centroid} 4 | \alias{compute_S_centroid} 5 | \title{Computes the conditioning set for centroid linkage hierarchical clustering} 6 | \usage{ 7 | compute_S_centroid(X, hcl, K, k1, k2, dist) 8 | } 9 | \arguments{ 10 | \item{X}{the n x q data set} 11 | 12 | \item{hcl}{hclust object obtained by clustering X} 13 | 14 | \item{K}{number of clusters} 15 | 16 | \item{k1}{the index of first cluster involved in the test} 17 | 18 | \item{k2}{the index of second cluster involved in the test} 19 | 20 | \item{dist}{The distances of matrix X} 21 | } 22 | \value{ 23 | Returns an "Intervals" object containing the conditioning set. 24 | } 25 | \description{ 26 | Computes the conditioning set for centroid linkage hierarchical clustering 27 | } 28 | \keyword{internal} 29 | -------------------------------------------------------------------------------- /man/compute_S_median.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/trunc_sets.R 3 | \name{compute_S_median} 4 | \alias{compute_S_median} 5 | \title{Computes the conditioning set for median linkage hierarchical clustering (WPGMC)} 6 | \usage{ 7 | compute_S_median(X, hcl, K, k1, k2, dist) 8 | } 9 | \arguments{ 10 | \item{X}{the n x q data set} 11 | 12 | \item{hcl}{hclust object obtained by clustering X} 13 | 14 | \item{K}{number of clusters} 15 | 16 | \item{k1}{the index of first cluster involved in the test} 17 | 18 | \item{k2}{the index of second cluster involved in the test} 19 | 20 | \item{dist}{The distances of matrix X} 21 | } 22 | \value{ 23 | Returns an "Intervals" object containing the conditioning set. 24 | } 25 | \description{ 26 | Computes the conditioning set for median linkage hierarchical clustering (WPGMC) 27 | } 28 | \keyword{internal} 29 | -------------------------------------------------------------------------------- /man/TNRatioApprox.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/trunc_dist.R 3 | \name{TNRatioApprox} 4 | \alias{TNRatioApprox} 5 | \title{Approximation of the ratio of two normal probabilities} 6 | \usage{ 7 | TNRatioApprox(E1, E2, scale = NULL) 8 | } 9 | \arguments{ 10 | \item{E1, E2}{"Intervals" objects or matrices where rows represents 11 | a union of intervals with \emph{positive and finite} endpoints.} 12 | 13 | \item{scale}{scaling parameter.} 14 | } 15 | \value{ 16 | This function returns the value of the approximation. 17 | } 18 | \description{ 19 | This function returns an approximation of \eqn{P(Z \in E1)/P(Z \in E2)}, where \eqn{Z ~ N(0, 1)}. 20 | } 21 | \references{ 22 | Bryc, Wlodzimierz. "A uniform approximation to the right normal tail integral." 23 | Applied mathematics and computation 127.2 (2002): 365-374. 24 | } 25 | \keyword{internal} 26 | -------------------------------------------------------------------------------- /man/compute_S_mcquitty.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/trunc_sets.R 3 | \name{compute_S_mcquitty} 4 | \alias{compute_S_mcquitty} 5 | \title{Computes the conditioning set for McQuitty linkage hierarchical clustering (WPGMA)} 6 | \usage{ 7 | compute_S_mcquitty(X, hcl, K, k1, k2, dist) 8 | } 9 | \arguments{ 10 | \item{X}{the n x q data set} 11 | 12 | \item{hcl}{hclust object obtained by clustering X} 13 | 14 | \item{K}{number of clusters} 15 | 16 | \item{k1}{the index of first cluster involved in the test} 17 | 18 | \item{k2}{the index of second cluster involved in the test} 19 | 20 | \item{dist}{The distances of matrix X} 21 | } 22 | \value{ 23 | Returns an "Intervals" object containing the conditioning set. 24 | } 25 | \description{ 26 | Computes the conditioning set for McQuitty linkage hierarchical clustering (WPGMA) 27 | } 28 | \keyword{internal} 29 | -------------------------------------------------------------------------------- /man/compute_S_single_gencov.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/trunc_sets.R 3 | \name{compute_S_single_gencov} 4 | \alias{compute_S_single_gencov} 5 | \title{Computes the conditioning set S for single linkage hierarchical clustering, 6 | w/o assuming isotropic covariance matrix} 7 | \usage{ 8 | compute_S_single_gencov(X, hcl, K, k1, k2, stat) 9 | } 10 | \arguments{ 11 | \item{X}{the n x q data set} 12 | 13 | \item{hcl}{hclust object obtained by clustering X} 14 | 15 | \item{K}{number of clusters} 16 | 17 | \item{k1}{the index of first cluster involved in the test} 18 | 19 | \item{k2}{the index of second cluster involved in the test} 20 | 21 | \item{stat}{the test statistic, \eqn{||\Sigma^{-1/2} x^T \nu||_2}} 22 | } 23 | \value{ 24 | Returns an "Intervals" object containing the conditioning set. 25 | } 26 | \description{ 27 | Computes the conditioning set S for single linkage hierarchical clustering, 28 | w/o assuming isotropic covariance matrix 29 | } 30 | \keyword{internal} 31 | -------------------------------------------------------------------------------- /man/compute_S_ward_gencov.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/trunc_sets.R 3 | \name{compute_S_ward_gencov} 4 | \alias{compute_S_ward_gencov} 5 | \title{Computes the conditioning set S for ward linkage hierarchical clustering, 6 | w/o assuming isotropic covariance matrix} 7 | \usage{ 8 | compute_S_ward_gencov(X, hcl, K, k1, k2, stat, dist) 9 | } 10 | \arguments{ 11 | \item{X}{the n x q data set} 12 | 13 | \item{hcl}{hclust object obtained by clustering X} 14 | 15 | \item{K}{number of clusters} 16 | 17 | \item{k1}{the index of first cluster involved in the test} 18 | 19 | \item{k2}{the index of second cluster involved in the test} 20 | 21 | \item{stat}{the test statistic, \eqn{||\Sigma^{-1/2} x^T \nu||_2}} 22 | 23 | \item{dist}{The distances of matrix X} 24 | } 25 | \value{ 26 | Returns an "Intervals" object containing the conditioning set. 27 | } 28 | \description{ 29 | Computes the conditioning set S for ward linkage hierarchical clustering, 30 | w/o assuming isotropic covariance matrix 31 | } 32 | \keyword{internal} 33 | -------------------------------------------------------------------------------- /man/compute_S_average_gencov.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/trunc_sets.R 3 | \name{compute_S_average_gencov} 4 | \alias{compute_S_average_gencov} 5 | \title{Computes the conditioning set S for average linkage hierarchical clustering, 6 | w/o assuming isotropic covariance matrix} 7 | \usage{ 8 | compute_S_average_gencov(X, hcl, K, k1, k2, stat, dist) 9 | } 10 | \arguments{ 11 | \item{X}{the n x q data set} 12 | 13 | \item{hcl}{hclust object obtained by clustering X} 14 | 15 | \item{K}{number of clusters} 16 | 17 | \item{k1}{the index of first cluster involved in the test} 18 | 19 | \item{k2}{the index of second cluster involved in the test} 20 | 21 | \item{stat}{the test statistic, \eqn{||\Sigma^{-1/2} x^T \nu||_2}} 22 | 23 | \item{dist}{The distances of matrix X} 24 | } 25 | \value{ 26 | Returns an "Intervals" object containing the conditioning set. 27 | } 28 | \description{ 29 | Computes the conditioning set S for average linkage hierarchical clustering, 30 | w/o assuming isotropic covariance matrix 31 | } 32 | \keyword{internal} 33 | -------------------------------------------------------------------------------- /man/compute_S_centroid_gencov.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/trunc_sets.R 3 | \name{compute_S_centroid_gencov} 4 | \alias{compute_S_centroid_gencov} 5 | \title{Computes the conditioning set S for centroid linkage hierarchical clustering, 6 | w/o assuming isotropic covariance matrix} 7 | \usage{ 8 | compute_S_centroid_gencov(X, hcl, K, k1, k2, stat, dist) 9 | } 10 | \arguments{ 11 | \item{X}{the n x q data set} 12 | 13 | \item{hcl}{hclust object obtained by clustering X} 14 | 15 | \item{K}{number of clusters} 16 | 17 | \item{k1}{the index of first cluster involved in the test} 18 | 19 | \item{k2}{the index of second cluster involved in the test} 20 | 21 | \item{stat}{the test statistic, \eqn{||\Sigma^{-1/2} x^T \nu||_2}} 22 | 23 | \item{dist}{The distances of matrix X} 24 | } 25 | \value{ 26 | Returns an "Intervals" object containing the conditioning set. 27 | } 28 | \description{ 29 | Computes the conditioning set S for centroid linkage hierarchical clustering, 30 | w/o assuming isotropic covariance matrix 31 | } 32 | \keyword{internal} 33 | -------------------------------------------------------------------------------- /man/compute_S_median_gencov.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/trunc_sets.R 3 | \name{compute_S_median_gencov} 4 | \alias{compute_S_median_gencov} 5 | \title{Computes the conditioning set S for median linkage hierarchical clustering (WPGMC), 6 | w/o assuming isotropic covariance matrix} 7 | \usage{ 8 | compute_S_median_gencov(X, hcl, K, k1, k2, stat, dist) 9 | } 10 | \arguments{ 11 | \item{X}{the n x q data set} 12 | 13 | \item{hcl}{hclust object obtained by clustering X} 14 | 15 | \item{K}{number of clusters} 16 | 17 | \item{k1}{the index of first cluster involved in the test} 18 | 19 | \item{k2}{the index of second cluster involved in the test} 20 | 21 | \item{stat}{the test statistic, \eqn{||\Sigma^{-1/2} x^T \nu||_2}} 22 | 23 | \item{dist}{The distances of matrix X} 24 | } 25 | \value{ 26 | Returns an "Intervals" object containing the conditioning set. 27 | } 28 | \description{ 29 | Computes the conditioning set S for median linkage hierarchical clustering (WPGMC), 30 | w/o assuming isotropic covariance matrix 31 | } 32 | \keyword{internal} 33 | -------------------------------------------------------------------------------- /man/compute_S_mcquitty_gencov.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/trunc_sets.R 3 | \name{compute_S_mcquitty_gencov} 4 | \alias{compute_S_mcquitty_gencov} 5 | \title{Computes the conditioning set S for McQuitty linkage hierarchical clustering (WPGMA), 6 | w/o assuming isotropic covariance matrix} 7 | \usage{ 8 | compute_S_mcquitty_gencov(X, hcl, K, k1, k2, stat, dist) 9 | } 10 | \arguments{ 11 | \item{X}{the n x q data set} 12 | 13 | \item{hcl}{hclust object obtained by clustering X} 14 | 15 | \item{K}{number of clusters} 16 | 17 | \item{k1}{the index of first cluster involved in the test} 18 | 19 | \item{k2}{the index of second cluster involved in the test} 20 | 21 | \item{stat}{the test statistic, \eqn{||\Sigma^{-1/2} x^T \nu||_2}} 22 | 23 | \item{dist}{The distances of matrix X} 24 | } 25 | \value{ 26 | Returns an "Intervals" object containing the conditioning set. 27 | } 28 | \description{ 29 | Computes the conditioning set S for McQuitty linkage hierarchical clustering (WPGMA), 30 | w/o assuming isotropic covariance matrix 31 | } 32 | \keyword{internal} 33 | -------------------------------------------------------------------------------- /man/TChisqRatioApprox.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/trunc_dist.R 3 | \name{TChisqRatioApprox} 4 | \alias{TChisqRatioApprox} 5 | \title{Approximation of the ratio of two chi-squared probabilities} 6 | \usage{ 7 | TChisqRatioApprox(df, E1, E2) 8 | } 9 | \arguments{ 10 | \item{df}{degree of freedom of the chi-squared random variable.} 11 | 12 | \item{E1, E2}{"Intervals" objects or matrices where rows represents 13 | a union of intervals with \emph{positive and finite} endpoints.} 14 | } 15 | \value{ 16 | This function returns the value of the approximation. 17 | } 18 | \description{ 19 | This function returns an approximation of \eqn{P(X \in E1)/P(X \in E2)}, where 20 | \eqn{X} is a central chi-squared random variable with \code{df} degrees of freedom. 21 | } 22 | \references{ 23 | Bryc, Wlodzimierz. "A uniform approximation to the right normal tail integral." 24 | Applied mathematics and computation 127.2 (2002): 365-374. 25 | 26 | Canal, Luisa. "A normal approximation for the chi-square distribution." 27 | Computational statistics & data analysis 48.4 (2005): 803-808. 28 | } 29 | -------------------------------------------------------------------------------- /man/TNSurv.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/trunc_dist.R 3 | \name{TNSurv} 4 | \alias{TNSurv} 5 | \title{Survival function of truncated normal distribution} 6 | \usage{ 7 | TNSurv(q, mean, sd, E, approx = FALSE) 8 | } 9 | \arguments{ 10 | \item{q}{the quantile.} 11 | 12 | \item{mean}{the mean parameter} 13 | 14 | \item{sd}{the standard deviation} 15 | 16 | \item{E}{the truncation set, an "Intervals" object or a matrix where rows represents 17 | a union of disjoint intervals.} 18 | 19 | \item{approx}{should the approximation algorithm be used? Default is \code{FALSE}, 20 | where the approximation is not used in the first place. But when the result is wacky, 21 | the approximation will be used.} 22 | } 23 | \value{ 24 | This function returns the value of the survival function evaluated at quantile \code{q}. 25 | } 26 | \description{ 27 | This function returns the upper tail probability of a truncated normal distribution 28 | at quantile \code{q}. 29 | } 30 | \details{ 31 | Let \eqn{X} be a normal random variable with \code{mean} and \code{sd}. Truncating 32 | \eqn{X} to the set \eqn{E} is equivalent to conditioning on \eqn{{X \in E}}. So this function 33 | returns \eqn{P(X \ge q | X \in E)}. 34 | } 35 | \references{ 36 | Bryc, Wlodzimierz. "A uniform approximation to the right normal tail integral." 37 | Applied mathematics and computation 127.2 (2002): 365-374. 38 | } 39 | \keyword{internal} 40 | -------------------------------------------------------------------------------- /R/util.R: -------------------------------------------------------------------------------- 1 | # ----- general purpose helper functions ----- 2 | 3 | #' Takes the l2-norm of a vector. 4 | #' 5 | #' @keywords internal 6 | #' 7 | #' @param x the vector to be normed 8 | #' 9 | #' @return Returns the l2-norm of x. 10 | norm_vec <- function(x) { 11 | sqrt(sum(x^2)) 12 | } 13 | 14 | #' Checks if input is an integer between a and b 15 | #' 16 | #' @keywords internal 17 | #' 18 | #' @param x input to check 19 | #' @param a lower 20 | #' @param b upper 21 | #' 22 | #' @return Returns TRUE if input is an integer between a and b, FALSE otherwise 23 | is_integer_between_a_b <- function(x, a, b) { 24 | (x>= min(c(a, b))) && (x %% 1 == 0) && (x <= max(c(a, b))) 25 | } 26 | 27 | #' Checks if two clusterings are the same up to permutation 28 | #' 29 | #' @keywords internal 30 | #' 31 | #' @param cl1 the first clustering 32 | #' @param cl2 the second clustering 33 | #' @param K the number of clusters 34 | #' 35 | #' @return Returns TRUE if they are the same, and FALSE otherwise 36 | same_cl <- function(cl1, cl2, K) { 37 | tab <- table(cl1, cl2) 38 | sum(tab != 0) == K 39 | } 40 | 41 | #' Checks if Ck, Ck' in C(x'(phi)) 42 | #' 43 | #' @keywords internal 44 | #' 45 | #' @param cl clustering of x 46 | #' @param cl_phi clustering of x'(phi) 47 | #' @param k1,k2 index of clusters involved in the test 48 | #' 49 | #' @return Returns TRUE if Ck, Ck' in C(x'(phi)), and FALSE otherwise 50 | preserve_cl <- function(cl, cl_phi, k1, k2) { 51 | tab <- table(cl, cl_phi) 52 | 53 | k1_in <- (sum(tab[k1, ] != 0) == 1) & (sum(tab[, k1] != 0) == 1) 54 | k2_in <- (sum(tab[k2, ] != 0) == 1) & (sum(tab[, k2] != 0) == 1) 55 | 56 | k1_in & k2_in 57 | } 58 | 59 | -------------------------------------------------------------------------------- /man/rect_hier_clusters.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/helper_cutree.R 3 | \name{rect_hier_clusters} 4 | \alias{rect_hier_clusters} 5 | \title{Draw a rectangle around a hierarchical cluster} 6 | \usage{ 7 | rect_hier_clusters( 8 | hcl, 9 | k = NULL, 10 | h = NULL, 11 | which, 12 | border = NULL, 13 | cluster = NULL 14 | ) 15 | } 16 | \arguments{ 17 | \item{hcl}{An object of the type produced by hclust.} 18 | 19 | \item{k, h}{Scalar. Cut the dendrogram such that either exactly k clusters are produced or by 20 | cutting at height h.} 21 | 22 | \item{which}{A vector selecting the clusters around which a rectangle should be drawn. Clusters 23 | are selected by number (according to the cutree function)} 24 | 25 | \item{border}{A vector of border colours for the rectangles} 26 | 27 | \item{cluster}{Optional vector with cluster memberships as returned by 28 | cutree(hclust.obj, k = k), can be specified for efficiency if already computed.} 29 | } 30 | \value{ 31 | (Invisibly) returns a list where each element contains a vector of data 32 | points contained in the respective cluster. 33 | } 34 | \description{ 35 | Draws rectangles around the branches of a dendrogram highlighting 36 | the corresponding clusters. First, we cut the dendrogram at a certain level, 37 | then a rectangle is drawn around selected branches. This function is like the 38 | rect.hclust function, but it indexes the clusters according to the names 39 | assigned by the cutree function, rather than by left to right. 40 | } 41 | \examples{ 42 | # Simulates a 100 x 2 data set with three clusters 43 | set.seed(123) 44 | dat <- rbind(c(-1, 0), c(0, sqrt(3)), c(1, 0))[rep(1:3, length=100), ] + 45 | matrix(0.2*rnorm(200), 100, 2) 46 | 47 | # Average linkage hierarchical clustering 48 | # We use the version of hclust in fastcluster because it's faster than the version in stats 49 | hcl <- fastcluster::hclust(dist(dat, method="euclidean")^2, method="average") 50 | 51 | # plot dendrograms with the three clusters displayed in blue, orange, and green, respectively 52 | plot(hcl) 53 | rect_hier_clusters(hcl, k=3, which=1:3, border=c("blue", "orange", "green")) 54 | } 55 | -------------------------------------------------------------------------------- /R/helper_cutree.R: -------------------------------------------------------------------------------- 1 | # ----- helpful visualization tools for interacting with cutree indexes ----- 2 | #' Draw a rectangle around a hierarchical cluster 3 | #' 4 | #' Draws rectangles around the branches of a dendrogram highlighting 5 | #' the corresponding clusters. First, we cut the dendrogram at a certain level, 6 | #' then a rectangle is drawn around selected branches. This function is like the 7 | #' rect.hclust function, but it indexes the clusters according to the names 8 | #' assigned by the cutree function, rather than by left to right. 9 | #' 10 | #' @export 11 | #' 12 | #' @param hcl An object of the type produced by hclust. 13 | #' @param k,h Scalar. Cut the dendrogram such that either exactly k clusters are produced or by 14 | #' cutting at height h. 15 | #' @param which A vector selecting the clusters around which a rectangle should be drawn. Clusters 16 | #' are selected by number (according to the cutree function) 17 | #' @param border A vector of border colours for the rectangles 18 | #' @param cluster Optional vector with cluster memberships as returned by 19 | #' cutree(hclust.obj, k = k), can be specified for efficiency if already computed. 20 | #' 21 | #' @return (Invisibly) returns a list where each element contains a vector of data 22 | #' points contained in the respective cluster. 23 | #' 24 | #' @examples 25 | #' # Simulates a 100 x 2 data set with three clusters 26 | #' set.seed(123) 27 | #' dat <- rbind(c(-1, 0), c(0, sqrt(3)), c(1, 0))[rep(1:3, length=100), ] + 28 | #' matrix(0.2*rnorm(200), 100, 2) 29 | #' 30 | #' # Average linkage hierarchical clustering 31 | #' # We use the version of hclust in fastcluster because it's faster than the version in stats 32 | #' hcl <- fastcluster::hclust(dist(dat, method="euclidean")^2, method="average") 33 | #' 34 | #' # plot dendrograms with the three clusters displayed in blue, orange, and green, respectively 35 | #' plot(hcl) 36 | #' rect_hier_clusters(hcl, k=3, which=1:3, border=c("blue", "orange", "green")) 37 | rect_hier_clusters <- function(hcl, k=NULL, h=NULL, which, border=NULL, cluster=NULL) { 38 | if (length(h) > 1L | length(k) > 1L) stop("'k' and 'h' must be a scalar") 39 | if (!is.null(h)) { 40 | if (!is.null(k)) stop("specify exactly one of 'k' and 'h'") 41 | k <- min(which(rev(hcl$height) < h)) 42 | k <- max(k, 2) 43 | } else if (is.null(k)) { 44 | stop("specify exactly one of 'k' and 'h'") 45 | } 46 | 47 | 48 | if (k < 2 | k > length(hcl$height)) { 49 | stop(gettextf("k must be between 2 and %d", length(hcl$height)), domain = NA) 50 | } 51 | 52 | if(is.null(cluster)) cluster <- stats::cutree(hcl, k = k) 53 | 54 | if(is.null(border)) border <- c("red", "green3", "blue", "cyan", "magenta", "yellow", "gray")[1:k] 55 | 56 | # unique(cluster[hcl$order]) gives the left to right ordering of the cutree clusters 57 | ltr_index <- match(which, unique(cluster[hcl$order])) 58 | stats::rect.hclust(hcl, k=k, which=ltr_index, border=border) 59 | } 60 | -------------------------------------------------------------------------------- /man/test_clusters_approx.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/trunc_inf.R 3 | \name{test_clusters_approx} 4 | \alias{test_clusters_approx} 5 | \title{Monte Carlo significance test for any clustering method} 6 | \usage{ 7 | test_clusters_approx( 8 | X, 9 | k1, 10 | k2, 11 | iso = TRUE, 12 | sig = NULL, 13 | SigInv = NULL, 14 | ndraws = 2000, 15 | cl_fun, 16 | cl = NULL 17 | ) 18 | } 19 | \arguments{ 20 | \item{X}{\eqn{n} by \eqn{p} matrix containing numeric data.} 21 | 22 | \item{k1, k2}{Integers selecting the clusters to test.} 23 | 24 | \item{iso}{Boolean. If \code{TRUE}, isotropic covariance matrix model, otherwise not.} 25 | 26 | \item{sig}{Optional scalar specifying \eqn{\sigma}, relevant if \code{iso} is \code{TRUE}.} 27 | 28 | \item{SigInv}{Optional matrix specifying \eqn{\Sigma^{-1}}, relevant if \code{iso} is \code{FALSE}.} 29 | 30 | \item{ndraws}{Integer selecting the number of importance samples, default of 2000.} 31 | 32 | \item{cl_fun}{Function returning assignments to clusters 1 through \code{K}.} 33 | 34 | \item{cl}{Optionally pass in the results of calling \code{cl_fun} on your data. This is for 35 | efficiency and reproducibility (when the clustering function is non-deterministic).} 36 | } 37 | \value{ 38 | \item{stat}{the test statistic: the Euclidean distance between the mean of cluster \code{k1} and the mean of cluster \code{k2} } 39 | \item{pval}{the approximate p-value} 40 | \item{stderr}{standard error of the p-value estimate} 41 | \item{clusters}{the estimated cluster assignments} 42 | } 43 | \description{ 44 | This function performs a user-specified clustering method \code{cl_fun} on the rows of a 45 | data matrix to obtain \code{K} clusters, and tests the null hypothesis of no difference in means 46 | between clusters \code{k1} and \code{k2}. 47 | } 48 | \details{ 49 | In order to account for the fact that the clusters have been estimated from the data, 50 | the p-values are computed conditional on the fact that those clusters were estimated. 51 | This function approximates p-values via importance sampling. 52 | 53 | This function assumes that \code{cl_fun} takes a \eqn{n \times p} numeric data matrix as input 54 | and outputs integer assignments to clusters 1 through \code{K}. 55 | 56 | Thank you to August Guang for providing code to speed-up the function by 57 | parallelizing via the \code{future} package. 58 | } 59 | \examples{ 60 | # Simulates a 100 x 2 data set with three clusters 61 | set.seed(123) 62 | dat <- rbind(c(-1, 0), c(0, sqrt(3)), c(1, 0))[rep(1:3, length=100), ] + 63 | matrix(0.2*rnorm(200), 100, 2) 64 | 65 | # Function to run k-means clustering w/ k = 3 and 50 random starts 66 | km_cluster <- function(X) { 67 | km <- kmeans(X, 3, nstart=50) 68 | return(km$cluster) 69 | } 70 | 71 | # Cluster data using k-means 72 | clusters <- km_cluster(dat) 73 | table(rep(1:3, length=100), clusters) 74 | 75 | # tests for a difference in means between clusters 1 and 2 76 | # We pass in earlier k-means clustering results from earlier 77 | results <- test_clusters_approx(dat, k1=1, k2=2, cl_fun=km_cluster, ndraws=500, cl=clusters) 78 | results$stat 79 | results$pval 80 | results$stderr 81 | 82 | } 83 | \references{ 84 | Lucy L. Gao et al. "Selective inference for hierarchical clustering". 85 | } 86 | -------------------------------------------------------------------------------- /man/test_complete_hier_clusters_approx.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/trunc_inf.R 3 | \name{test_complete_hier_clusters_approx} 4 | \alias{test_complete_hier_clusters_approx} 5 | \title{Monte Carlo significance test for complete linkage hierarchical clustering} 6 | \usage{ 7 | test_complete_hier_clusters_approx( 8 | X, 9 | hcl, 10 | K, 11 | k1, 12 | k2, 13 | iso = TRUE, 14 | sig = NULL, 15 | SigInv = NULL, 16 | ndraws = 2000 17 | ) 18 | } 19 | \arguments{ 20 | \item{X}{\eqn{n} by \eqn{p} matrix containing numeric data.} 21 | 22 | \item{hcl}{An object of the type \code{hclust} containing the hierarchical clustering of X.} 23 | 24 | \item{K}{Integer selecting the total number of clusters.} 25 | 26 | \item{k1, k2}{Integers selecting the clusters to test.} 27 | 28 | \item{iso}{Boolean. If \code{TRUE}, isotropic covariance matrix model, otherwise not.} 29 | 30 | \item{sig}{Optional scalar specifying \eqn{\sigma}, relevant if \code{iso} is \code{TRUE}.} 31 | 32 | \item{SigInv}{Optional matrix specifying \eqn{\Sigma^{-1}}, relevant if \code{iso} is \code{FALSE}.} 33 | 34 | \item{ndraws}{Integer selecting the number of importance samples, default of 2000.} 35 | } 36 | \value{ 37 | \item{stat}{the test statistic: the Euclidean distance between the mean of cluster \code{k1} and the mean of cluster \code{k2} } 38 | \item{pval}{the approximate p-value} 39 | \item{stderr}{estimated standard error of the p-value estimate} 40 | } 41 | \description{ 42 | This tests the null hypothesis of no difference in means between 43 | clusters \code{k1} and \code{k2} at level \code{K} in a complete 44 | linkage hierarchical clustering. (The \code{K} clusters are numbered as per 45 | the results of the \code{cutree} function in the \code{stats} package.) 46 | } 47 | \details{ 48 | Important note: Before calling \code{hclust} and this function, make sure to 49 | load the package \code{fastcluster}. This is because the p-value approximation 50 | procedure requires running hierarchical clustering on a large number of simulated 51 | data sets, and the version of \code{hclust} in the \code{fastcluster} package 52 | is much faster than the version of \code{hclust} in \code{stats}. 53 | 54 | In order to account for the fact that the clusters have been estimated from the data, 55 | the p-values are computed conditional on the fact that those clusters were estimated. 56 | This function approximates p-values via importance sampling. 57 | 58 | Currently, this function supports squared Euclidean distance as a measure of dissimilarity 59 | between observations. (Note that complete linkage is invariant under monotone transformations 60 | of the measure of dissimilarity between observations, so unsquared Euclidean distance 61 | would produce the same hierarchical clustering.) 62 | 63 | By default, this function assumes that the covariance matrix of the features is isotropic 64 | i.e. \eqn{Cov(X_i) = \sigma^2 I_p}. Setting \code{iso} to false instead assumes that 65 | \eqn{Cov(X_i) = \Sigma}. If known, \eqn{\sigma} can be passed in using the \code{sigma} argument 66 | or \eqn{\Sigma^{-1}} can be passed in the \code{SigInv} argument; otherwise, an 67 | estimate of \eqn{\sigma} or \eqn{\Sigma} will be used. 68 | } 69 | \examples{ 70 | # Simulates a 100 x 2 data set with no clusters 71 | set.seed(1) 72 | dat <- matrix(rnorm(200), 100, 2) 73 | 74 | # Complete linkage hierarchical clustering 75 | library(fastcluster) 76 | hcl <- hclust(dist(dat, method="euclidean")^2, method="complete") 77 | 78 | # plot dendrograms with the 1st and 2nd clusters (cut at the third level) 79 | # displayed in blue and orange 80 | plot(hcl) 81 | rect_hier_clusters(hcl, k=3, which=1:2, border=c("blue", "orange")) 82 | 83 | # Monte Carlo test for a difference in means between the blue and orange clusters 84 | test_complete_hier_clusters_approx(X=dat, hcl=hcl, K=3, k1=1, k2=2, ndraws=1000) 85 | 86 | } 87 | \references{ 88 | Lucy L. Gao et al. "Selective inference for hierarchical clustering". 89 | } 90 | \seealso{ 91 | \code{\link{rect_hier_clusters}} for visualizing clusters \code{k1} and \code{k2} in the dendrogram; 92 | 93 | \code{\link{test_hier_clusters_exact}} for exact p-values for hierarchical clustering with other linkages; 94 | 95 | \code{\link{test_clusters_approx}} for approximate p-values for a user-specified clustering function; 96 | } 97 | -------------------------------------------------------------------------------- /man/test_hier_clusters_exact.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/trunc_inf.R 3 | \name{test_hier_clusters_exact} 4 | \alias{test_hier_clusters_exact} 5 | \title{Exact significance test for hierarchical clustering} 6 | \usage{ 7 | test_hier_clusters_exact( 8 | X, 9 | link, 10 | hcl, 11 | K, 12 | k1, 13 | k2, 14 | iso = TRUE, 15 | sig = NULL, 16 | SigInv = NULL, 17 | dist = NULL 18 | ) 19 | } 20 | \arguments{ 21 | \item{X}{\eqn{n} by \eqn{p} matrix containing numeric data.} 22 | 23 | \item{link}{String selecting the linkage. Supported options are 24 | \code{"single", "average", "centroid", "ward.D", "median"}, and \code{"mcquitty"}.} 25 | 26 | \item{hcl}{Object of the type \code{hclust} containing the hierarchical clustering of X.} 27 | 28 | \item{K}{Integer selecting the total number of clusters.} 29 | 30 | \item{k1, k2}{Integers selecting the clusters to test, as indexed by the results of \code{cutree(hcl, K)}.} 31 | 32 | \item{iso}{Boolean. If \code{TRUE}, isotropic covariance matrix model, otherwise not.} 33 | 34 | \item{sig}{Optional scalar specifying \eqn{\sigma}, relevant if \code{iso} is \code{TRUE}.} 35 | 36 | \item{SigInv}{Optional matrix specifying \eqn{\Sigma^{-1}}, relevant if \code{iso} is \code{FALSE}.} 37 | 38 | \item{dist}{The SQUARED Euclidean distances of matrix X} 39 | } 40 | \value{ 41 | \item{stat}{the test statistic: the Euclidean distance between the mean of cluster \code{k1} and the mean of cluster \code{k2} } 42 | \item{pval}{the p-value} 43 | \item{trunc}{object of the type \code{Intervals} containing the conditioning set} 44 | } 45 | \description{ 46 | This tests the null hypothesis of no difference in means between 47 | clusters \code{k1} and \code{k2} at level \code{K} in a hierarchical clustering. 48 | (The \code{K} clusters are numbered as per the results of the \code{cutree} 49 | function in the \code{stats} package.) 50 | } 51 | \details{ 52 | In order to account for the fact that the clusters have been estimated from the data, 53 | the p-values are computed conditional on the fact that those clusters were estimated. 54 | This function computes p-values exactly via an analytic characterization of the conditioning set. 55 | 56 | Currently, this function supports SQUARED Euclidean distance as a measure of dissimilarity 57 | between observations, and the following six linkages: single, average, centroid, Ward, 58 | McQuitty (also known as WPGMA), and median (also knßown as WPGMC). 59 | 60 | By default, this function assumes that the covariance matrix of the features is isotropic 61 | i.e. \eqn{Cov(X_i) = \sigma^2 I_p}. Setting \code{iso} to \code{FALSE} instead assumes that 62 | \eqn{Cov(X_i) = \Sigma}. If known, \eqn{\sigma} can be passed in using the \code{sigma} argument 63 | or \eqn{\Sigma^{-1}} can be passed in the \code{SigInv} argument; otherwise, an 64 | estimate of \eqn{\sigma} or \eqn{\Sigma} will be used. 65 | 66 | Note that passing in the SQUARED Euclidean distance object used by \code{hclust} in using the 67 | optional \code{dist} argument improves computational efficiency for all linkages except 68 | for single linkage. This may not lead to noticeable speed-ups in small data sets but 69 | leads to major speed-ups in large data sets. Thank you to Jesko Wagner for 70 | suggesting and implementing this change. 71 | } 72 | \examples{ 73 | # Simulates a 100 x 2 data set with three clusters 74 | set.seed(123) 75 | dat <- rbind(c(-1, 0), c(0, sqrt(3)), c(1, 0))[rep(1:3, length=100), ] + 76 | matrix(0.2*rnorm(200), 100, 2) 77 | 78 | # Average linkage hierarchical clustering 79 | hcl <- hclust(dist(dat, method="euclidean")^2, method="average") 80 | 81 | # plot dendrograms with the 1st and 2nd clusters (cut at the third split) 82 | # displayed in blue and orange 83 | plot(hcl) 84 | rect_hier_clusters(hcl, k=3, which=1:2, border=c("blue", "orange")) 85 | 86 | # tests for a difference in means between the blue and orange clusters 87 | test_hier_clusters_exact(X=dat, link="average", hcl=hcl, K=3, k1=1, k2=2) 88 | 89 | } 90 | \references{ 91 | Lucy L. Gao et al. "Selective inference for hierarchical clustering". 92 | } 93 | \seealso{ 94 | \code{\link{rect_hier_clusters}} for visualizing clusters \code{k1} and \code{k2} in the dendrogram; 95 | 96 | \code{\link{test_complete_hier_clusters_approx}} for approximate p-values for complete linkage hierarchical clustering; 97 | 98 | \code{\link{test_clusters_approx}} for approximate p-values for a user-specified clustering function; 99 | } 100 | -------------------------------------------------------------------------------- /R/trunc_dist.R: -------------------------------------------------------------------------------- 1 | # ----- functions for computing tail probabilities of truncated chi-squared distributions ----- 2 | # ----- full credit to Shuxiao Chen, the writer of the outference package ----- 3 | #' A helper function for approximating normal tail probabilities 4 | #' 5 | #' For \eqn{Z ~ N(0, 1)}, we have the approximation 6 | #' \eqn{P(Z \ge z) \approx }\code{magicfun(z)*exp(-z^2/2)}. 7 | #' 8 | #' @keywords internal 9 | #' 10 | #' @param z, the number where the function is evaluated. 11 | #' 12 | #' @return This function returns the value of the function evaluated at \code{z}. 13 | #' 14 | #' @references Bryc, Wlodzimierz. "A uniform approximation to the right normal tail integral." 15 | #' Applied mathematics and computation 127.2 (2002): 365-374. 16 | magicfun = function(z){ 17 | z2 <- z*z 18 | z3 <- z*z*z 19 | temp <- (z2 + 5.575192695 * z + 12.77436324) / 20 | (sqrt(2*pi) * z3 + 14.38718147*z2 + 31.53531977*z + 2*12.77436324) 21 | return(temp) 22 | } 23 | 24 | #' Make endpoints of intervals finite 25 | #' 26 | #' This function modifies a union of intervals with positive but possibly infinite endpoints 27 | #' into a union of intervals with positive and \emph{finite} endpoints, while ensuring 28 | #' the probability of a \eqn{N(0, 1)} falling into it numerically the same. 29 | #' 30 | #' @keywords internal 31 | #' 32 | #' @param E an "Intervals" object or a matrix where rows represents 33 | #' a union of intervals with \emph{positive} but possibly infinite endpoints. 34 | #' 35 | #' @return This function returns an "Intervals" object or a matrix depending on the input. 36 | finiteE <- function(E) { 37 | ind.inf <- which(E == Inf) 38 | if (length(ind.inf) == 0) return(E) 39 | # we know there are some infinite entries 40 | E.max <- max(E[-ind.inf]) 41 | E[which(E == Inf)] <- max(10000, E.max * 2) 42 | return(E) 43 | } 44 | 45 | #' Make endpoints of intervals positive 46 | #' 47 | #' This function modifies a union of intervals with possibly negative enpoints 48 | #' into a union of intervals with \emph{positive} endpoints, while ensuring 49 | #' the probability of a \eqn{N(0, 1)} falling into it numerically the same. 50 | #' 51 | #' @keywords internal 52 | #' 53 | #' @param E an "Intervals" object or a matrix where rows represents 54 | #' a union of intervals with \emph{positive} but possibly infinite endpoints. 55 | #' 56 | #' @return This function returns an "Intervals" object or a matrix depending on the input. 57 | sortE <- function(E) { 58 | E.sorted <- lapply(1:nrow(E), function(i){ 59 | temp <- as.numeric(E[i, ]) 60 | if (temp[1] <= 0 & temp[2] <= 0) { 61 | return(sort(-temp)) 62 | } 63 | if (temp[1] >= 0 & temp[2] >= 0) { 64 | return(sort(temp)) 65 | } 66 | # we know temp[1] < 0, temp[2] > 0 OR temp[1] > 0, temp[2] < 0 67 | temp <- abs(temp) 68 | return(rbind(c(0, temp[1]), c(0, temp[2]))) 69 | }) 70 | E.sorted <- do.call(rbind, E.sorted) 71 | # in order to use the approximation, we translate Inf to a large number 72 | return(finiteE(E.sorted)) 73 | } 74 | 75 | #' Comparison between two intervals 76 | #' 77 | #' This functions returns \code{TRUE} if and only if two intervals are the same. 78 | #' 79 | #' @keywords internal 80 | #' 81 | #' @param int1,int2 "Intervals" objects. 82 | #' 83 | #' @return This function returns the desired logical result. 84 | isSameIntervals <- function(int1, int2) { 85 | 86 | # first make int1, int2 to the default order 87 | int1 <- intervals::reduce(int1) 88 | int2 <- intervals::reduce(int2) 89 | 90 | if (nrow(int1) != nrow(int2)) return(FALSE) 91 | 92 | # int1 and int2 has the same number of intervals 93 | 94 | if (sum(int1 != int2) > 0) return(FALSE) 95 | 96 | # int1 and int2 has the same elements 97 | return(TRUE) 98 | } 99 | 100 | #' Approximation of the ratio of two normal probabilities 101 | #' 102 | #' This function returns an approximation of \eqn{P(Z \in E1)/P(Z \in E2)}, where \eqn{Z ~ N(0, 1)}. 103 | #' 104 | #' @keywords internal 105 | #' 106 | #' @param E1,E2 "Intervals" objects or matrices where rows represents 107 | #' a union of intervals with \emph{positive and finite} endpoints. 108 | #' @param scale scaling parameter. 109 | #' 110 | #' @return This function returns the value of the approximation. 111 | #' 112 | #' @references Bryc, Wlodzimierz. "A uniform approximation to the right normal tail integral." 113 | #' Applied mathematics and computation 127.2 (2002): 365-374. 114 | TNRatioApprox <- function(E1, E2, scale = NULL) { 115 | 116 | if (is.null(scale)) { 117 | temp <- (c(E1, E2))^2 118 | scale.grid <- stats::quantile(temp, probs = seq(0, 1, 0.2)) 119 | 120 | for(scale in scale.grid) { 121 | temp <- TNRatioApprox(E1, E2, scale = scale) 122 | if (!is.na(temp)) { 123 | return(temp) 124 | } 125 | # if temp is NaN, proceed to the next loop 126 | } 127 | 128 | # if all scale.grid does not work, then return NaN 129 | return(NaN) 130 | } 131 | num1 <- magicfun(E1[, 1]) * exp(-(E1[, 1]^2 - scale)/2) 132 | num2 <- magicfun(E1[, 2]) * exp(-(E1[, 2]^2 - scale)/2) 133 | denom1 <- magicfun(E2[, 1]) * exp(-(E2[, 1]^2 - scale)/2) 134 | denom2 <- magicfun(E2[, 2]) * exp(-(E2[, 2]^2 - scale)/2) 135 | res <- sum(num1-num2)/sum(denom1-denom2) 136 | return(res) 137 | } 138 | 139 | #' Probability of a standard normal in a single interval 140 | #' 141 | #' This function returns \eqn{P(lo \le Z \le up)}, where \eqn{Z ~ N(0, 1)}. 142 | #' 143 | #' @keywords internal 144 | #' 145 | #' @param lo,up quantiles. 146 | #' 147 | #' @return This function returns the desired probability. 148 | TNProbEachInt <- function(lo, up) { 149 | if (up == Inf) { 150 | return(stats::pnorm(lo, 0, 1, lower.tail = FALSE)) 151 | } 152 | # we know up < Inf, want P(lo <= X <= up). 153 | # we want small - small (big - big will mask small numbers), 154 | 155 | try1 <- stats::pnorm(lo, 0, 1, lower.tail = FALSE) - stats::pnorm(up, 0, 1, lower.tail = FALSE) 156 | if (try1 != 0) return(try1) 157 | 158 | try2 <- stats::pnorm(up, 0, 1, lower.tail = TRUE) - stats::pnorm(lo, 0, 1, lower.tail = TRUE) 159 | return(try2) 160 | 161 | } 162 | 163 | #' Probability of a standard normal in a union of intervals 164 | #' 165 | #' This function returns \eqn{P(Z \in E)}, where \eqn{Z ~ N(0, 1)}. 166 | #' 167 | #' @keywords internal 168 | #' 169 | #' @param E an "Intervals" object or a matrix where rows represents 170 | #' a union of disjoint intervals. 171 | #' 172 | #' @return This function returns the desired probability. 173 | TNProb <- function(E) { 174 | # sum cdf over each disjoint interval of E 175 | res <- sum(sapply(1:nrow(E), function(v) { 176 | return(TNProbEachInt(E[v, 1], E[v, 2])) 177 | })) 178 | return(res) 179 | } 180 | 181 | #' Survival function of truncated normal distribution 182 | #' 183 | #' This function returns the upper tail probability of a truncated normal distribution 184 | #' at quantile \code{q}. 185 | #' 186 | #' Let \eqn{X} be a normal random variable with \code{mean} and \code{sd}. Truncating 187 | #' \eqn{X} to the set \eqn{E} is equivalent to conditioning on \eqn{{X \in E}}. So this function 188 | #' returns \eqn{P(X \ge q | X \in E)}. 189 | #' 190 | #' @keywords internal 191 | #' 192 | #' @param q the quantile. 193 | #' @param mean the mean parameter 194 | #' @param sd the standard deviation 195 | #' @param E the truncation set, an "Intervals" object or a matrix where rows represents 196 | #' a union of disjoint intervals. 197 | #' @param approx should the approximation algorithm be used? Default is \code{FALSE}, 198 | #' where the approximation is not used in the first place. But when the result is wacky, 199 | #' the approximation will be used. 200 | #' 201 | #' @return This function returns the value of the survival function evaluated at quantile \code{q}. 202 | #' 203 | #' @references Bryc, Wlodzimierz. "A uniform approximation to the right normal tail integral." 204 | #' Applied mathematics and computation 127.2 (2002): 365-374. 205 | TNSurv <- function(q, mean, sd, E, approx = FALSE) { 206 | # check if truncation is empty (i.e. truncated to the empty set) 207 | if (nrow(E) == 0) { 208 | stop("truncation set is empty") 209 | } 210 | 211 | # check if truncation is the whole real line 212 | if (isSameIntervals(E, intervals::Intervals(c(-Inf, Inf)))) { 213 | return(stats::pnorm(q, mean, sd, lower.tail = FALSE)) 214 | } 215 | 216 | # E is not empty and is not the whole real line, 217 | # i.e. 0 < P(X in E) < 1 218 | 219 | # we want P(X > q | X in E) = P(X >= q AND X in E) / P(X in E) 220 | # {X >= q} = {Z >= (q-mean)/sd} 221 | # {X in E} = {Z in (E-mean)/sd} 222 | # Z ~ N(0, 1) 223 | q <- (q-mean)/sd 224 | E <- (E-mean)/sd 225 | mean <- 0 226 | sd <- 1 227 | q2 <- q*q 228 | region <- suppressWarnings(intervals::interval_intersection(E, intervals::Intervals(c(q, Inf)))) 229 | # check if the result is 0 or 1 230 | if(nrow(region) == 0) return(0) 231 | if (isSameIntervals(E, region)) return(1) 232 | 233 | # transform region and E so that intervals have positive endpoints 234 | region <- sortE(region) 235 | E <- sortE(E) 236 | 237 | # we want P(Z in region) / P(Z in E) 238 | # try approximate calculation 239 | if (approx) { 240 | res <- TNRatioApprox(region, E, scale = q2) 241 | if (is.nan(res)) { # try approximation one more time 242 | res <- TNRatioApprox(region, E, scale = NULL) 243 | } 244 | return(max(0, min(1, res))) 245 | } 246 | 247 | # try exact calculation 248 | denom <- TNProb(E) 249 | num <- TNProb(region) 250 | 251 | if (denom < 1e-100 || num < 1e-100) { 252 | res <- TNRatioApprox(region, E, scale = q2) 253 | if (is.nan(res)) { # try approximation one more time 254 | res <- TNRatioApprox(region, E, scale = NULL) 255 | } 256 | return(max(0, min(1, res))) 257 | } 258 | 259 | # we know denom and num are both reasonably > 0 260 | 261 | res <- num / denom 262 | # force the result to lie in [0, 1] 263 | return(max(0, min(1, res))) 264 | } 265 | 266 | #' Approximation of the ratio of two chi-squared probabilities 267 | #' 268 | #' This function returns an approximation of \eqn{P(X \in E1)/P(X \in E2)}, where 269 | #' \eqn{X} is a central chi-squared random variable with \code{df} degrees of freedom. 270 | #' 271 | #' @export 272 | #' 273 | #' @param df degree of freedom of the chi-squared random variable. 274 | #' @param E1,E2 "Intervals" objects or matrices where rows represents 275 | #' a union of intervals with \emph{positive and finite} endpoints. 276 | #' 277 | #' @return This function returns the value of the approximation. 278 | #' 279 | #' @references Bryc, Wlodzimierz. "A uniform approximation to the right normal tail integral." 280 | #' Applied mathematics and computation 127.2 (2002): 365-374. 281 | #' @references Canal, Luisa. "A normal approximation for the chi-square distribution." 282 | #' Computational statistics & data analysis 48.4 (2005): 803-808. 283 | TChisqRatioApprox <- function(df, E1, E2) { 284 | 285 | # the transform that makes x into a N(0, 1) r.v. such that 286 | # P(X >= x) = P(Z >= Chisq2N(x)), X ~ chisq(df), Z ~ N(0, 1) 287 | # this function can take either scaler, vector or matrix 288 | Chisq2N <- function(x, df, tol = 1e-6) { 289 | 290 | if (is.numeric(x) && length(x) == 1) { 291 | if (x <= tol) { # x <= 0 292 | return(-Inf) 293 | } 294 | if (x == Inf) { 295 | return(Inf) 296 | } 297 | # we know x > 0 and x is finite 298 | x <- (x/df)^(1/6) - (1/2) * (x/df)^(1/3) + (1/3) * (x/df)^(1/2) 299 | mu <- 5/6 - 1/(9*df) - 7/(648*df^2) + 25/(2187*df^3) 300 | sig <- sqrt(1/(18*df) + 1/(162*df^2) - 37/(11664*df^3)) 301 | return((x-mu)/sig) 302 | } 303 | 304 | if (is.vector(x)) { 305 | return(vapply(X = x, FUN = Chisq2N, FUN.VALUE = 0.1, df = df)) 306 | } 307 | 308 | if (is.matrix(x)) { 309 | return(structure(vapply(X = x, FUN = Chisq2N, FUN.VALUE = 0.1, df = df), dim = dim(x))) 310 | } 311 | 312 | return(intervals::Intervals()) 313 | 314 | } 315 | 316 | 317 | E1 <- Chisq2N(E1, df) 318 | E1 <- sortE(E1) # notice that Chisq2N can be negative 319 | E2 <- Chisq2N(E2, df) 320 | E2 <- sortE(E2) 321 | 322 | # now we want P(Z in E1) / P(Z in E2), Z ~ N(0, 1) 323 | return(TNRatioApprox(E1, E2)) 324 | } 325 | -------------------------------------------------------------------------------- /R/.Rhistory: -------------------------------------------------------------------------------- 1 | } 2 | } 3 | } 4 | if(cl[min_cluster_1] != k1 & cl[min_cluster_1] != k2) { 5 | first_height <- height_merge[min_cluster_1] 6 | for(j in k1_obs) { 7 | if(first_height < height_merge[j]) { 8 | current_height <- heights[first_height] 9 | } else { 10 | current_height <- heights[height_merge[j]] 11 | } 12 | if(min_cluster_1 > j) { 13 | new_intervals <- solve_one_ineq(squared_prop_k2, B[min_cluster_1, j], C[min_cluster_1, j] - current_height) 14 | } else { 15 | new_intervals <- solve_one_ineq(squared_prop_k2, B[j, min_cluster_1], C[j, min_cluster_1] - current_height) 16 | } 17 | if(!is.null(new_intervals)) { 18 | S_complement[[list_index]] <- new_intervals 19 | list_index <- list_index + 1 20 | } 21 | } 22 | for(j in k2_obs) { 23 | if(first_height < height_merge[j]) { 24 | current_height <- heights[first_height] 25 | } else { 26 | current_height <- heights[height_merge[j]] 27 | } 28 | if(min_cluster_1 > j) { 29 | new_intervals <- solve_one_ineq(squared_prop_k1, B[min_cluster_1, j], C[min_cluster_1, j] - current_height) 30 | } else { 31 | new_intervals <- solve_one_ineq(squared_prop_k1, B[j, min_cluster_1], C[j, min_cluster_1] - current_height) 32 | } 33 | if(!is.null(new_intervals)) { 34 | S_complement[[list_index]] <- new_intervals 35 | list_index <- list_index + 1 36 | } 37 | } 38 | } 39 | } 40 | S_complement <- do.call('c', S_complement) 41 | S_complement <- matrix(S_complement, length(S_complement)/2, 2, byrow=TRUE) 42 | S_complement <- intervals::reduce(intervals::Intervals(S_complement), check_valid=FALSE) 43 | # complement the complement to get S 44 | S <- intervals::interval_complement(S_complement, check_valid=FALSE) 45 | }) 46 | cran_downloads(when = "last-week", packages = c("multiviewtest")) 47 | a <- cran_logs:;cran_downloads(when = "last-week", packages = c("multiviewtest")) 48 | a <- cran_logs::cran_downloads(when = "last-week", packages = c("multiviewtest")) 49 | m <- cranDownloads(packages = c("multiviewtest"), 50 | from = "2015", to = "2020-10-28") 51 | m <- packagerank::cranDownloads(packages = c("multiviewtest"), 52 | from = "2015", to = "2020-10-28") 53 | m <- packageRank::cranDownloads(packages = c("multiviewtest"), 54 | from = "2015", to = "2020-10-28") 55 | m <- packageRank::cranDownloads(packages = c("multiviewtest"), 56 | from = "2015", to = "2020-10-27") 57 | plot(m$cranlogs.data$count) 58 | sum(m$cranlogs.data$count) 59 | m <- packageRank::cranDownloads(packages = c("multiviewtest"), 60 | from = "2018", to = "2020-10-28") 61 | m <- packageRank::cranDownloads(packages = c("multiviewtest"), 62 | from = "2018", to = "2020-10-27") 63 | plot(m$cranlogs.data$count) 64 | m <- packageRank::cranDownloads(packages = c("multiviewtest"), 65 | from = "2018-10-27", to = "2020-10-27") 66 | plot(m$cranlogs.data$count) 67 | sum(m$cranlogs.data$count) 68 | setwd("~/Dropbox/My-Research/Lucy-Dissertation/Clustering-Selective-Inference/R-Code/clusterpval/R") 69 | set.seed(123) 70 | dat <- rbind(c(-1, 0), c(0, sqrt(3)), c(1, 0))[rep(1:3, length=100), ] + 71 | matrix(0.2*rnorm(200), 100, 2) 72 | hcl <- fastcluster::hclust(dist(dat, method="euclidean")^2, method="complete") 73 | set.seed(1) 74 | X <- dat 75 | K <- 3 76 | k1 <- 1 77 | k2 <- 2 78 | iso <- TRUE 79 | sig <- 0.2 80 | ndraws <- 2000 81 | n <- nrow(X) 82 | q <- ncol(X) 83 | hcl_at_K <- stats::cutree(hcl, K) 84 | k1_obs <- which(hcl_at_K == k1) 85 | k2_obs <- which(hcl_at_K == k2) 86 | n1 <- length(k1_obs) 87 | n2 <- length(k2_obs) 88 | squared_norm_nu <- 1/n1 + 1/n2 89 | diff_means <- colMeans(X[k1_obs, , drop=FALSE]) - colMeans(X[k2_obs, , drop=FALSE]) 90 | prop_k2 <- n2/(n1+n2) 91 | if(iso) { 92 | if(is.null(sig)) { 93 | sig <- sqrt(sum(scale(X, scale=FALSE)^2)/(n*q - q)) 94 | } 95 | scale_factor <- squared_norm_nu*sig^2 96 | # compute test statistic 97 | stat <- norm_vec(diff_means) 98 | } else { 99 | if(is.null(SigInv)) { 100 | Sig <- stats::cov(scale(X, scale=FALSE)) 101 | SigInv <- solve(Sig) 102 | } 103 | scale_factor <- squared_norm_nu 104 | # compute test statistic 105 | stat <- sqrt(as.numeric(t(diff_means)%*%SigInv%*%diff_means)) 106 | } 107 | norm_vec <- function(x) { 108 | sqrt(sum(x^2)) 109 | } 110 | if(iso) { 111 | if(is.null(sig)) { 112 | sig <- sqrt(sum(scale(X, scale=FALSE)^2)/(n*q - q)) 113 | } 114 | scale_factor <- squared_norm_nu*sig^2 115 | # compute test statistic 116 | stat <- norm_vec(diff_means) 117 | } else { 118 | if(is.null(SigInv)) { 119 | Sig <- stats::cov(scale(X, scale=FALSE)) 120 | SigInv <- solve(Sig) 121 | } 122 | scale_factor <- squared_norm_nu 123 | # compute test statistic 124 | stat <- sqrt(as.numeric(t(diff_means)%*%SigInv%*%diff_means)) 125 | } 126 | scale_factor <- sqrt(scale_factor) 127 | log_survives <- rep(NA, ndraws) 128 | if(iso) { 129 | if(is.null(sig)) { 130 | sig <- sqrt(sum(scale(X, scale=FALSE)^2)/(n*q - q)) 131 | } 132 | scale_factor <- squared_norm_nu*sig^2 133 | # compute test statistic 134 | stat <- norm_vec(diff_means) 135 | } else { 136 | if(is.null(SigInv)) { 137 | Sig <- stats::cov(scale(X, scale=FALSE)) 138 | SigInv <- solve(Sig) 139 | } 140 | scale_factor <- squared_norm_nu 141 | # compute test statistic 142 | stat <- sqrt(as.numeric(t(diff_means)%*%SigInv%*%diff_means)) 143 | } 144 | scale_factor <- sqrt(scale_factor) 145 | log_survives <- rep(NA, ndraws) 146 | phi <- stats::rnorm(ndraws)*scale_factor + stat 147 | k1_constant <- prop_k2*diff_means/stat 148 | k2_constant <- (prop_k2 - 1)*diff_means/stat 149 | orig_k1 <- t(X[k1_obs, ]) 150 | orig_k2 <- t(X[k2_obs, ]) 151 | Xphi <- X 152 | for(j in 1:ndraws) { 153 | if(phi[j] < 0) next 154 | # Compute perturbed data set 155 | phi_minus_stat <- phi[j] - stat 156 | Xphi[k1_obs, ] <- t(orig_k1 + k1_constant*phi_minus_stat) 157 | Xphi[k2_obs, ] <- t(orig_k2 + k2_constant*phi_minus_stat) 158 | # Recluster the perturbed data set 159 | hcl_Xphi <- fastcluster::hclust(stats::dist(Xphi)^2, method="complete") 160 | clusters_Xphi <- stats::cutree(hcl_Xphi, K) 161 | if(same_cl(hcl_at_K, clusters_Xphi, K)) { 162 | log_survives[j] <- -(phi[j]/scale_factor)^2/2 + (q-1)*log(phi[j]/scale_factor) - (q/2 - 1)*log(2) - log(gamma(q/2)) - log(scale_factor) - 163 | stats::dnorm(phi[j], mean=stat, sd=scale_factor, log=TRUE) 164 | } 165 | } 166 | # ----- general purpose helper functions ----- 167 | #' Takes the l2-norm of a vector. 168 | #' 169 | #' @keywords internal 170 | #' 171 | #' @param x the vector to be normed 172 | #' 173 | #' @return Returns the l2-norm of x. 174 | norm_vec <- function(x) { 175 | sqrt(sum(x^2)) 176 | } 177 | #' Checks if input is an integer between a and b 178 | #' 179 | #' @keywords internal 180 | #' 181 | #' @param x input to check 182 | #' @param a lower 183 | #' @param b upper 184 | #' 185 | #' @return Returns TRUE if input is an integer between a and b, FALSE otherwise 186 | is_integer_between_a_b <- function(x, a, b) { 187 | (x>= min(c(a, b))) && (x %% 1 == 0) && (x <= max(c(a, b))) 188 | } 189 | #' Checks if two clusterings are the same up to permutation 190 | #' 191 | #' @keywords internal 192 | #' 193 | #' @param cl1 the first clustering 194 | #' @param cl2 the second clustering 195 | #' @param K the number of clusters 196 | #' 197 | #' @return Returns TRUE if they are the same, and FALSE otherwise 198 | same_cl <- function(cl1, cl2, K) { 199 | tab <- table(cl1, cl2) 200 | sum(tab != 0) == K 201 | } 202 | #' Checks if Ck, Ck' in C(x'(phi)) 203 | #' 204 | #' @keywords internal 205 | #' 206 | #' @param cl clustering of x 207 | #' @param cl_phi clustering of x'(phi) 208 | #' @param k1,k2 index of clusters involved in the test 209 | #' 210 | #' @return Returns TRUE if Ck, Ck' in C(x'(phi)), and FALSE otherwise 211 | preserve_cl <- function(cl, cl_phi, k1, k2) { 212 | tab <- table(cl, cl_phi) 213 | k1_in <- (sum(tab[k1, ] != 0) == 1) & (sum(tab[, k1] != 0) == 1) 214 | k2_in <- (sum(tab[k2, ] != 0) == 1) & (sum(tab[, k2] != 0) == 1) 215 | k1_in & k2_in 216 | } 217 | for(j in 1:ndraws) { 218 | if(phi[j] < 0) next 219 | # Compute perturbed data set 220 | phi_minus_stat <- phi[j] - stat 221 | Xphi[k1_obs, ] <- t(orig_k1 + k1_constant*phi_minus_stat) 222 | Xphi[k2_obs, ] <- t(orig_k2 + k2_constant*phi_minus_stat) 223 | # Recluster the perturbed data set 224 | hcl_Xphi <- fastcluster::hclust(stats::dist(Xphi)^2, method="complete") 225 | clusters_Xphi <- stats::cutree(hcl_Xphi, K) 226 | if(same_cl(hcl_at_K, clusters_Xphi, K)) { 227 | log_survives[j] <- -(phi[j]/scale_factor)^2/2 + (q-1)*log(phi[j]/scale_factor) - (q/2 - 1)*log(2) - log(gamma(q/2)) - log(scale_factor) - 228 | stats::dnorm(phi[j], mean=stat, sd=scale_factor, log=TRUE) 229 | } 230 | } 231 | scale_factor <- sqrt(scale_factor) 232 | log_survives <- rep(NA, ndraws) 233 | phi <- stats::rnorm(ndraws)*scale_factor + stat 234 | k1_constant <- prop_k2*diff_means/stat 235 | k2_constant <- (prop_k2 - 1)*diff_means/stat 236 | orig_k1 <- t(X[k1_obs, ]) 237 | orig_k2 <- t(X[k2_obs, ]) 238 | Xphi <- X 239 | for(j in 1:ndraws) { 240 | if(phi[j] < 0) next 241 | # Compute perturbed data set 242 | phi_minus_stat <- phi[j] - stat 243 | Xphi[k1_obs, ] <- t(orig_k1 + k1_constant*phi_minus_stat) 244 | Xphi[k2_obs, ] <- t(orig_k2 + k2_constant*phi_minus_stat) 245 | # Recluster the perturbed data set 246 | hcl_Xphi <- fastcluster::hclust(stats::dist(Xphi)^2, method="complete") 247 | clusters_Xphi <- stats::cutree(hcl_Xphi, K) 248 | if(same_cl(hcl_at_K, clusters_Xphi, K)) { 249 | log_survives[j] <- -(phi[j]/scale_factor)^2/2 + (q-1)*log(phi[j]/scale_factor) - (q/2 - 1)*log(2) - log(gamma(q/2)) - log(scale_factor) - 250 | stats::dnorm(phi[j], mean=stat, sd=scale_factor, log=TRUE) 251 | } 252 | } 253 | phi <- phi[!is.na(log_survives)] 254 | log_survives <- log_survives[!is.na(log_survives)] 255 | survives <- length(log_survives) 256 | if(survives == 0) { 257 | warning("Oops - we didn't generate any samples that preserved the clusters! Try re-running with a larger value of ndraws.") 258 | return(list(hiercl=hcl, stat=stat, pval=NA)) 259 | } 260 | if(survives < 100) { 261 | warning(paste("Only", survives, "samples were used to compute the denominator -", 262 | "p-value approximation may be unstable. Consider re-running", 263 | "with a larger value of ndraws.")) 264 | } 265 | log_survives_shift <- log_survives - max(log_survives) 266 | props <- exp(log_survives_shift)/sum(exp(log_survives_shift)) 267 | props 268 | length(props) 269 | length(pval) 270 | pval <- sum(props[phi > stat]) 271 | pval 272 | props 273 | length(props) 274 | props[phi > stat] 275 | length(props[phi > stat]) 276 | length(props) 277 | stat 278 | sum(phi > stat) 279 | summary(phi) 280 | stat 281 | survives 282 | log_survives 283 | props 284 | sum(props[phi < stat]^2) + (1-pval)^2*sum(props[phi >= stat]) 285 | (sum(props[phi < stat]^2) + (1-pval)^2*sum(props[phi >= stat]))/(ndraws*pval) 286 | (sum(props[phi < stat]^2) + (1-pval)^2*sum(props[phi >= stat]))/(pval) 287 | sum(props[phi < stat]^2) + (1-pval)^2*sum(props[phi >= stat]) 288 | pval 289 | props[phi < stat] 290 | props[phi < stat]^2 291 | sum(props[phi < stat]^2) 292 | sum(props[phi > stat]^2) 293 | sum(props) 294 | (1 - pval)^2*sum(props[phi > stat]^2) 295 | (1 - pval)^2*sum(props[phi >= stat]^2) + sum(props[phi < stat]^2) 296 | sum(props[phi < stat]^2) 297 | props[phi < stat]^2 298 | (1 - pval)^2*sum(props[phi >= stat]^2) + sum(props[phi < stat]^2) 299 | (1 - pval)^2*sum(props[phi >= stat]^2) + pval^2*sum(props[phi < stat]^2) 300 | pval 301 | 1.96*4.409732e-23 302 | qnorm(0.975) 303 | var_pval <- (1 - pval)^2*sum(props[phi >= stat]^2) + pval^2*sum(props[phi < stat]^2) 304 | pval_lower_ci <- pval - qnorm(0.975)*sqrt(var_pval) 305 | pval_upper_ci <- pval + qnorm(0.975)*sqrt(var_pval) 306 | pval_lower_ci 307 | pval_upper_ci 308 | max(0, pval - qnorm(0.975)*sqrt(var_pval)) 309 | min(pval + qnorm(0.975)*sqrt(var_pval), 1) 310 | sqrt(ar(pval)) 311 | sqrt(vr(pval)) 312 | sqrt(var(pval)) 313 | test_complete_hier_clusters_approx <- function(X, K, k1, k2, iso=TRUE, sig=NULL, SigInv=NULL, ndraws=2000, hcl=NULL) { 314 | # error checking 315 | if(!is.matrix(X)) stop("X should be a matrix") 316 | n <- nrow(X) 317 | q <- ncol(X) 318 | if(!is_integer_between_a_b(K, 2, n)) stop("number of clusters (K) should be between 2 and n") 319 | if(!is_integer_between_a_b(k1, 1, K) | !is_integer_between_a_b(k2, 1, K)) stop(paste("cluster indices should be between 1 and K", sep="")) 320 | if((iso != TRUE) & (iso != FALSE)) stop("iso should be TRUE or FALSE") 321 | # hierarchical clustering with squared Euclidean distance and specified linkage 322 | if(is.null(hcl)) hcl <- fastcluster::hclust(stats::dist(X, method="euclidean")^2, method="complete") 323 | hcl_at_K <- stats::cutree(hcl, K) 324 | k1_obs <- which(hcl_at_K == k1) 325 | k2_obs <- which(hcl_at_K == k2) 326 | n1 <- length(k1_obs) 327 | n2 <- length(k2_obs) 328 | squared_norm_nu <- 1/n1 + 1/n2 329 | diff_means <- colMeans(X[k1_obs, , drop=FALSE]) - colMeans(X[k2_obs, , drop=FALSE]) 330 | prop_k2 <- n2/(n1+n2) 331 | if(iso) { 332 | if(is.null(sig)) { 333 | sig <- sqrt(sum(scale(X, scale=FALSE)^2)/(n*q - q)) 334 | } 335 | scale_factor <- squared_norm_nu*sig^2 336 | # compute test statistic 337 | stat <- norm_vec(diff_means) 338 | } else { 339 | if(is.null(SigInv)) { 340 | Sig <- stats::cov(scale(X, scale=FALSE)) 341 | SigInv <- solve(Sig) 342 | } 343 | scale_factor <- squared_norm_nu 344 | # compute test statistic 345 | stat <- sqrt(as.numeric(t(diff_means)%*%SigInv%*%diff_means)) 346 | } 347 | scale_factor <- sqrt(scale_factor) 348 | log_survives <- rep(NA, ndraws) 349 | phi <- stats::rnorm(ndraws)*scale_factor + stat 350 | k1_constant <- prop_k2*diff_means/stat 351 | k2_constant <- (prop_k2 - 1)*diff_means/stat 352 | orig_k1 <- t(X[k1_obs, ]) 353 | orig_k2 <- t(X[k2_obs, ]) 354 | Xphi <- X 355 | for(j in 1:ndraws) { 356 | if(phi[j] < 0) next 357 | # Compute perturbed data set 358 | phi_minus_stat <- phi[j] - stat 359 | Xphi[k1_obs, ] <- t(orig_k1 + k1_constant*phi_minus_stat) 360 | Xphi[k2_obs, ] <- t(orig_k2 + k2_constant*phi_minus_stat) 361 | # Recluster the perturbed data set 362 | hcl_Xphi <- fastcluster::hclust(stats::dist(Xphi)^2, method="complete") 363 | clusters_Xphi <- stats::cutree(hcl_Xphi, K) 364 | if(same_cl(hcl_at_K, clusters_Xphi, K)) { 365 | log_survives[j] <- -(phi[j]/scale_factor)^2/2 + (q-1)*log(phi[j]/scale_factor) - (q/2 - 1)*log(2) - log(gamma(q/2)) - log(scale_factor) - 366 | stats::dnorm(phi[j], mean=stat, sd=scale_factor, log=TRUE) 367 | } 368 | } 369 | # Trim down to only survives 370 | phi <- phi[!is.na(log_survives)] 371 | log_survives <- log_survives[!is.na(log_survives)] 372 | survives <- length(log_survives) 373 | # Return nothing if nothing survives 374 | if(survives == 0) { 375 | warning("Oops - we didn't generate any samples that preserved the clusters! Try re-running with a larger value of ndraws.") 376 | return(list(hiercl=hcl, stat=stat, pval=NA)) 377 | } 378 | if(survives < 100) { 379 | warning(paste("Only", survives, "samples were used to compute the denominator -", 380 | "p-value approximation may be unstable. Consider re-running", 381 | "with a larger value of ndraws.")) 382 | } 383 | # Approximate p-values 384 | log_survives_shift <- log_survives - max(log_survives) 385 | props <- exp(log_survives_shift)/sum(exp(log_survives_shift)) 386 | pval <- sum(props[phi >= stat]) 387 | var_pval <- (1 - pval)^2*sum(props[phi >= stat]^2) + pval^2*sum(props[phi < stat]^2) 388 | # pval_lower_ci <- max(0, pval - qnorm(0.975)*sqrt(var_pval)) 389 | # pval_upper_ci <- min(pval + qnorm(0.975)*sqrt(var_pval), 1) 390 | # var_pval <- 391 | return(list(stat=stat, pval=pval, stderr=sqrt(var_pval), hiercl=hcl)) 392 | } 393 | set.seed(123) 394 | dat <- matrix(0.2*rnorm(200), 100, 2) 395 | hcl <- fastcluster::hclust(dist(dat, method="euclidean")^2, method="complete") 396 | plot(hcl) 397 | rect_hier_clusters(hcl, k=3, which=1:2, border=c("blue", "orange")) 398 | clusterpval::rect_hier_clusters(hcl, k=3, which=1:2, border=c("blue", "orange")) 399 | set.seed(1) 400 | results <- test_complete_hier_clusters_approx(X=dat, K=3, k1=1, k2=2, ndraws=1000) 401 | results 402 | set.seed(1) 403 | results <- test_complete_hier_clusters_approx(X=dat, K=3, k1=1, k2=2, ndraws=100) 404 | results$stat 405 | results$pval 406 | results$stderr 407 | test_complete_hier_clusters_approx <- function(X, K, k1, k2, iso=TRUE, sig=NULL, SigInv=NULL, ndraws=2000, hcl=NULL) { 408 | # error checking 409 | if(!is.matrix(X)) stop("X should be a matrix") 410 | n <- nrow(X) 411 | q <- ncol(X) 412 | if(!is_integer_between_a_b(K, 2, n)) stop("number of clusters (K) should be between 2 and n") 413 | if(!is_integer_between_a_b(k1, 1, K) | !is_integer_between_a_b(k2, 1, K)) stop(paste("cluster indices should be between 1 and K", sep="")) 414 | if((iso != TRUE) & (iso != FALSE)) stop("iso should be TRUE or FALSE") 415 | # hierarchical clustering with squared Euclidean distance and specified linkage 416 | if(is.null(hcl)) hcl <- fastcluster::hclust(stats::dist(X, method="euclidean")^2, method="complete") 417 | hcl_at_K <- stats::cutree(hcl, K) 418 | k1_obs <- which(hcl_at_K == k1) 419 | k2_obs <- which(hcl_at_K == k2) 420 | n1 <- length(k1_obs) 421 | n2 <- length(k2_obs) 422 | squared_norm_nu <- 1/n1 + 1/n2 423 | diff_means <- colMeans(X[k1_obs, , drop=FALSE]) - colMeans(X[k2_obs, , drop=FALSE]) 424 | prop_k2 <- n2/(n1+n2) 425 | if(iso) { 426 | if(is.null(sig)) { 427 | sig <- sqrt(sum(scale(X, scale=FALSE)^2)/(n*q - q)) 428 | } 429 | scale_factor <- squared_norm_nu*sig^2 430 | # compute test statistic 431 | stat <- norm_vec(diff_means) 432 | } else { 433 | if(is.null(SigInv)) { 434 | Sig <- stats::cov(scale(X, scale=FALSE)) 435 | SigInv <- solve(Sig) 436 | } 437 | scale_factor <- squared_norm_nu 438 | # compute test statistic 439 | stat <- sqrt(as.numeric(t(diff_means)%*%SigInv%*%diff_means)) 440 | } 441 | scale_factor <- sqrt(scale_factor) 442 | log_survives <- rep(NA, ndraws) 443 | phi <- stats::rnorm(ndraws)*scale_factor + stat 444 | k1_constant <- prop_k2*diff_means/stat 445 | k2_constant <- (prop_k2 - 1)*diff_means/stat 446 | orig_k1 <- t(X[k1_obs, ]) 447 | orig_k2 <- t(X[k2_obs, ]) 448 | Xphi <- X 449 | for(j in 1:ndraws) { 450 | if(phi[j] < 0) next 451 | # Compute perturbed data set 452 | phi_minus_stat <- phi[j] - stat 453 | Xphi[k1_obs, ] <- t(orig_k1 + k1_constant*phi_minus_stat) 454 | Xphi[k2_obs, ] <- t(orig_k2 + k2_constant*phi_minus_stat) 455 | # Recluster the perturbed data set 456 | hcl_Xphi <- fastcluster::hclust(stats::dist(Xphi)^2, method="complete") 457 | clusters_Xphi <- stats::cutree(hcl_Xphi, K) 458 | if(same_cl(hcl_at_K, clusters_Xphi, K)) { 459 | log_survives[j] <- -(phi[j]/scale_factor)^2/2 + (q-1)*log(phi[j]/scale_factor) - (q/2 - 1)*log(2) - log(gamma(q/2)) - log(scale_factor) - 460 | stats::dnorm(phi[j], mean=stat, sd=scale_factor, log=TRUE) 461 | } 462 | } 463 | # Trim down to only survives 464 | phi <- phi[!is.na(log_survives)] 465 | log_survives <- log_survives[!is.na(log_survives)] 466 | survives <- length(log_survives) 467 | # Return nothing if nothing survives 468 | if(survives == 0) { 469 | warning("Oops - we didn't generate any samples that preserved the clusters! Try re-running with a larger value of ndraws.") 470 | return(list(hiercl=hcl, stat=stat, pval=NA)) 471 | } 472 | if(survives < 100) { 473 | warning(paste("Only", survives, "samples were used to compute the denominator -", 474 | "p-value approximation may be unstable. Consider re-running", 475 | "with a larger value of ndraws.")) 476 | } 477 | # Approximate p-values 478 | log_survives_shift <- log_survives - max(log_survives) 479 | props <- exp(log_survives_shift)/sum(exp(log_survives_shift)) 480 | pval <- sum(props[phi >= stat]) 481 | var_pval <- (1 - pval)^2*sum(props[phi >= stat]^2) + pval^2*sum(props[phi < stat]^2) 482 | return(list(stat=stat, pval=pval, stderr=sqrt(var_pval), hiercl=hcl)) 483 | } 484 | set.seed(1) 485 | results <- test_complete_hier_clusters_approx(X=dat, K=3, k1=1, k2=2, ndraws=1000) 486 | results$pval 487 | results$stderr 488 | set.seed(1) 489 | results <- test_complete_hier_clusters_approx(X=dat, K=3, k1=1, k2=2, ndraws=10000) 490 | results$pval 491 | results$stderr 492 | 0.511937- 1.96*0.01520826 493 | 0.4189947 + 1.96*sqrt(0.04481541) 494 | 0.511937- 1.96*sqrt(0.01520826) 495 | set.seed(1) 496 | results <- test_complete_hier_clusters_approx(X=dat, K=3, k1=1, k2=2, ndraws=2000) 497 | results$pval - 1.96*results$stderr 498 | results$pval + 1.96*results$stderr 499 | 0.4189947 - 1.96*0.04481541 500 | 0.4189947 + 1.96*0.04481541 501 | 0.511937- 1.96*0.01520826 502 | 0.511937 + 1.96*0.01520826 503 | rbind(c(0.4189947 - 1.96*0.04481541, 0.4189947 + 1.96*0.04481541), c(results$pval - 1.96*results$stderr, results$pval + 1.96*results$stderr), c(0.511937- 1.96*0.01520826, 0.511937+ 1.96*0.01520826)) 504 | set.seed(123) 505 | results <- test_complete_hier_clusters_approx(X=dat, K=3, k1=1, k2=2, ndraws=10000) 506 | c(results$pval - 1.96*results$stderr, results$pval + 1.96*results$stderr) 507 | set.seed(123) 508 | results <- test_complete_hier_clusters_approx(X=dat, K=3, k1=1, k2=2, ndraws=2000) 509 | c(results$pval - 1.96*results$stderr, results$pval + 1.96*results$stderr) 510 | results$pval 511 | 1.96*results$stderr 512 | results 513 | -------------------------------------------------------------------------------- /R/trunc_inf.R: -------------------------------------------------------------------------------- 1 | # ----- functions to test the means of clusters ----- 2 | 3 | #' Exact significance test for hierarchical clustering 4 | #' 5 | #' This tests the null hypothesis of no difference in means between 6 | #' clusters \code{k1} and \code{k2} at level \code{K} in a hierarchical clustering. 7 | #'(The \code{K} clusters are numbered as per the results of the \code{cutree} 8 | #' function in the \code{stats} package.) 9 | #' 10 | #' In order to account for the fact that the clusters have been estimated from the data, 11 | #' the p-values are computed conditional on the fact that those clusters were estimated. 12 | #' This function computes p-values exactly via an analytic characterization of the conditioning set. 13 | #' 14 | #' Currently, this function supports SQUARED Euclidean distance as a measure of dissimilarity 15 | #' between observations, and the following six linkages: single, average, centroid, Ward, 16 | #' McQuitty (also known as WPGMA), and median (also knßown as WPGMC). 17 | #' 18 | #' By default, this function assumes that the covariance matrix of the features is isotropic 19 | #' i.e. \eqn{Cov(X_i) = \sigma^2 I_p}. Setting \code{iso} to \code{FALSE} instead assumes that 20 | #' \eqn{Cov(X_i) = \Sigma}. If known, \eqn{\sigma} can be passed in using the \code{sigma} argument 21 | #' or \eqn{\Sigma^{-1}} can be passed in the \code{SigInv} argument; otherwise, an 22 | #' estimate of \eqn{\sigma} or \eqn{\Sigma} will be used. 23 | #' 24 | #' Note that passing in the SQUARED Euclidean distance object used by \code{hclust} in using the 25 | #' optional \code{dist} argument improves computational efficiency for all linkages except 26 | #' for single linkage. This may not lead to noticeable speed-ups in small data sets but 27 | #' leads to major speed-ups in large data sets. Thank you to Jesko Wagner for 28 | #' suggesting and implementing this change. 29 | #' 30 | #' @export 31 | #' 32 | #' @param X \eqn{n} by \eqn{p} matrix containing numeric data. 33 | #' @param link String selecting the linkage. Supported options are 34 | #' \code{"single", "average", "centroid", "ward.D", "median"}, and \code{"mcquitty"}. 35 | #' @param hcl Object of the type \code{hclust} containing the hierarchical clustering of X. 36 | #' @param K Integer selecting the total number of clusters. 37 | #' @param k1,k2 Integers selecting the clusters to test, as indexed by the results of \code{cutree(hcl, K)}. 38 | #' @param iso Boolean. If \code{TRUE}, isotropic covariance matrix model, otherwise not. 39 | #' @param sig Optional scalar specifying \eqn{\sigma}, relevant if \code{iso} is \code{TRUE}. 40 | #' @param SigInv Optional matrix specifying \eqn{\Sigma^{-1}}, relevant if \code{iso} is \code{FALSE}. 41 | #' @param dist The SQUARED Euclidean distances of matrix X 42 | #' @return 43 | #' \item{stat}{the test statistic: the Euclidean distance between the mean of cluster \code{k1} and the mean of cluster \code{k2} } 44 | #' \item{pval}{the p-value} 45 | #' \item{trunc}{object of the type \code{Intervals} containing the conditioning set} 46 | #' 47 | #' @examples 48 | #' # Simulates a 100 x 2 data set with three clusters 49 | #' set.seed(123) 50 | #' dat <- rbind(c(-1, 0), c(0, sqrt(3)), c(1, 0))[rep(1:3, length=100), ] + 51 | #' matrix(0.2*rnorm(200), 100, 2) 52 | #' 53 | #' # Average linkage hierarchical clustering 54 | #' hcl <- hclust(dist(dat, method="euclidean")^2, method="average") 55 | #' 56 | #' # plot dendrograms with the 1st and 2nd clusters (cut at the third split) 57 | #' # displayed in blue and orange 58 | #' plot(hcl) 59 | #' rect_hier_clusters(hcl, k=3, which=1:2, border=c("blue", "orange")) 60 | #' 61 | #' # tests for a difference in means between the blue and orange clusters 62 | #' test_hier_clusters_exact(X=dat, link="average", hcl=hcl, K=3, k1=1, k2=2) 63 | #' 64 | #' @seealso \code{\link{rect_hier_clusters}} for visualizing clusters \code{k1} and \code{k2} in the dendrogram; 65 | #' 66 | #' \code{\link{test_complete_hier_clusters_approx}} for approximate p-values for complete linkage hierarchical clustering; 67 | #' 68 | #' \code{\link{test_clusters_approx}} for approximate p-values for a user-specified clustering function; 69 | #' 70 | #' @references Lucy L. Gao et al. "Selective inference for hierarchical clustering". 71 | test_hier_clusters_exact <- function(X, link, hcl, K, k1, k2, iso=TRUE, sig=NULL, SigInv=NULL, dist=NULL) { 72 | # error checking 73 | if(!is.matrix(X)) stop("X should be a matrix") 74 | 75 | n <- nrow(X) 76 | q <- ncol(X) 77 | 78 | if(link == "complete") stop("Exact p-value not supported. See 'test_complete_hier_clusters_approx' for an approximate p-value.") 79 | if(!link %in% c("single", "average", "centroid", "ward.D", "mcquitty", "median")) stop("Linkage should be 'single', 'average', 'centroid', 'ward.D', 'mcquitty', or 'median'") 80 | if(!is_integer_between_a_b(K, 2, n)) stop("number of clusters (K) should be between 2 and n") 81 | if(!is_integer_between_a_b(k1, 1, K) | !is_integer_between_a_b(k2, 1, K)) stop(paste("cluster indices should be between 1 and K", sep="")) 82 | if((iso != TRUE) & (iso != FALSE)) stop("iso should be TRUE or FALSE") 83 | 84 | # hierarchical clustering with squared Euclidean distance and specified linkage 85 | hcl_at_K <- stats::cutree(hcl, K) 86 | 87 | n1 <- sum(hcl_at_K == k1) 88 | n2 <- sum(hcl_at_K == k2) 89 | squared_norm_nu <- 1/n1 + 1/n2 90 | diff_means <- colMeans(X[hcl_at_K == k1, , drop=FALSE]) - colMeans(X[hcl_at_K == k2, , drop=FALSE]) 91 | 92 | if(is.null(dist)) dist <- stats::dist(X)^2 93 | 94 | if(iso) { 95 | if(is.null(sig)) { 96 | sig <- sqrt(sum(scale(X, scale=FALSE)^2)/(n*q - q)) 97 | } 98 | 99 | # compute test statistic 100 | stat <- norm_vec(diff_means) 101 | 102 | # compute truncation set 103 | if(link == "single") S <- compute_S_single(X, hcl, K, k1, k2, dist) 104 | if(link == "average") S <- compute_S_average(X, hcl, K, k1, k2, dist) 105 | if(link == "centroid") S <- compute_S_centroid(X, hcl, K, k1, k2, dist) 106 | if(link == "ward.D") S <- compute_S_ward(X, hcl, K, k1, k2, dist) 107 | if(link == "mcquitty") S <- compute_S_mcquitty(X, hcl, K, k1, k2, dist) 108 | if(link == "median") S <- compute_S_median(X, hcl, K, k1, k2, dist) 109 | 110 | # set distribution of phi 111 | scale_factor <- squared_norm_nu*sig^2 112 | 113 | } else { 114 | if(is.null(SigInv)) { 115 | Sig <- stats::cov(scale(X, scale=F)) 116 | SigInv <- solve(Sig) 117 | } 118 | 119 | # compute test statistic 120 | stat <- sqrt(as.numeric(t(diff_means)%*%SigInv%*%diff_means)) 121 | 122 | # compute truncation set 123 | if(link == "single") S <- compute_S_single_gencov(X, hcl, K, k1, k2, stat) 124 | if(link == "average") S <- compute_S_average_gencov(X, hcl, K, k1, k2, stat, dist) 125 | if(link == "centroid") S <- compute_S_centroid_gencov(X, hcl, K, k1, k2, stat, dist) 126 | if(link == "ward.D") S <- compute_S_ward_gencov(X, hcl, K, k1, k2, stat, dist) 127 | if(link == "mcquitty") S <- compute_S_mcquitty_gencov(X, hcl, K, k1, k2, stat, dist) 128 | if(link == "median") S <- compute_S_median_gencov(X, hcl, K, k1, k2, stat, dist) 129 | 130 | # set distribution of phi 131 | scale_factor <- squared_norm_nu 132 | } 133 | 134 | # compute p-value using truncated chi-squared distribution 135 | gestat <- intervals::Intervals(c(stat^2/scale_factor, Inf)) 136 | denom <- S^2/scale_factor 137 | numer <- suppressWarnings(intervals::interval_intersection(gestat, denom)) 138 | pval <- TChisqRatioApprox(q, numer, denom) 139 | 140 | return(list(stat=stat, pval=pval, trunc=S)) 141 | } 142 | 143 | #' Monte Carlo significance test for complete linkage hierarchical clustering 144 | #' 145 | #' This tests the null hypothesis of no difference in means between 146 | #' clusters \code{k1} and \code{k2} at level \code{K} in a complete 147 | #' linkage hierarchical clustering. (The \code{K} clusters are numbered as per 148 | #' the results of the \code{cutree} function in the \code{stats} package.) 149 | #' 150 | #' Important note: Before calling \code{hclust} and this function, make sure to 151 | #' load the package \code{fastcluster}. This is because the p-value approximation 152 | #' procedure requires running hierarchical clustering on a large number of simulated 153 | #' data sets, and the version of \code{hclust} in the \code{fastcluster} package 154 | #' is much faster than the version of \code{hclust} in \code{stats}. 155 | #' 156 | #' In order to account for the fact that the clusters have been estimated from the data, 157 | #' the p-values are computed conditional on the fact that those clusters were estimated. 158 | #' This function approximates p-values via importance sampling. 159 | #' 160 | #' Currently, this function supports squared Euclidean distance as a measure of dissimilarity 161 | #' between observations. (Note that complete linkage is invariant under monotone transformations 162 | #' of the measure of dissimilarity between observations, so unsquared Euclidean distance 163 | #' would produce the same hierarchical clustering.) 164 | #' 165 | #' By default, this function assumes that the covariance matrix of the features is isotropic 166 | #' i.e. \eqn{Cov(X_i) = \sigma^2 I_p}. Setting \code{iso} to false instead assumes that 167 | #' \eqn{Cov(X_i) = \Sigma}. If known, \eqn{\sigma} can be passed in using the \code{sigma} argument 168 | #' or \eqn{\Sigma^{-1}} can be passed in the \code{SigInv} argument; otherwise, an 169 | #' estimate of \eqn{\sigma} or \eqn{\Sigma} will be used. 170 | #' 171 | #' @export 172 | #' 173 | #' @param X \eqn{n} by \eqn{p} matrix containing numeric data. 174 | #' @param hcl An object of the type \code{hclust} containing the hierarchical clustering of X. 175 | #' @param K Integer selecting the total number of clusters. 176 | #' @param k1,k2 Integers selecting the clusters to test. 177 | #' @param iso Boolean. If \code{TRUE}, isotropic covariance matrix model, otherwise not. 178 | #' @param sig Optional scalar specifying \eqn{\sigma}, relevant if \code{iso} is \code{TRUE}. 179 | #' @param SigInv Optional matrix specifying \eqn{\Sigma^{-1}}, relevant if \code{iso} is \code{FALSE}. 180 | #' @param ndraws Integer selecting the number of importance samples, default of 2000. 181 | #' 182 | #' @return 183 | #' \item{stat}{the test statistic: the Euclidean distance between the mean of cluster \code{k1} and the mean of cluster \code{k2} } 184 | #' \item{pval}{the approximate p-value} 185 | #' \item{stderr}{estimated standard error of the p-value estimate} 186 | #' 187 | #' @examples 188 | #' # Simulates a 100 x 2 data set with no clusters 189 | #' set.seed(1) 190 | #' dat <- matrix(rnorm(200), 100, 2) 191 | #' 192 | #' # Complete linkage hierarchical clustering 193 | #' library(fastcluster) 194 | #' hcl <- hclust(dist(dat, method="euclidean")^2, method="complete") 195 | #' 196 | #' # plot dendrograms with the 1st and 2nd clusters (cut at the third level) 197 | #' # displayed in blue and orange 198 | #' plot(hcl) 199 | #' rect_hier_clusters(hcl, k=3, which=1:2, border=c("blue", "orange")) 200 | #' 201 | #' # Monte Carlo test for a difference in means between the blue and orange clusters 202 | #' test_complete_hier_clusters_approx(X=dat, hcl=hcl, K=3, k1=1, k2=2, ndraws=1000) 203 | #' 204 | #' @seealso \code{\link{rect_hier_clusters}} for visualizing clusters \code{k1} and \code{k2} in the dendrogram; 205 | #' 206 | #' \code{\link{test_hier_clusters_exact}} for exact p-values for hierarchical clustering with other linkages; 207 | #' 208 | #' \code{\link{test_clusters_approx}} for approximate p-values for a user-specified clustering function; 209 | #' 210 | #' @references Lucy L. Gao et al. "Selective inference for hierarchical clustering". 211 | test_complete_hier_clusters_approx <- function(X, hcl, K, k1, k2, iso=TRUE, sig=NULL, SigInv=NULL, ndraws=2000) { 212 | # error checking 213 | if(!is.matrix(X)) stop("X should be a matrix") 214 | 215 | n <- nrow(X) 216 | q <- ncol(X) 217 | 218 | if(!is_integer_between_a_b(K, 2, n)) stop("number of clusters (K) should be between 2 and n") 219 | if(!is_integer_between_a_b(k1, 1, K) | !is_integer_between_a_b(k2, 1, K)) stop(paste("cluster indices should be between 1 and K", sep="")) 220 | if((iso != TRUE) & (iso != FALSE)) stop("iso should be TRUE or FALSE") 221 | if(!("fastcluster" %in% .packages())) stop("The fastcluster package must be loaded before calling hclust and before calling this function!") 222 | 223 | 224 | hcl_at_K <- stats::cutree(hcl, K) 225 | 226 | k1_obs <- which(hcl_at_K == k1) 227 | k2_obs <- which(hcl_at_K == k2) 228 | n1 <- length(k1_obs) 229 | n2 <- length(k2_obs) 230 | squared_norm_nu <- 1/n1 + 1/n2 231 | diff_means <- colMeans(X[k1_obs, , drop=FALSE]) - colMeans(X[k2_obs, , drop=FALSE]) 232 | 233 | prop_k2 <- n2/(n1+n2) 234 | 235 | if(iso) { 236 | if(is.null(sig)) { 237 | sig <- sqrt(sum(scale(X, scale=FALSE)^2)/(n*q - q)) 238 | } 239 | 240 | scale_factor <- squared_norm_nu*sig^2 241 | # compute test statistic 242 | stat <- norm_vec(diff_means) 243 | } else { 244 | if(is.null(SigInv)) { 245 | Sig <- stats::cov(scale(X, scale=FALSE)) 246 | SigInv <- solve(Sig) 247 | } 248 | 249 | scale_factor <- squared_norm_nu 250 | 251 | # compute test statistic 252 | stat <- sqrt(as.numeric(t(diff_means)%*%SigInv%*%diff_means)) 253 | } 254 | 255 | scale_factor <- sqrt(scale_factor) 256 | log_survives <- rep(NA, ndraws) 257 | phi <- stats::rnorm(ndraws)*scale_factor + stat 258 | 259 | k1_constant <- prop_k2*diff_means/stat 260 | k2_constant <- (prop_k2 - 1)*diff_means/stat 261 | orig_k1 <- t(X[k1_obs, ]) 262 | orig_k2 <- t(X[k2_obs, ]) 263 | 264 | Xphi <- X 265 | 266 | for(j in 1:ndraws) { 267 | if(phi[j] < 0) next 268 | 269 | # Compute perturbed data set 270 | phi_minus_stat <- phi[j] - stat 271 | Xphi[k1_obs, ] <- t(orig_k1 + k1_constant*phi_minus_stat) 272 | Xphi[k2_obs, ] <- t(orig_k2 + k2_constant*phi_minus_stat) 273 | 274 | # Recluster the perturbed data set 275 | hcl_Xphi <- fastcluster::hclust(stats::dist(Xphi)^2, method="complete") 276 | clusters_Xphi <- stats::cutree(hcl_Xphi, K) 277 | if(same_cl(hcl_at_K, clusters_Xphi, K)) { 278 | log_survives[j] <- -(phi[j]/scale_factor)^2/2 + (q-1)*log(phi[j]/scale_factor) - (q/2 - 1)*log(2) - lgamma(q/2) - log(scale_factor) - 279 | stats::dnorm(phi[j], mean=stat, sd=scale_factor, log=TRUE) 280 | } 281 | } 282 | 283 | # Trim down to only survives 284 | phi <- phi[!is.na(log_survives)] 285 | log_survives <- log_survives[!is.na(log_survives)] 286 | 287 | survives <- length(log_survives) 288 | 289 | # Return nothing if nothing survives 290 | if(survives == 0) { 291 | warning("Oops - we didn't generate any samples that preserved the clusters! Try re-running with a larger value of ndraws.") 292 | return(list(stat=stat, pval=NA, stderr=NA)) 293 | } 294 | 295 | # Approximate p-values 296 | log_survives_shift <- log_survives - max(log_survives) 297 | props <- exp(log_survives_shift)/sum(exp(log_survives_shift)) 298 | pval <- sum(props[phi >= stat]) 299 | 300 | var_pval <- (1 - pval)^2*sum(props[phi >= stat]^2) + pval^2*sum(props[phi < stat]^2) 301 | 302 | return(list(stat=stat, pval=pval, stderr=sqrt(var_pval))) 303 | } 304 | 305 | #' Monte Carlo significance test for any clustering method 306 | #' 307 | #' This function performs a user-specified clustering method \code{cl_fun} on the rows of a 308 | #' data matrix to obtain \code{K} clusters, and tests the null hypothesis of no difference in means 309 | #' between clusters \code{k1} and \code{k2}. 310 | #' 311 | #' In order to account for the fact that the clusters have been estimated from the data, 312 | #' the p-values are computed conditional on the fact that those clusters were estimated. 313 | #' This function approximates p-values via importance sampling. 314 | #' 315 | #' This function assumes that \code{cl_fun} takes a \eqn{n \times p} numeric data matrix as input 316 | #' and outputs integer assignments to clusters 1 through \code{K}. 317 | #' 318 | #' Thank you to August Guang for providing code to speed-up the function by 319 | #' parallelizing via the \code{future} package. 320 | #' 321 | #' @export 322 | #' 323 | #' @param X \eqn{n} by \eqn{p} matrix containing numeric data. 324 | #' @param k1,k2 Integers selecting the clusters to test. 325 | #' @param iso Boolean. If \code{TRUE}, isotropic covariance matrix model, otherwise not. 326 | #' @param sig Optional scalar specifying \eqn{\sigma}, relevant if \code{iso} is \code{TRUE}. 327 | #' @param SigInv Optional matrix specifying \eqn{\Sigma^{-1}}, relevant if \code{iso} is \code{FALSE}. 328 | #' @param ndraws Integer selecting the number of importance samples, default of 2000. 329 | #' @param cl_fun Function returning assignments to clusters 1 through \code{K}. 330 | #' @param cl Optionally pass in the results of calling \code{cl_fun} on your data. This is for 331 | #' efficiency and reproducibility (when the clustering function is non-deterministic). 332 | #' 333 | #' @return 334 | #' \item{stat}{the test statistic: the Euclidean distance between the mean of cluster \code{k1} and the mean of cluster \code{k2} } 335 | #' \item{pval}{the approximate p-value} 336 | #' \item{stderr}{standard error of the p-value estimate} 337 | #' \item{clusters}{the estimated cluster assignments} 338 | #' 339 | #' @examples 340 | #' # Simulates a 100 x 2 data set with three clusters 341 | #' set.seed(123) 342 | #' dat <- rbind(c(-1, 0), c(0, sqrt(3)), c(1, 0))[rep(1:3, length=100), ] + 343 | #' matrix(0.2*rnorm(200), 100, 2) 344 | #' 345 | #' # Function to run k-means clustering w/ k = 3 and 50 random starts 346 | #' km_cluster <- function(X) { 347 | #' km <- kmeans(X, 3, nstart=50) 348 | #' return(km$cluster) 349 | #' } 350 | #' 351 | #' # Cluster data using k-means 352 | #' clusters <- km_cluster(dat) 353 | #' table(rep(1:3, length=100), clusters) 354 | #' 355 | #' # tests for a difference in means between clusters 1 and 2 356 | #' # We pass in earlier k-means clustering results from earlier 357 | #' results <- test_clusters_approx(dat, k1=1, k2=2, cl_fun=km_cluster, ndraws=500, cl=clusters) 358 | #' results$stat 359 | #' results$pval 360 | #' results$stderr 361 | #' 362 | #' @references Lucy L. Gao et al. "Selective inference for hierarchical clustering". 363 | test_clusters_approx <- function(X, k1, k2, iso=TRUE, sig=NULL, SigInv=NULL, ndraws=2000, cl_fun, cl=NULL) { 364 | if(!is.matrix(X)) stop("X should be a matrix") 365 | 366 | n <- nrow(X) 367 | q <- ncol(X) 368 | 369 | if(is.null(cl)) cl <- cl_fun(X) 370 | K <- length(unique(cl)) 371 | 372 | if(!is_integer_between_a_b(K, 2, n)) stop("number of clusters (K) should be between 2 and n") 373 | if(!is_integer_between_a_b(k1, 1, K) | !is_integer_between_a_b(k2, 1, K)) stop(paste("cluster indices should be between 1 and K", sep="")) 374 | if((iso != TRUE) & (iso != FALSE)) stop("iso should be TRUE or FALSE") 375 | 376 | n1 <- sum(cl == k1) 377 | n2 <- sum(cl == k2) 378 | squared_norm_nu <- 1/n1 + 1/n2 379 | diff_means <- colMeans(X[cl == k1, , drop=FALSE]) - colMeans(X[cl == k2, , drop=F]) 380 | 381 | prop_k2 <- n2/(n1+n2) 382 | 383 | 384 | if(iso) { 385 | if(is.null(sig)) { 386 | sig <- sqrt(sum(scale(X, scale=FALSE)^2)/(n*q - q)) 387 | } 388 | 389 | scale_factor <- squared_norm_nu*sig^2 390 | # compute test statistic 391 | stat <- norm_vec(diff_means) 392 | } else { 393 | if(is.null(SigInv)) { 394 | Sig <- stats::cov(scale(X, scale=FALSE)) 395 | SigInv <- solve(Sig) 396 | } 397 | 398 | scale_factor <- squared_norm_nu 399 | 400 | # compute test statistic 401 | stat <- sqrt(as.numeric(t(diff_means)%*%SigInv%*%diff_means)) 402 | } 403 | 404 | scale_factor <- sqrt(scale_factor) 405 | phi <- stats::rnorm(ndraws)*scale_factor + stat 406 | 407 | 408 | k1_constant <- prop_k2*diff_means/stat 409 | k2_constant <- (prop_k2 - 1)*diff_means/stat 410 | orig_k1 <- t(X[cl == k1, ]) 411 | orig_k2 <- t(X[cl == k2, ]) 412 | 413 | Xphi <- X 414 | 415 | log_survives <- unlist(future.apply::future_lapply(X = 1:ndraws, FUN = function(j) { 416 | if(phi[j] < 0) return(NA) 417 | 418 | # Compute perturbed data set 419 | Xphi <- X 420 | Xphi[cl == k1, ] <- t(orig_k1 + (phi[j] - stat)*k1_constant) 421 | Xphi[cl == k2, ] <- t(orig_k2 + (phi[j] - stat)*k2_constant) 422 | 423 | # Recluster the perturbed data set 424 | cl_Xphi <- cl_fun(Xphi) 425 | if(preserve_cl(cl, cl_Xphi, k1, k2)) { 426 | log_survives <- -(phi[j]/scale_factor)^2/2 + (q-1)*log(phi[j]/scale_factor) - (q/2 - 1)*log(2) - lgamma(q/2) - log(scale_factor) - 427 | stats::dnorm(phi[j], mean=stat, sd=scale_factor, log=TRUE) 428 | return(log_survives) 429 | } 430 | 431 | return(NA) 432 | 433 | }, future.seed=TRUE)) 434 | 435 | # Trim down to only survives 436 | phi <- phi[!is.na(log_survives)] 437 | log_survives <- log_survives[!is.na(log_survives)] 438 | 439 | survives <- length(log_survives) 440 | 441 | # Return nothing if nothing survives 442 | if(survives == 0) { 443 | warning("Oops - we didn't generate any samples that preserved the clusters! Try re-running with a larger value of ndraws.") 444 | return(list(stat=stat, pval=NA, stderr=NA, clusters=cl)) 445 | } 446 | 447 | # Approximate p-values 448 | log_survives_shift <- log_survives - max(log_survives) 449 | props <- exp(log_survives_shift)/sum(exp(log_survives_shift)) 450 | pval <- sum(props[phi >= stat]) 451 | 452 | var_pval <- (1 - pval)^2*sum(props[phi >= stat]^2) + pval^2*sum(props[phi < stat]^2) 453 | 454 | return(list(stat=stat, pval=pval, stderr=sqrt(var_pval), clusters=cl)) 455 | } --------------------------------------------------------------------------------