├── .Rbuildignore
├── LICENSE
├── .gitignore
├── NAMESPACE
├── clusterpval.Rproj
├── man
    ├── norm_vec.Rd
    ├── TNProbEachInt.Rd
    ├── isSameIntervals.Rd
    ├── TNProb.Rd
    ├── preserve_cl.Rd
    ├── same_cl.Rd
    ├── is_integer_between_a_b.Rd
    ├── magicfun.Rd
    ├── sortE.Rd
    ├── finiteE.Rd
    ├── solve_one_ineq.Rd
    ├── compute_S_ward.Rd
    ├── compute_S_single.Rd
    ├── compute_S_average.Rd
    ├── compute_S_centroid.Rd
    ├── compute_S_median.Rd
    ├── TNRatioApprox.Rd
    ├── compute_S_mcquitty.Rd
    ├── compute_S_single_gencov.Rd
    ├── compute_S_ward_gencov.Rd
    ├── compute_S_average_gencov.Rd
    ├── compute_S_centroid_gencov.Rd
    ├── compute_S_median_gencov.Rd
    ├── compute_S_mcquitty_gencov.Rd
    ├── TChisqRatioApprox.Rd
    ├── TNSurv.Rd
    ├── rect_hier_clusters.Rd
    ├── test_clusters_approx.Rd
    ├── test_complete_hier_clusters_approx.Rd
    └── test_hier_clusters_exact.Rd
├── README.md
├── DESCRIPTION
└── R
    ├── util.R
    ├── helper_cutree.R
    ├── trunc_dist.R
    ├── .Rhistory
    └── trunc_inf.R


/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2020
2 | COPYRIGHT HOLDER: Lucy L. Gao
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
1 | # Generated by roxygen2: do not edit by hand
2 | 
3 | export(TChisqRatioApprox)
4 | export(rect_hier_clusters)
5 | export(test_clusters_approx)
6 | export(test_complete_hier_clusters_approx)
7 | export(test_hier_clusters_exact)
8 | 


--------------------------------------------------------------------------------
/clusterpval.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | BuildType: Package
16 | PackageUseDevtools: Yes
17 | PackageInstallArgs: --no-multiarch --with-keep.source
18 | 


--------------------------------------------------------------------------------
/man/norm_vec.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/util.R
 3 | \name{norm_vec}
 4 | \alias{norm_vec}
 5 | \title{Takes the l2-norm of a vector.}
 6 | \usage{
 7 | norm_vec(x)
 8 | }
 9 | \arguments{
10 | \item{x}{the vector to be normed}
11 | }
12 | \value{
13 | Returns the l2-norm of x.
14 | }
15 | \description{
16 | Takes the l2-norm of a vector.
17 | }
18 | \keyword{internal}
19 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # clusterpval: Inference for Estimated Clusters in R
 2 | 
 3 | See [http://lucylgao.com/clusterpval](http://lucylgao.com/clusterpval) for tutorials and examples. See [https://arxiv.org/abs/2012.02936](https://arxiv.org/abs/2012.02936) for the preprint.
 4 | 
 5 | Install 
 6 | -----
 7 | 
 8 | Make sure that ``devtools`` is installed by running ``install.packages("devtools")``, then type
 9 | 
10 | ```R
11 | devtools::install_github("lucylgao/clusterpval")
12 | ```
13 | 
14 | 


--------------------------------------------------------------------------------
/man/TNProbEachInt.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/trunc_dist.R
 3 | \name{TNProbEachInt}
 4 | \alias{TNProbEachInt}
 5 | \title{Probability of a standard normal in a single interval}
 6 | \usage{
 7 | TNProbEachInt(lo, up)
 8 | }
 9 | \arguments{
10 | \item{lo, up}{quantiles.}
11 | }
12 | \value{
13 | This function returns the desired probability.
14 | }
15 | \description{
16 | This function returns \eqn{P(lo \le Z \le up)}, where \eqn{Z ~ N(0, 1)}.
17 | }
18 | \keyword{internal}
19 | 


--------------------------------------------------------------------------------
/man/isSameIntervals.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/trunc_dist.R
 3 | \name{isSameIntervals}
 4 | \alias{isSameIntervals}
 5 | \title{Comparison between two intervals}
 6 | \usage{
 7 | isSameIntervals(int1, int2)
 8 | }
 9 | \arguments{
10 | \item{int1, int2}{"Intervals" objects.}
11 | }
12 | \value{
13 | This function returns the desired logical result.
14 | }
15 | \description{
16 | This functions returns \code{TRUE} if and only if two intervals are the same.
17 | }
18 | \keyword{internal}
19 | 


--------------------------------------------------------------------------------
/man/TNProb.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/trunc_dist.R
 3 | \name{TNProb}
 4 | \alias{TNProb}
 5 | \title{Probability of a standard normal in a union of intervals}
 6 | \usage{
 7 | TNProb(E)
 8 | }
 9 | \arguments{
10 | \item{E}{an "Intervals" object or a matrix where rows represents
11 | a union of disjoint intervals.}
12 | }
13 | \value{
14 | This function returns the desired probability.
15 | }
16 | \description{
17 | This function returns \eqn{P(Z \in E)}, where \eqn{Z ~ N(0, 1)}.
18 | }
19 | \keyword{internal}
20 | 


--------------------------------------------------------------------------------
/man/preserve_cl.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/util.R
 3 | \name{preserve_cl}
 4 | \alias{preserve_cl}
 5 | \title{Checks if Ck, Ck' in C(x'(phi))}
 6 | \usage{
 7 | preserve_cl(cl, cl_phi, k1, k2)
 8 | }
 9 | \arguments{
10 | \item{cl}{clustering of x}
11 | 
12 | \item{cl_phi}{clustering of x'(phi)}
13 | 
14 | \item{k1, k2}{index of clusters involved in the test}
15 | }
16 | \value{
17 | Returns TRUE if Ck, Ck' in C(x'(phi)), and FALSE otherwise
18 | }
19 | \description{
20 | Checks if Ck, Ck' in C(x'(phi))
21 | }
22 | \keyword{internal}
23 | 


--------------------------------------------------------------------------------
/man/same_cl.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/util.R
 3 | \name{same_cl}
 4 | \alias{same_cl}
 5 | \title{Checks if two clusterings are the same up to permutation}
 6 | \usage{
 7 | same_cl(cl1, cl2, K)
 8 | }
 9 | \arguments{
10 | \item{cl1}{the first clustering}
11 | 
12 | \item{cl2}{the second clustering}
13 | 
14 | \item{K}{the number of clusters}
15 | }
16 | \value{
17 | Returns TRUE if they are the same, and FALSE otherwise
18 | }
19 | \description{
20 | Checks if two clusterings are the same up to permutation
21 | }
22 | \keyword{internal}
23 | 


--------------------------------------------------------------------------------
/man/is_integer_between_a_b.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/util.R
 3 | \name{is_integer_between_a_b}
 4 | \alias{is_integer_between_a_b}
 5 | \title{Checks if input is an integer between a and b}
 6 | \usage{
 7 | is_integer_between_a_b(x, a, b)
 8 | }
 9 | \arguments{
10 | \item{x}{input to check}
11 | 
12 | \item{a}{lower}
13 | 
14 | \item{b}{upper}
15 | }
16 | \value{
17 | Returns TRUE if input is an integer between a and b, FALSE otherwise
18 | }
19 | \description{
20 | Checks if input is an integer between a and b
21 | }
22 | \keyword{internal}
23 | 


--------------------------------------------------------------------------------
/man/magicfun.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/trunc_dist.R
 3 | \name{magicfun}
 4 | \alias{magicfun}
 5 | \title{A helper function for approximating normal tail probabilities}
 6 | \usage{
 7 | magicfun(z)
 8 | }
 9 | \arguments{
10 | \item{z, }{the number where the function is evaluated.}
11 | }
12 | \value{
13 | This function returns the value of the function evaluated at \code{z}.
14 | }
15 | \description{
16 | For \eqn{Z ~ N(0, 1)}, we have the approximation
17 |     \eqn{P(Z \ge z) \approx }\code{magicfun(z)*exp(-z^2/2)}.
18 | }
19 | \references{
20 | Bryc, Wlodzimierz. "A uniform approximation to the right normal tail integral."
21 |     Applied mathematics and computation 127.2 (2002): 365-374.
22 | }
23 | \keyword{internal}
24 | 


--------------------------------------------------------------------------------
/man/sortE.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/trunc_dist.R
 3 | \name{sortE}
 4 | \alias{sortE}
 5 | \title{Make endpoints of intervals positive}
 6 | \usage{
 7 | sortE(E)
 8 | }
 9 | \arguments{
10 | \item{E}{an "Intervals" object or a matrix where rows represents
11 | a union of intervals with \emph{positive} but possibly infinite endpoints.}
12 | }
13 | \value{
14 | This function returns an "Intervals" object or a matrix depending on the input.
15 | }
16 | \description{
17 | This function modifies a union of intervals with possibly negative enpoints
18 |     into a union of intervals with \emph{positive} endpoints, while ensuring
19 |    the probability of a \eqn{N(0, 1)} falling into it numerically the same.
20 | }
21 | \keyword{internal}
22 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: clusterpval
 2 | Type: Package
 3 | Title: Clusterpval: P-Values for Differences in Means after Clustering
 4 | Version: 1.0.1
 5 | Authors@R:
 6 |     person("Lucy", "Gao", email = "lucy.gao@uwaterloo.ca", role = c("aut", "cre"))
 7 | Description: Clusters observations, then tests whether there  
 8 |     is a statistically significant difference between the cluster means, using the 
 9 |     methods described in the preprint "Selective inference for hierarchical clustering". See the package website at http://lucylgao.com/clusterpval for more information and more examples. 
10 | License: MIT + file LICENSE
11 | Encoding: UTF-8
12 | LazyData: true
13 | RoxygenNote: 7.1.2
14 | Suggests: 
15 |     graphics
16 | Imports: 
17 |     fastcluster,
18 |     stats,
19 |     intervals, 
20 |     future.apply
21 | 


--------------------------------------------------------------------------------
/man/finiteE.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/trunc_dist.R
 3 | \name{finiteE}
 4 | \alias{finiteE}
 5 | \title{Make endpoints of intervals finite}
 6 | \usage{
 7 | finiteE(E)
 8 | }
 9 | \arguments{
10 | \item{E}{an "Intervals" object or a matrix where rows represents
11 | a union of intervals with \emph{positive} but possibly infinite endpoints.}
12 | }
13 | \value{
14 | This function returns an "Intervals" object or a matrix depending on the input.
15 | }
16 | \description{
17 | This function modifies a union of intervals with positive but possibly infinite endpoints
18 |    into a union of intervals with positive and \emph{finite} endpoints, while ensuring
19 |    the probability of a \eqn{N(0, 1)} falling into it numerically the same.
20 | }
21 | \keyword{internal}
22 | 


--------------------------------------------------------------------------------
/man/solve_one_ineq.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/trunc_sets.R
 3 | \name{solve_one_ineq}
 4 | \alias{solve_one_ineq}
 5 | \title{Solve the roots of quadratic polynomials related to testing for a difference in means}
 6 | \usage{
 7 | solve_one_ineq(A, B, C, tol = 1e-10)
 8 | }
 9 | \arguments{
10 | \item{A, }{B, C the coefficients of the quadratic equation.}
11 | 
12 | \item{tol}{if \eqn{|a|}, \eqn{|b|}, or \eqn{|c|} is not larger than tol, then treat it as zero.}
13 | }
14 | \value{
15 | Returns an "Intervals" object containing NA or the complement of the solution set.
16 | }
17 | \description{
18 | Solves \eqn{ax^2 + bx + c \ge 0}, then returns the complement of the solution set
19 | wrt to the real line, unless the complement is empty, in which case
20 | the function returns NA.
21 | }
22 | \keyword{internal}
23 | 


--------------------------------------------------------------------------------
/man/compute_S_ward.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/trunc_sets.R
 3 | \name{compute_S_ward}
 4 | \alias{compute_S_ward}
 5 | \title{Computes the conditioning set for Ward linkage hierarchical clustering}
 6 | \usage{
 7 | compute_S_ward(X, hcl, K, k1, k2, dist)
 8 | }
 9 | \arguments{
10 | \item{X}{the n x q data set}
11 | 
12 | \item{hcl}{hclust object obtained by clustering X}
13 | 
14 | \item{K}{number of clusters}
15 | 
16 | \item{k1}{the index of first cluster involved in the test}
17 | 
18 | \item{k2}{the index of second cluster involved in the test}
19 | 
20 | \item{dist}{The distances of matrix X}
21 | }
22 | \value{
23 | Returns an "Intervals" object containing the conditioning set.
24 | }
25 | \description{
26 | Computes the conditioning set for Ward linkage hierarchical clustering
27 | }
28 | \keyword{internal}
29 | 


--------------------------------------------------------------------------------
/man/compute_S_single.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/trunc_sets.R
 3 | \name{compute_S_single}
 4 | \alias{compute_S_single}
 5 | \title{Computes the conditioning set for single linkage hierarchical clustering}
 6 | \usage{
 7 | compute_S_single(X, hcl, K, k1, k2, dist)
 8 | }
 9 | \arguments{
10 | \item{X}{the n x q data set}
11 | 
12 | \item{hcl}{hclust object obtained by clustering X}
13 | 
14 | \item{K}{number of clusters}
15 | 
16 | \item{k1}{the index of first cluster involved in the test}
17 | 
18 | \item{k2}{the index of second cluster involved in the test}
19 | 
20 | \item{dist}{The distances of matrix X}
21 | }
22 | \value{
23 | Returns an "Intervals" object containing the conditioning set.
24 | }
25 | \description{
26 | Computes the conditioning set for single linkage hierarchical clustering
27 | }
28 | \keyword{internal}
29 | 


--------------------------------------------------------------------------------
/man/compute_S_average.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/trunc_sets.R
 3 | \name{compute_S_average}
 4 | \alias{compute_S_average}
 5 | \title{Computes the conditioning set for average linkage hierarchical clustering}
 6 | \usage{
 7 | compute_S_average(X, hcl, K, k1, k2, dist)
 8 | }
 9 | \arguments{
10 | \item{X}{the n x q data set}
11 | 
12 | \item{hcl}{hclust object obtained by clustering X}
13 | 
14 | \item{K}{number of clusters}
15 | 
16 | \item{k1}{the index of first cluster involved in the test}
17 | 
18 | \item{k2}{the index of second cluster involved in the test}
19 | 
20 | \item{dist}{The distances of matrix X}
21 | }
22 | \value{
23 | Returns an "Intervals" object containing the conditioning set.
24 | }
25 | \description{
26 | Computes the conditioning set for average linkage hierarchical clustering
27 | }
28 | \keyword{internal}
29 | 


--------------------------------------------------------------------------------
/man/compute_S_centroid.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/trunc_sets.R
 3 | \name{compute_S_centroid}
 4 | \alias{compute_S_centroid}
 5 | \title{Computes the conditioning set for centroid linkage hierarchical clustering}
 6 | \usage{
 7 | compute_S_centroid(X, hcl, K, k1, k2, dist)
 8 | }
 9 | \arguments{
10 | \item{X}{the n x q data set}
11 | 
12 | \item{hcl}{hclust object obtained by clustering X}
13 | 
14 | \item{K}{number of clusters}
15 | 
16 | \item{k1}{the index of first cluster involved in the test}
17 | 
18 | \item{k2}{the index of second cluster involved in the test}
19 | 
20 | \item{dist}{The distances of matrix X}
21 | }
22 | \value{
23 | Returns an "Intervals" object containing the conditioning set.
24 | }
25 | \description{
26 | Computes the conditioning set for centroid linkage hierarchical clustering
27 | }
28 | \keyword{internal}
29 | 


--------------------------------------------------------------------------------
/man/compute_S_median.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/trunc_sets.R
 3 | \name{compute_S_median}
 4 | \alias{compute_S_median}
 5 | \title{Computes the conditioning set for median linkage hierarchical clustering (WPGMC)}
 6 | \usage{
 7 | compute_S_median(X, hcl, K, k1, k2, dist)
 8 | }
 9 | \arguments{
10 | \item{X}{the n x q data set}
11 | 
12 | \item{hcl}{hclust object obtained by clustering X}
13 | 
14 | \item{K}{number of clusters}
15 | 
16 | \item{k1}{the index of first cluster involved in the test}
17 | 
18 | \item{k2}{the index of second cluster involved in the test}
19 | 
20 | \item{dist}{The distances of matrix X}
21 | }
22 | \value{
23 | Returns an "Intervals" object containing the conditioning set.
24 | }
25 | \description{
26 | Computes the conditioning set for median linkage hierarchical clustering (WPGMC)
27 | }
28 | \keyword{internal}
29 | 


--------------------------------------------------------------------------------
/man/TNRatioApprox.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/trunc_dist.R
 3 | \name{TNRatioApprox}
 4 | \alias{TNRatioApprox}
 5 | \title{Approximation of the ratio of two normal probabilities}
 6 | \usage{
 7 | TNRatioApprox(E1, E2, scale = NULL)
 8 | }
 9 | \arguments{
10 | \item{E1, E2}{"Intervals" objects or matrices where rows represents
11 | a union of intervals with \emph{positive and finite} endpoints.}
12 | 
13 | \item{scale}{scaling parameter.}
14 | }
15 | \value{
16 | This function returns the value of the approximation.
17 | }
18 | \description{
19 | This function returns an approximation of \eqn{P(Z \in E1)/P(Z \in E2)}, where \eqn{Z ~ N(0, 1)}.
20 | }
21 | \references{
22 | Bryc, Wlodzimierz. "A uniform approximation to the right normal tail integral."
23 |     Applied mathematics and computation 127.2 (2002): 365-374.
24 | }
25 | \keyword{internal}
26 | 


--------------------------------------------------------------------------------
/man/compute_S_mcquitty.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/trunc_sets.R
 3 | \name{compute_S_mcquitty}
 4 | \alias{compute_S_mcquitty}
 5 | \title{Computes the conditioning set for McQuitty linkage hierarchical clustering (WPGMA)}
 6 | \usage{
 7 | compute_S_mcquitty(X, hcl, K, k1, k2, dist)
 8 | }
 9 | \arguments{
10 | \item{X}{the n x q data set}
11 | 
12 | \item{hcl}{hclust object obtained by clustering X}
13 | 
14 | \item{K}{number of clusters}
15 | 
16 | \item{k1}{the index of first cluster involved in the test}
17 | 
18 | \item{k2}{the index of second cluster involved in the test}
19 | 
20 | \item{dist}{The distances of matrix X}
21 | }
22 | \value{
23 | Returns an "Intervals" object containing the conditioning set.
24 | }
25 | \description{
26 | Computes the conditioning set for McQuitty linkage hierarchical clustering (WPGMA)
27 | }
28 | \keyword{internal}
29 | 


--------------------------------------------------------------------------------
/man/compute_S_single_gencov.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/trunc_sets.R
 3 | \name{compute_S_single_gencov}
 4 | \alias{compute_S_single_gencov}
 5 | \title{Computes the conditioning set S for single linkage hierarchical clustering,
 6 | w/o assuming isotropic covariance matrix}
 7 | \usage{
 8 | compute_S_single_gencov(X, hcl, K, k1, k2, stat)
 9 | }
10 | \arguments{
11 | \item{X}{the n x q data set}
12 | 
13 | \item{hcl}{hclust object obtained by clustering X}
14 | 
15 | \item{K}{number of clusters}
16 | 
17 | \item{k1}{the index of first cluster involved in the test}
18 | 
19 | \item{k2}{the index of second cluster involved in the test}
20 | 
21 | \item{stat}{the test statistic, \eqn{||\Sigma^{-1/2} x^T \nu||_2}}
22 | }
23 | \value{
24 | Returns an "Intervals" object containing the conditioning set.
25 | }
26 | \description{
27 | Computes the conditioning set S for single linkage hierarchical clustering,
28 | w/o assuming isotropic covariance matrix
29 | }
30 | \keyword{internal}
31 | 


--------------------------------------------------------------------------------
/man/compute_S_ward_gencov.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/trunc_sets.R
 3 | \name{compute_S_ward_gencov}
 4 | \alias{compute_S_ward_gencov}
 5 | \title{Computes the conditioning set S for ward linkage hierarchical clustering,
 6 | w/o assuming isotropic covariance matrix}
 7 | \usage{
 8 | compute_S_ward_gencov(X, hcl, K, k1, k2, stat, dist)
 9 | }
10 | \arguments{
11 | \item{X}{the n x q data set}
12 | 
13 | \item{hcl}{hclust object obtained by clustering X}
14 | 
15 | \item{K}{number of clusters}
16 | 
17 | \item{k1}{the index of first cluster involved in the test}
18 | 
19 | \item{k2}{the index of second cluster involved in the test}
20 | 
21 | \item{stat}{the test statistic, \eqn{||\Sigma^{-1/2} x^T \nu||_2}}
22 | 
23 | \item{dist}{The distances of matrix X}
24 | }
25 | \value{
26 | Returns an "Intervals" object containing the conditioning set.
27 | }
28 | \description{
29 | Computes the conditioning set S for ward linkage hierarchical clustering,
30 | w/o assuming isotropic covariance matrix
31 | }
32 | \keyword{internal}
33 | 


--------------------------------------------------------------------------------
/man/compute_S_average_gencov.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/trunc_sets.R
 3 | \name{compute_S_average_gencov}
 4 | \alias{compute_S_average_gencov}
 5 | \title{Computes the conditioning set S for average linkage hierarchical clustering,
 6 | w/o assuming isotropic covariance matrix}
 7 | \usage{
 8 | compute_S_average_gencov(X, hcl, K, k1, k2, stat, dist)
 9 | }
10 | \arguments{
11 | \item{X}{the n x q data set}
12 | 
13 | \item{hcl}{hclust object obtained by clustering X}
14 | 
15 | \item{K}{number of clusters}
16 | 
17 | \item{k1}{the index of first cluster involved in the test}
18 | 
19 | \item{k2}{the index of second cluster involved in the test}
20 | 
21 | \item{stat}{the test statistic, \eqn{||\Sigma^{-1/2} x^T \nu||_2}}
22 | 
23 | \item{dist}{The distances of matrix X}
24 | }
25 | \value{
26 | Returns an "Intervals" object containing the conditioning set.
27 | }
28 | \description{
29 | Computes the conditioning set S for average linkage hierarchical clustering,
30 | w/o assuming isotropic covariance matrix
31 | }
32 | \keyword{internal}
33 | 


--------------------------------------------------------------------------------
/man/compute_S_centroid_gencov.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/trunc_sets.R
 3 | \name{compute_S_centroid_gencov}
 4 | \alias{compute_S_centroid_gencov}
 5 | \title{Computes the conditioning set S for centroid linkage hierarchical clustering,
 6 | w/o assuming isotropic covariance matrix}
 7 | \usage{
 8 | compute_S_centroid_gencov(X, hcl, K, k1, k2, stat, dist)
 9 | }
10 | \arguments{
11 | \item{X}{the n x q data set}
12 | 
13 | \item{hcl}{hclust object obtained by clustering X}
14 | 
15 | \item{K}{number of clusters}
16 | 
17 | \item{k1}{the index of first cluster involved in the test}
18 | 
19 | \item{k2}{the index of second cluster involved in the test}
20 | 
21 | \item{stat}{the test statistic, \eqn{||\Sigma^{-1/2} x^T \nu||_2}}
22 | 
23 | \item{dist}{The distances of matrix X}
24 | }
25 | \value{
26 | Returns an "Intervals" object containing the conditioning set.
27 | }
28 | \description{
29 | Computes the conditioning set S for centroid linkage hierarchical clustering,
30 | w/o assuming isotropic covariance matrix
31 | }
32 | \keyword{internal}
33 | 


--------------------------------------------------------------------------------
/man/compute_S_median_gencov.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/trunc_sets.R
 3 | \name{compute_S_median_gencov}
 4 | \alias{compute_S_median_gencov}
 5 | \title{Computes the conditioning set S for median linkage hierarchical clustering (WPGMC),
 6 | w/o assuming isotropic covariance matrix}
 7 | \usage{
 8 | compute_S_median_gencov(X, hcl, K, k1, k2, stat, dist)
 9 | }
10 | \arguments{
11 | \item{X}{the n x q data set}
12 | 
13 | \item{hcl}{hclust object obtained by clustering X}
14 | 
15 | \item{K}{number of clusters}
16 | 
17 | \item{k1}{the index of first cluster involved in the test}
18 | 
19 | \item{k2}{the index of second cluster involved in the test}
20 | 
21 | \item{stat}{the test statistic, \eqn{||\Sigma^{-1/2} x^T \nu||_2}}
22 | 
23 | \item{dist}{The distances of matrix X}
24 | }
25 | \value{
26 | Returns an "Intervals" object containing the conditioning set.
27 | }
28 | \description{
29 | Computes the conditioning set S for median linkage hierarchical clustering (WPGMC),
30 | w/o assuming isotropic covariance matrix
31 | }
32 | \keyword{internal}
33 | 


--------------------------------------------------------------------------------
/man/compute_S_mcquitty_gencov.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/trunc_sets.R
 3 | \name{compute_S_mcquitty_gencov}
 4 | \alias{compute_S_mcquitty_gencov}
 5 | \title{Computes the conditioning set S for McQuitty linkage hierarchical clustering (WPGMA),
 6 | w/o assuming isotropic covariance matrix}
 7 | \usage{
 8 | compute_S_mcquitty_gencov(X, hcl, K, k1, k2, stat, dist)
 9 | }
10 | \arguments{
11 | \item{X}{the n x q data set}
12 | 
13 | \item{hcl}{hclust object obtained by clustering X}
14 | 
15 | \item{K}{number of clusters}
16 | 
17 | \item{k1}{the index of first cluster involved in the test}
18 | 
19 | \item{k2}{the index of second cluster involved in the test}
20 | 
21 | \item{stat}{the test statistic, \eqn{||\Sigma^{-1/2} x^T \nu||_2}}
22 | 
23 | \item{dist}{The distances of matrix X}
24 | }
25 | \value{
26 | Returns an "Intervals" object containing the conditioning set.
27 | }
28 | \description{
29 | Computes the conditioning set S for McQuitty linkage hierarchical clustering (WPGMA),
30 | w/o assuming isotropic covariance matrix
31 | }
32 | \keyword{internal}
33 | 


--------------------------------------------------------------------------------
/man/TChisqRatioApprox.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/trunc_dist.R
 3 | \name{TChisqRatioApprox}
 4 | \alias{TChisqRatioApprox}
 5 | \title{Approximation of the ratio of two chi-squared probabilities}
 6 | \usage{
 7 | TChisqRatioApprox(df, E1, E2)
 8 | }
 9 | \arguments{
10 | \item{df}{degree of freedom of the chi-squared random variable.}
11 | 
12 | \item{E1, E2}{"Intervals" objects or matrices where rows represents
13 | a union of intervals with \emph{positive and finite} endpoints.}
14 | }
15 | \value{
16 | This function returns the value of the approximation.
17 | }
18 | \description{
19 | This function returns an approximation of \eqn{P(X \in E1)/P(X \in E2)}, where
20 |     \eqn{X} is a central chi-squared random variable with \code{df} degrees of freedom.
21 | }
22 | \references{
23 | Bryc, Wlodzimierz. "A uniform approximation to the right normal tail integral."
24 |     Applied mathematics and computation 127.2 (2002): 365-374.
25 | 
26 | Canal, Luisa. "A normal approximation for the chi-square distribution."
27 |     Computational statistics & data analysis 48.4 (2005): 803-808.
28 | }
29 | 


--------------------------------------------------------------------------------
/man/TNSurv.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/trunc_dist.R
 3 | \name{TNSurv}
 4 | \alias{TNSurv}
 5 | \title{Survival function of truncated normal distribution}
 6 | \usage{
 7 | TNSurv(q, mean, sd, E, approx = FALSE)
 8 | }
 9 | \arguments{
10 | \item{q}{the quantile.}
11 | 
12 | \item{mean}{the mean parameter}
13 | 
14 | \item{sd}{the standard deviation}
15 | 
16 | \item{E}{the truncation set, an "Intervals" object or a matrix where rows represents
17 | a union of disjoint intervals.}
18 | 
19 | \item{approx}{should the approximation algorithm be used? Default is \code{FALSE},
20 | where the approximation is not used in the first place. But when the result is wacky,
21 | the approximation will be used.}
22 | }
23 | \value{
24 | This function returns the value of the survival function evaluated at quantile \code{q}.
25 | }
26 | \description{
27 | This function returns the upper tail probability of a truncated normal distribution
28 |     at quantile \code{q}.
29 | }
30 | \details{
31 | Let \eqn{X} be a normal random variable with \code{mean} and \code{sd}. Truncating
32 |     \eqn{X} to the set \eqn{E} is equivalent to conditioning on \eqn{{X \in E}}. So this function
33 |     returns \eqn{P(X \ge q | X \in E)}.
34 | }
35 | \references{
36 | Bryc, Wlodzimierz. "A uniform approximation to the right normal tail integral."
37 |     Applied mathematics and computation 127.2 (2002): 365-374.
38 | }
39 | \keyword{internal}
40 | 


--------------------------------------------------------------------------------
/R/util.R:
--------------------------------------------------------------------------------
 1 | # ----- general purpose helper functions -----
 2 | 
 3 | #' Takes the l2-norm of a vector.
 4 | #'
 5 | #' @keywords internal
 6 | #'
 7 | #' @param x the vector to be normed
 8 | #'
 9 | #' @return Returns the l2-norm of x.
10 | norm_vec <- function(x) {
11 |   sqrt(sum(x^2))
12 | }
13 | 
14 | #' Checks if input is an integer between a and b
15 | #'
16 | #' @keywords internal
17 | #'
18 | #' @param x input to check 
19 | #' @param a lower
20 | #' @param b upper
21 | #'
22 | #' @return Returns TRUE if input is an integer between a and b, FALSE otherwise
23 | is_integer_between_a_b <- function(x, a, b) {
24 |   (x>= min(c(a, b))) && (x %% 1 == 0) && (x <= max(c(a, b)))
25 | }
26 | 
27 | #' Checks if two clusterings are the same up to permutation
28 | #'
29 | #' @keywords internal
30 | #'
31 | #' @param cl1 the first clustering
32 | #' @param cl2 the second clustering
33 | #' @param K the number of clusters
34 | #'
35 | #' @return Returns TRUE if they are the same, and FALSE otherwise 
36 | same_cl <- function(cl1, cl2, K) {
37 |   tab <- table(cl1, cl2)
38 |   sum(tab != 0) == K
39 | }
40 | 
41 | #' Checks if Ck, Ck' in C(x'(phi))
42 | #'
43 | #' @keywords internal
44 | #'
45 | #' @param cl clustering of x
46 | #' @param cl_phi clustering of x'(phi)
47 | #' @param k1,k2 index of clusters involved in the test 
48 | #'
49 | #' @return Returns TRUE if Ck, Ck' in C(x'(phi)), and FALSE otherwise 
50 | preserve_cl <- function(cl, cl_phi, k1, k2) {
51 |   tab <- table(cl, cl_phi)
52 |   
53 |   k1_in <- (sum(tab[k1, ] != 0) == 1) & (sum(tab[, k1] != 0) == 1)
54 |   k2_in <- (sum(tab[k2, ] != 0) == 1) & (sum(tab[, k2] != 0) == 1)
55 | 
56 |   k1_in & k2_in
57 | }
58 | 
59 | 


--------------------------------------------------------------------------------
/man/rect_hier_clusters.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/helper_cutree.R
 3 | \name{rect_hier_clusters}
 4 | \alias{rect_hier_clusters}
 5 | \title{Draw a rectangle around a hierarchical cluster}
 6 | \usage{
 7 | rect_hier_clusters(
 8 |   hcl,
 9 |   k = NULL,
10 |   h = NULL,
11 |   which,
12 |   border = NULL,
13 |   cluster = NULL
14 | )
15 | }
16 | \arguments{
17 | \item{hcl}{An object of the type produced by hclust.}
18 | 
19 | \item{k, h}{Scalar. Cut the dendrogram such that either exactly k clusters are produced or by
20 | cutting at height h.}
21 | 
22 | \item{which}{A vector selecting the clusters around which a rectangle should be drawn. Clusters
23 | are selected by number (according to the cutree function)}
24 | 
25 | \item{border}{A vector of border colours for the rectangles}
26 | 
27 | \item{cluster}{Optional vector with cluster memberships as returned by
28 | cutree(hclust.obj, k = k), can be specified for efficiency if already computed.}
29 | }
30 | \value{
31 | (Invisibly) returns a list where each element contains a vector of data
32 | points contained in the respective cluster.
33 | }
34 | \description{
35 | Draws rectangles around the branches of a dendrogram highlighting
36 | the corresponding clusters. First, we cut the dendrogram at a certain level,
37 | then a rectangle is drawn around selected branches. This function is like the
38 | rect.hclust function, but it indexes the clusters according to the names
39 | assigned by the cutree function, rather than by left to right.
40 | }
41 | \examples{
42 | # Simulates a 100 x 2 data set with three clusters
43 | set.seed(123)
44 | dat <- rbind(c(-1, 0), c(0, sqrt(3)), c(1, 0))[rep(1:3, length=100), ] + 
45 | matrix(0.2*rnorm(200), 100, 2)
46 | 
47 | # Average linkage hierarchical clustering
48 | # We use the version of hclust in fastcluster because it's faster than the version in stats
49 | hcl <- fastcluster::hclust(dist(dat, method="euclidean")^2, method="average")
50 | 
51 | # plot dendrograms with the three clusters displayed in blue, orange, and green, respectively
52 | plot(hcl)
53 | rect_hier_clusters(hcl, k=3, which=1:3, border=c("blue", "orange", "green"))
54 | }
55 | 


--------------------------------------------------------------------------------
/R/helper_cutree.R:
--------------------------------------------------------------------------------
 1 | # ----- helpful visualization tools for interacting with cutree indexes -----
 2 | #' Draw a rectangle around a hierarchical cluster
 3 | #'
 4 | #' Draws rectangles around the branches of a dendrogram highlighting
 5 | #' the corresponding clusters. First, we cut the dendrogram at a certain level,
 6 | #' then a rectangle is drawn around selected branches. This function is like the
 7 | #' rect.hclust function, but it indexes the clusters according to the names
 8 | #' assigned by the cutree function, rather than by left to right.
 9 | #'
10 | #' @export
11 | #'
12 | #' @param hcl An object of the type produced by hclust.
13 | #' @param k,h Scalar. Cut the dendrogram such that either exactly k clusters are produced or by
14 | #' cutting at height h.
15 | #' @param which A vector selecting the clusters around which a rectangle should be drawn. Clusters
16 | #' are selected by number (according to the cutree function)
17 | #' @param border A vector of border colours for the rectangles
18 | #' @param cluster Optional vector with cluster memberships as returned by
19 | #' cutree(hclust.obj, k = k), can be specified for efficiency if already computed.
20 | #'
21 | #' @return (Invisibly) returns a list where each element contains a vector of data
22 | #' points contained in the respective cluster.
23 | #'
24 | #' @examples
25 | #' # Simulates a 100 x 2 data set with three clusters
26 | #' set.seed(123)
27 | #' dat <- rbind(c(-1, 0), c(0, sqrt(3)), c(1, 0))[rep(1:3, length=100), ] + 
28 | #' matrix(0.2*rnorm(200), 100, 2)
29 | #'
30 | #' # Average linkage hierarchical clustering
31 | #' # We use the version of hclust in fastcluster because it's faster than the version in stats
32 | #' hcl <- fastcluster::hclust(dist(dat, method="euclidean")^2, method="average")
33 | #' 
34 | #' # plot dendrograms with the three clusters displayed in blue, orange, and green, respectively
35 | #' plot(hcl)
36 | #' rect_hier_clusters(hcl, k=3, which=1:3, border=c("blue", "orange", "green"))
37 | rect_hier_clusters <- function(hcl, k=NULL, h=NULL, which, border=NULL, cluster=NULL) {
38 |     if (length(h) > 1L | length(k) > 1L) stop("'k' and 'h' must be a scalar")
39 |     if (!is.null(h)) {
40 |         if (!is.null(k)) stop("specify exactly one of 'k' and 'h'")
41 |         k <- min(which(rev(hcl$height) < h))
42 |         k <- max(k, 2)
43 |     } else if (is.null(k)) {
44 |         stop("specify exactly one of 'k' and 'h'")
45 |     }
46 | 
47 | 
48 |     if (k < 2 | k > length(hcl$height)) {
49 |         stop(gettextf("k must be between 2 and %d", length(hcl$height)), domain = NA)
50 |     }
51 | 
52 |     if(is.null(cluster)) cluster <- stats::cutree(hcl, k = k)
53 |     
54 |     if(is.null(border)) border <- c("red", "green3", "blue", "cyan", "magenta", "yellow", "gray")[1:k]
55 |     
56 |     # unique(cluster[hcl$order]) gives the left to right ordering of the cutree clusters
57 |     ltr_index <- match(which,  unique(cluster[hcl$order]))
58 |     stats::rect.hclust(hcl, k=k, which=ltr_index, border=border)
59 | }
60 | 


--------------------------------------------------------------------------------
/man/test_clusters_approx.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/trunc_inf.R
 3 | \name{test_clusters_approx}
 4 | \alias{test_clusters_approx}
 5 | \title{Monte Carlo significance test for any clustering method}
 6 | \usage{
 7 | test_clusters_approx(
 8 |   X,
 9 |   k1,
10 |   k2,
11 |   iso = TRUE,
12 |   sig = NULL,
13 |   SigInv = NULL,
14 |   ndraws = 2000,
15 |   cl_fun,
16 |   cl = NULL
17 | )
18 | }
19 | \arguments{
20 | \item{X}{\eqn{n} by \eqn{p} matrix containing numeric data.}
21 | 
22 | \item{k1, k2}{Integers selecting the clusters to test.}
23 | 
24 | \item{iso}{Boolean. If \code{TRUE}, isotropic covariance matrix model, otherwise not.}
25 | 
26 | \item{sig}{Optional scalar specifying \eqn{\sigma}, relevant if \code{iso} is \code{TRUE}.}
27 | 
28 | \item{SigInv}{Optional matrix specifying \eqn{\Sigma^{-1}}, relevant if \code{iso} is \code{FALSE}.}
29 | 
30 | \item{ndraws}{Integer selecting the number of importance samples, default of 2000.}
31 | 
32 | \item{cl_fun}{Function returning assignments to clusters 1 through \code{K}.}
33 | 
34 | \item{cl}{Optionally pass in the results of calling \code{cl_fun} on your data. This is for
35 | efficiency and reproducibility (when the clustering function is non-deterministic).}
36 | }
37 | \value{
38 | \item{stat}{the test statistic: the Euclidean distance between the mean of cluster \code{k1} and the mean of cluster \code{k2}  }
39 | \item{pval}{the approximate p-value}
40 | \item{stderr}{standard error of the p-value estimate}
41 | \item{clusters}{the estimated cluster assignments}
42 | }
43 | \description{
44 | This function performs a user-specified clustering method \code{cl_fun} on the rows of a 
45 | data matrix to obtain \code{K} clusters, and tests the null hypothesis of no difference in means 
46 | between clusters \code{k1} and \code{k2}.
47 | }
48 | \details{
49 | In order to account for the fact that the clusters have been estimated from the data, 
50 | the p-values are computed conditional on the fact that those clusters were estimated. 
51 | This function approximates p-values via importance sampling. 
52 | 
53 | This function assumes that \code{cl_fun} takes a \eqn{n \times p} numeric data matrix as input 
54 | and outputs integer assignments to clusters 1 through \code{K}. 
55 | 
56 | Thank you to August Guang for providing code to speed-up the function by 
57 | parallelizing via the \code{future} package.
58 | }
59 | \examples{
60 | # Simulates a 100 x 2 data set with three clusters
61 | set.seed(123)
62 | dat <- rbind(c(-1, 0), c(0, sqrt(3)), c(1, 0))[rep(1:3, length=100), ] + 
63 | matrix(0.2*rnorm(200), 100, 2)
64 | 
65 | # Function to run k-means clustering w/ k = 3 and 50 random starts
66 | km_cluster <- function(X) { 
67 |  km <- kmeans(X, 3, nstart=50)
68 |  return(km$cluster)
69 | }
70 | 
71 | # Cluster data using k-means 
72 | clusters <- km_cluster(dat)
73 | table(rep(1:3, length=100), clusters)
74 | 
75 | # tests for a difference in means between clusters 1 and 2
76 | # We pass in earlier k-means clustering results from earlier
77 | results <- test_clusters_approx(dat, k1=1, k2=2, cl_fun=km_cluster, ndraws=500, cl=clusters)
78 | results$stat
79 | results$pval
80 | results$stderr
81 | 
82 | }
83 | \references{
84 | Lucy L. Gao et al. "Selective inference for hierarchical clustering".
85 | }
86 | 


--------------------------------------------------------------------------------
/man/test_complete_hier_clusters_approx.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/trunc_inf.R
 3 | \name{test_complete_hier_clusters_approx}
 4 | \alias{test_complete_hier_clusters_approx}
 5 | \title{Monte Carlo significance test for complete linkage hierarchical clustering}
 6 | \usage{
 7 | test_complete_hier_clusters_approx(
 8 |   X,
 9 |   hcl,
10 |   K,
11 |   k1,
12 |   k2,
13 |   iso = TRUE,
14 |   sig = NULL,
15 |   SigInv = NULL,
16 |   ndraws = 2000
17 | )
18 | }
19 | \arguments{
20 | \item{X}{\eqn{n} by \eqn{p} matrix containing numeric data.}
21 | 
22 | \item{hcl}{An object of the type \code{hclust} containing the hierarchical clustering of X.}
23 | 
24 | \item{K}{Integer selecting the total number of clusters.}
25 | 
26 | \item{k1, k2}{Integers selecting the clusters to test.}
27 | 
28 | \item{iso}{Boolean. If \code{TRUE}, isotropic covariance matrix model, otherwise not.}
29 | 
30 | \item{sig}{Optional scalar specifying \eqn{\sigma}, relevant if \code{iso} is \code{TRUE}.}
31 | 
32 | \item{SigInv}{Optional matrix specifying \eqn{\Sigma^{-1}}, relevant if \code{iso} is \code{FALSE}.}
33 | 
34 | \item{ndraws}{Integer selecting the number of importance samples, default of 2000.}
35 | }
36 | \value{
37 | \item{stat}{the test statistic: the Euclidean distance between the mean of cluster \code{k1} and the mean of cluster \code{k2}  }
38 | \item{pval}{the approximate p-value}
39 | \item{stderr}{estimated standard error of the p-value estimate}
40 | }
41 | \description{
42 | This tests the null hypothesis of no difference in means between 
43 | clusters \code{k1} and \code{k2} at level \code{K} in a complete 
44 | linkage hierarchical clustering. (The \code{K} clusters are numbered as per 
45 | the results of the \code{cutree} function in the \code{stats} package.)
46 | }
47 | \details{
48 | Important note: Before calling \code{hclust} and this function, make sure to 
49 | load the package \code{fastcluster}. This is because the p-value approximation 
50 | procedure requires running hierarchical clustering on a large number of simulated 
51 | data sets, and the version of \code{hclust} in the \code{fastcluster} package
52 | is much faster than the version of \code{hclust} in \code{stats}.  
53 | 
54 | In order to account for the fact that the clusters have been estimated from the data, 
55 | the p-values are computed conditional on the fact that those clusters were estimated. 
56 | This function approximates p-values via importance sampling. 
57 | 
58 | Currently, this function supports squared Euclidean distance as a measure of dissimilarity 
59 | between observations. (Note that complete linkage is invariant under monotone transformations 
60 | of the measure of dissimilarity between observations, so unsquared Euclidean distance 
61 | would produce the same hierarchical clustering.)
62 | 
63 | By default, this function assumes that the covariance matrix of the features is isotropic 
64 | i.e. \eqn{Cov(X_i) = \sigma^2 I_p}. Setting \code{iso} to false instead assumes that 
65 | \eqn{Cov(X_i) = \Sigma}. If known, \eqn{\sigma} can be passed in using the \code{sigma} argument 
66 | or \eqn{\Sigma^{-1}} can be passed in the \code{SigInv} argument; otherwise, an 
67 | estimate of \eqn{\sigma} or \eqn{\Sigma} will be used.
68 | }
69 | \examples{
70 | # Simulates a 100 x 2 data set with no clusters
71 | set.seed(1)
72 | dat <- matrix(rnorm(200), 100, 2)
73 | 
74 | # Complete linkage hierarchical clustering
75 | library(fastcluster)
76 | hcl <- hclust(dist(dat, method="euclidean")^2, method="complete")
77 | 
78 | # plot dendrograms with the 1st and 2nd clusters (cut at the third level) 
79 | # displayed in blue and orange 
80 | plot(hcl)
81 | rect_hier_clusters(hcl, k=3, which=1:2, border=c("blue", "orange"))
82 | 
83 | # Monte Carlo test for a difference in means between the blue and orange clusters
84 | test_complete_hier_clusters_approx(X=dat, hcl=hcl, K=3, k1=1, k2=2, ndraws=1000)
85 | 
86 | }
87 | \references{
88 | Lucy L. Gao et al. "Selective inference for hierarchical clustering".
89 | }
90 | \seealso{
91 | \code{\link{rect_hier_clusters}} for visualizing clusters \code{k1} and \code{k2} in the dendrogram;
92 | 
93 | \code{\link{test_hier_clusters_exact}} for exact p-values for hierarchical clustering with other linkages;
94 | 
95 | \code{\link{test_clusters_approx}} for approximate p-values for a user-specified clustering function;
96 | }
97 | 


--------------------------------------------------------------------------------
/man/test_hier_clusters_exact.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/trunc_inf.R
  3 | \name{test_hier_clusters_exact}
  4 | \alias{test_hier_clusters_exact}
  5 | \title{Exact significance test for hierarchical clustering}
  6 | \usage{
  7 | test_hier_clusters_exact(
  8 |   X,
  9 |   link,
 10 |   hcl,
 11 |   K,
 12 |   k1,
 13 |   k2,
 14 |   iso = TRUE,
 15 |   sig = NULL,
 16 |   SigInv = NULL,
 17 |   dist = NULL
 18 | )
 19 | }
 20 | \arguments{
 21 | \item{X}{\eqn{n} by \eqn{p} matrix containing numeric data.}
 22 | 
 23 | \item{link}{String selecting the linkage. Supported options are 
 24 | \code{"single", "average", "centroid", "ward.D", "median"}, and \code{"mcquitty"}.}
 25 | 
 26 | \item{hcl}{Object of the type \code{hclust} containing the hierarchical clustering of X.}
 27 | 
 28 | \item{K}{Integer selecting the total number of clusters.}
 29 | 
 30 | \item{k1, k2}{Integers selecting the clusters to test, as indexed by the results of \code{cutree(hcl, K)}.}
 31 | 
 32 | \item{iso}{Boolean. If \code{TRUE}, isotropic covariance matrix model, otherwise not.}
 33 | 
 34 | \item{sig}{Optional scalar specifying \eqn{\sigma}, relevant if \code{iso} is \code{TRUE}.}
 35 | 
 36 | \item{SigInv}{Optional matrix specifying \eqn{\Sigma^{-1}}, relevant if \code{iso} is \code{FALSE}.}
 37 | 
 38 | \item{dist}{The SQUARED Euclidean distances of matrix X}
 39 | }
 40 | \value{
 41 | \item{stat}{the test statistic: the Euclidean distance between the mean of cluster \code{k1} and the mean of cluster \code{k2}  }
 42 | \item{pval}{the p-value}
 43 | \item{trunc}{object of the type \code{Intervals} containing the conditioning set}
 44 | }
 45 | \description{
 46 | This tests the null hypothesis of no difference in means between 
 47 | clusters \code{k1} and \code{k2} at level \code{K} in a hierarchical clustering. 
 48 | (The \code{K} clusters are numbered as per the results of the \code{cutree} 
 49 | function in the \code{stats} package.)
 50 | }
 51 | \details{
 52 | In order to account for the fact that the clusters have been estimated from the data, 
 53 | the p-values are computed conditional on the fact that those clusters were estimated. 
 54 | This function computes p-values exactly via an analytic characterization of the conditioning set. 
 55 | 
 56 | Currently, this function supports SQUARED Euclidean distance as a measure of dissimilarity 
 57 | between observations, and the following six linkages: single, average, centroid, Ward, 
 58 | McQuitty (also known as WPGMA), and median (also knßown as WPGMC). 
 59 | 
 60 | By default, this function assumes that the covariance matrix of the features is isotropic 
 61 | i.e. \eqn{Cov(X_i) = \sigma^2 I_p}. Setting \code{iso} to \code{FALSE} instead assumes that 
 62 | \eqn{Cov(X_i) = \Sigma}. If known, \eqn{\sigma} can be passed in using the \code{sigma} argument 
 63 | or \eqn{\Sigma^{-1}} can be passed in the \code{SigInv} argument; otherwise, an 
 64 | estimate of \eqn{\sigma} or \eqn{\Sigma} will be used. 
 65 | 
 66 | Note that passing in the SQUARED Euclidean distance object used by \code{hclust} in using the
 67 | optional \code{dist} argument improves computational efficiency for all linkages except 
 68 | for single linkage. This may not lead to noticeable speed-ups in small data sets but 
 69 | leads to major speed-ups in large data sets. Thank you to Jesko Wagner for 
 70 | suggesting and implementing this change.
 71 | }
 72 | \examples{
 73 | # Simulates a 100 x 2 data set with three clusters
 74 | set.seed(123)
 75 | dat <- rbind(c(-1, 0), c(0, sqrt(3)), c(1, 0))[rep(1:3, length=100), ] + 
 76 | matrix(0.2*rnorm(200), 100, 2)
 77 | 
 78 | # Average linkage hierarchical clustering
 79 | hcl <- hclust(dist(dat, method="euclidean")^2, method="average")
 80 | 
 81 | # plot dendrograms with the 1st and 2nd clusters (cut at the third split) 
 82 | # displayed in blue and orange 
 83 | plot(hcl)
 84 | rect_hier_clusters(hcl, k=3, which=1:2, border=c("blue", "orange"))
 85 | 
 86 | # tests for a difference in means between the blue and orange clusters
 87 | test_hier_clusters_exact(X=dat, link="average", hcl=hcl, K=3, k1=1, k2=2)
 88 | 
 89 | }
 90 | \references{
 91 | Lucy L. Gao et al. "Selective inference for hierarchical clustering".
 92 | }
 93 | \seealso{
 94 | \code{\link{rect_hier_clusters}} for visualizing clusters \code{k1} and \code{k2} in the dendrogram;
 95 | 
 96 | \code{\link{test_complete_hier_clusters_approx}} for approximate p-values for complete linkage hierarchical clustering;
 97 | 
 98 | \code{\link{test_clusters_approx}} for approximate p-values for a user-specified clustering function;
 99 | }
100 | 


--------------------------------------------------------------------------------
/R/trunc_dist.R:
--------------------------------------------------------------------------------
  1 | # ----- functions for computing tail probabilities of truncated chi-squared distributions -----
  2 | # ----- full credit to Shuxiao Chen, the writer of the outference package  -----
  3 | #' A helper function for approximating normal tail probabilities
  4 | #'
  5 | #' For \eqn{Z ~ N(0, 1)}, we have the approximation
  6 | #'     \eqn{P(Z \ge z) \approx }\code{magicfun(z)*exp(-z^2/2)}.
  7 | #'
  8 | #' @keywords internal
  9 | #'
 10 | #' @param z, the number where the function is evaluated.
 11 | #'
 12 | #' @return This function returns the value of the function evaluated at \code{z}.
 13 | #'
 14 | #' @references Bryc, Wlodzimierz. "A uniform approximation to the right normal tail integral."
 15 | #'     Applied mathematics and computation 127.2 (2002): 365-374.
 16 | magicfun = function(z){
 17 |   z2 <- z*z
 18 |   z3 <- z*z*z
 19 |   temp <- (z2 + 5.575192695 * z + 12.77436324) /
 20 |     (sqrt(2*pi) * z3 + 14.38718147*z2 + 31.53531977*z + 2*12.77436324)
 21 |   return(temp)
 22 | }
 23 | 
 24 | #' Make endpoints of intervals finite
 25 | #'
 26 | #' This function modifies a union of intervals with positive but possibly infinite endpoints
 27 | #'    into a union of intervals with positive and \emph{finite} endpoints, while ensuring
 28 | #'    the probability of a \eqn{N(0, 1)} falling into it numerically the same.
 29 | #'
 30 | #' @keywords internal
 31 | #'
 32 | #' @param E an "Intervals" object or a matrix where rows represents
 33 | #'     a union of intervals with \emph{positive} but possibly infinite endpoints.
 34 | #'
 35 | #' @return This function returns an "Intervals" object or a matrix depending on the input.
 36 | finiteE <- function(E) {
 37 |   ind.inf <- which(E == Inf)
 38 |   if (length(ind.inf) == 0) return(E)
 39 |   # we know there are some infinite entries
 40 |   E.max <- max(E[-ind.inf])
 41 |   E[which(E == Inf)] <- max(10000, E.max * 2)
 42 |   return(E)
 43 | }
 44 | 
 45 | #' Make endpoints of intervals positive
 46 | #'
 47 | #' This function modifies a union of intervals with possibly negative enpoints
 48 | #'     into a union of intervals with \emph{positive} endpoints, while ensuring
 49 | #'    the probability of a \eqn{N(0, 1)} falling into it numerically the same.
 50 | #'
 51 | #' @keywords internal
 52 | #'
 53 | #' @param E an "Intervals" object or a matrix where rows represents
 54 | #'     a union of intervals with \emph{positive} but possibly infinite endpoints.
 55 | #'
 56 | #' @return This function returns an "Intervals" object or a matrix depending on the input.
 57 | sortE <- function(E) {
 58 |   E.sorted <- lapply(1:nrow(E), function(i){
 59 |     temp <- as.numeric(E[i, ])
 60 |     if (temp[1] <= 0 & temp[2] <= 0) {
 61 |       return(sort(-temp))
 62 |     }
 63 |     if (temp[1] >= 0 & temp[2] >= 0) {
 64 |       return(sort(temp))
 65 |     }
 66 |     # we know temp[1] < 0, temp[2] > 0 OR temp[1] > 0, temp[2] < 0
 67 |     temp <- abs(temp)
 68 |     return(rbind(c(0, temp[1]), c(0, temp[2])))
 69 |   })
 70 |   E.sorted <- do.call(rbind, E.sorted)
 71 |   # in order to use the approximation, we translate Inf to a large number
 72 |   return(finiteE(E.sorted))
 73 | }
 74 | 
 75 | #' Comparison between two intervals
 76 | #'
 77 | #' This functions returns \code{TRUE} if and only if two intervals are the same.
 78 | #'
 79 | #' @keywords internal
 80 | #'
 81 | #' @param int1,int2 "Intervals" objects.
 82 | #'
 83 | #' @return This function returns the desired logical result.
 84 | isSameIntervals <- function(int1, int2) {
 85 | 
 86 |   # first make int1, int2 to the default order
 87 |   int1 <- intervals::reduce(int1)
 88 |   int2 <- intervals::reduce(int2)
 89 | 
 90 |   if (nrow(int1) != nrow(int2)) return(FALSE)
 91 | 
 92 |   # int1 and int2 has the same number of intervals
 93 | 
 94 |   if (sum(int1 != int2) > 0) return(FALSE)
 95 | 
 96 |   # int1 and int2 has the same elements
 97 |   return(TRUE)
 98 | }
 99 | 
100 | #' Approximation of the ratio of two normal probabilities
101 | #'
102 | #' This function returns an approximation of \eqn{P(Z \in E1)/P(Z \in E2)}, where \eqn{Z ~ N(0, 1)}.
103 | #'
104 | #' @keywords internal
105 | #'
106 | #' @param E1,E2 "Intervals" objects or matrices where rows represents
107 | #'     a union of intervals with \emph{positive and finite} endpoints.
108 | #' @param scale scaling parameter.
109 | #'
110 | #' @return This function returns the value of the approximation.
111 | #'
112 | #' @references Bryc, Wlodzimierz. "A uniform approximation to the right normal tail integral."
113 | #'     Applied mathematics and computation 127.2 (2002): 365-374.
114 | TNRatioApprox <- function(E1, E2, scale = NULL) {
115 | 
116 |   if (is.null(scale)) {
117 |     temp <- (c(E1, E2))^2
118 |     scale.grid <- stats::quantile(temp, probs = seq(0, 1, 0.2))
119 | 
120 |     for(scale in scale.grid) {
121 |       temp <- TNRatioApprox(E1, E2, scale = scale)
122 |       if (!is.na(temp)) {
123 |         return(temp)
124 |       }
125 |       # if temp is NaN, proceed to the next loop
126 |     }
127 | 
128 |     # if all scale.grid does not work, then return NaN
129 |     return(NaN)
130 |   }
131 |   num1 <- magicfun(E1[, 1]) * exp(-(E1[, 1]^2 - scale)/2)
132 |   num2 <- magicfun(E1[, 2]) * exp(-(E1[, 2]^2 - scale)/2)
133 |   denom1 <- magicfun(E2[, 1]) * exp(-(E2[, 1]^2 - scale)/2)
134 |   denom2 <- magicfun(E2[, 2]) * exp(-(E2[, 2]^2 - scale)/2)
135 |   res <- sum(num1-num2)/sum(denom1-denom2)
136 |   return(res)
137 | }
138 | 
139 | #' Probability of a standard normal in a single interval
140 | #'
141 | #' This function returns \eqn{P(lo \le Z \le up)}, where \eqn{Z ~ N(0, 1)}.
142 | #'
143 | #' @keywords internal
144 | #'
145 | #' @param lo,up quantiles.
146 | #'
147 | #' @return This function returns the desired probability.
148 | TNProbEachInt <- function(lo, up) {
149 |   if (up == Inf) {
150 |     return(stats::pnorm(lo, 0, 1, lower.tail = FALSE))
151 |   }
152 |   # we know up < Inf, want P(lo <= X <= up).
153 |   # we want small - small (big - big will mask small numbers),
154 | 
155 |   try1 <- stats::pnorm(lo, 0, 1, lower.tail = FALSE) - stats::pnorm(up, 0, 1, lower.tail = FALSE)
156 |   if (try1 != 0) return(try1)
157 | 
158 |   try2 <- stats::pnorm(up, 0, 1, lower.tail = TRUE) - stats::pnorm(lo, 0, 1, lower.tail = TRUE)
159 |   return(try2)
160 | 
161 | }
162 | 
163 | #' Probability of a standard normal in a union of intervals
164 | #'
165 | #' This function returns \eqn{P(Z \in E)}, where \eqn{Z ~ N(0, 1)}.
166 | #'
167 | #' @keywords internal
168 | #'
169 | #' @param E an "Intervals" object or a matrix where rows represents
170 | #'     a union of disjoint intervals.
171 | #'
172 | #' @return This function returns the desired probability.
173 | TNProb <- function(E) {
174 |   # sum cdf over each disjoint interval of E
175 |   res <- sum(sapply(1:nrow(E), function(v) {
176 |     return(TNProbEachInt(E[v, 1], E[v, 2]))
177 |   }))
178 |   return(res)
179 | }
180 | 
181 | #' Survival function of truncated normal distribution
182 | #'
183 | #' This function returns the upper tail probability of a truncated normal distribution
184 | #'     at quantile \code{q}.
185 | #'
186 | #' Let \eqn{X} be a normal random variable with \code{mean} and \code{sd}. Truncating
187 | #'     \eqn{X} to the set \eqn{E} is equivalent to conditioning on \eqn{{X \in E}}. So this function
188 | #'     returns \eqn{P(X \ge q | X \in E)}.
189 | #'
190 | #' @keywords internal
191 | #'
192 | #' @param q the quantile.
193 | #' @param mean the mean parameter
194 | #' @param sd the standard deviation
195 | #' @param E the truncation set, an "Intervals" object or a matrix where rows represents
196 | #'     a union of disjoint intervals.
197 | #' @param approx should the approximation algorithm be used? Default is \code{FALSE},
198 | #'     where the approximation is not used in the first place. But when the result is wacky,
199 | #'     the approximation will be used.
200 | #'
201 | #' @return This function returns the value of the survival function evaluated at quantile \code{q}.
202 | #'
203 | #' @references Bryc, Wlodzimierz. "A uniform approximation to the right normal tail integral."
204 | #'     Applied mathematics and computation 127.2 (2002): 365-374.
205 | TNSurv <- function(q, mean, sd, E, approx = FALSE) {
206 |   # check if truncation is empty (i.e. truncated to the empty set)
207 |   if (nrow(E) == 0) {
208 |     stop("truncation set is empty")
209 |   }
210 | 
211 |   # check if truncation is the whole real line
212 |   if (isSameIntervals(E, intervals::Intervals(c(-Inf, Inf)))) {
213 |     return(stats::pnorm(q, mean, sd, lower.tail = FALSE))
214 |   }
215 | 
216 |   # E is not empty and is not the whole real line,
217 |   # i.e. 0 < P(X in E) < 1
218 | 
219 |   # we want P(X > q | X in E) = P(X >= q AND X in E) / P(X in E)
220 |   # {X >= q} = {Z >= (q-mean)/sd}
221 |   # {X in E} = {Z in (E-mean)/sd}
222 |   # Z ~ N(0, 1)
223 |   q <- (q-mean)/sd
224 |   E <- (E-mean)/sd
225 |   mean <- 0
226 |   sd <- 1
227 |   q2 <- q*q
228 |   region <- suppressWarnings(intervals::interval_intersection(E, intervals::Intervals(c(q, Inf))))
229 |   # check if the result is 0 or 1
230 |   if(nrow(region) == 0) return(0)
231 |   if (isSameIntervals(E, region)) return(1)
232 | 
233 |   # transform region and E so that intervals have positive endpoints
234 |   region <- sortE(region)
235 |   E <- sortE(E)
236 | 
237 |   # we want P(Z in region) / P(Z in E)
238 |   # try approximate calculation
239 |   if (approx) {
240 |     res <- TNRatioApprox(region, E, scale = q2)
241 |     if (is.nan(res)) { # try approximation one more time
242 |       res <- TNRatioApprox(region, E, scale = NULL)
243 |     }
244 |     return(max(0, min(1, res)))
245 |   }
246 | 
247 |   # try exact calculation
248 |   denom <- TNProb(E)
249 |   num <- TNProb(region)
250 | 
251 |   if (denom < 1e-100 || num < 1e-100) {
252 |     res <- TNRatioApprox(region, E, scale = q2)
253 |     if (is.nan(res)) { # try approximation one more time
254 |       res <- TNRatioApprox(region, E, scale = NULL)
255 |     }
256 |     return(max(0, min(1, res)))
257 |   }
258 | 
259 |   # we know denom and num are both reasonably > 0
260 | 
261 |   res <- num / denom
262 |   # force the result to lie in [0, 1]
263 |   return(max(0, min(1, res)))
264 | }
265 | 
266 | #' Approximation of the ratio of two chi-squared probabilities
267 | #'
268 | #' This function returns an approximation of \eqn{P(X \in E1)/P(X \in E2)}, where
269 | #'     \eqn{X} is a central chi-squared random variable with \code{df} degrees of freedom.
270 | #'
271 | #' @export
272 | #'
273 | #' @param df degree of freedom of the chi-squared random variable.
274 | #' @param E1,E2 "Intervals" objects or matrices where rows represents
275 | #'     a union of intervals with \emph{positive and finite} endpoints.
276 | #'
277 | #' @return This function returns the value of the approximation.
278 | #'
279 | #' @references Bryc, Wlodzimierz. "A uniform approximation to the right normal tail integral."
280 | #'     Applied mathematics and computation 127.2 (2002): 365-374.
281 | #' @references Canal, Luisa. "A normal approximation for the chi-square distribution."
282 | #'     Computational statistics & data analysis 48.4 (2005): 803-808.
283 | TChisqRatioApprox <- function(df, E1, E2) {
284 | 
285 |   # the transform that makes x into a N(0, 1) r.v. such that
286 |   # P(X >= x) = P(Z >= Chisq2N(x)), X ~ chisq(df), Z ~ N(0, 1)
287 |   # this function can take either scaler, vector or matrix
288 |   Chisq2N <- function(x, df, tol = 1e-6) {
289 | 
290 |     if (is.numeric(x) && length(x) == 1) {
291 |       if (x <= tol) { # x <= 0
292 |         return(-Inf)
293 |       }
294 |       if (x == Inf) {
295 |         return(Inf)
296 |       }
297 |       # we know x > 0 and x is finite
298 |       x <- (x/df)^(1/6) - (1/2) * (x/df)^(1/3) + (1/3) * (x/df)^(1/2)
299 |       mu <- 5/6 - 1/(9*df) - 7/(648*df^2) + 25/(2187*df^3)
300 |       sig <- sqrt(1/(18*df) + 1/(162*df^2) - 37/(11664*df^3))
301 |       return((x-mu)/sig)
302 |     }
303 | 
304 |     if (is.vector(x)) {
305 |       return(vapply(X = x, FUN = Chisq2N, FUN.VALUE = 0.1, df = df))
306 |     }
307 | 
308 |     if (is.matrix(x)) {
309 |       return(structure(vapply(X = x, FUN = Chisq2N, FUN.VALUE = 0.1, df = df), dim = dim(x)))
310 |     }
311 | 
312 |     return(intervals::Intervals())
313 | 
314 |   }
315 | 
316 | 
317 |   E1 <- Chisq2N(E1, df)
318 |   E1 <- sortE(E1) # notice that Chisq2N can be negative
319 |   E2 <- Chisq2N(E2, df)
320 |   E2 <- sortE(E2)
321 | 
322 |   # now we want P(Z in E1) / P(Z in E2), Z ~ N(0, 1)
323 |   return(TNRatioApprox(E1, E2))
324 | }
325 | 


--------------------------------------------------------------------------------
/R/.Rhistory:
--------------------------------------------------------------------------------
  1 | }
  2 | }
  3 | }
  4 | if(cl[min_cluster_1] != k1 & cl[min_cluster_1] != k2) {
  5 | first_height <- height_merge[min_cluster_1]
  6 | for(j in k1_obs) {
  7 | if(first_height < height_merge[j]) {
  8 | current_height <- heights[first_height]
  9 | } else {
 10 | current_height <- heights[height_merge[j]]
 11 | }
 12 | if(min_cluster_1 > j) {
 13 | new_intervals <- solve_one_ineq(squared_prop_k2, B[min_cluster_1, j], C[min_cluster_1, j] - current_height)
 14 | } else {
 15 | new_intervals <- solve_one_ineq(squared_prop_k2, B[j, min_cluster_1], C[j, min_cluster_1] - current_height)
 16 | }
 17 | if(!is.null(new_intervals)) {
 18 | S_complement[[list_index]] <- new_intervals
 19 | list_index <- list_index + 1
 20 | }
 21 | }
 22 | for(j in k2_obs) {
 23 | if(first_height < height_merge[j]) {
 24 | current_height <- heights[first_height]
 25 | } else {
 26 | current_height <- heights[height_merge[j]]
 27 | }
 28 | if(min_cluster_1 > j) {
 29 | new_intervals <- solve_one_ineq(squared_prop_k1, B[min_cluster_1, j], C[min_cluster_1, j] - current_height)
 30 | } else {
 31 | new_intervals <- solve_one_ineq(squared_prop_k1, B[j, min_cluster_1], C[j, min_cluster_1] - current_height)
 32 | }
 33 | if(!is.null(new_intervals)) {
 34 | S_complement[[list_index]] <- new_intervals
 35 | list_index <- list_index + 1
 36 | }
 37 | }
 38 | }
 39 | }
 40 | S_complement <- do.call('c', S_complement)
 41 | S_complement <- matrix(S_complement, length(S_complement)/2, 2, byrow=TRUE)
 42 | S_complement <- intervals::reduce(intervals::Intervals(S_complement), check_valid=FALSE)
 43 | # complement the complement to get S
 44 | S <- intervals::interval_complement(S_complement, check_valid=FALSE)
 45 | })
 46 | cran_downloads(when = "last-week", packages = c("multiviewtest"))
 47 | a <- cran_logs:;cran_downloads(when = "last-week", packages = c("multiviewtest"))
 48 | a <- cran_logs::cran_downloads(when = "last-week", packages = c("multiviewtest"))
 49 | m <- cranDownloads(packages = c("multiviewtest"),
 50 | from = "2015", to = "2020-10-28")
 51 | m <- packagerank::cranDownloads(packages = c("multiviewtest"),
 52 | from = "2015", to = "2020-10-28")
 53 | m <- packageRank::cranDownloads(packages = c("multiviewtest"),
 54 | from = "2015", to = "2020-10-28")
 55 | m <- packageRank::cranDownloads(packages = c("multiviewtest"),
 56 | from = "2015", to = "2020-10-27")
 57 | plot(m$cranlogs.data$count)
 58 | sum(m$cranlogs.data$count)
 59 | m <- packageRank::cranDownloads(packages = c("multiviewtest"),
 60 | from = "2018", to = "2020-10-28")
 61 | m <- packageRank::cranDownloads(packages = c("multiviewtest"),
 62 | from = "2018", to = "2020-10-27")
 63 | plot(m$cranlogs.data$count)
 64 | m <- packageRank::cranDownloads(packages = c("multiviewtest"),
 65 | from = "2018-10-27", to = "2020-10-27")
 66 | plot(m$cranlogs.data$count)
 67 | sum(m$cranlogs.data$count)
 68 | setwd("~/Dropbox/My-Research/Lucy-Dissertation/Clustering-Selective-Inference/R-Code/clusterpval/R")
 69 | set.seed(123)
 70 | dat <- rbind(c(-1, 0), c(0, sqrt(3)), c(1, 0))[rep(1:3, length=100), ] +
 71 | matrix(0.2*rnorm(200), 100, 2)
 72 | hcl <- fastcluster::hclust(dist(dat, method="euclidean")^2, method="complete")
 73 | set.seed(1)
 74 | X <- dat
 75 | K <- 3
 76 | k1 <- 1
 77 | k2 <- 2
 78 | iso <- TRUE
 79 | sig <- 0.2
 80 | ndraws <- 2000
 81 | n <- nrow(X)
 82 | q <- ncol(X)
 83 | hcl_at_K <- stats::cutree(hcl, K)
 84 | k1_obs <- which(hcl_at_K == k1)
 85 | k2_obs <- which(hcl_at_K == k2)
 86 | n1 <- length(k1_obs)
 87 | n2 <- length(k2_obs)
 88 | squared_norm_nu <- 1/n1 + 1/n2
 89 | diff_means <- colMeans(X[k1_obs, , drop=FALSE]) - colMeans(X[k2_obs, , drop=FALSE])
 90 | prop_k2 <- n2/(n1+n2)
 91 | if(iso) {
 92 | if(is.null(sig)) {
 93 | sig <- sqrt(sum(scale(X, scale=FALSE)^2)/(n*q - q))
 94 | }
 95 | scale_factor <- squared_norm_nu*sig^2
 96 | # compute test statistic
 97 | stat <- norm_vec(diff_means)
 98 | } else {
 99 | if(is.null(SigInv)) {
100 | Sig <- stats::cov(scale(X, scale=FALSE))
101 | SigInv <- solve(Sig)
102 | }
103 | scale_factor <- squared_norm_nu
104 | # compute test statistic
105 | stat <- sqrt(as.numeric(t(diff_means)%*%SigInv%*%diff_means))
106 | }
107 | norm_vec <- function(x) {
108 | sqrt(sum(x^2))
109 | }
110 | if(iso) {
111 | if(is.null(sig)) {
112 | sig <- sqrt(sum(scale(X, scale=FALSE)^2)/(n*q - q))
113 | }
114 | scale_factor <- squared_norm_nu*sig^2
115 | # compute test statistic
116 | stat <- norm_vec(diff_means)
117 | } else {
118 | if(is.null(SigInv)) {
119 | Sig <- stats::cov(scale(X, scale=FALSE))
120 | SigInv <- solve(Sig)
121 | }
122 | scale_factor <- squared_norm_nu
123 | # compute test statistic
124 | stat <- sqrt(as.numeric(t(diff_means)%*%SigInv%*%diff_means))
125 | }
126 | scale_factor <- sqrt(scale_factor)
127 | log_survives <- rep(NA, ndraws)
128 | if(iso) {
129 | if(is.null(sig)) {
130 | sig <- sqrt(sum(scale(X, scale=FALSE)^2)/(n*q - q))
131 | }
132 | scale_factor <- squared_norm_nu*sig^2
133 | # compute test statistic
134 | stat <- norm_vec(diff_means)
135 | } else {
136 | if(is.null(SigInv)) {
137 | Sig <- stats::cov(scale(X, scale=FALSE))
138 | SigInv <- solve(Sig)
139 | }
140 | scale_factor <- squared_norm_nu
141 | # compute test statistic
142 | stat <- sqrt(as.numeric(t(diff_means)%*%SigInv%*%diff_means))
143 | }
144 | scale_factor <- sqrt(scale_factor)
145 | log_survives <- rep(NA, ndraws)
146 | phi <- stats::rnorm(ndraws)*scale_factor + stat
147 | k1_constant <- prop_k2*diff_means/stat
148 | k2_constant <- (prop_k2 - 1)*diff_means/stat
149 | orig_k1 <- t(X[k1_obs, ])
150 | orig_k2 <- t(X[k2_obs, ])
151 | Xphi <- X
152 | for(j in 1:ndraws) {
153 | if(phi[j] < 0) next
154 | # Compute perturbed data set
155 | phi_minus_stat <- phi[j] - stat
156 | Xphi[k1_obs, ] <- t(orig_k1 + k1_constant*phi_minus_stat)
157 | Xphi[k2_obs, ] <- t(orig_k2 + k2_constant*phi_minus_stat)
158 | # Recluster the perturbed data set
159 | hcl_Xphi <- fastcluster::hclust(stats::dist(Xphi)^2, method="complete")
160 | clusters_Xphi <- stats::cutree(hcl_Xphi, K)
161 | if(same_cl(hcl_at_K, clusters_Xphi, K)) {
162 | log_survives[j] <- -(phi[j]/scale_factor)^2/2 + (q-1)*log(phi[j]/scale_factor) - (q/2 - 1)*log(2) - log(gamma(q/2)) - log(scale_factor) -
163 | stats::dnorm(phi[j], mean=stat, sd=scale_factor, log=TRUE)
164 | }
165 | }
166 | # ----- general purpose helper functions -----
167 | #' Takes the l2-norm of a vector.
168 | #'
169 | #' @keywords internal
170 | #'
171 | #' @param x the vector to be normed
172 | #'
173 | #' @return Returns the l2-norm of x.
174 | norm_vec <- function(x) {
175 | sqrt(sum(x^2))
176 | }
177 | #' Checks if input is an integer between a and b
178 | #'
179 | #' @keywords internal
180 | #'
181 | #' @param x input to check
182 | #' @param a lower
183 | #' @param b upper
184 | #'
185 | #' @return Returns TRUE if input is an integer between a and b, FALSE otherwise
186 | is_integer_between_a_b <- function(x, a, b) {
187 | (x>= min(c(a, b))) && (x %% 1 == 0) && (x <= max(c(a, b)))
188 | }
189 | #' Checks if two clusterings are the same up to permutation
190 | #'
191 | #' @keywords internal
192 | #'
193 | #' @param cl1 the first clustering
194 | #' @param cl2 the second clustering
195 | #' @param K the number of clusters
196 | #'
197 | #' @return Returns TRUE if they are the same, and FALSE otherwise
198 | same_cl <- function(cl1, cl2, K) {
199 | tab <- table(cl1, cl2)
200 | sum(tab != 0) == K
201 | }
202 | #' Checks if Ck, Ck' in C(x'(phi))
203 | #'
204 | #' @keywords internal
205 | #'
206 | #' @param cl clustering of x
207 | #' @param cl_phi clustering of x'(phi)
208 | #' @param k1,k2 index of clusters involved in the test
209 | #'
210 | #' @return Returns TRUE if Ck, Ck' in C(x'(phi)), and FALSE otherwise
211 | preserve_cl <- function(cl, cl_phi, k1, k2) {
212 | tab <- table(cl, cl_phi)
213 | k1_in <- (sum(tab[k1, ] != 0) == 1) & (sum(tab[, k1] != 0) == 1)
214 | k2_in <- (sum(tab[k2, ] != 0) == 1) & (sum(tab[, k2] != 0) == 1)
215 | k1_in & k2_in
216 | }
217 | for(j in 1:ndraws) {
218 | if(phi[j] < 0) next
219 | # Compute perturbed data set
220 | phi_minus_stat <- phi[j] - stat
221 | Xphi[k1_obs, ] <- t(orig_k1 + k1_constant*phi_minus_stat)
222 | Xphi[k2_obs, ] <- t(orig_k2 + k2_constant*phi_minus_stat)
223 | # Recluster the perturbed data set
224 | hcl_Xphi <- fastcluster::hclust(stats::dist(Xphi)^2, method="complete")
225 | clusters_Xphi <- stats::cutree(hcl_Xphi, K)
226 | if(same_cl(hcl_at_K, clusters_Xphi, K)) {
227 | log_survives[j] <- -(phi[j]/scale_factor)^2/2 + (q-1)*log(phi[j]/scale_factor) - (q/2 - 1)*log(2) - log(gamma(q/2)) - log(scale_factor) -
228 | stats::dnorm(phi[j], mean=stat, sd=scale_factor, log=TRUE)
229 | }
230 | }
231 | scale_factor <- sqrt(scale_factor)
232 | log_survives <- rep(NA, ndraws)
233 | phi <- stats::rnorm(ndraws)*scale_factor + stat
234 | k1_constant <- prop_k2*diff_means/stat
235 | k2_constant <- (prop_k2 - 1)*diff_means/stat
236 | orig_k1 <- t(X[k1_obs, ])
237 | orig_k2 <- t(X[k2_obs, ])
238 | Xphi <- X
239 | for(j in 1:ndraws) {
240 | if(phi[j] < 0) next
241 | # Compute perturbed data set
242 | phi_minus_stat <- phi[j] - stat
243 | Xphi[k1_obs, ] <- t(orig_k1 + k1_constant*phi_minus_stat)
244 | Xphi[k2_obs, ] <- t(orig_k2 + k2_constant*phi_minus_stat)
245 | # Recluster the perturbed data set
246 | hcl_Xphi <- fastcluster::hclust(stats::dist(Xphi)^2, method="complete")
247 | clusters_Xphi <- stats::cutree(hcl_Xphi, K)
248 | if(same_cl(hcl_at_K, clusters_Xphi, K)) {
249 | log_survives[j] <- -(phi[j]/scale_factor)^2/2 + (q-1)*log(phi[j]/scale_factor) - (q/2 - 1)*log(2) - log(gamma(q/2)) - log(scale_factor) -
250 | stats::dnorm(phi[j], mean=stat, sd=scale_factor, log=TRUE)
251 | }
252 | }
253 | phi <- phi[!is.na(log_survives)]
254 | log_survives <- log_survives[!is.na(log_survives)]
255 | survives <- length(log_survives)
256 | if(survives == 0) {
257 | warning("Oops - we didn't generate any samples that preserved the clusters! Try re-running with a larger value of ndraws.")
258 | return(list(hiercl=hcl, stat=stat, pval=NA))
259 | }
260 | if(survives < 100) {
261 | warning(paste("Only", survives, "samples were used to compute the denominator -",
262 | "p-value approximation may be unstable. Consider re-running",
263 | "with a larger value of ndraws."))
264 | }
265 | log_survives_shift <- log_survives - max(log_survives)
266 | props <- exp(log_survives_shift)/sum(exp(log_survives_shift))
267 | props
268 | length(props)
269 | length(pval)
270 | pval <- sum(props[phi > stat])
271 | pval
272 | props
273 | length(props)
274 | props[phi > stat]
275 | length(props[phi > stat])
276 | length(props)
277 | stat
278 | sum(phi > stat)
279 | summary(phi)
280 | stat
281 | survives
282 | log_survives
283 | props
284 | sum(props[phi < stat]^2) + (1-pval)^2*sum(props[phi >= stat])
285 | (sum(props[phi < stat]^2) + (1-pval)^2*sum(props[phi >= stat]))/(ndraws*pval)
286 | (sum(props[phi < stat]^2) + (1-pval)^2*sum(props[phi >= stat]))/(pval)
287 | sum(props[phi < stat]^2) + (1-pval)^2*sum(props[phi >= stat])
288 | pval
289 | props[phi < stat]
290 | props[phi < stat]^2
291 | sum(props[phi < stat]^2)
292 | sum(props[phi > stat]^2)
293 | sum(props)
294 | (1 - pval)^2*sum(props[phi > stat]^2)
295 | (1 - pval)^2*sum(props[phi >= stat]^2) + sum(props[phi < stat]^2)
296 | sum(props[phi < stat]^2)
297 | props[phi < stat]^2
298 | (1 - pval)^2*sum(props[phi >= stat]^2) + sum(props[phi < stat]^2)
299 | (1 - pval)^2*sum(props[phi >= stat]^2) + pval^2*sum(props[phi < stat]^2)
300 | pval
301 | 1.96*4.409732e-23
302 | qnorm(0.975)
303 | var_pval <- (1 - pval)^2*sum(props[phi >= stat]^2) + pval^2*sum(props[phi < stat]^2)
304 | pval_lower_ci <- pval - qnorm(0.975)*sqrt(var_pval)
305 | pval_upper_ci <- pval + qnorm(0.975)*sqrt(var_pval)
306 | pval_lower_ci
307 | pval_upper_ci
308 | max(0, pval - qnorm(0.975)*sqrt(var_pval))
309 | min(pval + qnorm(0.975)*sqrt(var_pval), 1)
310 | sqrt(ar(pval))
311 | sqrt(vr(pval))
312 | sqrt(var(pval))
313 | test_complete_hier_clusters_approx <- function(X, K, k1, k2, iso=TRUE, sig=NULL, SigInv=NULL, ndraws=2000, hcl=NULL) {
314 | # error checking
315 | if(!is.matrix(X)) stop("X should be a matrix")
316 | n <- nrow(X)
317 | q <- ncol(X)
318 | if(!is_integer_between_a_b(K, 2, n)) stop("number of clusters (K) should be between 2 and n")
319 | if(!is_integer_between_a_b(k1, 1, K) | !is_integer_between_a_b(k2, 1, K)) stop(paste("cluster indices should be between 1 and K", sep=""))
320 | if((iso != TRUE) & (iso != FALSE)) stop("iso should be TRUE or FALSE")
321 | # hierarchical clustering with squared Euclidean distance and specified linkage
322 | if(is.null(hcl)) hcl <- fastcluster::hclust(stats::dist(X, method="euclidean")^2, method="complete")
323 | hcl_at_K <- stats::cutree(hcl, K)
324 | k1_obs <- which(hcl_at_K == k1)
325 | k2_obs <- which(hcl_at_K == k2)
326 | n1 <- length(k1_obs)
327 | n2 <- length(k2_obs)
328 | squared_norm_nu <- 1/n1 + 1/n2
329 | diff_means <- colMeans(X[k1_obs, , drop=FALSE]) - colMeans(X[k2_obs, , drop=FALSE])
330 | prop_k2 <- n2/(n1+n2)
331 | if(iso) {
332 | if(is.null(sig)) {
333 | sig <- sqrt(sum(scale(X, scale=FALSE)^2)/(n*q - q))
334 | }
335 | scale_factor <- squared_norm_nu*sig^2
336 | # compute test statistic
337 | stat <- norm_vec(diff_means)
338 | } else {
339 | if(is.null(SigInv)) {
340 | Sig <- stats::cov(scale(X, scale=FALSE))
341 | SigInv <- solve(Sig)
342 | }
343 | scale_factor <- squared_norm_nu
344 | # compute test statistic
345 | stat <- sqrt(as.numeric(t(diff_means)%*%SigInv%*%diff_means))
346 | }
347 | scale_factor <- sqrt(scale_factor)
348 | log_survives <- rep(NA, ndraws)
349 | phi <- stats::rnorm(ndraws)*scale_factor + stat
350 | k1_constant <- prop_k2*diff_means/stat
351 | k2_constant <- (prop_k2 - 1)*diff_means/stat
352 | orig_k1 <- t(X[k1_obs, ])
353 | orig_k2 <- t(X[k2_obs, ])
354 | Xphi <- X
355 | for(j in 1:ndraws) {
356 | if(phi[j] < 0) next
357 | # Compute perturbed data set
358 | phi_minus_stat <- phi[j] - stat
359 | Xphi[k1_obs, ] <- t(orig_k1 + k1_constant*phi_minus_stat)
360 | Xphi[k2_obs, ] <- t(orig_k2 + k2_constant*phi_minus_stat)
361 | # Recluster the perturbed data set
362 | hcl_Xphi <- fastcluster::hclust(stats::dist(Xphi)^2, method="complete")
363 | clusters_Xphi <- stats::cutree(hcl_Xphi, K)
364 | if(same_cl(hcl_at_K, clusters_Xphi, K)) {
365 | log_survives[j] <- -(phi[j]/scale_factor)^2/2 + (q-1)*log(phi[j]/scale_factor) - (q/2 - 1)*log(2) - log(gamma(q/2)) - log(scale_factor) -
366 | stats::dnorm(phi[j], mean=stat, sd=scale_factor, log=TRUE)
367 | }
368 | }
369 | # Trim down to only survives
370 | phi <- phi[!is.na(log_survives)]
371 | log_survives <- log_survives[!is.na(log_survives)]
372 | survives <- length(log_survives)
373 | # Return nothing if nothing survives
374 | if(survives == 0) {
375 | warning("Oops - we didn't generate any samples that preserved the clusters! Try re-running with a larger value of ndraws.")
376 | return(list(hiercl=hcl, stat=stat, pval=NA))
377 | }
378 | if(survives < 100) {
379 | warning(paste("Only", survives, "samples were used to compute the denominator -",
380 | "p-value approximation may be unstable. Consider re-running",
381 | "with a larger value of ndraws."))
382 | }
383 | #  Approximate p-values
384 | log_survives_shift <- log_survives - max(log_survives)
385 | props <- exp(log_survives_shift)/sum(exp(log_survives_shift))
386 | pval <- sum(props[phi >= stat])
387 | var_pval <- (1 - pval)^2*sum(props[phi >= stat]^2) + pval^2*sum(props[phi < stat]^2)
388 | # pval_lower_ci <- max(0, pval - qnorm(0.975)*sqrt(var_pval))
389 | # pval_upper_ci <- min(pval + qnorm(0.975)*sqrt(var_pval), 1)
390 | # var_pval <-
391 | return(list(stat=stat, pval=pval, stderr=sqrt(var_pval), hiercl=hcl))
392 | }
393 | set.seed(123)
394 | dat <- matrix(0.2*rnorm(200), 100, 2)
395 | hcl <- fastcluster::hclust(dist(dat, method="euclidean")^2, method="complete")
396 | plot(hcl)
397 | rect_hier_clusters(hcl, k=3, which=1:2, border=c("blue", "orange"))
398 | clusterpval::rect_hier_clusters(hcl, k=3, which=1:2, border=c("blue", "orange"))
399 | set.seed(1)
400 | results <- test_complete_hier_clusters_approx(X=dat, K=3, k1=1, k2=2, ndraws=1000)
401 | results
402 | set.seed(1)
403 | results <- test_complete_hier_clusters_approx(X=dat, K=3, k1=1, k2=2, ndraws=100)
404 | results$stat
405 | results$pval
406 | results$stderr
407 | test_complete_hier_clusters_approx <- function(X, K, k1, k2, iso=TRUE, sig=NULL, SigInv=NULL, ndraws=2000, hcl=NULL) {
408 | # error checking
409 | if(!is.matrix(X)) stop("X should be a matrix")
410 | n <- nrow(X)
411 | q <- ncol(X)
412 | if(!is_integer_between_a_b(K, 2, n)) stop("number of clusters (K) should be between 2 and n")
413 | if(!is_integer_between_a_b(k1, 1, K) | !is_integer_between_a_b(k2, 1, K)) stop(paste("cluster indices should be between 1 and K", sep=""))
414 | if((iso != TRUE) & (iso != FALSE)) stop("iso should be TRUE or FALSE")
415 | # hierarchical clustering with squared Euclidean distance and specified linkage
416 | if(is.null(hcl)) hcl <- fastcluster::hclust(stats::dist(X, method="euclidean")^2, method="complete")
417 | hcl_at_K <- stats::cutree(hcl, K)
418 | k1_obs <- which(hcl_at_K == k1)
419 | k2_obs <- which(hcl_at_K == k2)
420 | n1 <- length(k1_obs)
421 | n2 <- length(k2_obs)
422 | squared_norm_nu <- 1/n1 + 1/n2
423 | diff_means <- colMeans(X[k1_obs, , drop=FALSE]) - colMeans(X[k2_obs, , drop=FALSE])
424 | prop_k2 <- n2/(n1+n2)
425 | if(iso) {
426 | if(is.null(sig)) {
427 | sig <- sqrt(sum(scale(X, scale=FALSE)^2)/(n*q - q))
428 | }
429 | scale_factor <- squared_norm_nu*sig^2
430 | # compute test statistic
431 | stat <- norm_vec(diff_means)
432 | } else {
433 | if(is.null(SigInv)) {
434 | Sig <- stats::cov(scale(X, scale=FALSE))
435 | SigInv <- solve(Sig)
436 | }
437 | scale_factor <- squared_norm_nu
438 | # compute test statistic
439 | stat <- sqrt(as.numeric(t(diff_means)%*%SigInv%*%diff_means))
440 | }
441 | scale_factor <- sqrt(scale_factor)
442 | log_survives <- rep(NA, ndraws)
443 | phi <- stats::rnorm(ndraws)*scale_factor + stat
444 | k1_constant <- prop_k2*diff_means/stat
445 | k2_constant <- (prop_k2 - 1)*diff_means/stat
446 | orig_k1 <- t(X[k1_obs, ])
447 | orig_k2 <- t(X[k2_obs, ])
448 | Xphi <- X
449 | for(j in 1:ndraws) {
450 | if(phi[j] < 0) next
451 | # Compute perturbed data set
452 | phi_minus_stat <- phi[j] - stat
453 | Xphi[k1_obs, ] <- t(orig_k1 + k1_constant*phi_minus_stat)
454 | Xphi[k2_obs, ] <- t(orig_k2 + k2_constant*phi_minus_stat)
455 | # Recluster the perturbed data set
456 | hcl_Xphi <- fastcluster::hclust(stats::dist(Xphi)^2, method="complete")
457 | clusters_Xphi <- stats::cutree(hcl_Xphi, K)
458 | if(same_cl(hcl_at_K, clusters_Xphi, K)) {
459 | log_survives[j] <- -(phi[j]/scale_factor)^2/2 + (q-1)*log(phi[j]/scale_factor) - (q/2 - 1)*log(2) - log(gamma(q/2)) - log(scale_factor) -
460 | stats::dnorm(phi[j], mean=stat, sd=scale_factor, log=TRUE)
461 | }
462 | }
463 | # Trim down to only survives
464 | phi <- phi[!is.na(log_survives)]
465 | log_survives <- log_survives[!is.na(log_survives)]
466 | survives <- length(log_survives)
467 | # Return nothing if nothing survives
468 | if(survives == 0) {
469 | warning("Oops - we didn't generate any samples that preserved the clusters! Try re-running with a larger value of ndraws.")
470 | return(list(hiercl=hcl, stat=stat, pval=NA))
471 | }
472 | if(survives < 100) {
473 | warning(paste("Only", survives, "samples were used to compute the denominator -",
474 | "p-value approximation may be unstable. Consider re-running",
475 | "with a larger value of ndraws."))
476 | }
477 | #  Approximate p-values
478 | log_survives_shift <- log_survives - max(log_survives)
479 | props <- exp(log_survives_shift)/sum(exp(log_survives_shift))
480 | pval <- sum(props[phi >= stat])
481 | var_pval <- (1 - pval)^2*sum(props[phi >= stat]^2) + pval^2*sum(props[phi < stat]^2)
482 | return(list(stat=stat, pval=pval, stderr=sqrt(var_pval), hiercl=hcl))
483 | }
484 | set.seed(1)
485 | results <- test_complete_hier_clusters_approx(X=dat, K=3, k1=1, k2=2, ndraws=1000)
486 | results$pval
487 | results$stderr
488 | set.seed(1)
489 | results <- test_complete_hier_clusters_approx(X=dat, K=3, k1=1, k2=2, ndraws=10000)
490 | results$pval
491 | results$stderr
492 | 0.511937- 1.96*0.01520826
493 | 0.4189947 + 1.96*sqrt(0.04481541)
494 | 0.511937- 1.96*sqrt(0.01520826)
495 | set.seed(1)
496 | results <- test_complete_hier_clusters_approx(X=dat, K=3, k1=1, k2=2, ndraws=2000)
497 | results$pval - 1.96*results$stderr
498 | results$pval + 1.96*results$stderr
499 | 0.4189947 - 1.96*0.04481541
500 | 0.4189947 + 1.96*0.04481541
501 | 0.511937- 1.96*0.01520826
502 | 0.511937 + 1.96*0.01520826
503 | rbind(c(0.4189947 - 1.96*0.04481541, 0.4189947 + 1.96*0.04481541), c(results$pval - 1.96*results$stderr, results$pval + 1.96*results$stderr), c(0.511937- 1.96*0.01520826, 0.511937+ 1.96*0.01520826))
504 | set.seed(123)
505 | results <- test_complete_hier_clusters_approx(X=dat, K=3, k1=1, k2=2, ndraws=10000)
506 | c(results$pval - 1.96*results$stderr, results$pval + 1.96*results$stderr)
507 | set.seed(123)
508 | results <- test_complete_hier_clusters_approx(X=dat, K=3, k1=1, k2=2, ndraws=2000)
509 | c(results$pval - 1.96*results$stderr, results$pval + 1.96*results$stderr)
510 | results$pval
511 | 1.96*results$stderr
512 | results
513 | 


--------------------------------------------------------------------------------
/R/trunc_inf.R:
--------------------------------------------------------------------------------
  1 | # ----- functions to test the means of clusters -----
  2 | 
  3 | #' Exact significance test for hierarchical clustering
  4 | #'
  5 | #' This tests the null hypothesis of no difference in means between 
  6 | #' clusters \code{k1} and \code{k2} at level \code{K} in a hierarchical clustering. 
  7 | #'(The \code{K} clusters are numbered as per the results of the \code{cutree} 
  8 | #' function in the \code{stats} package.)
  9 | #' 
 10 | #' In order to account for the fact that the clusters have been estimated from the data, 
 11 | #' the p-values are computed conditional on the fact that those clusters were estimated. 
 12 | #' This function computes p-values exactly via an analytic characterization of the conditioning set. 
 13 | #' 
 14 | #' Currently, this function supports SQUARED Euclidean distance as a measure of dissimilarity 
 15 | #' between observations, and the following six linkages: single, average, centroid, Ward, 
 16 | #' McQuitty (also known as WPGMA), and median (also knßown as WPGMC). 
 17 | #' 
 18 | #' By default, this function assumes that the covariance matrix of the features is isotropic 
 19 | #' i.e. \eqn{Cov(X_i) = \sigma^2 I_p}. Setting \code{iso} to \code{FALSE} instead assumes that 
 20 | #' \eqn{Cov(X_i) = \Sigma}. If known, \eqn{\sigma} can be passed in using the \code{sigma} argument 
 21 | #' or \eqn{\Sigma^{-1}} can be passed in the \code{SigInv} argument; otherwise, an 
 22 | #' estimate of \eqn{\sigma} or \eqn{\Sigma} will be used. 
 23 | #' 
 24 | #' Note that passing in the SQUARED Euclidean distance object used by \code{hclust} in using the
 25 | #' optional \code{dist} argument improves computational efficiency for all linkages except 
 26 | #' for single linkage. This may not lead to noticeable speed-ups in small data sets but 
 27 | #' leads to major speed-ups in large data sets. Thank you to Jesko Wagner for 
 28 | #' suggesting and implementing this change. 
 29 | #'
 30 | #' @export
 31 | #'
 32 | #' @param X \eqn{n} by \eqn{p} matrix containing numeric data.
 33 | #' @param link String selecting the linkage. Supported options are 
 34 | #' \code{"single", "average", "centroid", "ward.D", "median"}, and \code{"mcquitty"}.
 35 | #' @param hcl Object of the type \code{hclust} containing the hierarchical clustering of X. 
 36 | #' @param K Integer selecting the total number of clusters.
 37 | #' @param k1,k2 Integers selecting the clusters to test, as indexed by the results of \code{cutree(hcl, K)}.
 38 | #' @param iso Boolean. If \code{TRUE}, isotropic covariance matrix model, otherwise not.
 39 | #' @param sig Optional scalar specifying \eqn{\sigma}, relevant if \code{iso} is \code{TRUE}.
 40 | #' @param SigInv Optional matrix specifying \eqn{\Sigma^{-1}}, relevant if \code{iso} is \code{FALSE}.
 41 | #' @param dist The SQUARED Euclidean distances of matrix X
 42 | #' @return
 43 | #' \item{stat}{the test statistic: the Euclidean distance between the mean of cluster \code{k1} and the mean of cluster \code{k2}  }
 44 | #' \item{pval}{the p-value}
 45 | #' \item{trunc}{object of the type \code{Intervals} containing the conditioning set}
 46 | #'
 47 | #' @examples
 48 | #' # Simulates a 100 x 2 data set with three clusters
 49 | #' set.seed(123)
 50 | #' dat <- rbind(c(-1, 0), c(0, sqrt(3)), c(1, 0))[rep(1:3, length=100), ] + 
 51 | #' matrix(0.2*rnorm(200), 100, 2)
 52 | #'
 53 | #' # Average linkage hierarchical clustering
 54 | #' hcl <- hclust(dist(dat, method="euclidean")^2, method="average")
 55 | #' 
 56 | #' # plot dendrograms with the 1st and 2nd clusters (cut at the third split) 
 57 | #' # displayed in blue and orange 
 58 | #' plot(hcl)
 59 | #' rect_hier_clusters(hcl, k=3, which=1:2, border=c("blue", "orange"))
 60 | #' 
 61 | #' # tests for a difference in means between the blue and orange clusters
 62 | #' test_hier_clusters_exact(X=dat, link="average", hcl=hcl, K=3, k1=1, k2=2)
 63 | #' 
 64 | #' @seealso \code{\link{rect_hier_clusters}} for visualizing clusters \code{k1} and \code{k2} in the dendrogram;
 65 | #' 
 66 | #' \code{\link{test_complete_hier_clusters_approx}} for approximate p-values for complete linkage hierarchical clustering;
 67 | #' 
 68 | #' \code{\link{test_clusters_approx}} for approximate p-values for a user-specified clustering function;
 69 | #' 
 70 | #' @references Lucy L. Gao et al. "Selective inference for hierarchical clustering". 
 71 | test_hier_clusters_exact <- function(X, link, hcl, K, k1, k2, iso=TRUE, sig=NULL, SigInv=NULL, dist=NULL) {
 72 |     # error checking 
 73 |     if(!is.matrix(X)) stop("X should be a matrix")
 74 |   
 75 |     n <- nrow(X)
 76 |     q <- ncol(X)
 77 |     
 78 |     if(link == "complete") stop("Exact p-value not supported. See 'test_complete_hier_clusters_approx' for an approximate p-value.")
 79 |     if(!link %in% c("single", "average", "centroid", "ward.D", "mcquitty", "median")) stop("Linkage should be 'single', 'average', 'centroid', 'ward.D', 'mcquitty', or 'median'")
 80 |     if(!is_integer_between_a_b(K, 2, n)) stop("number of clusters (K) should be between 2 and n")
 81 |     if(!is_integer_between_a_b(k1, 1, K) | !is_integer_between_a_b(k2, 1, K)) stop(paste("cluster indices should be between 1 and K", sep=""))
 82 |     if((iso != TRUE) & (iso != FALSE)) stop("iso should be TRUE or FALSE")
 83 |     
 84 |     # hierarchical clustering with squared Euclidean distance and specified linkage
 85 |     hcl_at_K <- stats::cutree(hcl, K)
 86 | 
 87 |     n1 <- sum(hcl_at_K == k1)
 88 |     n2 <- sum(hcl_at_K == k2)
 89 |     squared_norm_nu <- 1/n1 + 1/n2
 90 |     diff_means <- colMeans(X[hcl_at_K == k1, , drop=FALSE]) - colMeans(X[hcl_at_K == k2, , drop=FALSE])
 91 |     
 92 |     if(is.null(dist)) dist <- stats::dist(X)^2
 93 |     
 94 |     if(iso) {
 95 |       if(is.null(sig)) {
 96 |         sig <- sqrt(sum(scale(X, scale=FALSE)^2)/(n*q - q))
 97 |       }
 98 |       
 99 |       # compute test statistic
100 |       stat <- norm_vec(diff_means)
101 | 
102 |       # compute truncation set
103 |       if(link == "single") S <- compute_S_single(X, hcl, K, k1, k2, dist)
104 |       if(link == "average") S <- compute_S_average(X, hcl, K, k1, k2, dist)
105 |       if(link == "centroid") S <-  compute_S_centroid(X, hcl, K, k1, k2, dist)
106 |       if(link == "ward.D") S <-  compute_S_ward(X, hcl, K, k1, k2, dist)
107 |       if(link == "mcquitty") S <-  compute_S_mcquitty(X, hcl, K, k1, k2, dist)
108 |       if(link == "median") S <-  compute_S_median(X, hcl, K, k1, k2, dist)
109 | 
110 |       # set distribution of phi
111 |       scale_factor <- squared_norm_nu*sig^2
112 | 
113 |     } else {
114 |       if(is.null(SigInv)) {
115 |         Sig <- stats::cov(scale(X, scale=F))
116 |         SigInv <- solve(Sig)
117 |       }
118 |       
119 |       # compute test statistic
120 |       stat <- sqrt(as.numeric(t(diff_means)%*%SigInv%*%diff_means))
121 | 
122 |       # compute truncation set
123 |       if(link == "single") S <- compute_S_single_gencov(X, hcl, K, k1, k2, stat)
124 |       if(link == "average") S <- compute_S_average_gencov(X, hcl, K, k1, k2, stat, dist)
125 |       if(link == "centroid") S <-  compute_S_centroid_gencov(X, hcl, K, k1, k2, stat, dist)
126 |       if(link == "ward.D") S <-  compute_S_ward_gencov(X, hcl, K, k1, k2, stat, dist)
127 |       if(link == "mcquitty") S <-  compute_S_mcquitty_gencov(X, hcl, K, k1, k2, stat, dist)
128 |       if(link == "median") S <-  compute_S_median_gencov(X, hcl, K, k1, k2, stat, dist)
129 | 
130 |       # set distribution of phi
131 |       scale_factor <- squared_norm_nu
132 |     }
133 | 
134 |     # compute p-value using truncated chi-squared distribution
135 |     gestat <- intervals::Intervals(c(stat^2/scale_factor, Inf))
136 |     denom <- S^2/scale_factor
137 |     numer <- suppressWarnings(intervals::interval_intersection(gestat, denom))
138 |     pval <- TChisqRatioApprox(q, numer, denom)
139 | 
140 |     return(list(stat=stat, pval=pval, trunc=S))
141 | }
142 | 
143 | #' Monte Carlo significance test for complete linkage hierarchical clustering
144 | #'
145 | #' This tests the null hypothesis of no difference in means between 
146 | #' clusters \code{k1} and \code{k2} at level \code{K} in a complete 
147 | #' linkage hierarchical clustering. (The \code{K} clusters are numbered as per 
148 | #' the results of the \code{cutree} function in the \code{stats} package.)
149 | #' 
150 | #' Important note: Before calling \code{hclust} and this function, make sure to 
151 | #' load the package \code{fastcluster}. This is because the p-value approximation 
152 | #' procedure requires running hierarchical clustering on a large number of simulated 
153 | #' data sets, and the version of \code{hclust} in the \code{fastcluster} package
154 | #' is much faster than the version of \code{hclust} in \code{stats}.  
155 | #' 
156 | #' In order to account for the fact that the clusters have been estimated from the data, 
157 | #' the p-values are computed conditional on the fact that those clusters were estimated. 
158 | #' This function approximates p-values via importance sampling. 
159 | #' 
160 | #' Currently, this function supports squared Euclidean distance as a measure of dissimilarity 
161 | #' between observations. (Note that complete linkage is invariant under monotone transformations 
162 | #' of the measure of dissimilarity between observations, so unsquared Euclidean distance 
163 | #' would produce the same hierarchical clustering.)
164 | #' 
165 | #' By default, this function assumes that the covariance matrix of the features is isotropic 
166 | #' i.e. \eqn{Cov(X_i) = \sigma^2 I_p}. Setting \code{iso} to false instead assumes that 
167 | #' \eqn{Cov(X_i) = \Sigma}. If known, \eqn{\sigma} can be passed in using the \code{sigma} argument 
168 | #' or \eqn{\Sigma^{-1}} can be passed in the \code{SigInv} argument; otherwise, an 
169 | #' estimate of \eqn{\sigma} or \eqn{\Sigma} will be used. 
170 | #'
171 | #' @export
172 | #'
173 | #' @param X \eqn{n} by \eqn{p} matrix containing numeric data.
174 | #' @param hcl An object of the type \code{hclust} containing the hierarchical clustering of X.  
175 | #' @param K Integer selecting the total number of clusters.
176 | #' @param k1,k2 Integers selecting the clusters to test.
177 | #' @param iso Boolean. If \code{TRUE}, isotropic covariance matrix model, otherwise not.
178 | #' @param sig Optional scalar specifying \eqn{\sigma}, relevant if \code{iso} is \code{TRUE}.
179 | #' @param SigInv Optional matrix specifying \eqn{\Sigma^{-1}}, relevant if \code{iso} is \code{FALSE}.
180 | #' @param ndraws Integer selecting the number of importance samples, default of 2000.
181 | #'
182 | #' @return
183 | #' \item{stat}{the test statistic: the Euclidean distance between the mean of cluster \code{k1} and the mean of cluster \code{k2}  }
184 | #' \item{pval}{the approximate p-value}
185 | #' \item{stderr}{estimated standard error of the p-value estimate}
186 | #'
187 | #' @examples
188 | #' # Simulates a 100 x 2 data set with no clusters
189 | #' set.seed(1)
190 | #' dat <- matrix(rnorm(200), 100, 2)
191 | #'
192 | #' # Complete linkage hierarchical clustering
193 | #' library(fastcluster)
194 | #' hcl <- hclust(dist(dat, method="euclidean")^2, method="complete")
195 | #' 
196 | #' # plot dendrograms with the 1st and 2nd clusters (cut at the third level) 
197 | #' # displayed in blue and orange 
198 | #' plot(hcl)
199 | #' rect_hier_clusters(hcl, k=3, which=1:2, border=c("blue", "orange"))
200 | #' 
201 | #' # Monte Carlo test for a difference in means between the blue and orange clusters
202 | #' test_complete_hier_clusters_approx(X=dat, hcl=hcl, K=3, k1=1, k2=2, ndraws=1000)
203 | #' 
204 | #' @seealso \code{\link{rect_hier_clusters}} for visualizing clusters \code{k1} and \code{k2} in the dendrogram;
205 | #' 
206 | #' \code{\link{test_hier_clusters_exact}} for exact p-values for hierarchical clustering with other linkages;
207 | #' 
208 | #' \code{\link{test_clusters_approx}} for approximate p-values for a user-specified clustering function;
209 | #' 
210 | #' @references Lucy L. Gao et al. "Selective inference for hierarchical clustering". 
211 | test_complete_hier_clusters_approx <- function(X, hcl, K, k1, k2, iso=TRUE, sig=NULL, SigInv=NULL, ndraws=2000) {
212 |   # error checking 
213 |   if(!is.matrix(X)) stop("X should be a matrix")
214 |   
215 |   n <- nrow(X)
216 |   q <- ncol(X)
217 |   
218 |   if(!is_integer_between_a_b(K, 2, n)) stop("number of clusters (K) should be between 2 and n")
219 |   if(!is_integer_between_a_b(k1, 1, K) | !is_integer_between_a_b(k2, 1, K)) stop(paste("cluster indices should be between 1 and K", sep=""))
220 |   if((iso != TRUE) & (iso != FALSE)) stop("iso should be TRUE or FALSE")
221 |   if(!("fastcluster" %in% .packages())) stop("The fastcluster package must be loaded before calling hclust and before calling this function!")
222 |   
223 |   
224 |   hcl_at_K <- stats::cutree(hcl, K)
225 | 
226 |   k1_obs <- which(hcl_at_K == k1)
227 |   k2_obs <- which(hcl_at_K == k2)
228 |   n1 <- length(k1_obs)
229 |   n2 <- length(k2_obs)
230 |   squared_norm_nu <- 1/n1 + 1/n2
231 |   diff_means <- colMeans(X[k1_obs, , drop=FALSE]) - colMeans(X[k2_obs, , drop=FALSE])
232 | 
233 |   prop_k2 <- n2/(n1+n2)
234 | 
235 |   if(iso) {
236 |     if(is.null(sig)) {
237 |       sig <- sqrt(sum(scale(X, scale=FALSE)^2)/(n*q - q))
238 |     }
239 | 
240 |     scale_factor <- squared_norm_nu*sig^2
241 |     # compute test statistic
242 |     stat <- norm_vec(diff_means)
243 |   } else {
244 |     if(is.null(SigInv)) {
245 |       Sig <- stats::cov(scale(X, scale=FALSE))
246 |       SigInv <- solve(Sig)
247 |     }
248 | 
249 |     scale_factor <- squared_norm_nu
250 | 
251 |     # compute test statistic
252 |     stat <- sqrt(as.numeric(t(diff_means)%*%SigInv%*%diff_means))
253 |   }
254 | 
255 |   scale_factor <- sqrt(scale_factor)
256 |   log_survives <- rep(NA, ndraws)
257 |   phi <- stats::rnorm(ndraws)*scale_factor + stat
258 |   
259 |   k1_constant <- prop_k2*diff_means/stat
260 |   k2_constant <- (prop_k2 - 1)*diff_means/stat
261 |   orig_k1 <- t(X[k1_obs, ])
262 |   orig_k2 <- t(X[k2_obs, ])
263 |   
264 |   Xphi <- X
265 |   
266 |   for(j in 1:ndraws) {
267 |     if(phi[j] < 0) next
268 |     
269 |     # Compute perturbed data set
270 |     phi_minus_stat <- phi[j] - stat
271 |     Xphi[k1_obs, ] <- t(orig_k1 + k1_constant*phi_minus_stat)
272 |     Xphi[k2_obs, ] <- t(orig_k2 + k2_constant*phi_minus_stat)
273 |     
274 |     # Recluster the perturbed data set
275 |     hcl_Xphi <- fastcluster::hclust(stats::dist(Xphi)^2, method="complete")
276 |     clusters_Xphi <- stats::cutree(hcl_Xphi, K)
277 |     if(same_cl(hcl_at_K, clusters_Xphi, K)) {
278 |       log_survives[j] <- -(phi[j]/scale_factor)^2/2 + (q-1)*log(phi[j]/scale_factor) - (q/2 - 1)*log(2) - lgamma(q/2) - log(scale_factor) -
279 |         stats::dnorm(phi[j], mean=stat, sd=scale_factor, log=TRUE)
280 |     }
281 |   }
282 | 
283 |   # Trim down to only survives
284 |   phi <- phi[!is.na(log_survives)]
285 |   log_survives <- log_survives[!is.na(log_survives)]
286 | 
287 |   survives <- length(log_survives)
288 | 
289 |   # Return nothing if nothing survives
290 |   if(survives == 0) {
291 |     warning("Oops - we didn't generate any samples that preserved the clusters! Try re-running with a larger value of ndraws.")
292 |     return(list(stat=stat, pval=NA, stderr=NA))
293 |   }
294 |   
295 |   #  Approximate p-values
296 |   log_survives_shift <- log_survives - max(log_survives)
297 |   props <- exp(log_survives_shift)/sum(exp(log_survives_shift))
298 |   pval <- sum(props[phi >= stat])
299 |   
300 |   var_pval <- (1 - pval)^2*sum(props[phi >= stat]^2) + pval^2*sum(props[phi < stat]^2)
301 |   
302 |   return(list(stat=stat, pval=pval, stderr=sqrt(var_pval)))
303 | }
304 | 
305 | #' Monte Carlo significance test for any clustering method
306 | #'
307 | #' This function performs a user-specified clustering method \code{cl_fun} on the rows of a 
308 | #' data matrix to obtain \code{K} clusters, and tests the null hypothesis of no difference in means 
309 | #' between clusters \code{k1} and \code{k2}. 
310 | #' 
311 | #' In order to account for the fact that the clusters have been estimated from the data, 
312 | #' the p-values are computed conditional on the fact that those clusters were estimated. 
313 | #' This function approximates p-values via importance sampling. 
314 | #' 
315 | #' This function assumes that \code{cl_fun} takes a \eqn{n \times p} numeric data matrix as input 
316 | #' and outputs integer assignments to clusters 1 through \code{K}. 
317 | #' 
318 | #' Thank you to August Guang for providing code to speed-up the function by 
319 | #' parallelizing via the \code{future} package.
320 | #'
321 | #' @export
322 | #'
323 | #' @param X \eqn{n} by \eqn{p} matrix containing numeric data.
324 | #' @param k1,k2 Integers selecting the clusters to test.
325 | #' @param iso Boolean. If \code{TRUE}, isotropic covariance matrix model, otherwise not.
326 | #' @param sig Optional scalar specifying \eqn{\sigma}, relevant if \code{iso} is \code{TRUE}.
327 | #' @param SigInv Optional matrix specifying \eqn{\Sigma^{-1}}, relevant if \code{iso} is \code{FALSE}.
328 | #' @param ndraws Integer selecting the number of importance samples, default of 2000.
329 | #' @param cl_fun Function returning assignments to clusters 1 through \code{K}. 
330 | #' @param cl Optionally pass in the results of calling \code{cl_fun} on your data. This is for
331 | #' efficiency and reproducibility (when the clustering function is non-deterministic). 
332 | #'
333 | #' @return
334 | #' \item{stat}{the test statistic: the Euclidean distance between the mean of cluster \code{k1} and the mean of cluster \code{k2}  }
335 | #' \item{pval}{the approximate p-value}
336 | #' \item{stderr}{standard error of the p-value estimate}
337 | #' \item{clusters}{the estimated cluster assignments}
338 | #' 
339 | #' @examples 
340 | #' # Simulates a 100 x 2 data set with three clusters
341 | #' set.seed(123)
342 | #' dat <- rbind(c(-1, 0), c(0, sqrt(3)), c(1, 0))[rep(1:3, length=100), ] + 
343 | #' matrix(0.2*rnorm(200), 100, 2)
344 | #' 
345 | #' # Function to run k-means clustering w/ k = 3 and 50 random starts
346 | #' km_cluster <- function(X) { 
347 | #'  km <- kmeans(X, 3, nstart=50)
348 | #'  return(km$cluster)
349 | #' }
350 | #' 
351 | #' # Cluster data using k-means 
352 | #' clusters <- km_cluster(dat)
353 | #' table(rep(1:3, length=100), clusters)
354 | #'
355 | #' # tests for a difference in means between clusters 1 and 2
356 | #' # We pass in earlier k-means clustering results from earlier
357 | #' results <- test_clusters_approx(dat, k1=1, k2=2, cl_fun=km_cluster, ndraws=500, cl=clusters)
358 | #' results$stat
359 | #' results$pval
360 | #' results$stderr
361 | #' 
362 | #' @references Lucy L. Gao et al. "Selective inference for hierarchical clustering". 
363 | test_clusters_approx <- function(X, k1, k2, iso=TRUE, sig=NULL, SigInv=NULL, ndraws=2000, cl_fun, cl=NULL) {
364 |   if(!is.matrix(X)) stop("X should be a matrix")
365 |   
366 |   n <- nrow(X)
367 |   q <- ncol(X)
368 |   
369 |   if(is.null(cl)) cl <- cl_fun(X)
370 |   K <- length(unique(cl))
371 |   
372 |   if(!is_integer_between_a_b(K, 2, n)) stop("number of clusters (K) should be between 2 and n")
373 |   if(!is_integer_between_a_b(k1, 1, K) | !is_integer_between_a_b(k2, 1, K)) stop(paste("cluster indices should be between 1 and K", sep=""))
374 |   if((iso != TRUE) & (iso != FALSE)) stop("iso should be TRUE or FALSE")
375 |   
376 |   n1 <- sum(cl == k1)
377 |   n2 <- sum(cl == k2)
378 |   squared_norm_nu <- 1/n1 + 1/n2
379 |   diff_means <- colMeans(X[cl == k1, , drop=FALSE]) - colMeans(X[cl == k2, , drop=F])
380 |   
381 |   prop_k2 <- n2/(n1+n2)
382 |   
383 |   
384 |   if(iso) {
385 |     if(is.null(sig)) {
386 |       sig <- sqrt(sum(scale(X, scale=FALSE)^2)/(n*q - q))
387 |     }
388 |     
389 |     scale_factor <- squared_norm_nu*sig^2
390 |     # compute test statistic
391 |     stat <- norm_vec(diff_means)
392 |   } else {
393 |     if(is.null(SigInv)) {
394 |       Sig <- stats::cov(scale(X, scale=FALSE))
395 |       SigInv <- solve(Sig)
396 |     }
397 | 
398 |     scale_factor <- squared_norm_nu
399 |     
400 |     # compute test statistic
401 |     stat <- sqrt(as.numeric(t(diff_means)%*%SigInv%*%diff_means))
402 |   }
403 |   
404 |   scale_factor <- sqrt(scale_factor)
405 |   phi <- stats::rnorm(ndraws)*scale_factor + stat
406 |   
407 |   
408 |   k1_constant <- prop_k2*diff_means/stat
409 |   k2_constant <- (prop_k2 - 1)*diff_means/stat
410 |   orig_k1 <- t(X[cl == k1, ])
411 |   orig_k2 <- t(X[cl == k2, ])
412 |   
413 |   Xphi <- X
414 |   
415 |   log_survives <- unlist(future.apply::future_lapply(X = 1:ndraws, FUN = function(j) {
416 |     if(phi[j] < 0) return(NA)
417 |     
418 |     # Compute perturbed data set
419 |     Xphi <- X
420 |     Xphi[cl == k1, ] <- t(orig_k1 + (phi[j] - stat)*k1_constant)
421 |     Xphi[cl == k2, ] <- t(orig_k2 + (phi[j] - stat)*k2_constant)
422 |     
423 |     # Recluster the perturbed data set
424 |     cl_Xphi <- cl_fun(Xphi)
425 |     if(preserve_cl(cl, cl_Xphi, k1, k2)) {
426 |       log_survives <- -(phi[j]/scale_factor)^2/2 + (q-1)*log(phi[j]/scale_factor) - (q/2 - 1)*log(2) - lgamma(q/2) - log(scale_factor) -
427 |         stats::dnorm(phi[j], mean=stat, sd=scale_factor, log=TRUE)
428 |       return(log_survives)
429 |     }
430 |     
431 |     return(NA)
432 |     
433 |   }, future.seed=TRUE))
434 |   
435 |   # Trim down to only survives
436 |   phi <- phi[!is.na(log_survives)]
437 |   log_survives <- log_survives[!is.na(log_survives)]
438 |   
439 |   survives <- length(log_survives)
440 |   
441 |   # Return nothing if nothing survives
442 |   if(survives == 0) {
443 |     warning("Oops - we didn't generate any samples that preserved the clusters! Try re-running with a larger value of ndraws.")
444 |     return(list(stat=stat, pval=NA, stderr=NA, clusters=cl))
445 |   }
446 |   
447 |   #  Approximate p-values
448 |   log_survives_shift <- log_survives - max(log_survives)
449 |   props <- exp(log_survives_shift)/sum(exp(log_survives_shift))
450 |   pval <- sum(props[phi >= stat])
451 | 
452 |   var_pval <- (1 - pval)^2*sum(props[phi >= stat]^2) + pval^2*sum(props[phi < stat]^2)
453 |   
454 |   return(list(stat=stat, pval=pval, stderr=sqrt(var_pval), clusters=cl))
455 | }


--------------------------------------------------------------------------------