├── data └── EVnormal.rda ├── docs ├── pkgdown.yml ├── reference │ ├── pkgdown.yml │ ├── energy-deprecated.html │ └── energy-defunct.html ├── link.svg ├── docsearch.js ├── pkgdown.js ├── 404.html └── authors.html ├── R ├── pdcor.R ├── energy-defunct.R ├── indep-deprecated.R ├── Ecluster.R ├── centering.R ├── mutual-indep.R ├── dcovu.R ├── RcppExports.R ├── energy-deprecated.R ├── mvI.R ├── kgroups.R ├── dcorT.R ├── edist.R ├── pdcov-test.R ├── util.R ├── Epoisson.R ├── Eeqdist.R ├── dcov.R ├── Emvnorm.R ├── dcov2d.R └── disco.R ├── src ├── U-product.cpp ├── Rcpp-utilities.cpp ├── dcovU.cpp ├── projection.cpp ├── poissonM.cpp ├── centering.cpp ├── energy_init.c ├── partial-dcor.cpp ├── mvI.cpp ├── B-tree.cpp ├── kgroups.cpp ├── dcov.c ├── RcppExports.cpp └── utilities.c ├── README.md ├── man ├── energy-defunct.Rd ├── energy-deprecated.Rd ├── sortrank.Rd ├── eigen.Rd ├── U_product.Rd ├── energy-package.Rd ├── dmatrix.Rd ├── mutualIndep.Rd ├── pdcor.Rd ├── centering.Rd ├── dcovU_stats.Rd ├── dcovu.Rd ├── dcorT.Rd ├── normalGOF.Rd ├── mvnorm-test.Rd ├── dcov2d.Rd ├── kgroups.Rd ├── poisson.Rd ├── indep-deprecated.Rd ├── edist.Rd ├── mvI.test.Rd ├── dcov.Rd ├── disco.Rd ├── dcov.test.Rd ├── energy.hclust.Rd └── eqdist.etest.Rd ├── NAMESPACE └── DESCRIPTION /data/EVnormal.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mariarizzo/energy/HEAD/data/EVnormal.rda -------------------------------------------------------------------------------- /docs/pkgdown.yml: -------------------------------------------------------------------------------- 1 | pandoc: '3.3' 2 | pkgdown: 2.1.0 3 | pkgdown_sha: ~ 4 | articles: {} 5 | last_built: 2024-08-27T19:42Z 6 | -------------------------------------------------------------------------------- /docs/reference/pkgdown.yml: -------------------------------------------------------------------------------- 1 | pandoc: '3.3' 2 | pkgdown: 2.1.0 3 | pkgdown_sha: ~ 4 | articles: {} 5 | last_built: 2024-08-25T21:46Z 6 | -------------------------------------------------------------------------------- /R/pdcor.R: -------------------------------------------------------------------------------- 1 | ## pdcor.R 2 | ## 3 | ## 4 | 5 | pdcor <- function(x, y, z) { 6 | x <- .arg2dist.matrix(x) 7 | y <- .arg2dist.matrix(y) 8 | z <- .arg2dist.matrix(z) 9 | partial_dcor(x, y, z)["pdcor"] 10 | } 11 | 12 | pdcov <- function(x, y, z) { 13 | x <- .arg2dist.matrix(x) 14 | y <- .arg2dist.matrix(y) 15 | z <- .arg2dist.matrix(z) 16 | partial_dcov(x, y, z) 17 | } 18 | 19 | -------------------------------------------------------------------------------- /R/energy-defunct.R: -------------------------------------------------------------------------------- 1 | ## defunct functions from the energy package 2 | 3 | dcor.ttest <- function(x, y, distance=FALSE) { 4 | .Defunct(new = "dcorT.test", package = "energy", 5 | msg = "dcort.ttest replaced by dcorT.test") 6 | } 7 | 8 | dcor.t <- function(x, y, distance=FALSE) { 9 | .Deprecated(new = "dcorT", package = "energy", 10 | msg = "dcor.t replaced by dcorT") 11 | } 12 | 13 | 14 | -------------------------------------------------------------------------------- /R/indep-deprecated.R: -------------------------------------------------------------------------------- 1 | # deprecated independence test 2 | 3 | indep.test<- 4 | function(x, y, method = c("dcov","mvI"), index = 1, R) { 5 | # two energy tests for multivariate independence 6 | .Deprecated(new = "dcov.test", package = "energy", 7 | msg = "indep.test is deprecated, 8 | replaced by dcov.test or mvI.test") 9 | 10 | type <- match.arg(method) 11 | if (type == "dcov") 12 | return(dcov.test(x, y, index, R)) else 13 | if (type == "mvI") 14 | return(mvI.test(x, y, R)) 15 | } 16 | -------------------------------------------------------------------------------- /src/U-product.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace Rcpp; 3 | 4 | /* 5 | Author: Maria L. Rizzo 6 | energy package 7 | github.com/mariarizzo/energy 8 | */ 9 | 10 | 11 | // [[Rcpp::export]] 12 | double U_product(NumericMatrix U, NumericMatrix V) { 13 | // U and V are U-centered dissimilarity matrices of the two samples 14 | int n = U.nrow(); 15 | int i, j; 16 | double sums = 0.0; 17 | 18 | for (i = 0; i < n; i++) 19 | for (j=0; j 2) 10 | warning("Exponent alpha should be in (0,2]") 11 | if (alpha < 0) 12 | stop("Cannot use negative exponent on distance.") 13 | d <- d^alpha 14 | } 15 | ## heights of hclust are half of energy; otherwise equivalent 16 | return(hclust(d, method = "ward.D")) 17 | } 18 | 19 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # energy 2 | energy package for R 3 | 4 | The energy package for R implements several methods in multivariate analysis 5 | and multivariate inference based on the energy distance, which characterizes 6 | equality of distributions. 7 | 8 | Distance correlation (multivariate independence), disco (nonparametric extension 9 | of ANOVA), and goodness-of-fit tests are examples of some of the methods included. 10 | 11 | energy is named based on the analogy with potential energy in physics. See 12 | the references in the manual for more details. 13 | 14 | 15 | [![CRAN_Status_Badge](https://www.r-pkg.org/badges/version/energy)](https://cran.r-project.org/package=energy) 16 | 17 | -------------------------------------------------------------------------------- /man/energy-defunct.Rd: -------------------------------------------------------------------------------- 1 | \name{dcor.ttest} 2 | \alias{dcor.ttest} 3 | \alias{dcor.t} 4 | \title{ Distance Correlation t-test for High Dimensions} 5 | \description{Defunct: use \code{dcorT.test} and \code{dcorT}.} 6 | \usage{ 7 | dcor.t(x, y, distance = FALSE) 8 | dcor.ttest(x, y, distance = FALSE) 9 | } 10 | \arguments{ 11 | \item{x}{ data or distances of first sample} 12 | \item{y}{ data or distances of second sample} 13 | \item{distance}{ TRUE if x and y are distances, otherwise FALSE} 14 | } 15 | \details{ 16 | See \code{\link{dcorT}}. 17 | } 18 | \author{ Maria L. Rizzo \email{mrizzo@bgsu.edu} and 19 | Gabor J. Szekely 20 | } 21 | \keyword{ htest } 22 | \keyword{ multivariate } 23 | \concept{ energy statistics } 24 | 25 | -------------------------------------------------------------------------------- /src/Rcpp-utilities.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace Rcpp; 3 | 4 | /* 5 | Author: Maria L. Rizzo 6 | energy package 7 | github.com/mariarizzo/energy 8 | */ 9 | 10 | 11 | // [[Rcpp::export]] 12 | NumericMatrix calc_dist(NumericMatrix x) { 13 | int n = x.nrow(), d = x.ncol(), i, j, k; 14 | double dsum, dk; 15 | NumericMatrix Dx(n, n); 16 | for (i = 0; i < n; i++) { 17 | for (j = i; j < n; j++) { 18 | if (i == j) { 19 | Dx(i, i) = 0.0; 20 | } else { 21 | dsum = 0.0; 22 | for (k = 0; k < d; k++) { 23 | dk = x(i,k) - x(j,k); 24 | dsum += dk * dk; 25 | } 26 | Dx(i, j) = sqrt(dsum); 27 | Dx(j, i) = Dx(i, j); 28 | } 29 | } 30 | } 31 | return Dx; 32 | } 33 | 34 | 35 | -------------------------------------------------------------------------------- /man/energy-deprecated.Rd: -------------------------------------------------------------------------------- 1 | \name{energy-deprecated} 2 | \alias{DCOR} 3 | \title{ Deprecated Functions} 4 | \description{ These deprecated functions have been replaced by revised functions and will be removed in future releases of the energy package.} 5 | \usage{ 6 | DCOR(x, y, index=1.0) 7 | } 8 | \arguments{ 9 | \item{x}{ data or distances of first sample} 10 | \item{y}{ data or distances of second sample} 11 | \item{index}{ exponent on Euclidean distance in (0, 2)} 12 | } 13 | \details{ 14 | DCOR is an R version replaced by faster compiled code. 15 | } 16 | \keyword{ multivariate } 17 | \keyword{ nonparametric } 18 | \concept{ independence } 19 | \concept{ multivariate } 20 | \concept{ distance correlation } 21 | \concept{ distance covariance } 22 | \concept{ energy statistics } 23 | 24 | -------------------------------------------------------------------------------- /R/centering.R: -------------------------------------------------------------------------------- 1 | ## use the Rcpp exported function U_center or D_center 2 | ## the utilities in this file are provided for reference and historical reasons 3 | 4 | Dcenter <- function(x) { 5 | ## x is a dist object or data matrix 6 | if (!inherits(x, "dist")) x <- dist(x) 7 | d <- as.matrix(x) 8 | n <- nrow(d) 9 | m <- rowSums(d) 10 | M <- sum(m) / n^2 11 | m <- m / n 12 | a <- sweep(d, 1, m) 13 | b <- sweep(a, 2, m) 14 | B <- b + M 15 | } 16 | 17 | Ucenter <- function(x) { 18 | ## x is a dist object or data matrix 19 | if (!inherits(x, "dist")) x <- dist(x) 20 | d <- as.matrix(x) 21 | n <- nrow(d) 22 | m <- rowSums(d) 23 | M <- sum(m) / ((n-1)*(n-2)) 24 | m <- m / (n-2) 25 | a <- sweep(d, 1, m) 26 | b <- sweep(a, 2, m) 27 | B <- b + M 28 | diag(B) <- 0 29 | B 30 | } 31 | 32 | -------------------------------------------------------------------------------- /docs/link.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 8 | 12 | 13 | -------------------------------------------------------------------------------- /R/mutual-indep.R: -------------------------------------------------------------------------------- 1 | mutualIndep.test <- 2 | function(x, R) { 3 | if (NCOL(x) < 2) { 4 | stop("Expecting two or more samples") 5 | } 6 | bootfn <- function(x, i) { 7 | d <- ncol(x) 8 | dc <- numeric(d-1) 9 | for (k in 1:(d-1)) { 10 | dc[k] <- energy::bcdcor(x[i,k], x[,(k+1):d]) 11 | } 12 | return (dc) 13 | } 14 | 15 | b <- boot::boot(x, bootfn, sim="permutation", R=R) 16 | t0 <- sum(b$t0) 17 | tp <- rowSums(b$t) 18 | pval <- (1 + sum(tp > t0)) / (R + 1) 19 | estimate <- round(b$t0, 3) 20 | names(t0) <- "Sum(R*)" 21 | names(estimate) <- paste0("R*", 1:length(b$t0)) 22 | method <- paste("Energy Test of Mutual Independence") 23 | call <- match.call() 24 | NOTE <- "statistic=sum(bcdcor); permutation test" 25 | rval <- list(statistic = t0, p.value = pval, call = call, 26 | data.name=paste(deparse(substitute(x))," dim ", paste(dim(x), collapse=",")), 27 | estimate=estimate, method=method, note=NOTE) 28 | class(rval) <- "power.htest" 29 | return(rval) 30 | } 31 | 32 | -------------------------------------------------------------------------------- /man/sortrank.Rd: -------------------------------------------------------------------------------- 1 | \name{sortrank} 2 | \alias{sortrank} 3 | \title{ Sort, order and rank a vector } 4 | \description{ 5 | A utility that returns a list with the components 6 | equivalent to sort(x), order(x), rank(x, ties.method = "first"). 7 | } 8 | \usage{ 9 | sortrank(x) 10 | } 11 | \arguments{ 12 | \item{x}{ vector compatible with sort(x)} 13 | } 14 | \details{ 15 | This utility exists to save a little time on large vectors when two or all three of the sort(), order(), rank() results are required. In case of ties, the ranks component matches \code{rank(x, ties.method = "first")}. 16 | } 17 | \value{ 18 | A list with components 19 | \item{x}{the sorted input vector x} 20 | \item{ix}{the permutation = order(x) which rearranges x into ascending order} 21 | \item{r}{the ranks of x} 22 | } 23 | \note{ 24 | This function was benchmarked faster than the combined calls to \code{sort} and \code{rank}. 25 | } 26 | \examples{ 27 | sortrank(rnorm(5)) 28 | } 29 | \references{ 30 | See \code{\link{sort}}. 31 | } 32 | \author{ Maria L. Rizzo \email{mrizzo@bgsu.edu} 33 | } 34 | 35 | -------------------------------------------------------------------------------- /R/dcovu.R: -------------------------------------------------------------------------------- 1 | ## dcovu.R 2 | ## unbiased dcov^2 and bias-corrected dcor^2 3 | ## 4 | 5 | 6 | bcdcor <- function(x, y) { 7 | ## compute bias corrected distance correlation 8 | dcorU(x, y) 9 | } 10 | 11 | dcovU <- 12 | function(x, y) { 13 | ## unbiased dcov^2 14 | if (!inherits(x, "dist")) x <- dist(x) 15 | if (!inherits(y, "dist")) y <- dist(y) 16 | x <- as.matrix(x) 17 | y <- as.matrix(y) 18 | n <- nrow(x) 19 | m <- nrow(y) 20 | if (n != m) stop("sample sizes must agree") 21 | if (! (all(is.finite(c(x, y))))) 22 | stop("data contains missing or infinite values") 23 | 24 | estimates <- dcovU_stats(x, y) #RcppExports 25 | return (estimates[1]) 26 | } 27 | 28 | dcorU <- 29 | function(x, y) { 30 | ## unbiased dcov^2 31 | x <- .arg2dist.matrix(x) 32 | y <- .arg2dist.matrix(y) 33 | n <- nrow(x) 34 | m <- nrow(y) 35 | if (n != m) stop("sample sizes must agree") 36 | if (! (all(is.finite(c(x, y))))) 37 | stop("data contains missing or infinite values") 38 | 39 | estimates <- dcovU_stats(x, y) #RcppExports 40 | return (estimates[2]) 41 | } 42 | -------------------------------------------------------------------------------- /man/eigen.Rd: -------------------------------------------------------------------------------- 1 | \name{EVnormal} 2 | \docType{data} 3 | \alias{EVnormal} 4 | \alias{eigenvalues} 5 | \title{Eigenvalues for the energy Test of Univariate Normality} 6 | \description{ 7 | Pre-computed eigenvalues corresponding to the asymptotic sampling 8 | distribution of the energy test statistic for univariate 9 | normality, under the null hypothesis. Four Cases are computed: 10 | \enumerate{ 11 | \item Simple hypothesis, known parameters. 12 | \item Estimated mean, known variance. 13 | \item Known mean, estimated variance. 14 | \item Composite hypothesis, estimated parameters. 15 | } 16 | Case 4 eigenvalues are used in the test function \code{normal.test} 17 | when \code{method=="limit"}. 18 | } 19 | \usage{data(EVnormal)} 20 | \format{Numeric matrix with 125 rows and 5 columns; 21 | column 1 is the index, and columns 2-5 are 22 | the eigenvalues of Cases 1-4.} 23 | \source{Computed} 24 | \references{ 25 | Szekely, G. J. and Rizzo, M. L. (2005) A New Test for 26 | Multivariate Normality, \emph{Journal of Multivariate Analysis}, 27 | 93/1, 58-80, 28 | \doi{10.1016/j.jmva.2003.12.002}. 29 | } 30 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | useDynLib(energy, .registration=TRUE) 2 | 3 | importFrom(Rcpp, evalCpp) 4 | importFrom("stats", "as.dist", "dist", "dnorm", "hclust", "model.matrix", 5 | "pnorm", "ppois", "pt", "rnorm", "rpois", "sd", "var") 6 | importFrom(boot, boot) 7 | importFrom(gsl, hyperg_1F1) 8 | 9 | export( 10 | bcdcor, 11 | calc_dist, 12 | D_center, 13 | Dcenter, 14 | dcor, 15 | dcor2d, 16 | DCOR, 17 | dcor.test, 18 | dcorT, 19 | dcorT.test, 20 | dcov, 21 | dcov2d, 22 | dcov.test, 23 | dcovU, 24 | dcovU_stats, 25 | disco, 26 | disco.between, 27 | edist, 28 | energy.hclust, 29 | eqdist.e, 30 | eqdist.etest, 31 | indep.test, 32 | is.dmatrix, 33 | kgroups, 34 | ksample.e, 35 | mutualIndep.test, 36 | mvI, 37 | mvI.test, 38 | mvnorm.e, 39 | mvnorm.etest, 40 | mvnorm.test, 41 | normal.e, 42 | normal.test, 43 | pdcor, 44 | pdcor.test, 45 | pdcov, 46 | pdcov.test, 47 | poisson.e, 48 | poisson.etest, 49 | poisson.m, 50 | poisson.mtest, 51 | poisson.tests, 52 | sortrank, 53 | U_center, 54 | U_product, 55 | Ucenter 56 | ) 57 | 58 | S3method(print, disco) 59 | S3method(print, kgroups) 60 | S3method(fitted, kgroups) 61 | -------------------------------------------------------------------------------- /src/dcovU.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace Rcpp; 3 | 4 | 5 | // Author: Maria L. Rizzo 6 | // energy package 7 | // github.com/mariarizzo/energy 8 | 9 | NumericMatrix U_center(NumericMatrix); 10 | 11 | //[[Rcpp::export]] 12 | NumericVector dcovU_stats(NumericMatrix Dx, NumericMatrix Dy) { 13 | // x and y must be square distance matrices 14 | NumericMatrix A = U_center(Dx); 15 | NumericMatrix B = U_center(Dy); 16 | double ab = 0.0, aa = 0.0, bb = 0.0; 17 | double V, dcorU = 0.0; 18 | double eps = std::numeric_limits::epsilon(); //machine epsilon 19 | int n = Dx.nrow(); 20 | int n2 = n * (n - 3); 21 | 22 | for (int i=0; i eps) 34 | dcorU = ab / sqrt(V); 35 | 36 | return NumericVector::create( 37 | _["dCovU"] = ab, 38 | _["bcdcor"] = dcorU, 39 | _["dVarXU"] = aa, 40 | _["dVarYU"] = bb 41 | ); 42 | } -------------------------------------------------------------------------------- /src/projection.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace Rcpp; 3 | 4 | /* 5 | Author: Maria L. Rizzo 6 | energy package 7 | github.com/mariarizzo/energy 8 | */ 9 | 10 | 11 | NumericMatrix U_center(NumericMatrix); 12 | double U_product(NumericMatrix, NumericMatrix); 13 | 14 | // [[Rcpp::export]] 15 | NumericMatrix projection(NumericMatrix Dx, NumericMatrix Dz) { 16 | /* 17 | returns the projection of A(x) distance matrix Dx onto the 18 | orthogonal complement of C(z) distance matrix; 19 | both Dx and Dz are n by n distance or dissimilarity matrices 20 | the projection is an n by n matrix 21 | */ 22 | int n = Dx.nrow(); 23 | int i, j; 24 | NumericMatrix A(n, n), C(n, n), P(n, n); 25 | double AC, CC, c1; 26 | double eps = std::numeric_limits::epsilon(); //machine epsilon 27 | 28 | A = U_center(Dx); // U-centering to get A^U etc. 29 | C = U_center(Dz); 30 | AC = U_product(A, C); // (A,C) = dcov^U 31 | CC = U_product(C, C); 32 | c1 = 0.0; 33 | // if (C,C)==0 then C==0 so c1=(A,C)=0 34 | if (fabs(CC) > eps) 35 | c1 = AC / CC; 36 | for (i=0; i= 0.12.6), stats, boot, gsl 16 | LinkingTo: Rcpp 17 | Suggests: 18 | MASS, 19 | CompQuadForm, 20 | knitr, 21 | rmarkdown 22 | Depends: R (>= 3.1) 23 | URL: https://github.com/mariarizzo/energy 24 | License: GPL (>= 2) 25 | LazyData: true 26 | NeedsCompilation: yes 27 | Repository: CRAN 28 | RoxygenNote: 7.2.3 29 | -------------------------------------------------------------------------------- /R/RcppExports.R: -------------------------------------------------------------------------------- 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | Btree_sum <- function(y, z) { 5 | .Call(`_energy_Btree_sum`, y, z) 6 | } 7 | 8 | calc_dist <- function(x) { 9 | .Call(`_energy_calc_dist`, x) 10 | } 11 | 12 | U_product <- function(U, V) { 13 | .Call(`_energy_U_product`, U, V) 14 | } 15 | 16 | D_center <- function(Dx) { 17 | .Call(`_energy_D_center`, Dx) 18 | } 19 | 20 | U_center <- function(Dx) { 21 | .Call(`_energy_U_center`, Dx) 22 | } 23 | 24 | dcovU_stats <- function(Dx, Dy) { 25 | .Call(`_energy_dcovU_stats`, Dx, Dy) 26 | } 27 | 28 | kgroups_start <- function(x, k, clus, iter_max, distance) { 29 | .Call(`_energy_kgroups_start`, x, k, clus, iter_max, distance) 30 | } 31 | 32 | Istat <- function(Dx, Dy) { 33 | .Call(`_energy_Istat`, Dx, Dy) 34 | } 35 | 36 | Istats <- function(Dx, Dy, R) { 37 | .Call(`_energy_Istats`, Dx, Dy, R) 38 | } 39 | 40 | partial_dcor <- function(Dx, Dy, Dz) { 41 | .Call(`_energy_partial_dcor`, Dx, Dy, Dz) 42 | } 43 | 44 | partial_dcov <- function(Dx, Dy, Dz) { 45 | .Call(`_energy_partial_dcov`, Dx, Dy, Dz) 46 | } 47 | 48 | .poisMstat <- function(x) { 49 | .Call(`_energy_poisMstat`, x) 50 | } 51 | 52 | projection <- function(Dx, Dz) { 53 | .Call(`_energy_projection`, Dx, Dz) 54 | } 55 | 56 | -------------------------------------------------------------------------------- /R/energy-deprecated.R: -------------------------------------------------------------------------------- 1 | ## deprecated functions in energy package 2 | 3 | 4 | DCOR <- 5 | function(x, y, index=1.0) { 6 | # distance covariance and correlation statistics 7 | # alternate method, implemented in R without .C call 8 | # this method is usually slower than the C version 9 | 10 | 11 | .Deprecated(new = "dcor", package = "energy", 12 | msg = "DCOR is deprecated, replaced by dcor or dcov") 13 | 14 | if (!inherits(x, "dist")) x <- dist(x) 15 | if (!inherits(y, "dist")) y <- dist(y) 16 | x <- as.matrix(x) 17 | y <- as.matrix(y) 18 | n <- nrow(x) 19 | m <- nrow(y) 20 | if (n != m) stop("Sample sizes must agree") 21 | if (! (all(is.finite(c(x, y))))) 22 | stop("Data contains missing or infinite values") 23 | if (index < 0 || index > 2) { 24 | warning("index must be in [0,2), using default index=1") 25 | index=1.0} 26 | 27 | stat <- 0 28 | dims <- c(n, ncol(x), ncol(y)) 29 | 30 | Akl <- function(x) { 31 | d <- as.matrix(x)^index 32 | m <- rowMeans(d) 33 | M <- mean(d) 34 | a <- sweep(d, 1, m) 35 | b <- sweep(a, 2, m) 36 | return(b + M) 37 | } 38 | 39 | A <- Akl(x) 40 | B <- Akl(y) 41 | dCov <- sqrt(mean(A * B)) 42 | dVarX <- sqrt(mean(A * A)) 43 | dVarY <- sqrt(mean(B * B)) 44 | V <- sqrt(dVarX * dVarY) 45 | if (V > 0) 46 | dCor <- dCov / V else dCor <- 0 47 | return(list(dCov=dCov, dCor=dCor, dVarX=dVarX, dVarY=dVarY)) 48 | } 49 | 50 | -------------------------------------------------------------------------------- /man/U_product.Rd: -------------------------------------------------------------------------------- 1 | \name{U_product} 2 | \alias{U_product} 3 | \title{ Inner product in the Hilbert space of U-centered 4 | distance matrices} 5 | \description{ 6 | Stand-alone function to compute the inner product in the 7 | Hilbert space of U-centered distance matrices, as in the definition of 8 | partial distance covariance. 9 | } 10 | \usage{ 11 | U_product(U, V) 12 | } 13 | \arguments{ 14 | \item{U}{ U-centered distance matrix} 15 | \item{V}{ U-centered distance matrix} 16 | } 17 | \details{ 18 | Note that \code{pdcor}, etc. functions include the centering and 19 | projection operations, so that these stand alone versions are not 20 | needed except in case one wants to check the internal computations. 21 | 22 | Exported from U_product.cpp. 23 | } 24 | \value{ 25 | \code{U_product} returns the inner product, a scalar. 26 | } 27 | \references{ 28 | Szekely, G.J. and Rizzo, M.L. (2014), 29 | Partial Distance Correlation with Methods for Dissimilarities, 30 | \emph{Annals of Statistics}, Vol. 42, No. 6, pp. 2382-2412.} 31 | \author{ Maria L. Rizzo \email{mrizzo@bgsu.edu} and 32 | Gabor J. Szekely 33 | } 34 | \examples{ 35 | x <- iris[1:10, 1:4] 36 | y <- iris[11:20, 1:4] 37 | M1 <- as.matrix(dist(x)) 38 | M2 <- as.matrix(dist(y)) 39 | U <- U_center(M1) 40 | V <- U_center(M2) 41 | 42 | U_product(U, V) 43 | dcovU_stats(M1, M2) 44 | } 45 | \keyword{ multivariate } 46 | \concept{ multivariate } 47 | \concept{ distance correlation } 48 | \concept{ distance covariance } 49 | \concept{ energy statistics } 50 | -------------------------------------------------------------------------------- /R/mvI.R: -------------------------------------------------------------------------------- 1 | indep.test<- 2 | function(x, y, method = c("dcov","mvI"), index = 1, R) { 3 | # two energy tests for multivariate independence 4 | type <- match.arg(method) 5 | if (type == "dcov") 6 | return(dcov.test(x, y, index, R)) else 7 | if (type == "mvI") 8 | return(mvI.test(x, y, R)) 9 | } 10 | 11 | mvI <- 12 | function(x, y) { 13 | # energy statistic for multivariate independence 14 | # returns dependence coefficient I_n 15 | n <- NROW(x) 16 | m <- NROW(y) 17 | Dx <- .arg2dist.matrix(x) 18 | Dy <- .arg2dist.matrix(y) 19 | return(Istat(Dx, Dy)) #Rcpp 20 | } 21 | 22 | mvI.test<- 23 | function(x, y, R) { 24 | # an energy test for multivariate independence 25 | # not based on dCov or dCor 26 | n <- NROW(x) 27 | m <- NROW(y) 28 | if (n != m || n < 2) stop("Sample sizes must agree") 29 | Dx <- .arg2dist.matrix(x) 30 | Dy <- .arg2dist.matrix(y) 31 | stats <- Istats(Dx, Dy, R) 32 | stat <- n * stats[1]^2 33 | est <- stats[1] 34 | names(est) <- "I" 35 | names(stat) <- "n I^2" 36 | dataname <- paste("x (",n," by ",ncol(x), "), y(",n," by ", ncol(y), "), replicates ", R, sep="") 37 | if (R > 0) { 38 | p.value = (1 + sum(stats[-1] > est)) / (R+1) 39 | } else { 40 | p.value = NA 41 | } 42 | e <- list( 43 | method = "mvI energy test of independence", 44 | statistic = stat, 45 | estimate = est, 46 | replicates = stats[-1], 47 | p.value = p.value, 48 | data.name = dataname) 49 | class(e) <- "htest" 50 | e 51 | } 52 | -------------------------------------------------------------------------------- /src/poissonM.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace Rcpp; 3 | 4 | /* 5 | Author: Maria L. Rizzo 6 | energy package 7 | github.com/mariarizzo/energy 8 | */ 9 | 10 | 11 | // [[Rcpp::export(.poisMstat)]] 12 | NumericVector poisMstat(IntegerVector x) 13 | { 14 | /* computes the Poisson mean distance statistic */ 15 | int i, j, k, n=x.size(); 16 | double eps=1.0e-10; 17 | double ad, cvm, d, lambda, m, q; 18 | double Mcdf1, Mcdf0, Mpdf1, cdf1, cdf0; 19 | NumericVector stats(2); 20 | 21 | lambda = mean(x); 22 | q = R::qpois(1.0-eps, lambda, TRUE, FALSE) + 1; 23 | 24 | m = 0.0; 25 | for (j=0; j 1) Mcdf1 = 1.0; 45 | 46 | cdf1 = R::ppois(i, lambda, TRUE, FALSE); /* MLE of F(i) */ 47 | d = Mcdf1 - cdf1; 48 | cvm += d * d * (cdf1 - cdf0); 49 | ad += d * d * (cdf1 - cdf0) / (cdf1 * (1-cdf1)); 50 | 51 | cdf0 = cdf1; 52 | Mcdf0 = Mcdf1; 53 | } 54 | cvm *= n; 55 | ad *= n; 56 | stats(0) = cvm; 57 | stats(1) = ad; 58 | return stats; 59 | } 60 | -------------------------------------------------------------------------------- /man/energy-package.Rd: -------------------------------------------------------------------------------- 1 | \name{energy-package} 2 | \alias{energy-package} 3 | \alias{energy} 4 | \docType{package} 5 | \title{ 6 | E-statistics: Multivariate Inference via the Energy of Data 7 | } 8 | \description{ 9 | Description: E-statistics (energy) tests and statistics for multivariate and univariate inference, 10 | including distance correlation, one-sample, two-sample, and multi-sample tests for 11 | comparing multivariate distributions, are implemented. Measuring and testing 12 | multivariate independence based on distance correlation, partial distance 13 | correlation, multivariate goodness-of-fit tests, clustering based on energy distance, 14 | testing for multivariate normality, distance components (disco) for non-parametric 15 | analysis of structured data, and other energy statistics/methods are implemented. 16 | } 17 | \author{ 18 | Maria L. Rizzo and Gabor J. Szekely 19 | } 20 | \references{ 21 | G. J. Szekely and M. L. Rizzo (2013). Energy statistics: 22 | A class of statistics based on distances, \emph{Journal of 23 | Statistical Planning and Inference}. 24 | 25 | M. L. Rizzo and G. J. Szekely (2016). Energy Distance, 26 | \emph{WIRES Computational Statistics}, Wiley, Volume 8 Issue 1, 27-38. 27 | Available online Dec., 2015, \doi{10.1002/wics.1375}. 28 | 29 | G. J. Szekely and M. L. Rizzo (2017). The Energy of Data. 30 | \emph{The Annual Review of Statistics and Its Application} 31 | 4:447-79. 32 | 33 | G. J. Szekely and M. L. Rizzo (2023). \emph{The Energy of Data and Distance Correlation}. Chapman & Hall/CRC Monographs on Statistics and Applied Probability. ISBN 9781482242744. 34 | \url{https://www.routledge.com/The-Energy-of-Data-and-Distance-Correlation/Szekely-Rizzo/p/book/9781482242744}. 35 | } 36 | \keyword{ package } 37 | \keyword{ multivariate } 38 | -------------------------------------------------------------------------------- /man/dmatrix.Rd: -------------------------------------------------------------------------------- 1 | \name{Distance Matrix} 2 | \alias{is.dmatrix} 3 | \alias{calc_dist} 4 | \title{ Distance Matrices } 5 | \description{ 6 | Utilities for working with distance matrices. 7 | \code{is.dmatrix} is a utility that checks whether the argument is a distance or dissimilarity matrix; is it square symmetric, non-negative, with zero diagonal? \code{calc_dist} computes a distance matrix directly from a data matrix. 8 | } 9 | \usage{ 10 | is.dmatrix(x, tol = 100 * .Machine$double.eps) 11 | calc_dist(x) 12 | } 13 | \arguments{ 14 | \item{x}{ numeric matrix} 15 | \item{tol}{ tolerance for checking required conditions} 16 | } 17 | \details{ 18 | Energy functions work with the distance matrices of samples. The \code{is.dmatrix} function is used internally when converting arguments to distance matrices. The default \code{tol} is the same as default tolerance of \code{isSymmetric}. 19 | 20 | \code{calc_dist} is an exported Rcpp function that returns a Euclidean distance matrix from the input data matrix. 21 | } 22 | \value{ 23 | \code{is.dmatrix} returns TRUE if (within tolerance) \code{x} is a distance/dissimilarity matrix; otherwise FALSE. It will return FALSE if \code{x} is a class \code{dist} object. 24 | 25 | \code{calc_dist} returns the Euclidean distance matrix for the data matrix \code{x}, which has observations in rows. 26 | } 27 | \note{ 28 | In practice, if \code{dist(x)} is not yet computed, \code{calc_dist(x)} will be faster than \code{as.matrix(dist(x))}. 29 | 30 | On working with non-Euclidean dissimilarities, see the references. 31 | } 32 | \examples{ 33 | x <- matrix(rnorm(20), 10, 2) 34 | D <- calc_dist(x) 35 | is.dmatrix(D) 36 | is.dmatrix(cov(x)) 37 | } 38 | \author{ Maria L. Rizzo \email{mrizzo@bgsu.edu} 39 | } 40 | \references{ 41 | Szekely, G.J. and Rizzo, M.L. (2014), 42 | Partial Distance Correlation with Methods for Dissimilarities. 43 | \emph{Annals of Statistics}, Vol. 42 No. 6, 2382-2412. 44 | } 45 | -------------------------------------------------------------------------------- /src/centering.cpp: -------------------------------------------------------------------------------- 1 | // double centering utilities for the energy package 2 | // 3 | 4 | // Author: Maria L. Rizzo 5 | // energy package 6 | // github.com/mariarizzo/energy 7 | 8 | 9 | #include 10 | using namespace Rcpp; 11 | 12 | NumericMatrix D_center(NumericMatrix Dx); 13 | NumericMatrix U_center(NumericMatrix Dx); 14 | 15 | // [[Rcpp::export]] 16 | NumericMatrix D_center(NumericMatrix Dx) { 17 | /* 18 | computes the double centered distance matrix for distance matrix Dx 19 | for dCov, dCor, etc. 20 | a_{ij} - a_{i.}/n - a_{.j}/n + a_{..}/n^2, all i, j 21 | */ 22 | int j, k; 23 | int n = Dx.nrow(); 24 | NumericVector akbar(n); 25 | NumericMatrix A(n, n); 26 | double abar = 0.0; 27 | 28 | for (k=0; k 1) { 22 | objective <- rep(0, nstart) 23 | objective[1] <- value$W 24 | values <- vector("list", nstart) 25 | values[[1]] <- value 26 | for (j in 2:nstart) { 27 | ## random initialization of cluster labels 28 | cluster <- sample(0:(k-1), size = n, replace = TRUE) 29 | values[[j]] <- kgroups_start(x, k, cluster, iter.max, distance = distance) 30 | 31 | objective[j] <- values[[j]]$W 32 | } 33 | best <- which.min(objective) 34 | value <- values[[best]] 35 | } 36 | 37 | obj <- structure(list( 38 | call = match.call(), 39 | cluster = value$cluster + 1, 40 | sizes = value$sizes, 41 | within = value$within, 42 | W = sum(value$within), 43 | count = value$count, 44 | iterations = value$it, 45 | k = k), 46 | class = "kgroups") 47 | return (obj) 48 | } 49 | 50 | 51 | print.kgroups <- function(x, ...) { 52 | cat("\n"); print(x$call) 53 | cat("\nK-groups cluster analysis\n") 54 | cat(x$k, " groups of size ", x$sizes, "\n") 55 | cat("Within cluster distances:\n", x$within) 56 | cat("\nIterations: ", x$iterations, " Count: ", x$count, "\n") 57 | } 58 | 59 | fitted.kgroups <- function(object, method = c("labels", "groups"), ...) { 60 | method = match.arg(method) 61 | if (method == "groups") { 62 | k <- object$k 63 | CList <- vector("list", k) 64 | for (i in 1:k) 65 | CList[[i]] <- which(object$cluster == i) 66 | return (CList) 67 | } 68 | return (object$cluster) 69 | } 70 | -------------------------------------------------------------------------------- /man/mutualIndep.Rd: -------------------------------------------------------------------------------- 1 | \name{mutual independence} 2 | \alias{mutualIndep.test} 3 | \title{ Energy Test of Mutual Independence} 4 | \description{ 5 | The test statistic is the sum of d-1 bias-corrected squared dcor statistics where the number of variables is d. Implementation is by permuation test. 6 | } 7 | \usage{ 8 | mutualIndep.test(x, R) 9 | } 10 | \arguments{ 11 | \item{x}{ data matrix or data frame} 12 | \item{R}{ number of permutation replicates} 13 | } 14 | \details{ 15 | A population coefficient for mutual independence of d random variables, \eqn{d \geq 2}, is 16 | \deqn{ 17 | \sum_{k=1}^{d-1} \mathcal R^2(X_k, [X_{k+1},\dots,X_d]). 18 | } 19 | which is non-negative and equals zero iff mutual independence holds. 20 | For example, if d=4 the population coefficient is 21 | \deqn{ 22 | \mathcal R^2(X_1, [X_2,X_3,X_4]) + 23 | \mathcal R^2(X_2, [X_3,X_4]) + 24 | \mathcal R^2(X_3, X_4), 25 | } 26 | A permutation test is implemented based on the corresponding sample coefficient. 27 | To test mutual independence of \deqn{X_1,\dots,X_d} the test statistic is the sum of the d-1 28 | statistics (bias-corrected \eqn{dcor^2} statistics): 29 | \deqn{\sum_{k=1}^{d-1} \mathcal R_n^*(X_k, [X_{k+1},\dots,X_d])}. 30 | } 31 | \value{ 32 | \code{mutualIndep.test} returns an object of class \code{power.htest}. 33 | } 34 | \note{ 35 | See Szekely and Rizzo (2014) for details on unbiased \eqn{dCov^2} and bias-corrected \eqn{dCor^2} (\code{bcdcor}) statistics. 36 | } 37 | \seealso{ 38 | \code{\link{bcdcor}}, \code{\link{dcovU_stats}} 39 | } 40 | \references{ 41 | Szekely, G.J., Rizzo, M.L., and Bakirov, N.K. (2007), 42 | Measuring and Testing Dependence by Correlation of Distances, 43 | \emph{Annals of Statistics}, Vol. 35 No. 6, pp. 2769-2794. 44 | \cr \doi{10.1214/009053607000000505} 45 | 46 | Szekely, G.J. and Rizzo, M.L. (2014), 47 | Partial Distance Correlation with Methods for Dissimilarities. 48 | \emph{Annals of Statistics}, Vol. 42 No. 6, 2382-2412. 49 | } 50 | 51 | \author{ Maria L. Rizzo \email{mrizzo@bgsu.edu} and 52 | Gabor J. Szekely 53 | } 54 | \examples{ 55 | x <- matrix(rnorm(100), nrow=20, ncol=5) 56 | mutualIndep.test(x, 199) 57 | } 58 | \keyword{ multivariate } 59 | \concept{ independence } 60 | \concept{ distance correlation } 61 | \concept{ distance covariance } 62 | \concept{ energy statistics } 63 | 64 | -------------------------------------------------------------------------------- /docs/docsearch.js: -------------------------------------------------------------------------------- 1 | $(function() { 2 | 3 | // register a handler to move the focus to the search bar 4 | // upon pressing shift + "/" (i.e. "?") 5 | $(document).on('keydown', function(e) { 6 | if (e.shiftKey && e.keyCode == 191) { 7 | e.preventDefault(); 8 | $("#search-input").focus(); 9 | } 10 | }); 11 | 12 | $(document).ready(function() { 13 | // do keyword highlighting 14 | /* modified from https://jsfiddle.net/julmot/bL6bb5oo/ */ 15 | var mark = function() { 16 | 17 | var referrer = document.URL ; 18 | var paramKey = "q" ; 19 | 20 | if (referrer.indexOf("?") !== -1) { 21 | var qs = referrer.substr(referrer.indexOf('?') + 1); 22 | var qs_noanchor = qs.split('#')[0]; 23 | var qsa = qs_noanchor.split('&'); 24 | var keyword = ""; 25 | 26 | for (var i = 0; i < qsa.length; i++) { 27 | var currentParam = qsa[i].split('='); 28 | 29 | if (currentParam.length !== 2) { 30 | continue; 31 | } 32 | 33 | if (currentParam[0] == paramKey) { 34 | keyword = decodeURIComponent(currentParam[1].replace(/\+/g, "%20")); 35 | } 36 | } 37 | 38 | if (keyword !== "") { 39 | $(".contents").unmark({ 40 | done: function() { 41 | $(".contents").mark(keyword); 42 | } 43 | }); 44 | } 45 | } 46 | }; 47 | 48 | mark(); 49 | }); 50 | }); 51 | 52 | /* Search term highlighting ------------------------------*/ 53 | 54 | function matchedWords(hit) { 55 | var words = []; 56 | 57 | var hierarchy = hit._highlightResult.hierarchy; 58 | // loop to fetch from lvl0, lvl1, etc. 59 | for (var idx in hierarchy) { 60 | words = words.concat(hierarchy[idx].matchedWords); 61 | } 62 | 63 | var content = hit._highlightResult.content; 64 | if (content) { 65 | words = words.concat(content.matchedWords); 66 | } 67 | 68 | // return unique words 69 | var words_uniq = [...new Set(words)]; 70 | return words_uniq; 71 | } 72 | 73 | function updateHitURL(hit) { 74 | 75 | var words = matchedWords(hit); 76 | var url = ""; 77 | 78 | if (hit.anchor) { 79 | url = hit.url_without_anchor + '?q=' + escape(words.join(" ")) + '#' + hit.anchor; 80 | } else { 81 | url = hit.url + '?q=' + escape(words.join(" ")); 82 | } 83 | 84 | return url; 85 | } 86 | -------------------------------------------------------------------------------- /man/pdcor.Rd: -------------------------------------------------------------------------------- 1 | \name{pdcor} 2 | \alias{pdcor} 3 | \alias{pdcov} 4 | \alias{pdcor.test} 5 | \alias{pdcov.test} 6 | \title{ 7 | Partial distance correlation and covariance 8 | } 9 | \description{Partial distance correlation pdcor, pdcov, and tests.} 10 | \usage{ 11 | pdcov.test(x, y, z, R) 12 | pdcor.test(x, y, z, R) 13 | pdcor(x, y, z) 14 | pdcov(x, y, z) 15 | } 16 | \arguments{ 17 | \item{x}{ data or dist object of first sample} 18 | \item{y}{ data or dist object of second sample} 19 | \item{z}{ data or dist object of third sample} 20 | \item{R}{ replicates for permutation test} 21 | } 22 | \details{ 23 | \code{pdcor(x, y, z)} and \code{pdcov(x, y, z)} compute the partial distance 24 | correlation and partial distance covariance, respectively, 25 | of x and y removing z. 26 | 27 | A test for zero partial distance correlation (or zero partial distance covariance) is implemented in \code{pdcor.test}, and \code{pdcov.test}. 28 | 29 | Argument types supported are numeric data matrix, data.frame, tibble, numeric vector, class "dist" object, or factor. For unordered factors a 0-1 distance matrix is computed. 30 | } 31 | \value{ 32 | Each test returns an object of class \code{htest}. 33 | } 34 | \author{ Maria L. Rizzo \email{mrizzo@bgsu.edu} and 35 | Gabor J. Szekely 36 | } 37 | \references{ 38 | Szekely, G.J. and Rizzo, M.L. (2014), 39 | Partial Distance Correlation with Methods for Dissimilarities. 40 | \emph{Annals of Statistics}, Vol. 42 No. 6, 2382-2412. 41 | } 42 | \examples{ 43 | n = 30 44 | R <- 199 45 | 46 | ## mutually independent standard normal vectors 47 | x <- rnorm(n) 48 | y <- rnorm(n) 49 | z <- rnorm(n) 50 | 51 | pdcor(x, y, z) 52 | pdcov(x, y, z) 53 | set.seed(1) 54 | pdcov.test(x, y, z, R=R) 55 | set.seed(1) 56 | pdcor.test(x, y, z, R=R) 57 | 58 | \donttest{ 59 | if (require(MASS)) { 60 | p = 4 61 | mu <- rep(0, p) 62 | Sigma <- diag(p) 63 | 64 | ## linear dependence 65 | y <- mvrnorm(n, mu, Sigma) + x 66 | print(pdcov.test(x, y, z, R=R)) 67 | 68 | ## non-linear dependence 69 | y <- mvrnorm(n, mu, Sigma) * x 70 | print(pdcov.test(x, y, z, R=R)) 71 | } 72 | } 73 | } 74 | \keyword{ htest } 75 | \keyword{ multivariate } 76 | \keyword{ nonparametric } 77 | \concept{ independence } 78 | \concept{ multivariate } 79 | \concept{ distance correlation } 80 | \concept{ distance covariance } 81 | \concept{ energy statistics } 82 | -------------------------------------------------------------------------------- /R/dcorT.R: -------------------------------------------------------------------------------- 1 | ### dcorT.R 2 | ### implementation of the distance correlation t-test 3 | ### for high dimension 4 | 5 | Astar <- function(d) { 6 | ## d is a distance matrix or distance object 7 | ## modified or corrected doubly centered distance matrices 8 | ## denoted A* (or B*) in JMVA t-test paper (2013) 9 | if (inherits(d, "dist")) 10 | d <- as.matrix(d) 11 | n <- nrow(d) 12 | if (n != ncol(d)) stop("Argument d should be distance") 13 | m <- rowMeans(d) 14 | M <- mean(d) 15 | a <- sweep(d, 1, m) 16 | b <- sweep(a, 2, m) 17 | A <- b + M #same as plain A 18 | #correction to get A^* 19 | A <- A - d/n 20 | diag(A) <- m - M 21 | (n / (n-1)) * A 22 | } 23 | 24 | BCDCOR <- function(x, y) { 25 | ## compute bias corrected distance correlation 26 | ## internal function not in NAMESPACE (external: use bcdcor) 27 | ## revised version from v. 1.7-7 28 | if (!inherits(x, "dist")) { 29 | x <- as.matrix(dist(x)) 30 | } else { 31 | x <- as.matrix(x) 32 | } 33 | if (!inherits(y, "dist")) { 34 | y <- as.matrix(dist(y)) 35 | } else { 36 | y <- as.matrix(y) 37 | } 38 | 39 | n <- NROW(x) 40 | AA <- Astar(x) 41 | BB <- Astar(y) 42 | XY <- sum(AA*BB) - (n/(n-2)) * sum(diag(AA*BB)) 43 | XX <- sum(AA*AA) - (n/(n-2)) * sum(diag(AA*AA)) 44 | YY <- sum(BB*BB) - (n/(n-2)) * sum(diag(BB*BB)) 45 | list(bcR=XY / sqrt(XX*YY), XY=XY/n^2, XX=XX/n^2, YY=YY/n^2, n=n) 46 | } 47 | 48 | 49 | 50 | dcorT <- function(x, y) { 51 | # computes the t statistic for corrected high-dim dCor 52 | # should be approximately student T 53 | # x and y are observed samples or distance objects 54 | r <- BCDCOR(x, y) 55 | Cn <- r$bcR 56 | n <- r$n 57 | M <- n*(n-3)/2 58 | sqrt(M-1) * Cn / sqrt(1-Cn^2) 59 | } 60 | 61 | dcorT.test <- function(x, y) { 62 | # x and y are observed samples or distance objects 63 | dname <- paste(deparse(substitute(x)),"and", 64 | deparse(substitute(y))) 65 | stats <- BCDCOR(x, y) 66 | bcR <- stats$bcR 67 | n <- stats$n 68 | M <- n * (n-3) / 2 69 | df <- M - 1 70 | names(df) <- "df" 71 | tstat <- sqrt(M-1) * bcR / sqrt(1-bcR^2) 72 | names(tstat) <- "T" 73 | estimate <- bcR 74 | names(estimate) <- "Bias corrected dcor" 75 | pval <- 1 - pt(tstat, df=df) 76 | method <- "dcor t-test of independence for high dimension" 77 | rval <- list(statistic = tstat, parameter = df, p.value = pval, 78 | estimate=estimate, method=method, data.name=dname) 79 | class(rval) <- "htest" 80 | return(rval) 81 | } 82 | 83 | -------------------------------------------------------------------------------- /man/centering.Rd: -------------------------------------------------------------------------------- 1 | \name{centering distance matrices} 2 | \alias{Ucenter} 3 | \alias{Dcenter} 4 | \alias{U_center} 5 | \alias{D_center} 6 | \title{ Double centering and U-centering } 7 | \description{ 8 | Stand-alone double centering and U-centering functions 9 | that are applied in unbiased distance covariance, bias 10 | corrected distance correlation, and partial distance correlation. 11 | } 12 | \usage{ 13 | Dcenter(x) 14 | Ucenter(x) 15 | U_center(Dx) 16 | D_center(Dx) 17 | } 18 | \arguments{ 19 | \item{x}{ dist object or data matrix} 20 | \item{Dx}{ distance or dissimilarity matrix} 21 | } 22 | \details{ 23 | In \code{Dcenter} and \code{Ucenter}, \code{x} must be 24 | a \code{dist} object or a data matrix. Both functions return 25 | a doubly centered distance matrix. 26 | 27 | Note that \code{pdcor}, etc. functions include the 28 | centering operations (in C), so that these stand alone versions 29 | of centering functions are not needed except in case one 30 | wants to compute just a double-centered or U-centered matrix. 31 | 32 | \code{U_center} is the Rcpp export of the cpp function. 33 | \code{D_center} is the Rcpp export of the cpp function. 34 | } 35 | \value{ 36 | All functions return a square symmetric matrix. 37 | 38 | \code{Dcenter} returns a matrix 39 | \deqn{A_{ij}=a_{ij} - \bar a_{i.} - \bar a_{.j} + \bar a_{..}} 40 | as in classical multidimensional scaling. \code{Ucenter} 41 | returns a matrix 42 | \deqn{\tilde A_{ij}=a_{ij} - \frac{a_{i.}}{n-2} 43 | - \frac{a_{.j}}{n-2} + \frac{a_{..}}{(n-1)(n-2)},\quad i \neq j,} 44 | with zero diagonal, 45 | and this is the double centering applied in \code{pdcov} and 46 | \code{pdcor} as well as the unbiased dCov and bias corrected 47 | dCor statistics. 48 | } 49 | \note{ 50 | The c++ versions \code{D_center} and \code{U_center} should typically 51 | be faster. R versions are retained for historical reasons. 52 | } 53 | \references{ 54 | Szekely, G.J. and Rizzo, M.L. (2014), 55 | Partial Distance Correlation with Methods for Dissimilarities, 56 | \emph{Annals of Statistics}, Vol. 42, No. 6, pp. 2382-2412. 57 | } 58 | \author{ Maria L. Rizzo \email{mrizzo@bgsu.edu} and 59 | Gabor J. Szekely 60 | } 61 | \examples{ 62 | x <- iris[1:10, 1:4] 63 | dx <- dist(x) 64 | Dx <- as.matrix(dx) 65 | M <- U_center(Dx) 66 | 67 | all.equal(M, U_center(M)) #idempotence 68 | all.equal(M, D_center(M)) #invariance 69 | } 70 | \keyword{ multivariate } 71 | \concept{ multivariate } 72 | \concept{ distance correlation } 73 | \concept{ distance covariance } 74 | \concept{ energy statistics } 75 | -------------------------------------------------------------------------------- /R/edist.R: -------------------------------------------------------------------------------- 1 | edist <- 2 | function(x, sizes, distance = FALSE, ix = 1:sum(sizes), alpha = 1, 3 | method = c("cluster","discoB")) { 4 | # computes the e-dissimilarity matrix between k samples or clusters 5 | # x: pooled sample or Euclidean distances 6 | # sizes: vector of sample (cluster) sizes 7 | # distance: TRUE if x is a distance matrix, otherwise FALSE 8 | # ix: a permutation of row indices of x 9 | # alpha: distance exponent 10 | # method: cluster distances or disco statistics 11 | # 12 | k <- length(sizes) 13 | if (k == 1) return (as.dist(0.0)) 14 | if (k < 1) return (NA) 15 | e <- matrix(nrow=k, ncol=k) 16 | n <- cumsum(sizes) 17 | m <- 1 + c(0, n[1:(k-1)]) 18 | 19 | if (is.vector(x)) x <- matrix(x, ncol=1) 20 | if (inherits(x, "dist")) distance <- TRUE 21 | if (distance) 22 | dst <- as.matrix(x) else dst <- as.matrix(dist(x)) 23 | N <- NROW(dst) 24 | if (NCOL(dst) != N) 25 | stop("distance==TRUE but first argument is not distance") 26 | 27 | if (alpha != 1) { 28 | if (alpha <= 0 || alpha > 2) 29 | warning("exponent alpha should be in (0,2]") 30 | dst <- dst^alpha 31 | } 32 | 33 | type <- match.arg(method) 34 | if (type == "cluster") { 35 | for (i in 1:(k - 1)) { 36 | e[i, i] <- 0.0 37 | for (j in (i + 1):k) { 38 | n1 <- sizes[i] 39 | n2 <- sizes[j] 40 | ii <- ix[m[i]:n[i]] 41 | jj <- ix[m[j]:n[j]] 42 | w <- n1 * n2 / (n1 + n2) 43 | m11 <- sum(dst[ii, ii]) / (n1 * n1) 44 | m22 <- sum(dst[jj, jj]) / (n2 * n2) 45 | m12 <- sum(dst[ii, jj]) / (n1 * n2) 46 | e[i, j] <- e[j, i] <- w * ((m12 + m12) - (m11 + m22)) 47 | } 48 | } 49 | } 50 | 51 | 52 | if (type == "discoB") { 53 | #disco statistics for testing F=G 54 | for (i in 1:(k - 1)) { 55 | e[i, i] <- 0.0 56 | for (j in (i + 1):k) { 57 | n1 <- sizes[i] 58 | n2 <- sizes[j] 59 | ii <- ix[m[i]:n[i]] 60 | jj <- ix[m[j]:n[j]] 61 | J <- c(ii,jj) 62 | d <- dst[J, J] 63 | e[i, j] <- eqdist.e(d, sizes=c(n1, n2), distance=TRUE) 64 | e[j, i] <- e[i, j] <- e[i, j] * (n1 + n2) 65 | } 66 | } 67 | e <- 0.5 * e / sum(sizes) #discoB formula 68 | } 69 | 70 | e <- as.dist(e) 71 | attr(e,"method") <- paste(method,": index= ", alpha) 72 | e 73 | } 74 | 75 | -------------------------------------------------------------------------------- /man/dcovU_stats.Rd: -------------------------------------------------------------------------------- 1 | \name{dcovU_stats} 2 | \alias{dcovU_stats} 3 | \title{Unbiased distance covariance statistics} 4 | \description{ 5 | This function computes unbiased estimators of squared distance 6 | covariance, distance variance, and a bias-corrected estimator of 7 | (squared) distance correlation. 8 | } 9 | \usage{ 10 | dcovU_stats(Dx, Dy) 11 | } 12 | \arguments{ 13 | \item{Dx}{ distance matrix of first sample} 14 | \item{Dy}{ distance matrix of second sample} 15 | } 16 | \details{ 17 | The unbiased (squared) dcov is inner product definition of 18 | dCov, in the Hilbert space of U-centered distance matrices. 19 | 20 | The sample sizes (number of rows) of the two samples must 21 | agree, and samples must not contain missing values. The 22 | arguments must be square symmetric matrices. 23 | } 24 | \value{ 25 | \code{dcovU_stats} returns a vector of the components of bias-corrected 26 | dcor: [dCovU, bcdcor, dVarXU, dVarYU]. 27 | } 28 | \note{ 29 | Unbiased distance covariance (SR2014) corresponds to the biased 30 | (original) \eqn{\mathrm{dCov^2}}{dCov^2}. Since \code{dcovU} is an 31 | unbiased statistic, it is signed and we do not take the square root. 32 | For the original distance covariance test of independence (SRB2007, 33 | SR2009), the distance covariance test statistic is the V-statistic 34 | \eqn{\mathrm{n\, dCov^2} = n \mathcal{V}_n^2}{n V_n^2} (not dCov). 35 | Similarly, \code{bcdcor} is bias-corrected, so we do not take the 36 | square root as with dCor. 37 | } 38 | \references{ 39 | Szekely, G.J. and Rizzo, M.L. (2014), 40 | Partial Distance Correlation with Methods for Dissimilarities. 41 | \emph{Annals of Statistics}, Vol. 42 No. 6, 2382-2412. 42 | 43 | Szekely, G.J., Rizzo, M.L., and Bakirov, N.K. (2007), 44 | Measuring and Testing Dependence by Correlation of Distances, 45 | \emph{Annals of Statistics}, Vol. 35 No. 6, pp. 2769-2794. 46 | \cr \doi{10.1214/009053607000000505} 47 | 48 | Szekely, G.J. and Rizzo, M.L. (2009), 49 | Brownian Distance Covariance, 50 | \emph{Annals of Applied Statistics}, 51 | Vol. 3, No. 4, 1236-1265. 52 | \cr \doi{10.1214/09-AOAS312} 53 | } 54 | \author{ Maria L. Rizzo \email{mrizzo@bgsu.edu} and 55 | Gabor J. Szekely 56 | } 57 | \examples{ 58 | x <- iris[1:50, 1:4] 59 | y <- iris[51:100, 1:4] 60 | Dx <- as.matrix(dist(x)) 61 | Dy <- as.matrix(dist(y)) 62 | dcovU_stats(Dx, Dy) 63 | } 64 | \keyword{ multivariate } 65 | \keyword{ nonparametric } 66 | \concept{ independence } 67 | \concept{ multivariate } 68 | \concept{ distance correlation } 69 | \concept{ distance covariance } 70 | \concept{ energy statistics } 71 | -------------------------------------------------------------------------------- /R/pdcov-test.R: -------------------------------------------------------------------------------- 1 | pdcov.test <- function(x, y, z, R) { 2 | if (missing(R)) R <- 0 3 | 4 | Dx <- .arg2dist.matrix(x) 5 | Dy <- .arg2dist.matrix(y) 6 | Dz <- .arg2dist.matrix(z) 7 | n <- nrow(Dx) 8 | Pxz <- projection(Dx, Dz) #U-center and compute projections 9 | Pyz <- projection(Dy, Dz) 10 | 11 | #PxzU <- U_center(Pxz) #not necessary, because of invariance 12 | #PyzU <- U_center(Pyz) 13 | 14 | teststat <- n * U_product(Pxz, Pyz) 15 | ## calc. pdcor 16 | den <- sqrt(U_product(Pxz, Pxz) * U_product(Pyz, Pyz)) 17 | if (den > 0.0) { 18 | estimate <- teststat / (n * den) 19 | } else estimate <- 0.0 20 | bootfn <- function(Pxz, i, Pyz) { 21 | # generate the permutation replicates of dcovU(Pxz, Pyz) 22 | # PxzU and PyzU are the U-centered matrices 23 | U_product(Pxz[i, i], Pyz) #RcppExports 24 | } 25 | 26 | if (R > 0 && den > 0.0) { 27 | reps <- replicate(R, expr= { 28 | i <- sample(1:n) 29 | bootfn(Pxz, i, Pyz=Pyz) 30 | }) 31 | 32 | replicates <- n * reps 33 | pval <- (1 + sum(replicates > teststat)) / (1 + R) 34 | #df <- n * (n-3) / 2 - 2 35 | } else { 36 | pval <- NA 37 | replicates <- NA 38 | } 39 | 40 | dataname <- paste("replicates ", R, sep="") 41 | if (! R>0) 42 | dataname <- "Specify R>0 replicates for a test" 43 | condition <- (den > 0.0) 44 | names(estimate) <- "pdcor" 45 | names(teststat) <- "n V^*" 46 | e <- list( 47 | call = match.call(), 48 | method = paste("pdcov test", sep = ""), 49 | statistic = teststat, 50 | estimate = estimate, 51 | p.value = pval, 52 | n = n, 53 | replicates = replicates, 54 | condition = condition, 55 | data.name = dataname) 56 | class(e) <- "htest" 57 | return(e) 58 | } 59 | 60 | 61 | pdcor.test <- function(x, y, z, R) { 62 | ## x, y, z must be dist. objects or data matrices (no dist matrix) 63 | ## all required calc. done in pdcov.test 64 | if (missing(R)) R <- 0 65 | result <- pdcov.test(x, y, z, R=R) 66 | 67 | if (result$condition) { 68 | ## if (A*A)(B*B) > 0 69 | nRootV <- result$statistic / result$estimate 70 | pdcor_reps <- result$replicates / nRootV 71 | } else pdcor_reps <- NA 72 | 73 | e <- list( 74 | call = match.call(), 75 | method = paste("pdcor test", sep = ""), 76 | statistic = result$estimate, 77 | estimate = result$estimate, 78 | p.value = result$p.value, 79 | n = result$n, 80 | replicates = pdcor_reps, 81 | condition = result$condition, 82 | data.name = result$data.name) 83 | class(e) <- "htest" 84 | return(e) 85 | } 86 | 87 | -------------------------------------------------------------------------------- /src/energy_init.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include // for NULL 4 | #include 5 | 6 | /* 7 | Author: Maria L. Rizzo 8 | energy package 9 | github.com/mariarizzo/energy 10 | */ 11 | 12 | /* declarations to register native routines in this package */ 13 | 14 | /* .C calls */ 15 | extern void dCOV(void *, void *, void *, void *); 16 | extern void dCOVtest(void *, void *, void *, void *, void *, void *, void *); 17 | extern void ksampleEtest(void *, void *, void *, void *, void *, void *, void *, void *, void *); 18 | extern void permute_check(void *, void *); 19 | 20 | /* .Call calls */ 21 | extern SEXP _energy_D_center(SEXP); 22 | extern SEXP _energy_dcovU_stats(SEXP, SEXP); 23 | extern SEXP _energy_partial_dcor(SEXP, SEXP, SEXP); 24 | extern SEXP _energy_partial_dcov(SEXP, SEXP, SEXP); 25 | extern SEXP _energy_poisMstat(SEXP); 26 | extern SEXP _energy_projection(SEXP, SEXP); 27 | extern SEXP _energy_U_center(SEXP); 28 | extern SEXP _energy_U_product(SEXP, SEXP); 29 | extern SEXP _energy_Btree_sum(SEXP, SEXP); 30 | extern SEXP _energy_kgroups_start(SEXP, SEXP, SEXP, SEXP, SEXP); 31 | extern SEXP _energy_calc_dist(SEXP); 32 | extern SEXP _energy_Istat(SEXP, SEXP); 33 | extern SEXP _energy_Istats(SEXP, SEXP, SEXP); 34 | 35 | static const R_CMethodDef CEntries[] = { 36 | {"dCOV", (DL_FUNC) &dCOV, 4}, 37 | {"dCOVtest", (DL_FUNC) &dCOVtest, 7}, 38 | {"ksampleEtest", (DL_FUNC) &ksampleEtest, 9}, 39 | {"permute_check",(DL_FUNC) &permute_check,2}, 40 | {NULL, NULL, 0} 41 | }; 42 | 43 | static const R_CallMethodDef CallEntries[] = { 44 | {"_energy_D_center", (DL_FUNC) &_energy_D_center, 1}, 45 | {"_energy_dcovU_stats", (DL_FUNC) &_energy_dcovU_stats, 2}, 46 | {"_energy_Istat", (DL_FUNC) &_energy_Istat, 2}, 47 | {"_energy_Istats", (DL_FUNC) &_energy_Istats, 3}, 48 | {"_energy_partial_dcor", (DL_FUNC) &_energy_partial_dcor, 3}, 49 | {"_energy_partial_dcov", (DL_FUNC) &_energy_partial_dcov, 3}, 50 | {"_energy_poisMstat", (DL_FUNC) &_energy_poisMstat, 1}, 51 | {"_energy_projection", (DL_FUNC) &_energy_projection, 2}, 52 | {"_energy_U_center", (DL_FUNC) &_energy_U_center, 1}, 53 | {"_energy_U_product", (DL_FUNC) &_energy_U_product, 2}, 54 | {"_energy_Btree_sum", (DL_FUNC) &_energy_Btree_sum, 2}, 55 | {"_energy_kgroups_start", (DL_FUNC) &_energy_kgroups_start, 5}, 56 | {"_energy_calc_dist", (DL_FUNC) &_energy_calc_dist, 1}, 57 | {NULL, NULL, 0} 58 | }; 59 | 60 | void R_init_energy(DllInfo *dll) 61 | { 62 | R_registerRoutines(dll, CEntries, CallEntries, NULL, NULL); 63 | R_useDynamicSymbols(dll, FALSE); 64 | } 65 | -------------------------------------------------------------------------------- /man/dcovu.Rd: -------------------------------------------------------------------------------- 1 | \name{Unbiased distance covariance} 2 | \alias{bcdcor} 3 | \alias{dcovU} 4 | \title{Unbiased dcov and bias-corrected dcor statistics} 5 | \description{ 6 | These functions compute unbiased estimators of squared distance 7 | covariance and a bias-corrected estimator of 8 | (squared) distance correlation. 9 | } 10 | \usage{ 11 | bcdcor(x, y) 12 | dcovU(x, y) 13 | } 14 | \arguments{ 15 | \item{x}{ data or dist object of first sample} 16 | \item{y}{ data or dist object of second sample} 17 | } 18 | \details{ 19 | The unbiased (squared) dcov is inner product definition of 20 | dCov, in the Hilbert space of U-centered distance matrices. 21 | 22 | The sample sizes (number of rows) of the two samples must 23 | agree, and samples must not contain missing values. 24 | 25 | Argument types supported are 26 | numeric data matrix, data.frame, or tibble, with observations in rows; 27 | numeric vector; ordered or unordered factors. In case of unordered factors 28 | a 0-1 distance matrix is computed. 29 | } 30 | \value{ 31 | \code{dcovU} returns the unbiased estimator of squared dcov. 32 | \code{bcdcor} returns a bias-corrected estimator of squared dcor. 33 | } 34 | \note{ 35 | Unbiased distance covariance (SR2014) corresponds to the biased 36 | (original) \eqn{\mathrm{dCov^2}}{dCov^2}. Since \code{dcovU} is an 37 | unbiased statistic, it is signed and we do not take the square root. 38 | For the original distance covariance test of independence (SRB2007, 39 | SR2009), the distance covariance test statistic is the V-statistic 40 | \eqn{\mathrm{n\, dCov^2} = n \mathcal{V}_n^2}{n V_n^2} (not dCov). 41 | Similarly, \code{bcdcor} is bias-corrected, so we do not take the 42 | square root as with dCor. 43 | } 44 | \references{ 45 | Szekely, G.J. and Rizzo, M.L. (2014), 46 | Partial Distance Correlation with Methods for Dissimilarities. 47 | \emph{Annals of Statistics}, Vol. 42 No. 6, 2382-2412. 48 | 49 | Szekely, G.J., Rizzo, M.L., and Bakirov, N.K. (2007), 50 | Measuring and Testing Dependence by Correlation of Distances, 51 | \emph{Annals of Statistics}, Vol. 35 No. 6, pp. 2769-2794. 52 | \cr \doi{10.1214/009053607000000505} 53 | 54 | Szekely, G.J. and Rizzo, M.L. (2009), 55 | Brownian Distance Covariance, 56 | \emph{Annals of Applied Statistics}, 57 | Vol. 3, No. 4, 1236-1265. 58 | \cr \doi{10.1214/09-AOAS312} 59 | } 60 | \author{ Maria L. Rizzo \email{mrizzo@bgsu.edu} and 61 | Gabor J. Szekely 62 | } 63 | \examples{ 64 | x <- iris[1:50, 1:4] 65 | y <- iris[51:100, 1:4] 66 | dcovU(x, y) 67 | bcdcor(x, y) 68 | } 69 | \keyword{ multivariate } 70 | \keyword{ nonparametric } 71 | \concept{ independence } 72 | \concept{ multivariate } 73 | \concept{ distance correlation } 74 | \concept{ distance covariance } 75 | \concept{ energy statistics } 76 | -------------------------------------------------------------------------------- /R/util.R: -------------------------------------------------------------------------------- 1 | ## util.R 2 | ## 3 | ## utilities for the energy package 4 | ## Author: Maria Rizzo 5 | ## github.com/mariarizzo/energy 6 | ## 7 | 8 | 9 | .arg2dist.matrix <- function(x) { 10 | ## argument check and conversion for energy functions 11 | ## that take optionally data or distance object arguments 12 | ## check type of argument, return a distance matrix 13 | ## supported argument types: matrix, vector, data.frame, tibble, factor, dist 14 | 15 | if (anyNA(x)) warning("missing values not supported") 16 | 17 | if (inherits(x, "dist")) { 18 | Dx <- as.matrix(x) 19 | return(Dx) 20 | } 21 | 22 | if (is.factor(x)) { 23 | z <- as.matrix(as.integer(x)) 24 | Dx <- calc_dist(z) 25 | if (!is.ordered(x) && nlevels(x) > 2) { 26 | # need a 0-1 matrix 27 | Dx <- matrix(as.integer(Dx > 0), nrow=nrow(Dx)) 28 | } 29 | return(Dx) 30 | } 31 | 32 | 33 | if (is.vector(x) || is.data.frame(x)) { 34 | ## also for tibble 35 | Dx <- calc_dist(as.matrix(x)) 36 | } 37 | 38 | if (is.matrix(x)) { 39 | if (is.dmatrix(x)) { 40 | Dx <- x 41 | } else { 42 | ## should be data matrix 43 | Dx <- calc_dist(x) 44 | } 45 | } 46 | return(Dx) 47 | 48 | ## if here, arg type is not supported 49 | stop(paste("cannot compute distances for", class(x))) 50 | return(NA) 51 | } 52 | 53 | is.dmatrix <- function(x, tol = 100 * .Machine$double.eps) { 54 | ## check if zero diagonal, symmetric, non-negative square matrix 55 | ## i.e., distance matrix or dissimilarity matrix 56 | value <- FALSE 57 | if (is.matrix(x)) { 58 | if (nrow(x) == ncol(x)) { 59 | if (max(abs(diag(x)) < tol) && (max(abs(x - t(x)) < tol))) { 60 | if (! any(x < 0.0)) value <- TRUE 61 | } 62 | } 63 | } 64 | return (value) 65 | } 66 | 67 | perm.matrix <- function(n, R) { 68 | ## Generate the same matrix as boot.array with 69 | ## sim="permutation" and default other arguments 70 | ## with same seed we get boot.array(boot.out, indices=T) 71 | 72 | pfn <- function(x, n) x[sample.int(n)] 73 | perms <- matrix(1:n, n, R) 74 | perms <- t(apply(perms, 2, pfn, n=n)) 75 | } 76 | 77 | 78 | permutation <- function(n) { 79 | ## call the internal permute() function using permute_check() 80 | J <- 1:n 81 | a <- .C("permute_check", 82 | J = as.integer(J), 83 | n = as.integer(n), 84 | PACKAGE = "energy") 85 | return (a$J) 86 | } 87 | 88 | sortrank <- function(x) { 89 | ## sort and rank data with one call to order() 90 | ## faster than calling sort and rank separately 91 | ## returns an object identical to: 92 | ## list(x=sort(x), ix=order(x), r=rank(x, ties.method = "first")) 93 | o <- order(x) 94 | n <- length(o) 95 | N <- 1:n 96 | N[o] <- N 97 | return(list(x=x[o], ix=o, r=N)) 98 | } 99 | -------------------------------------------------------------------------------- /man/dcorT.Rd: -------------------------------------------------------------------------------- 1 | \name{dcorT} 2 | \alias{dcorT.test} 3 | \alias{dcorT} 4 | \title{ Distance Correlation t-Test} 5 | \description{ 6 | Distance correlation t-test of multivariate independence for high dimension.} 7 | \usage{ 8 | dcorT.test(x, y) 9 | dcorT(x, y) 10 | } 11 | \arguments{ 12 | \item{x}{ data or distances of first sample} 13 | \item{y}{ data or distances of second sample} 14 | } 15 | \details{ 16 | \code{dcorT.test} performs a nonparametric t-test of 17 | multivariate independence in high dimension (dimension is close to 18 | or larger than sample size). As dimension goes to infinity, the 19 | asymptotic distribution of the test statistic is approximately Student t with \eqn{n(n-3)/2-1} degrees of freedom and for \eqn{n \geq 10} the statistic is approximately distributed as standard normal. 20 | 21 | The sample sizes (number of rows) of the two samples must 22 | agree, and samples must not contain missing values. 23 | 24 | The t statistic (dcorT) is a transformation of a bias corrected 25 | version of distance correlation (see SR 2013 for details). 26 | 27 | Large values (upper tail) of the dcorT statistic are significant. 28 | } 29 | \note{ 30 | \code{dcor.t} and \code{dcor.ttest} are deprecated. 31 | } 32 | \value{ 33 | \code{dcorT} returns the dcor t statistic, and 34 | \code{dcorT.test} returns a list with class \code{htest} containing 35 | \item{ method}{ description of test} 36 | \item{ statistic}{ observed value of the test statistic} 37 | \item{ parameter}{ degrees of freedom} 38 | \item{ estimate}{ (bias corrected) squared dCor(x,y)} 39 | \item{ p.value}{ p-value of the t-test} 40 | \item{ data.name}{ description of data} 41 | } 42 | \seealso{ 43 | \code{\link{bcdcor}} \code{\link{dcov.test}} \code{\link{dcor}} \code{\link{DCOR}} 44 | } 45 | 46 | \references{ 47 | Szekely, G.J. and Rizzo, M.L. (2013). The distance correlation t-test of independence in high dimension. \emph{Journal of Multivariate Analysis}, Volume 117, pp. 193-213. \cr 48 | \doi{10.1016/j.jmva.2013.02.012} 49 | 50 | Szekely, G.J., Rizzo, M.L., and Bakirov, N.K. (2007), 51 | Measuring and Testing Dependence by Correlation of Distances, 52 | \emph{Annals of Statistics}, Vol. 35 No. 6, pp. 2769-2794. 53 | \cr \doi{10.1214/009053607000000505} 54 | 55 | Szekely, G.J. and Rizzo, M.L. (2009), 56 | Brownian Distance Covariance, 57 | \emph{Annals of Applied Statistics}, 58 | Vol. 3, No. 4, 1236-1265. 59 | \cr \doi{10.1214/09-AOAS312} 60 | } 61 | \author{ 62 | Maria L. Rizzo \email{mrizzo@bgsu.edu} and 63 | Gabor J. Szekely 64 | } 65 | \examples{ 66 | x <- matrix(rnorm(100), 10, 10) 67 | y <- matrix(runif(100), 10, 10) 68 | dcorT(x, y) 69 | dcorT.test(x, y) 70 | } 71 | 72 | 73 | \keyword{ htest } 74 | \keyword{ multivariate } 75 | \keyword{ nonparametric } 76 | \concept{ independence } 77 | \concept{ multivariate } 78 | \concept{ distance correlation } 79 | \concept{ distance covariance } 80 | \concept{ energy statistics } 81 | 82 | -------------------------------------------------------------------------------- /man/normalGOF.Rd: -------------------------------------------------------------------------------- 1 | \name{normal.test} 2 | \alias{normal.test} 3 | \alias{normal.e} 4 | \title{Energy Test of Univariate Normality} 5 | \description{ 6 | Performs the energy test of univariate normality 7 | for the composite hypothesis Case 4, estimated parameters. 8 | } 9 | \usage{ 10 | normal.test(x, method=c("mc","limit"), R) 11 | normal.e(x) 12 | } 13 | \arguments{ 14 | \item{x}{ univariate data vector} 15 | \item{method}{ method for p-value} 16 | \item{R}{ number of replications if Monte Carlo method} 17 | } 18 | \details{ 19 | If \code{method="mc"} this test function applies the parametric 20 | bootstrap method implemented in \code{\link{mvnorm.test}}. 21 | 22 | If \code{method="limit"}, the p-value of the test is computed from 23 | the asymptotic distribution of the test statistic under the null 24 | hypothesis. The asymptotic 25 | distribution is a quadratic form of centered Gaussian random variables, 26 | which has the form 27 | \deqn{\sum_{k=1}^\infty \lambda_k Z_k^2,} 28 | where \eqn{\lambda_k} are positive constants (eigenvalues) and 29 | \eqn{Z_k} are iid standard normal variables. Eigenvalues are 30 | pre-computed and stored internally. 31 | A p-value is computed using Imhof's method as implemented in the 32 | \pkg{CompQuadForm} package. 33 | 34 | Note that the "limit" method is intended for moderately large 35 | samples because it applies the asymptotic distribution. 36 | 37 | The energy test of normality was proposed 38 | and implemented by Szekely and Rizzo (2005). 39 | See \code{\link{mvnorm.test}} 40 | for more details. 41 | } 42 | 43 | \value{ 44 | \code{normal.e} returns the energy goodness-of-fit statistic for 45 | a univariate sample. 46 | 47 | \code{normal.test} returns a list with class \code{htest} containing 48 | \item{statistic}{observed value of the test statistic} 49 | \item{p.value}{p-value of the test} 50 | \item{estimate}{sample estimates: mean, sd} 51 | \item{data.name}{description of data} 52 | } 53 | \seealso{ 54 | \code{\link{mvnorm.test}} and \code{\link{mvnorm.e}} for the 55 | energy test of multivariate normality and the test statistic 56 | for multivariate samples. 57 | } 58 | \references{ 59 | Szekely, G. J. and Rizzo, M. L. (2005) A New Test for 60 | Multivariate Normality, \emph{Journal of Multivariate Analysis}, 61 | 93/1, 58-80, 62 | \doi{10.1016/j.jmva.2003.12.002}. 63 | 64 | Mori, T. F., Szekely, G. J. and Rizzo, M. L. "On energy tests of normality." Journal of Statistical Planning and Inference 213 (2021): 1-15. 65 | 66 | Rizzo, M. L. (2002). A New Rotation Invariant Goodness-of-Fit Test, 67 | Ph.D. dissertation, Bowling Green State University. 68 | 69 | J. P. Imhof (1961). Computing the Distribution of Quadratic Forms in 70 | Normal Variables, \emph{Biometrika}, Volume 48, Issue 3/4, 71 | 419-426. 72 | } 73 | \author{ Maria L. Rizzo \email{mrizzo@bgsu.edu} and 74 | Gabor J. Szekely 75 | } 76 | \examples{ 77 | x <- iris[1:50, 1] 78 | normal.e(x) 79 | normal.test(x, R=199) 80 | normal.test(x, method="limit") 81 | } 82 | \keyword{ htest } 83 | \concept{ goodness-of-fit} 84 | \concept{ normal distribution} 85 | \concept{ energy statistics } 86 | -------------------------------------------------------------------------------- /src/partial-dcor.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace Rcpp; 3 | 4 | /* 5 | Author: Maria L. Rizzo 6 | energy package 7 | github.com/mariarizzo/energy 8 | */ 9 | 10 | 11 | NumericVector partial_dcor(NumericMatrix Dx, NumericMatrix Dy, NumericMatrix Dz); 12 | double partial_dcov(NumericMatrix Dx, NumericMatrix Dy, NumericMatrix Dz); 13 | 14 | NumericMatrix U_center(NumericMatrix); 15 | double U_product(NumericMatrix U, NumericMatrix V); 16 | NumericMatrix projection(NumericMatrix Dx, NumericMatrix Dz); 17 | 18 | // [[Rcpp::export]] 19 | NumericVector partial_dcor(NumericMatrix Dx, NumericMatrix Dy, NumericMatrix Dz) { 20 | /* partial distance correlation, second formulation 21 | Dx, Dy, Dz are symmetric distance or dissimilarity matrices with zero diagonals 22 | partial_dcor : vector length 4, partial_dcor[0] is pdcor 23 | partial_dcor returns vector [Rxyz, Rxy, Rxz, Ryz] starred versions 24 | */ 25 | int n = Dx.nrow(); 26 | NumericMatrix A(n, n), B(n, n), C(n, n); 27 | double Rxy=0.0, Rxz=0.0, Ryz=0.0, Rxyz=0.0, den; 28 | double AB, AC, BC, AA, BB, CC, pDCOV; 29 | double eps = std::numeric_limits::epsilon(); //machine epsilon 30 | 31 | A = U_center(Dx); /* U-centering to get A^U etc. */ 32 | B = U_center(Dy); 33 | C = U_center(Dz); 34 | 35 | AB = U_product(A, B); 36 | AC = U_product(A, C); 37 | BC = U_product(B, C); 38 | AA = U_product(A, A); 39 | BB = U_product(B, B); 40 | CC = U_product(C, C); 41 | pDCOV = U_product(projection(Dx, Dz), projection(Dy, Dz)); 42 | 43 | den = sqrt(AA*BB); 44 | if (den > eps) 45 | Rxy = AB / den; 46 | den = sqrt(AA*CC); 47 | if (den > eps) 48 | Rxz = AC / den; 49 | den = sqrt(BB*CC); 50 | if (den > eps) 51 | Ryz = BC / den; 52 | den = sqrt(1 - Rxz*Rxz) * sqrt(1 - Ryz * Ryz); 53 | 54 | if (den > eps) 55 | Rxyz = (Rxy - Rxz * Ryz) / den; 56 | else { 57 | Rxyz = 0.0; 58 | } 59 | 60 | return NumericVector::create( 61 | _["pdcor"] = Rxyz, 62 | _["pdcov"] = pDCOV, 63 | _["Rxy"] = Rxy, 64 | _["Rxz"] = Rxz, 65 | _["Ryz"] = Ryz 66 | ); 67 | } 68 | 69 | 70 | //[[Rcpp::export]] 71 | double partial_dcov(NumericMatrix Dx, NumericMatrix Dy, NumericMatrix Dz) { 72 | /* pdcov following the definition via projections 73 | Dx, Dy, Dz are symmetric distance or dissimilarity matrices with zero diagonals 74 | returns pdcov sample coefficient 75 | */ 76 | int n = Dx.nrow(); 77 | int i, j; 78 | NumericMatrix A(n, n), B(n, n), C(n, n), Pxz(n, n), Pyz(n, n); 79 | double AC, BC, CC, c1, c2; 80 | double eps = std::numeric_limits::epsilon(); //machine epsilon 81 | 82 | A = U_center(Dx); /* U-centering to get A^U etc. */ 83 | B = U_center(Dy); 84 | C = U_center(Dz); 85 | 86 | AC = U_product(A, C); 87 | BC = U_product(B, C); 88 | CC = U_product(C, C); 89 | 90 | c1 = c2 = 0.0; 91 | // if (C,C)==0 then C=0 and both (A,C)=0 and (B,C)=0 92 | if (fabs(CC) > eps) { 93 | c1 = AC / CC; 94 | c2 = BC / CC; 95 | } 96 | 97 | for (i=0; i max_length) { 29 | max_length = length; 30 | pos = i; 31 | } 32 | } 33 | 34 | // Add class to parent
  • , and enclosing
  • if in dropdown 35 | if (pos >= 0) { 36 | var menu_anchor = $(links[pos]); 37 | menu_anchor.parent().addClass("active"); 38 | menu_anchor.closest("li.dropdown").addClass("active"); 39 | } 40 | }); 41 | 42 | function paths(pathname) { 43 | var pieces = pathname.split("/"); 44 | pieces.shift(); // always starts with / 45 | 46 | var end = pieces[pieces.length - 1]; 47 | if (end === "index.html" || end === "") 48 | pieces.pop(); 49 | return(pieces); 50 | } 51 | 52 | // Returns -1 if not found 53 | function prefix_length(needle, haystack) { 54 | if (needle.length > haystack.length) 55 | return(-1); 56 | 57 | // Special case for length-0 haystack, since for loop won't run 58 | if (haystack.length === 0) { 59 | return(needle.length === 0 ? 0 : -1); 60 | } 61 | 62 | for (var i = 0; i < haystack.length; i++) { 63 | if (needle[i] != haystack[i]) 64 | return(i); 65 | } 66 | 67 | return(haystack.length); 68 | } 69 | 70 | /* Clipboard --------------------------*/ 71 | 72 | function changeTooltipMessage(element, msg) { 73 | var tooltipOriginalTitle=element.getAttribute('data-original-title'); 74 | element.setAttribute('data-original-title', msg); 75 | $(element).tooltip('show'); 76 | element.setAttribute('data-original-title', tooltipOriginalTitle); 77 | } 78 | 79 | if(ClipboardJS.isSupported()) { 80 | $(document).ready(function() { 81 | var copyButton = ""; 82 | 83 | $("div.sourceCode").addClass("hasCopyButton"); 84 | 85 | // Insert copy buttons: 86 | $(copyButton).prependTo(".hasCopyButton"); 87 | 88 | // Initialize tooltips: 89 | $('.btn-copy-ex').tooltip({container: 'body'}); 90 | 91 | // Initialize clipboard: 92 | var clipboardBtnCopies = new ClipboardJS('[data-clipboard-copy]', { 93 | text: function(trigger) { 94 | return trigger.parentNode.textContent.replace(/\n#>[^\n]*/g, ""); 95 | } 96 | }); 97 | 98 | clipboardBtnCopies.on('success', function(e) { 99 | changeTooltipMessage(e.trigger, 'Copied!'); 100 | e.clearSelection(); 101 | }); 102 | 103 | clipboardBtnCopies.on('error', function() { 104 | changeTooltipMessage(e.trigger,'Press Ctrl+C or Command+C to copy'); 105 | }); 106 | }); 107 | } 108 | })(window.jQuery || window.$) 109 | -------------------------------------------------------------------------------- /man/mvnorm-test.Rd: -------------------------------------------------------------------------------- 1 | \name{mvnorm.test} 2 | \alias{mvnorm.test} 3 | \alias{mvnorm.etest} 4 | \alias{mvnorm.e} 5 | \title{E-statistic (Energy) Test of Multivariate Normality} 6 | \description{ 7 | Performs the E-statistic (energy) test of multivariate or univariate normality. 8 | } 9 | \usage{ 10 | mvnorm.test(x, R) 11 | mvnorm.etest(x, R) 12 | mvnorm.e(x) 13 | } 14 | \arguments{ 15 | \item{x}{ data matrix of multivariate sample, or univariate data vector} 16 | \item{R}{ number of bootstrap replicates } 17 | } 18 | \details{ 19 | If \code{x} is a matrix, each row is a multivariate observation. The 20 | data will be standardized to zero mean and identity covariance matrix 21 | using the sample mean vector and sample covariance matrix. If \code{x} 22 | is a vector, \code{mvnorm.e} returns the univariate statistic 23 | \code{normal.e(x)}. 24 | If the data contains missing values or the sample covariance matrix is 25 | singular, \code{mvnorm.e} returns NA. 26 | 27 | The \eqn{\mathcal{E}}{E}-test of multivariate normality was proposed 28 | and implemented by Szekely and Rizzo (2005). The test statistic for 29 | d-variate normality is given by 30 | \deqn{\mathcal{E} = n (\frac{2}{n} \sum_{i=1}^n E\|y_i-Z\| - 31 | E\|Z-Z'\| - \frac{1}{n^2} \sum_{i=1}^n \sum_{j=1}^n \|y_i-y_j\|), 32 | }{E = n((2/n) sum[1:n] E||y_i-Z|| - E||Z-Z'|| - (1/n^2) sum[1:n,1:n] 33 | ||y_i-y_j||),} 34 | where \eqn{y_1,\ldots,y_n} is the standardized sample, 35 | \eqn{Z, Z'} are iid standard d-variate normal, and 36 | \eqn{\| \cdot \|}{|| ||} denotes Euclidean norm. 37 | 38 | The \eqn{\mathcal{E}}{E}-test of multivariate (univariate) normality 39 | is implemented by parametric bootstrap with \code{R} replicates. 40 | } 41 | \value{ 42 | The value of the \eqn{\mathcal{E}}{E}-statistic for multivariate 43 | normality is returned by \code{mvnorm.e}. 44 | 45 | \code{mvnorm.test} returns a list with class \code{htest} containing 46 | \item{method}{description of test} 47 | \item{statistic}{observed value of the test statistic} 48 | \item{p.value}{approximate p-value of the test} 49 | \item{data.name}{description of data} 50 | 51 | \code{mvnorm.etest} is replaced by \code{mvnorm.test}. 52 | } 53 | \seealso{ 54 | \code{\link{normal.test}} for the energy test of univariate 55 | normality and \code{\link{normal.e}} for the statistic. 56 | } 57 | \note{If the data is univariate, the test statistic is formally 58 | the same as the multivariate case, but a more efficient computational 59 | formula is applied in \code{\link{normal.e}}. 60 | 61 | \code{\link{normal.test}} also provides an optional method for the 62 | test based on the asymptotic sampling distribution of the test 63 | statistic. 64 | } 65 | \references{ 66 | Szekely, G. J. and Rizzo, M. L. (2005) A New Test for 67 | Multivariate Normality, \emph{Journal of Multivariate Analysis}, 68 | 93/1, 58-80, 69 | \doi{10.1016/j.jmva.2003.12.002}. 70 | 71 | Mori, T. F., Szekely, G. J. and Rizzo, M. L. "On energy tests of normality." Journal of Statistical Planning and Inference 213 (2021): 1-15. 72 | 73 | Rizzo, M. L. (2002). A New Rotation Invariant Goodness-of-Fit Test, Ph.D. dissertation, Bowling Green State University. 74 | 75 | Szekely, G. J. (1989) Potential and Kinetic Energy in Statistics, 76 | Lecture Notes, Budapest Institute of Technology (Technical University). 77 | } 78 | \author{ Maria L. Rizzo \email{mrizzo@bgsu.edu} and 79 | Gabor J. Szekely 80 | } 81 | \examples{ 82 | ## compute normality test statistic for iris Setosa data 83 | data(iris) 84 | mvnorm.e(iris[1:50, 1:4]) 85 | 86 | ## test if the iris Setosa data has multivariate normal distribution 87 | mvnorm.test(iris[1:50,1:4], R = 199) 88 | } 89 | \keyword{ multivariate } 90 | \keyword{ htest } 91 | \concept{ goodness-of-fit} 92 | \concept{ normal distribution} 93 | \concept{ energy statistics } 94 | -------------------------------------------------------------------------------- /src/mvI.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace Rcpp; 3 | 4 | NumericMatrix Dxi(NumericMatrix Dx, IntegerVector ix); 5 | 6 | // [[Rcpp::export]] 7 | double Istat(NumericMatrix Dx, NumericMatrix Dy) { 8 | // compute independence coefficient I_n (the square root) 9 | // Dx and Dy are the Euclidean distance matrices 10 | 11 | int n = Dx.nrow(); 12 | int i, j, k, m; 13 | double n2 = n*n, n3 = n*n2, n4 = n2*n2; 14 | double Cx, Cy, zd, zbar, z; 15 | IntegerVector ix(n), iy(n); 16 | NumericMatrix Dx2(n, n), Dy2(n, n); 17 | 18 | Cx = 0.0; Cy = 0.0; z = 0.0; 19 | for (i=0; i 2 | using namespace Rcpp; 3 | 4 | // Author: Maria L. Rizzo 5 | // energy package 6 | // github.com/mariarizzo/energy 7 | 8 | // compute partial sum using binary search algorithm like AVL 9 | // pre-compute powers of two to save repeated calculations 10 | 11 | 12 | IntegerVector containerNodes (int y, IntegerVector pwr2, IntegerVector psum); 13 | NumericVector gamma1_direct(IntegerVector y, NumericVector z); 14 | IntegerVector p2sum(IntegerVector pwr2); 15 | IntegerVector powers2 (int L); 16 | NumericVector rowsumsDist(NumericVector x, NumericVector sorted, IntegerVector ranks); 17 | IntegerVector subNodes (int y, IntegerVector pwr2, IntegerVector psum); 18 | 19 | 20 | 21 | // [[Rcpp::export]] 22 | NumericVector Btree_sum (IntegerVector y, NumericVector z) { 23 | // 24 | // y is a permutation of the integers 1:n 25 | // z is a numeric vector of length n 26 | // compute gamma1(i) = sum(j 0) 48 | gamma1(i) += sums(node); 49 | } 50 | } 51 | return gamma1; 52 | } 53 | 54 | IntegerVector containerNodes (int y, IntegerVector pwr2, IntegerVector psum) { 55 | /* 56 | * get the indices of all nodes of binary tree whose closed 57 | * intervals contain integer y 58 | */ 59 | int i, L = pwr2.length(); 60 | IntegerVector nodes(L); 61 | 62 | nodes(0) = y; 63 | for (i = 0; i < L-1; i++) { 64 | nodes(i+1) = ceil((double) y / pwr2(i)) + psum(i); 65 | } 66 | return nodes; 67 | } 68 | 69 | 70 | IntegerVector subNodes (int y, IntegerVector pwr2, IntegerVector psum) { 71 | /* 72 | * get indices of nodes whose intervals disjoint union is 1:y 73 | */ 74 | int L = psum.length(); 75 | int idx, k, level, p2; 76 | IntegerVector nodes(L); 77 | 78 | std::fill(nodes.begin(), nodes.end(), -1L); 79 | 80 | k = y; 81 | for (level = L - 1; level > 0; level --) { 82 | p2 = pwr2(level - 1); 83 | if (k >= p2) { 84 | // at index of left node plus an offset 85 | idx = psum(level - 1) + (y / p2); 86 | nodes(L - level - 1) = idx; 87 | k -= p2; 88 | } 89 | } 90 | if (k > 0) 91 | nodes(L - 1) = y; 92 | return nodes; 93 | } 94 | 95 | 96 | IntegerVector powers2 (int L) { 97 | // (2, 4, 8, ..., 2^L, 2^(L+1)) 98 | int k; 99 | IntegerVector pwr2(L); 100 | 101 | pwr2(0) = 2; 102 | for (k = 1; k < L; k++) 103 | pwr2(k) = pwr2(k-1) * 2; 104 | return pwr2; 105 | } 106 | 107 | IntegerVector p2sum(IntegerVector pwr2) { 108 | // computes the cumsum of 2^L, 2^(L-1), ..., 2^2, 2 109 | int i, L = pwr2.length(); 110 | IntegerVector psum(L); 111 | 112 | std::fill(psum.begin(), psum.end(), pwr2(L-1)); 113 | for (i = 1; i < L; i++) 114 | psum(i) = psum(i-1) + pwr2(L-i-1); 115 | return psum; 116 | } 117 | 118 | 119 | NumericVector gamma1_direct(IntegerVector y, NumericVector z) { 120 | // utility: direct computation of the sum gamm1 121 | // for the purpose of testing and benchmarks 122 | 123 | int n = y.length(); 124 | int i, j; 125 | NumericVector gamma1(n); 126 | 127 | for (i = 1; i < n; i++) { 128 | for (j = 0; j < i; j++) { 129 | if (y(j) < y(i)) { 130 | gamma1(i) += z(j); 131 | } 132 | } 133 | } 134 | return gamma1; 135 | } 136 | -------------------------------------------------------------------------------- /R/Epoisson.R: -------------------------------------------------------------------------------- 1 | poisson.tests <- 2 | function(x, R, test="all") { 3 | # parametric bootstrap tests of Poisson distribution 4 | # poisson.e is the energy GOF statistic 5 | # poisson.m is the mean distance statistic 6 | # (not related to the test stats::poisson.test) 7 | if (!is.integer(x) || any(x < 0)) { 8 | warning("sample must be non-negative integers") 9 | return(NULL) 10 | } 11 | test <- tolower(test) 12 | poisson.stats <- function(x) { 13 | c(poisson.m(x), poisson.e(x)) 14 | } 15 | stat <- switch(test, 16 | "m" = poisson.m, 17 | "e" = poisson.e, 18 | poisson.stats) 19 | 20 | method <- switch(test, 21 | m=c("M-CvM","M-AD"), 22 | e="Energy", 23 | c("M-CvM","M-AD","Energy")) 24 | method <- paste(method, " test", sep="") 25 | n <- length(x) 26 | lambda <- mean(x) 27 | if (missing(R) || is.null(R)) { 28 | R <- 0 29 | message("Specify R > 0 replicates for MC test") 30 | } 31 | 32 | bootobj <- boot::boot(x, statistic = stat, R = R, 33 | sim = "parametric", 34 | ran.gen = function(x, y) {rpois(n, lambda)}) 35 | 36 | N <- length(bootobj$t0) 37 | p <- rep(NA, times=N) 38 | if (R > 0) { 39 | for (i in 1:N) { 40 | p[i] <- 1 - mean(bootobj$t[,i] < bootobj$t0[i]) 41 | } 42 | } 43 | 44 | # a data frame, not an htest object 45 | # comparable to broom::tidy on an htest object 46 | RVAL <- data.frame(estimate=lambda, statistic=bootobj$t0, 47 | p.value=p, method=method) 48 | return(RVAL) 49 | } 50 | 51 | poisson.mtest <- 52 | function(x, R=NULL) { 53 | if (is.null(R)) R <- 0 54 | rval <- poisson.tests(x, R, test="M") 55 | DNAME <- paste(deparse1(substitute(x)), "replicates: ", R) 56 | stat <- rval$statistic[1] 57 | names(stat) <- "M-CvM" 58 | e <- list( 59 | method = paste("Poisson M-test", sep = ""), 60 | statistic = stat, 61 | p.value = rval$p.value[1], 62 | data.name = DNAME, 63 | estimate = rval$estimate[1]) 64 | class(e) <- "htest" 65 | e 66 | } 67 | 68 | poisson.etest <- 69 | function(x, R=NULL) { 70 | if (is.null(R)) R <- 0 71 | rval <- poisson.tests(x, R, test="E") 72 | DNAME <- paste(deparse1(substitute(x)), "replicates: ", R) 73 | stat <- rval$statistic 74 | names(stat) <- "E" 75 | e <- list( 76 | method = paste("Poisson E-test", sep = ""), 77 | statistic = stat, 78 | p.value = rval$p.value, 79 | data.name = paste("replicates: ", R, sep=""), 80 | estimate = rval$estimate) 81 | class(e) <- "htest" 82 | e 83 | } 84 | 85 | poisson.m <- 86 | function(x) { 87 | # mean distance statistic for Poissonity 88 | if (any(!is.integer(x)) || any(x < 0)) { 89 | warning("sample must be non-negative integers") 90 | return(NULL) 91 | } 92 | stats <- .poisMstat(x) 93 | names(stats) <- c("M-CvM", "M-AD") 94 | return(stats) 95 | } 96 | 97 | poisson.e <- 98 | function(x) { 99 | # energy GOF statistic for Poissonity 100 | if (any(!is.integer(x)) || any(x < 0)) { 101 | warning("sample must be non-negative integers") 102 | return(NULL) 103 | } 104 | lambda <- mean(x) 105 | n <- length(x) 106 | 107 | ## E|y-X| for X Poisson(lambda) (vectorized) 108 | Px <- ppois(x, lambda) 109 | Px1 <- ppois(x-1, lambda) 110 | meanvec <- 2*x*Px - 2*lambda*Px1 + lambda - x 111 | 112 | ## second mean E|X-X'| 113 | a <- 2 * lambda 114 | EXX <- a * exp(-a) * (besselI(a, 0) + besselI(a, 1)) 115 | 116 | ## third mean = sum_{i,j} |x_i - x_j| / n^2 117 | K <- seq(1 - n, n - 1, 2) 118 | y <- sort(x) 119 | meanxx <- 2 * sum(K * y) / n^2 120 | stat <- n * (2 * mean(meanvec) - EXX - meanxx) 121 | names(stat) <- "E" 122 | return(stat) 123 | } 124 | 125 | 126 | -------------------------------------------------------------------------------- /man/dcov2d.Rd: -------------------------------------------------------------------------------- 1 | \name{dcov2d} 2 | \alias{dcor2d} 3 | \alias{dcov2d} 4 | \title{Fast dCor and dCov for bivariate data only} 5 | \description{ 6 | For bivariate data only, these are fast O(n log n) implementations of distance 7 | correlation and distance covariance statistics. The U-statistic for dcov^2 is unbiased; 8 | the V-statistic is the original definition in SRB 2007. These algorithms do not 9 | store the distance matrices, so they are suitable for large samples. 10 | } 11 | \usage{ 12 | dcor2d(x, y, type = c("V", "U")) 13 | dcov2d(x, y, type = c("V", "U"), all.stats = FALSE) 14 | } 15 | \arguments{ 16 | \item{x}{ numeric vector} 17 | \item{y}{ numeric vector} 18 | \item{type}{ "V" or "U", for V- or U-statistics} 19 | \item{all.stats}{ logical} 20 | } 21 | \details{ 22 | The unbiased (squared) dcov is documented in \code{dcovU}, for multivariate data in arbitrary, not necessarily equal dimensions. \code{dcov2d} and \code{dcor2d} provide a faster O(n log n) algorithm for bivariate (x, y) only (X and Y are real-valued random vectors). The O(n log n) algorithm was proposed by Huo and Szekely (2016). The algorithm is faster above a certain sample size n. It does not store the distance matrix so the sample size can be very large. 23 | } 24 | \value{ 25 | By default, \code{dcov2d} returns the V-statistic \eqn{V_n = dCov_n^2(x, y)}{V_n = dCov_n^2(x, y)}, and if type="U", it returns the U-statistic, unbiased for \eqn{dCov^2(X, Y)}{dCov^2(X,Y)}. The argument all.stats=TRUE is used internally when the function is called from \code{dcor2d}. 26 | 27 | By default, \code{dcor2d} returns \eqn{dCor_n^2(x, y)}{dCor_n^2(x, y)}, and if type="U", it returns a bias-corrected estimator of squared dcor equivalent to \code{bcdcor}. 28 | 29 | These functions do not store the distance matrices so they are helpful when sample size is large and the data is bivariate. 30 | } 31 | \note{ 32 | The U-statistic \eqn{U_n}{U_n} can be negative in the lower tail so 33 | the square root of the U-statistic is not applied. 34 | Similarly, \code{dcor2d(x, y, "U")} is bias-corrected and can be 35 | negative in the lower tail, so we do not take the 36 | square root. The original definitions of dCov and dCor 37 | (SRB2007, SR2009) were based on V-statistics, which are non-negative, 38 | and defined using the square root of V-statistics. 39 | 40 | It has been suggested that instead of taking the square root of the U-statistic, one could take the root of \eqn{|U_n|}{|U_n|} before applying the sign, but that introduces more bias than the original dCor, and should never be used. 41 | } 42 | \author{ Maria L. Rizzo \email{mrizzo@bgsu.edu} and 43 | Gabor J. Szekely 44 | } 45 | \seealso{ 46 | \code{\link{dcov}} \code{\link{dcov.test}} \code{\link{dcor}} \code{\link{dcor.test}} (multivariate statistics and permutation test) 47 | } 48 | \references{ 49 | Huo, X. and Szekely, G.J. (2016). Fast computing for 50 | distance covariance. Technometrics, 58(4), 435-447. 51 | 52 | Szekely, G.J. and Rizzo, M.L. (2014), 53 | Partial Distance Correlation with Methods for Dissimilarities. 54 | \emph{Annals of Statistics}, Vol. 42 No. 6, 2382-2412. 55 | 56 | Szekely, G.J., Rizzo, M.L., and Bakirov, N.K. (2007), 57 | Measuring and Testing Dependence by Correlation of Distances, 58 | \emph{Annals of Statistics}, Vol. 35 No. 6, pp. 2769-2794. 59 | \cr \doi{10.1214/009053607000000505} 60 | } 61 | \examples{ 62 | \donttest{ 63 | ## these are equivalent, but 2d is faster for n > 50 64 | n <- 100 65 | x <- rnorm(100) 66 | y <- rnorm(100) 67 | all.equal(dcov(x, y)^2, dcov2d(x, y), check.attributes = FALSE) 68 | all.equal(bcdcor(x, y), dcor2d(x, y, "U"), check.attributes = FALSE) 69 | 70 | x <- rlnorm(400) 71 | y <- rexp(400) 72 | dcov.test(x, y, R=199) #permutation test 73 | dcor.test(x, y, R=199) 74 | } 75 | } 76 | \keyword{ htest } 77 | \keyword{ nonparametric } 78 | \concept{ independence } 79 | \concept{ distance correlation } 80 | \concept{ distance covariance } 81 | \concept{ energy statistics } 82 | -------------------------------------------------------------------------------- /man/kgroups.Rd: -------------------------------------------------------------------------------- 1 | \name{kgroups} 2 | \alias{kgroups} 3 | 4 | \title{ 5 | K-Groups Clustering 6 | } 7 | \description{ 8 | Perform k-groups clustering by energy distance. 9 | } 10 | \usage{ 11 | kgroups(x, k, iter.max = 10, nstart = 1, cluster = NULL) 12 | } 13 | \arguments{ 14 | \item{x}{Data frame or data matrix or distance object} 15 | \item{k}{number of clusters} 16 | \item{iter.max}{maximum number of iterations} 17 | \item{nstart}{number of restarts} 18 | \item{cluster}{initial clustering vector} 19 | } 20 | 21 | \details{ 22 | K-groups is based on the multisample energy distance for comparing distributions. 23 | Based on the disco decomposition of total dispersion (a Gini type mean distance) the objective function should either maximize the total between cluster energy distance, or equivalently, minimize the total within cluster energy distance. It is more computationally efficient to minimize within distances, and that makes it possible to use a modified version of the Hartigan-Wong algorithm (1979) to implement K-groups clustering. 24 | 25 | The within cluster Gini mean distance is 26 | \deqn{G(C_j) = \frac{1}{n_j^2} \sum_{i,m=1}^{n_j} |x_{i,j} - x_{m,j}|} 27 | and the K-groups within cluster distance is 28 | \deqn{W_j = \frac{n_j}{2}G(C_j) = \frac{1}{2 n_j} \sum_{i,m=1}^{n_j} |x_{i,j} - x_{m,j}.} 29 | If z is the data matrix for cluster \eqn{C_j}, then \eqn{W_j} could be computed as 30 | \code{sum(dist(z)) / nrow(z)}. 31 | 32 | If cluster is not NULL, the clusters are initialized by this vector (can be a factor or integer vector). Otherwise clusters are initialized with random labels in k approximately equal size clusters. 33 | 34 | If \code{x} is not a distance object (class(x) == "dist") then \code{x} is converted to a data matrix for analysis. 35 | 36 | Run up to \code{iter.max} complete passes through the data set until a local min is reached. If \code{nstart > 1}, on second and later starts, clusters are initialized at random, and the best result is returned. 37 | } 38 | 39 | \value{ 40 | An object of class \code{kgroups} containing the components 41 | \item{call}{the function call} 42 | \item{cluster}{vector of cluster indices} 43 | \item{sizes}{cluster sizes} 44 | \item{within}{vector of Gini within cluster distances} 45 | \item{W}{sum of within cluster distances} 46 | \item{count}{number of moves} 47 | \item{iterations}{number of iterations} 48 | \item{k}{number of clusters} 49 | 50 | \code{cluster} is a vector containing the group labels, 1 to k. \code{print.kgroups} 51 | prints some of the components of the kgroups object. 52 | 53 | Expect that count is 0 if the algorithm converged to a local min (that is, 0 moves happened on the last iteration). If iterations equals iter.max and count is positive, then the algorithm did not converge to a local min. 54 | } 55 | \author{ 56 | Maria Rizzo and Songzi Li 57 | } 58 | 59 | \references{ 60 | Li, Songzi (2015). 61 | "K-groups: A Generalization of K-means by Energy Distance." 62 | Ph.D. thesis, Bowling Green State University. 63 | 64 | Li, S. and Rizzo, M. L. (2017). 65 | "K-groups: A Generalization of K-means Clustering". 66 | ArXiv e-print 1711.04359. https://arxiv.org/abs/1711.04359 67 | 68 | Szekely, G. J., and M. L. Rizzo. "Testing for equal distributions in high dimension." InterStat 5, no. 16.10 (2004). 69 | 70 | Rizzo, M. L., and G. J. Szekely. "Disco analysis: A nonparametric extension of analysis of variance." The Annals of Applied Statistics (2010): 1034-1055. 71 | 72 | Hartigan, J. A. and Wong, M. A. (1979). "Algorithm AS 136: A K-means clustering algorithm." Applied Statistics, 28, 100-108. doi: 10.2307/2346830. 73 | } 74 | 75 | \examples{ 76 | x <- as.matrix(iris[ ,1:4]) 77 | set.seed(123) 78 | kg <- kgroups(x, k = 3, iter.max = 5, nstart = 2) 79 | kg 80 | fitted(kg) 81 | 82 | \donttest{ 83 | d <- dist(x) 84 | set.seed(123) 85 | kg <- kgroups(d, k = 3, iter.max = 5, nstart = 2) 86 | kg 87 | 88 | kg$cluster 89 | 90 | fitted(kg) 91 | fitted(kg, method = "groups") 92 | } 93 | } 94 | 95 | \keyword{ cluster } 96 | \keyword{ multivariate } 97 | -------------------------------------------------------------------------------- /R/Eeqdist.R: -------------------------------------------------------------------------------- 1 | eqdist.e <- 2 | function(x, sizes, distance = FALSE, method = c("original","discoB","discoF")) 3 | { 4 | ## multivariate E-statistic for testing equal distributions 5 | ## x: matrix of pooled sample or distance matrix 6 | ## sizes: vector of sample sizes 7 | ## distance: logical, TRUE if x is a distance matrix, otherwise false 8 | ## method: original (default) or disco between components, or disco F ratio 9 | 10 | method <-match.arg(method) 11 | if (method=="discoB") { 12 | g <- as.factor(rep(1:length(sizes), sizes)) 13 | RVAL <- disco(x, factors=g, distance=distance, R=0, method=method) 14 | } else { 15 | RVAL <- eqdist.etest(x, sizes, distance = distance, R=0, method=method)$statistic 16 | } 17 | RVAL 18 | } 19 | 20 | eqdist.etest <- 21 | function(x, sizes, distance = FALSE, method = c("original","discoB","discoF"), R) 22 | { 23 | ## multivariate E-test of the multisample hypothesis of equal distributions 24 | ## x: matrix of pooled sample or distance matrix 25 | ## sizes: vector of sample sizes 26 | ## distance: logical, TRUE if x is a distance matrix, otherwise false 27 | ## method: original (default) or disco components 28 | ## R: number of replicates 29 | ## 30 | 31 | method <-match.arg(method) 32 | 33 | if (method=="discoB" || method=="discoF") { 34 | g <- as.factor(rep(1:length(sizes), sizes)) 35 | # for other index use disco() function directly 36 | return(disco(x, factors=g, distance=distance, index=1.0, R=R, method=method)) 37 | } 38 | 39 | nsamples <- length(sizes) 40 | if (nsamples < 2) return (NA) 41 | if (min(sizes) < 1) return (NA) 42 | if (!is.null(attr(x, "Size"))) distance <- TRUE 43 | 44 | x <- as.matrix(x) 45 | if (NROW(x) != sum(sizes)) stop("nrow(x) should equal sum(sizes)") 46 | if (distance == FALSE && nrow(x) == ncol(x)) 47 | warning("square data matrix with distance==FALSE") 48 | d <- NCOL(x) 49 | if (distance == TRUE) d <- 0 50 | str <- "Multivariate " 51 | if (d == 1) str <- "Univariate " 52 | if (d == 0) str <- "" 53 | 54 | e0 <- 0.0 55 | repl <- rep(0, R) 56 | pval <- 1.0 57 | b <- .C("ksampleEtest", 58 | x = as.double(t(x)), 59 | byrow = as.integer(1), 60 | nsamples = as.integer(nsamples), 61 | sizes = as.integer(sizes), 62 | dim = as.integer(d), 63 | R = as.integer(R), 64 | e0 = as.double(e0), 65 | e = as.double(repl), 66 | pval = as.double(pval), 67 | PACKAGE = "energy") 68 | 69 | names(b$e0) <- "E-statistic" 70 | sz <- paste(sizes, collapse = " ", sep = "") 71 | methodname <- paste(str, length(sizes), 72 | "-sample E-test of equal distributions", sep = "") 73 | dataname <- paste("sample sizes ", sz, ", replicates ", R, sep="") 74 | e <- list( 75 | call = match.call(), 76 | method = methodname, 77 | statistic = b$e0, 78 | p.value = b$pval, 79 | data.name = dataname) 80 | 81 | class(e) <- "htest" 82 | e 83 | } 84 | 85 | ksample.e <- 86 | function(x, sizes, distance = FALSE, method = c("original","discoB","discoF"), 87 | ix = 1:sum(sizes)) 88 | { 89 | ## computes k-sample E-statistics for equal distributions 90 | ## retained for backward compatibility or use with boot 91 | ## (this function simply passes arguments to eqdist.e) 92 | ## 93 | ## x: pooled sample or distance matrix 94 | ## sizes: vector of sample sizes 95 | ## distance: TRUE if x is a distance matrix, otherwise FALSE 96 | ## method: default (original) or disco between components or disco F ratio 97 | ## ix: a permutation of row indices of x 98 | ## 99 | x <- as.matrix(x) 100 | method <- match.arg(method) 101 | eqdist.e(x[ix,], sizes=sizes, distance=distance, method=method) 102 | } 103 | 104 | -------------------------------------------------------------------------------- /src/kgroups.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace Rcpp; 3 | 4 | /* 5 | Author: Maria L. Rizzo 6 | energy package 7 | github.com/mariarizzo/energy 8 | */ 9 | 10 | 11 | int kgroups_update(NumericMatrix x, int k, IntegerVector clus, 12 | IntegerVector sizes, NumericVector within, bool distance); 13 | List kgroups_start(NumericMatrix x, int k, IntegerVector clus, 14 | int iter_max, bool distance); 15 | 16 | int kgroups_update(NumericMatrix x, int k, IntegerVector clus, 17 | IntegerVector sizes, NumericVector w, bool distance) { 18 | /* 19 | * k-groups one pass through sample moving one point at a time 20 | * x: data matrix or distance 21 | * k: number of clusters 22 | * clus: clustering vector clus(i)==j ==> x_i is in cluster j 23 | * sizes: cluster sizes 24 | * within: vector of within cluster dispersions 25 | * distance: true if x is distance matrix 26 | * update clus, sizes, and withins 27 | * return count = number of points moved 28 | */ 29 | 30 | int n = x.nrow(), d = x.ncol(); 31 | int i, j, I, J, ix, nI, nJ; 32 | NumericVector rowdst(k), e(k); 33 | int best, count = 0; 34 | double dsum, dif; 35 | 36 | for (ix = 0; ix < n; ix++) { 37 | I = clus(ix); 38 | nI = sizes(I); 39 | if (nI > 1) { 40 | // calculate the E-distances of this point to each cluster 41 | rowdst.fill(0.0); 42 | for (i = 0; i < n; i++) { 43 | J = clus(i); 44 | if (distance == true) { 45 | rowdst(J) += x(ix, i); 46 | } else { 47 | dsum = 0.0; 48 | for (j = 0; j < d; j++) { 49 | dif = x(ix, j) - x(i, j); 50 | dsum += dif * dif; 51 | } 52 | rowdst(J) += sqrt(dsum); 53 | } 54 | } 55 | 56 | for (J = 0; J < k; J++) { 57 | nJ = sizes(J); 58 | e(J) = (2.0 / (double) nJ) * (rowdst(J) - w(J)); 59 | } 60 | 61 | best = Rcpp::which_min(e); 62 | if (best != I) { 63 | // move this point and update 64 | nI = sizes(I); 65 | nJ = sizes(best); 66 | w(best) = (((double) nJ) * w(best) + rowdst(best)) / ((double) (nJ + 1)); 67 | w(I) = (((double) nI) * w(I) - rowdst(I)) / ((double) (nI - 1)); 68 | clus(ix) = best; 69 | sizes(I) = nI - 1; 70 | sizes(best) = nJ + 1; 71 | count ++; // number of moves 72 | } 73 | } 74 | } 75 | 76 | return count; 77 | } 78 | 79 | 80 | 81 | // [[Rcpp::export]] 82 | List kgroups_start(NumericMatrix x, int k, IntegerVector clus, 83 | int iter_max, bool distance) { 84 | // k-groups clustering with initial clustering vector clus 85 | // up to iter_max iterations of n possible moves each 86 | // distance: true if x is distance matrix 87 | NumericVector within(k, 0.0); 88 | IntegerVector sizes(k, 0); 89 | double dif, dsum; 90 | int I, J, h, i, j; 91 | int n = x.nrow(), d = x.ncol(); 92 | 93 | for (i = 0; i < n; i++) { 94 | I = clus(i); 95 | sizes(I)++; 96 | for (j = 0; j < i; j++) { 97 | J = clus(j); 98 | if (I == J) { 99 | if (distance == true) { 100 | within(I) += x(i, j); 101 | } else { 102 | dsum = 0.0; 103 | for (h = 0; h < d; h++) { 104 | dif = x(i, h) - x(j, h); 105 | dsum += dif * dif; 106 | } 107 | within(I) += sqrt(dsum); 108 | } 109 | } 110 | } 111 | } 112 | for (I = 0; I < k; I++) 113 | within(I) /= ((double) sizes(I)); 114 | 115 | int it = 1, count = 1; 116 | count = kgroups_update(x, k, clus, sizes, within, distance); 117 | 118 | while (it < iter_max && count > 0) { 119 | count = kgroups_update(x, k, clus, sizes, within, distance); 120 | it++; 121 | } 122 | double W = Rcpp::sum(within); 123 | 124 | return List::create( 125 | _["within"] = within, 126 | _["W"] = W, 127 | _["sizes"] = sizes, 128 | _["cluster"] = clus, 129 | _["iterations"] = it, 130 | _["count"] = count); 131 | } 132 | 133 | 134 | -------------------------------------------------------------------------------- /man/poisson.Rd: -------------------------------------------------------------------------------- 1 | \name{Poisson Tests} 2 | \alias{poisson.tests} 3 | \alias{poisson.e} 4 | \alias{poisson.etest} 5 | \alias{poisson.m} 6 | \alias{poisson.mtest} 7 | \title{ Goodness-of-Fit Tests for Poisson Distribution} 8 | \description{ 9 | Performs the mean distance goodness-of-fit test and the energy goodness-of-fit test of Poisson distribution with unknown parameter. 10 | } 11 | \usage{ 12 | poisson.e(x) 13 | poisson.m(x) 14 | poisson.etest(x, R) 15 | poisson.mtest(x, R) 16 | poisson.tests(x, R, test="all") 17 | } 18 | \arguments{ 19 | \item{x}{ vector of nonnegative integers, the sample data } 20 | \item{R}{ number of bootstrap replicates } 21 | \item{test}{ name of test(s) } 22 | } 23 | \details{ 24 | Two distance-based tests of Poissonity are applied in \code{poisson.tests}, "M" and "E". The default is to 25 | do all tests and return results in a data frame. 26 | Valid choices for \code{test} are "M", "E", or "all" with 27 | default "all". 28 | 29 | If "all" tests, all tests are performed by a single parametric bootstrap computing all test statistics on each sample. 30 | 31 | The "M" choice is two tests, one based on a Cramer-von Mises distance and the other an Anderson-Darling distance. The "E" choice is the energy goodness-of-fit test. 32 | 33 | \code{R} must be a positive integer for a test. If \code{R} is missing or 0, a warning is printed but test statistics are computed (without testing). 34 | 35 | The mean distance test of Poissonity (M-test) is based on the result that the sequence 36 | of expected values E|X-j|, j=0,1,2,... characterizes the distribution of 37 | the random variable X. As an application of this characterization one can 38 | get an estimator \eqn{\hat F(j)} of the CDF. The test statistic 39 | (see \code{\link{poisson.m}}) is a Cramer-von Mises type of distance, with 40 | M-estimates replacing the usual EDF estimates of the CDF: 41 | \deqn{M_n = n\sum_{j=0}^\infty (\hat F(j) - F(j\;; \hat \lambda))^2 42 | f(j\;; \hat \lambda).}{M_n = n sum [j>=0] (\hat F(j) - F(j; \hat \lambda))^2 43 | f(j; \hat \lambda).} 44 | 45 | In \code{poisson.tests}, an Anderson-Darling type of weight is also applied when \code{test="M"} or \code{test="all"}. 46 | 47 | The tests are implemented by parametric bootstrap with 48 | \code{R} replicates. 49 | 50 | An energy goodness-of-fit test (E) is based on the test statistic 51 | \deqn{Q_n = n (\frac{2}{n} \sum_{i=1}^n E|x_i - X| - E|X-X'| - \frac{1}{n^2} \sum_{i,j=1}^n |x_i - x_j|, 52 | }{Q_n = n((2/n) sum[1:n] E|x_i-X| - E|X-X'| - (1/n^2) sum[1:n,1:n] 53 | |x_i-x_j|),} 54 | where X and X' are iid with the hypothesized null distribution. For a test of H: X ~ Poisson(\eqn{\lambda}), we can express E|X-X'| in terms of Bessel functions, and E|x_i - X| in terms of the CDF of Poisson(\eqn{\lambda}). 55 | 56 | If test=="all" or not specified, all tests are run with a single parametric bootstrap. \code{poisson.mtest} implements only the Poisson M-test with Cramer-von Mises type distance. \code{poisson.etest} implements only the Poisson energy test. 57 | } 58 | \value{ 59 | The functions \code{poisson.m} and \code{poisson.e} return the test statistics. The function 60 | \code{poisson.mtest} or \code{poisson.etest} return an \code{htest} object containing 61 | \item{method}{Description of test} 62 | \item{statistic}{observed value of the test statistic} 63 | \item{p.value}{approximate p-value of the test} 64 | \item{data.name}{replicates R} 65 | \item{estimate}{sample mean} 66 | 67 | \code{poisson.tests} returns "M-CvM test", "M-AD test" and "Energy test" results in a data frame with columns 68 | \item{estimate}{sample mean} 69 | \item{statistic}{observed value of the test statistic} 70 | \item{p.value}{approximate p-value of the test} 71 | \item{method}{Description of test} 72 | which can be coerced to a \code{tibble}. 73 | } 74 | \note{The running time of the M test is much faster than the E-test.} 75 | \references{ 76 | Szekely, G. J. and Rizzo, M. L. (2004) Mean Distance Test of Poisson Distribution, \emph{Statistics and Probability Letters}, 77 | 67/3, 241-247. \doi{10.1016/j.spl.2004.01.005}. 78 | 79 | Szekely, G. J. and Rizzo, M. L. (2005) A New Test for 80 | Multivariate Normality, \emph{Journal of Multivariate Analysis}, 81 | 93/1, 58-80, 82 | \doi{10.1016/j.jmva.2003.12.002}. 83 | } 84 | \author{ Maria L. Rizzo \email{mrizzo@bgsu.edu} and 85 | Gabor J. Szekely 86 | } 87 | \examples{ 88 | x <- rpois(50, 2) 89 | poisson.m(x) 90 | poisson.e(x) 91 | \donttest{ 92 | poisson.etest(x, R=199) 93 | poisson.mtest(x, R=199) 94 | poisson.tests(x, R=199) 95 | } 96 | } 97 | \keyword{ htest } 98 | \keyword{ energy } 99 | -------------------------------------------------------------------------------- /man/indep-deprecated.Rd: -------------------------------------------------------------------------------- 1 | \name{indep.test} 2 | \alias{indep.test} 3 | \title{ Energy-tests of Independence} 4 | \description{ 5 | Computes a multivariate nonparametric test of independence. 6 | The default method implements the distance covariance test 7 | \code{\link{dcov.test}}. 8 | } 9 | \usage{ 10 | indep.test(x, y, method = c("dcov","mvI"), index = 1, R) 11 | } 12 | \arguments{ 13 | \item{x}{ matrix: first sample, observations in rows} 14 | \item{y}{ matrix: second sample, observations in rows} 15 | \item{method}{ a character string giving the name of the test} 16 | \item{index}{ exponent on Euclidean distances} 17 | \item{R}{ number of replicates} 18 | } 19 | \details{ 20 | \code{indep.test} with the default \code{method = "dcov"} computes 21 | the distance 22 | covariance test of independence. \code{index} is an exponent on 23 | the Euclidean distances. Valid choices for \code{index} are in (0,2], 24 | with default value 1 (Euclidean distance). The arguments are passed 25 | to the \code{dcov.test} function. See the help topic \code{\link{dcov.test}} for 26 | the description and documentation and also see the references below. 27 | 28 | \code{indep.test} with \code{method = "mvI"} 29 | computes the coefficient \eqn{\mathcal I_n}{I_n} and performs a nonparametric 30 | \eqn{\mathcal E}{E}-test of independence. The arguments are passed to 31 | \code{mvI.test}. The 32 | \code{index} argument is ignored (\code{index = 1} is applied). 33 | See the help topic \code{\link{mvI.test}} and also 34 | see the reference (2006) below for details. 35 | 36 | The test decision is obtained via 37 | bootstrap, with \code{R} replicates. 38 | The sample sizes (number of rows) of the two samples must agree, and 39 | samples must not contain missing values. 40 | 41 | These energy tests of independence are based on related theoretical 42 | results, but different test statistics. 43 | The \code{dcov} method is faster than \code{mvI} method by 44 | approximately a factor of O(n). 45 | } 46 | \value{ 47 | \code{indep.test} returns a list with class 48 | \code{htest} containing 49 | \item{ method}{description of test} 50 | \item{ statistic}{observed value of the 51 | test statistic \eqn{n \mathcal V_n^2}{n V_n^2} 52 | or \eqn{n \mathcal I_n^2}{n I_n^2}} 53 | \item{ estimate}{ \eqn{\mathcal V_n}{V_n} or \eqn{\mathcal I_n}{I_n}} 54 | \item{ estimates}{ a vector [dCov(x,y), dCor(x,y), dVar(x), dVar(y)] 55 | (method dcov)} 56 | \item{ replicates}{ replicates of the test statistic} 57 | \item{ p.value}{approximate p-value of the test} 58 | \item{ data.name}{description of data} 59 | } 60 | \note{As of energy-1.1-0, 61 | \code{indep.etest} is deprecated and replaced by \code{indep.test}, which 62 | has methods for two different energy tests of independence. \code{indep.test} applies 63 | the distance covariance test (see \code{dcov.test}) by default (\code{method = "dcov"}). 64 | The original \code{indep.etest} applied the independence coefficient 65 | \eqn{\mathcal I_n}{I_n}, which is now obtained by \code{method = "mvI"}. 66 | } 67 | \seealso{ 68 | \code{ \link{dcov.test} } 69 | \code{ \link{mvI.test} } 70 | \code{ \link{dcov} } 71 | \code{ \link{mvI} } 72 | } 73 | \references{ 74 | Szekely, G.J. and Rizzo, M.L. (2009), 75 | Brownian Distance Covariance, 76 | \emph{Annals of Applied Statistics}, Vol. 3 No. 4, pp. 77 | 1236-1265. (Also see discussion and rejoinder.) 78 | \cr \doi{10.1214/09-AOAS312} 79 | 80 | Szekely, G.J., Rizzo, M.L., and Bakirov, N.K. (2007), 81 | Measuring and Testing Dependence by Correlation of Distances, 82 | \emph{Annals of Statistics}, Vol. 35 No. 6, pp. 2769-2794. 83 | \cr \doi{10.1214/009053607000000505} 84 | 85 | Bakirov, N.K., Rizzo, M.L., and Szekely, G.J. (2006), A Multivariate 86 | Nonparametric Test of Independence, \emph{Journal of Multivariate Analysis} 87 | 93/1, 58-80, \cr 88 | \doi{10.1016/j.jmva.2005.10.005} 89 | } 90 | \author{ Maria L. Rizzo \email{mrizzo@bgsu.edu} and 91 | Gabor J. Szekely 92 | } 93 | \examples{ 94 | \donttest{ 95 | ## independent multivariate data 96 | x <- matrix(rnorm(60), nrow=20, ncol=3) 97 | y <- matrix(rnorm(40), nrow=20, ncol=2) 98 | indep.test(x, y, method = "dcov", R = 99) 99 | indep.test(x, y, method = "mvI", R = 99) 100 | 101 | ## dependent multivariate data 102 | if (require(MASS)) { 103 | Sigma <- matrix(c(1, .1, 0, 0 , 1, 0, 0 ,.1, 1), 3, 3) 104 | x <- mvrnorm(30, c(0, 0, 0), diag(3)) 105 | y <- mvrnorm(30, c(0, 0, 0), Sigma) * x 106 | indep.test(x, y, R = 99) #dcov method 107 | indep.test(x, y, method = "mvI", R = 99) 108 | } 109 | } 110 | } 111 | 112 | \keyword{ htest } 113 | \keyword{ multivariate } 114 | \keyword{ nonparametric } 115 | \concept{ independence } 116 | \concept{ energy statistics } 117 | 118 | -------------------------------------------------------------------------------- /R/dcov.R: -------------------------------------------------------------------------------- 1 | dcov.test <- 2 | function(x, y, index=1.0, R=NULL) { 3 | ## check for valid number of replicates R 4 | method <- "Specify the number of replicates R (R > 0) for an independence test" 5 | if (! is.null(R)) { 6 | R <- floor(R) 7 | if (R < 1) R <- 0 8 | if (R > 0) 9 | method <- "dCov independence test (permutation test)" 10 | } else { 11 | R <- 0 12 | } 13 | 14 | Dx <- .arg2dist.matrix(x) 15 | Dy <- .arg2dist.matrix(y) 16 | if (!isTRUE(all.equal(index, 1.0))) { 17 | Dx <- Dx^index 18 | Dy <- Dy^index 19 | } 20 | 21 | n <- nrow(Dx) 22 | m <- nrow(Dy) 23 | if (n != m) stop("Sample sizes must agree") 24 | 25 | stat <- dcorr <- reps <- 0 26 | dcov <- rep(0, 4) 27 | if (R > 0) reps <- rep(0, R) 28 | pval <- 1 29 | dims <- c(n, ncol(Dx), ncol(Dy), R) 30 | 31 | # dcov = [dCov,dCor,dVar(x),dVar(y)] 32 | a <- .C("dCOVtest", 33 | x = as.double(t(Dx)), 34 | y = as.double(t(Dy)), 35 | nrow = as.integer(nrow(Dx)), 36 | nreps = as.integer(R), 37 | reps = as.double(reps), 38 | DCOV = as.double(dcov), 39 | pval = as.double(pval), 40 | PACKAGE = "energy") 41 | # test statistic is n times the square of dCov statistic 42 | stat <- n * a$DCOV[1]^2 43 | dcorr <- a$DCOV 44 | V <- dcorr[[1]] 45 | names(stat) <- "nV^2" 46 | names(V) <- "dCov" 47 | dataname <- paste("index ", index, ", replicates ", R, sep="") 48 | pval <- ifelse (R < 1, NA, a$pval) 49 | e <- list( 50 | statistic = stat, 51 | method = method, 52 | estimate = V, 53 | estimates = dcorr, 54 | p.value = pval, 55 | replicates = n* a$reps^2, 56 | n = n, 57 | data.name = dataname) 58 | class(e) <- "htest" 59 | return(e) 60 | } 61 | 62 | dcor.test <- 63 | function(x, y, index=1.0, R) { 64 | # distance correlation test for multivariate independence 65 | # like dcov.test but using dcor as the test statistic 66 | if (missing(R)) R <- 0 67 | R <- ifelse(R > 0, floor(R), 0) 68 | RESULT <- dcov.test(x, y, index=index, R) 69 | # this test statistic is n times the square of dCov statistic 70 | DCOVteststat <- RESULT$statistic 71 | DCOVreplicates <- RESULT$replicates 72 | 73 | # RESULT$estimates = [dCov,dCor,dVar(x),dVar(y)] 74 | # dVar are invariant under permutation of sample indices 75 | estimates = RESULT$estimates 76 | names(estimates) <- c("dCov", "dCor", "dVar(X)", "dVar(Y)") 77 | 78 | DCORteststat <- RESULT$estimates[2] 79 | dvarX <- RESULT$estimates[3] 80 | dvarY <- RESULT$estimates[4] 81 | n <- RESULT$n 82 | 83 | if (R > 0) { 84 | DCORreps <- sqrt(DCOVreplicates / n) / sqrt(dvarX * dvarY) 85 | p.value <- (1 + sum(DCORreps >= DCORteststat)) / (1 + R) 86 | } else { 87 | p.value <- NA 88 | DCORreps <- NA 89 | } 90 | 91 | names(DCORteststat) <- "dCor" 92 | dataname <- paste("index ", index, ", replicates ", R, sep="") 93 | method <- ifelse(R > 0, "dCor independence test (permutation test)", 94 | "Specify the number of replicates R>0 for an independence test") 95 | e <- list( 96 | method = method, 97 | statistic = DCORteststat, 98 | estimates = estimates, 99 | p.value = p.value, 100 | replicates = DCORreps, 101 | n = n, 102 | data.name = dataname) 103 | class(e) <- "htest" 104 | return(e) 105 | } 106 | 107 | 108 | .dcov <- 109 | function(x, y, index=1.0) { 110 | # distance covariance statistic for independence 111 | # dcov = [dCov,dCor,dVar(x),dVar(y)] (vector) 112 | # this function provides the fast method for computing dCov 113 | # it is called by the dcov and dcor functions 114 | 115 | Dx <- .arg2dist.matrix(x) 116 | Dy <- .arg2dist.matrix(y) 117 | if (!isTRUE(all.equal(index, 1.0))) { 118 | Dx <- Dx^index 119 | Dy <- Dy^index 120 | } 121 | 122 | n <- nrow(Dx) 123 | m <- nrow(Dy) 124 | if (n != m) stop("Sample sizes must agree") 125 | dims <- c(n, ncol(Dx), ncol(Dy)) 126 | idx <- 1:dims[1] 127 | DCOV <- numeric(4) 128 | a <- .C("dCOV", 129 | x = as.double(t(Dx)), 130 | y = as.double(t(Dy)), 131 | nrow = as.integer(n), 132 | DCOV = as.double(DCOV), 133 | PACKAGE = "energy") 134 | return(a$DCOV) 135 | } 136 | 137 | dcov <- 138 | function(x, y, index=1.0) { 139 | # distance correlation statistic for independence 140 | return(.dcov(x, y, index)[1]) 141 | } 142 | 143 | dcor <- 144 | function(x, y, index=1.0) { 145 | # distance correlation statistic for independence 146 | return(.dcov(x, y, index)[2]) 147 | } 148 | -------------------------------------------------------------------------------- /R/Emvnorm.R: -------------------------------------------------------------------------------- 1 | mvnorm.test <- function(x, R) { 2 | # parametric bootstrap E-test for multivariate normality 3 | if (missing(R)) { 4 | method = "Energy test of multivariate normality: (Specify R > 0 for MC test)" 5 | R <- 0 6 | } else { 7 | method = "Energy test of multivariate normality: estimated parameters" 8 | } 9 | 10 | if (is.vector(x) || NCOL(x)==1) { 11 | n <- NROW(x) 12 | d <- 1 13 | bootobj <- boot::boot(x, statistic = normal.e, R = R, sim = "parametric", 14 | ran.gen = function(x, y) { 15 | return(rnorm(n)) 16 | }) 17 | } else { 18 | n <- nrow(x) 19 | d <- ncol(x) 20 | bootobj <- boot::boot(x, statistic = mvnorm.e, R = R, sim = "parametric", 21 | ran.gen = function(x, y) { 22 | return(matrix(rnorm(n * d), nrow = n, ncol = d)) 23 | }) 24 | } 25 | if (R > 0) 26 | p <- 1 - mean(bootobj$t < bootobj$t0) else p <- NA 27 | 28 | names(bootobj$t0) <- "E-statistic" 29 | e <- list(statistic = bootobj$t0, p.value = p, 30 | method = method, 31 | data.name = paste("x, sample size ", n, ", dimension ", d, ", replicates ", 32 | R, sep = "")) 33 | class(e) <- "htest" 34 | e 35 | } 36 | 37 | mvnorm.etest <- function(x, R) { 38 | return(mvnorm.test(x, R)) 39 | } 40 | 41 | mvnorm.e <- function(x) { 42 | # E-statistic for multivariate normality 43 | if (is.vector(x) || NCOL(x)==1) 44 | return(normal.e(x)) 45 | n <- nrow(x) 46 | d <- ncol(x) 47 | if (n < 2) { 48 | warning("sample size must be at least 2") 49 | return(NA) 50 | } 51 | # subtract column means and compute S^(-1/2) 52 | z <- scale(x, scale = FALSE) 53 | ev <- eigen(var(x), symmetric = TRUE) 54 | P <- ev$vectors 55 | lambda <- ev$values 56 | D <- diag(d) 57 | diag(D) <- 1 / sqrt(lambda) 58 | y <- z %*% (P %*% D %*% t(P)) 59 | if (any(!is.finite(y))) { 60 | warning("missing or non-finite y") 61 | return(NA) 62 | } 63 | 64 | if (requireNamespace("gsl", quietly=TRUE)) { 65 | const <- exp(lgamma((d+1)/2) - lgamma(d/2)) 66 | mean2 <- 2*const 67 | ysq <- rowSums(y^2) 68 | mean1 <- sqrt(2) * const * 69 | mean(gsl::hyperg_1F1(-1/2, d/2, -ysq/2)) 70 | mean3 <- 2*sum(dist(y)) / n^2 71 | return(n * (2*mean1 - mean2 - mean3)) 72 | } else { 73 | warning("package gsl required but not found") 74 | return (NA) 75 | } 76 | } 77 | 78 | normal.e <- function(x) { 79 | ## Case 4: unknown parameters 80 | x <- as.vector(x) 81 | n <- length(x) 82 | s <- sd(x) 83 | if (!is.finite(s) || !(s > 0)) { 84 | warning("sd(x)>0 required") 85 | return(NA) 86 | } 87 | y <- (x - mean(x)) / sd(x) 88 | y <- sort(y) 89 | K <- seq(1 - n, n - 1, 2) 90 | return(2 * (sum(2 * y * pnorm(y) + 2 * dnorm(y)) - 91 | n/sqrt(pi) - mean(K * y))) 92 | } 93 | 94 | normal.test <- function(x, method=c("mc", "limit"), R) { 95 | ## implements the test for for d=1 96 | ## Case 4: composite hypothesis 97 | method <- match.arg(method) 98 | estimate <- c(mean(x), sd(x)) 99 | names(estimate) <- c("mean", "sd") 100 | 101 | if (method == "mc") { 102 | ## Monte Carlo approach 103 | if (missing(R)) R <- 0 104 | e <- energy::mvnorm.etest(x, R=R) 105 | e$method <- "Energy test of normality" 106 | e$method <- ifelse(R > 0, 107 | paste0(e$method,": estimated parameters"), 108 | paste0(e$method, " (Specify R > 0 for MC test)")) 109 | e$estimate <- estimate 110 | return(e) 111 | } 112 | 113 | ## implement test using asymptotic distribution for p-value 114 | if (!is.numeric(x) || (!is.vector(x) && NCOL(x) > 1)) { 115 | warning("x must be a numeric vector") 116 | return (NA) 117 | } else { 118 | x <- as.vector(x, mode="numeric") 119 | } 120 | 121 | n <- length(x) 122 | t0 <- normal.e(x) 123 | names(t0) <- "statistic" 124 | 125 | ## load pre-computed eigenvalues 126 | ev <- energy::EVnormal[, "Case4"] 127 | 128 | if (requireNamespace("CompQuadForm", quietly=TRUE)) { 129 | p <- CompQuadForm::imhof(t0, ev)$Qq 130 | } else { 131 | warning("limit distribution method requires CompQuadForm package for p-value") 132 | p <- NA 133 | } 134 | estimate <- c(mean(x), sd(x)) 135 | names(estimate) <- c("mean", "sd") 136 | e <- list(statistic = t0, p.value = p, 137 | method = paste("Energy test of normality: limit distribution"), 138 | estimate = estimate, 139 | data.name = "Case 4: composite hypothesis, estimated parameters") 140 | class(e) <- "htest" 141 | e 142 | } 143 | -------------------------------------------------------------------------------- /man/edist.Rd: -------------------------------------------------------------------------------- 1 | \name{edist} 2 | \alias{edist} 3 | \title{E-distance} 4 | \description{ 5 | Returns the E-distances (energy statistics) between clusters. 6 | } 7 | \usage{ 8 | edist(x, sizes, distance = FALSE, ix = 1:sum(sizes), alpha = 1, 9 | method = c("cluster","discoB")) 10 | } 11 | \arguments{ 12 | \item{x}{ data matrix of pooled sample or Euclidean distances} 13 | \item{sizes}{ vector of sample sizes} 14 | \item{distance}{ logical: if TRUE, x is a distance matrix} 15 | \item{ix}{ a permutation of the row indices of x } 16 | \item{alpha}{ distance exponent in (0,2]} 17 | \item{method}{ how to weight the statistics } 18 | } 19 | \details{ 20 | A vector containing the pairwise two-sample multivariate 21 | \eqn{\mathcal{E}}{E}-statistics for comparing clusters or samples is returned. 22 | The e-distance between clusters is computed from the original pooled data, 23 | stacked in matrix \code{x} where each row is a multivariate observation, or 24 | from the distance matrix \code{x} of the original data, or distance object 25 | returned by \code{dist}. The first \code{sizes[1]} rows of the original data 26 | matrix are the first sample, the next \code{sizes[2]} rows are the second 27 | sample, etc. The permutation vector \code{ix} may be used to obtain 28 | e-distances corresponding to a clustering solution at a given level in 29 | the hierarchy. 30 | 31 | The default method \code{cluster} summarizes the e-distances between 32 | clusters in a table. 33 | The e-distance between two clusters \eqn{C_i, C_j} 34 | of size \eqn{n_i, n_j} 35 | proposed by Szekely and Rizzo (2005) 36 | is the e-distance \eqn{e(C_i,C_j)}, defined by 37 | \deqn{e(C_i,C_j)=\frac{n_i n_j}{n_i+n_j}[2M_{ij}-M_{ii}-M_{jj}], 38 | }{e(S_i, S_j) = (n_i n_j)/(n_i+n_j)[2M_(ij)-M_(ii)-M_(jj)],} 39 | where 40 | \deqn{M_{ij}=\frac{1}{n_i n_j}\sum_{p=1}^{n_i} \sum_{q=1}^{n_j} 41 | \|X_{ip}-X_{jq}\|^\alpha,}{ 42 | M_{ij} = 1/(n_i n_j) sum[1:n_i, 1:n_j] ||X_(ip) - X_(jq)||^a,} 43 | \eqn{\|\cdot\|}{|| ||} denotes Euclidean norm, \eqn{\alpha=}{a=} 44 | \code{alpha}, and \eqn{X_{ip}}{ 45 | X_(ip)} denotes the p-th observation in the i-th cluster. The 46 | exponent \code{alpha} should be in the interval (0,2]. 47 | 48 | The coefficient \eqn{\frac{n_i n_j}{n_i+n_j}}{(n_i n_j)(n_i+n_j)} 49 | is one-half of the harmonic mean of the sample sizes. The 50 | \code{discoB} method is related but with 51 | different ways of summarizing the pairwise differences between samples. 52 | The \code{disco} methods apply the coefficient 53 | \eqn{\frac{n_i n_j}{2N}}{(n_i n_j)/(2N)} where N is the total number 54 | of observations. This weights each (i,j) statistic by sample size 55 | relative to N. See the \code{disco} topic for more details. 56 | } 57 | \value{ 58 | A object of class \code{dist} containing the lower triangle of the 59 | e-distance matrix of cluster distances corresponding to the permutation 60 | of indices \code{ix} is returned. The \code{method} attribute of the 61 | distance object is assigned a value of type, index. 62 | } 63 | \references{ 64 | Szekely, G. J. and Rizzo, M. L. (2005) Hierarchical Clustering 65 | via Joint Between-Within Distances: Extending Ward's Minimum 66 | Variance Method, \emph{Journal of Classification} 22(2) 151-183. 67 | \cr \doi{10.1007/s00357-005-0012-9} 68 | 69 | M. L. Rizzo and G. J. Szekely (2010). 70 | DISCO Analysis: A Nonparametric Extension of 71 | Analysis of Variance, Annals of Applied Statistics, 72 | Vol. 4, No. 2, 1034-1055. 73 | \cr \doi{10.1214/09-AOAS245} 74 | 75 | Szekely, G. J. and Rizzo, M. L. (2004) Testing for Equal 76 | Distributions in High Dimension, InterStat, November (5). 77 | 78 | Szekely, G. J. (2000) Technical Report 03-05, 79 | \eqn{\mathcal{E}}{E}-statistics: Energy of 80 | Statistical Samples, Department of Mathematics and Statistics, 81 | Bowling Green State University. 82 | } 83 | \author{ Maria L. Rizzo \email{mrizzo@bgsu.edu} and 84 | Gabor J. Szekely 85 | } 86 | \seealso{ 87 | \code{\link{energy.hclust}} 88 | \code{\link{eqdist.etest}} 89 | \code{\link{ksample.e}} 90 | \code{\link{disco}} 91 | } 92 | \examples{ 93 | ## compute cluster e-distances for 3 samples of iris data 94 | data(iris) 95 | edist(iris[,1:4], c(50,50,50)) 96 | 97 | ## pairwise disco statistics 98 | edist(iris[,1:4], c(50,50,50), method="discoB") 99 | 100 | ## compute e-distances from a distance object 101 | data(iris) 102 | edist(dist(iris[,1:4]), c(50, 50, 50), distance=TRUE, alpha = 1) 103 | 104 | ## compute e-distances from a distance matrix 105 | data(iris) 106 | d <- as.matrix(dist(iris[,1:4])) 107 | edist(d, c(50, 50, 50), distance=TRUE, alpha = 1) 108 | 109 | } 110 | \keyword{ multivariate } 111 | \keyword{ cluster } 112 | \keyword{ nonparametric } 113 | \concept{ energy statistics } 114 | -------------------------------------------------------------------------------- /docs/404.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | Page not found (404) • energy 9 | 10 | 11 | 12 | 13 | 14 | 15 | 19 | 20 | 21 | 22 | 23 |
    24 |
    61 | 62 | 63 | 64 | 65 |
    66 |
    67 | 70 | 71 | Content not found. Please use links in the navbar. 72 | 73 |
    74 | 75 | 79 | 80 |
    81 | 82 | 83 | 84 |
    88 | 89 |
    90 |

    91 |

    Site built with pkgdown 2.1.0.

    92 |
    93 | 94 |
    95 |
    96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | -------------------------------------------------------------------------------- /man/mvI.test.Rd: -------------------------------------------------------------------------------- 1 | \name{mvI.test} 2 | \alias{mvI.test} 3 | \alias{mvI} 4 | \title{ Independence Coefficient and Test} 5 | \description{ 6 | Computes a type of multivariate nonparametric E-statistic and test of independence 7 | based on independence coefficient \eqn{\mathcal I_n}{I_n}. This coefficient pre-dates and is different from distance covariance or distance correlation.} 8 | \usage{ 9 | mvI.test(x, y, R) 10 | mvI(x, y) 11 | } 12 | \arguments{ 13 | \item{x}{ matrix: first sample, observations in rows} 14 | \item{y}{ matrix: second sample, observations in rows} 15 | \item{R}{ number of replicates} 16 | } 17 | \details{ 18 | \code{mvI} computes the coefficient \eqn{\mathcal I_n}{I_n} and \code{mvI.test} performs a nonparametric test of independence. The test decision is obtained via permutation 19 | bootstrap, with \code{R} replicates. 20 | The sample sizes (number of rows) of the two samples must agree, and 21 | samples must not contain missing values. 22 | 23 | Historically this is the first energy test of independence. The 24 | distance covariance test \code{\link{dcov.test}}, distance correlation \code{\link{dcor}}, and related methods are more recent (2007, 2009). 25 | 26 | The distance covariance test \code{\link{dcov.test}} and distance correlation test \code{\link{dcor.test}} are much faster and have different properties than \code{mvI.test}. All are based on a population independence coefficient that characterizes independence and all of these tests are statistically consistent. However, dCor is scale invariant while \eqn{\mathcal I_n}{I_n} is not. In applications \code{\link{dcor.test}} or \code{\link{dcov.test}} are the recommended tests. 27 | 28 | Computing formula from Bakirov, Rizzo, and Szekely (2006), equation (2): 29 | 30 | Suppose the two samples are \eqn{X_1,\dots,X_n \in R^p} and \eqn{Y_1,\dots,Y_n \in R^q}. Define \eqn{Z_{kl} = (X_k, Y_l) \in R^{p+q}.} 31 | 32 | The independence coefficient \eqn{\mathcal I_n}{I_n} is defined 33 | \deqn{ 34 | \mathcal I_n = \sqrt{\frac{2\bar z - z_d - z}{x + y - z}}, 35 | } 36 | where 37 | \deqn{z_d= \frac{1}{n^2} \sum_{k,l=1}^n |Z_{kk}-Z_{ll}|_{p+q},} 38 | \deqn{z= \frac{1}{n^4} \sum_{k,l=1}^n \sum_{i,j=1}^n |Z_{kl}-Z_{ij}|_{p+q},} 39 | \deqn{\bar z= \frac{1}{n^3} \sum_{k=1}^n \sum_{i,j=1}^n |Z_{kk}-Z_{ij}|_{p+q},} 40 | \deqn{x= \frac{1}{n^2} \sum_{k,l=1}^n |X_{k}-X_{l}|_p,} 41 | \deqn{y= \frac{1}{n^2} \sum_{k,l=1}^n |Y_{k}-Y_{l}|_q.} 42 | 43 | Some properties: 44 | \itemize{ 45 | \item 46 | \eqn{0 \leq \mathcal I_n \leq 1} (Theorem 1). 47 | \item 48 | Large values of \eqn{n \mathcal I_n^2} (or \eqn{\mathcal I_n}) support the alternative hypothesis that the sampled random variables are dependent. 49 | \item \eqn{\mathcal I_n} is invariant to shifts and orthogonal transformations of X and Y. 50 | \item \eqn{\sqrt{n} \, \mathcal I_n} determines a statistically consistent test of independence against all fixed dependent alternatives (Corollary 1). 51 | \item The population independence coefficient \eqn{\mathcal I} is a normalized distance between the joint characteristic function and the product of the marginal characteristic functions. \eqn{\mathcal I_n} converges almost surely to \eqn{\mathcal I} as \eqn{n \to \infty}. X and Y are independent if and only if \eqn{\mathcal I(X, Y) = 0}. 52 | See the 2006 reference below for more details. 53 | }} 54 | \value{ 55 | \code{mvI} returns the statistic. \code{mvI.test} returns 56 | a list with class 57 | \code{htest} containing 58 | \item{ method}{ description of test} 59 | \item{ statistic}{ observed value of the test statistic \eqn{n\mathcal I_n^2}{n I_n^2}} 60 | \item{ estimate}{ \eqn{\mathcal I_n}{I_n}} 61 | \item{ replicates}{ permutation replicates} 62 | \item{ p.value}{ p-value of the test} 63 | \item{ data.name}{ description of data} 64 | } 65 | \references{ 66 | Bakirov, N.K., Rizzo, M.L., and Szekely, G.J. (2006), A Multivariate 67 | Nonparametric Test of Independence, \emph{Journal of Multivariate Analysis} 93/1, 58-80. 68 | 69 | Szekely, G.J., Rizzo, M.L., and Bakirov, N.K. (2007), 70 | Measuring and Testing Dependence by Correlation of Distances, 71 | \emph{Annals of Statistics}, Vol. 35 No. 6, pp. 2769-2794. 72 | 73 | Szekely, G.J. and Rizzo, M.L. (2009), 74 | Brownian Distance Covariance, 75 | \emph{Annals of Applied Statistics}, 76 | Vol. 3, No. 4, 1236-1265. 77 | } 78 | \note{ 79 | On scale invariance: Distance correlation (\code{\link{dcor}}) has the property that if we change the scale of X from e.g., meters to kilometers, and the scale of Y from e.g. grams to ounces, the statistic and the test are not changed. \eqn{\mathcal I_n}{I_n} does not have this property; it is invariant only under a common rescaling of X and Y by the same constant. Thus, if the units of measurement change for either or both variables, dCor is invariant, but \eqn{\mathcal I_n}{I_n} and possibly the \code{mvI.test} decision changes. 80 | } 81 | \author{ Maria L. Rizzo \email{mrizzo@bgsu.edu} and 82 | Gabor J. Szekely 83 | } 84 | \examples{ 85 | mvI(iris[1:25, 1], iris[1:25, 2]) 86 | \donttest{ 87 | mvI.test(iris[1:25, 1], iris[1:25, 2], R=99) 88 | } 89 | } 90 | \seealso{ 91 | \code{ \link{dcov.test} } 92 | \code{ \link{dcov} } 93 | \code{ \link{dcor.test} } 94 | \code{ \link{dcor} } 95 | \code{ \link{dcov2d} } 96 | \code{ \link{dcor2d} } 97 | } 98 | \keyword{ htest } 99 | \keyword{ multivariate } 100 | \keyword{ nonparametric } 101 | \concept{ independence } 102 | \concept{ energy statistics } 103 | 104 | -------------------------------------------------------------------------------- /man/dcov.Rd: -------------------------------------------------------------------------------- 1 | \name{distance correlation} 2 | \alias{dcor} 3 | \alias{dcov} 4 | \title{ Distance Correlation and Covariance Statistics} 5 | \description{ 6 | Computes distance covariance and distance correlation statistics, 7 | which are multivariate measures of dependence. 8 | } 9 | \usage{ 10 | dcov(x, y, index = 1.0) 11 | dcor(x, y, index = 1.0) 12 | } 13 | \arguments{ 14 | \item{x}{ data or distances of first sample} 15 | \item{y}{ data or distances of second sample} 16 | \item{index}{ exponent on Euclidean distance, in (0,2]} 17 | } 18 | \details{ 19 | \code{dcov} and \code{dcor} compute distance 20 | covariance and distance correlation statistics. 21 | 22 | The sample sizes (number of rows) of the two samples must 23 | agree, and samples must not contain missing values. 24 | 25 | The \code{index} is an optional exponent on Euclidean distance. 26 | Valid exponents for energy are in (0, 2) excluding 2. 27 | 28 | Argument types supported are 29 | numeric data matrix, data.frame, or tibble, with observations in rows; 30 | numeric vector; ordered or unordered factors. In case of unordered factors 31 | a 0-1 distance matrix is computed. 32 | 33 | Optionally pre-computed distances can be input as class "dist" objects or as distance matrices. 34 | For data types of arguments, distance matrices are computed internally. 35 | 36 | Distance correlation is a new measure of dependence between random 37 | vectors introduced by Szekely, Rizzo, and Bakirov (2007). 38 | For all distributions with finite first moments, distance 39 | correlation \eqn{\mathcal R}{R} generalizes the idea of correlation in two 40 | fundamental ways: 41 | (1) \eqn{\mathcal R(X,Y)}{R(X,Y)} is defined for \eqn{X} and \eqn{Y} in arbitrary dimension. 42 | (2) \eqn{\mathcal R(X,Y)=0}{R(X,Y)=0} characterizes independence of \eqn{X} and 43 | \eqn{Y}. 44 | 45 | Distance correlation satisfies \eqn{0 \le \mathcal R \le 1}{0 \le R \le 1}, and 46 | \eqn{\mathcal R = 0}{R = 0} only if \eqn{X} and \eqn{Y} are independent. Distance 47 | covariance \eqn{\mathcal V}{V} provides a new approach to the problem of 48 | testing the joint independence of random vectors. The formal 49 | definitions of the population coefficients \eqn{\mathcal V}{V} and 50 | \eqn{\mathcal R}{R} are given in (SRB 2007). The definitions of the 51 | empirical coefficients are as follows. 52 | 53 | The empirical distance covariance \eqn{\mathcal{V}_n(\mathbf{X,Y})}{V_n(X,Y)} 54 | with index 1 is 55 | the nonnegative number defined by 56 | \deqn{ 57 | \mathcal{V}^2_n (\mathbf{X,Y}) = \frac{1}{n^2} \sum_{k,\,l=1}^n 58 | A_{kl}B_{kl} 59 | }{ 60 | V^2_n (X,Y) = (1/n^2) sum_{k,l=1:n} 61 | A_{kl}B_{kl} 62 | } 63 | where \eqn{A_{kl}} and \eqn{B_{kl}} are 64 | \deqn{ 65 | A_{kl} = a_{kl}-\bar a_{k.}- \bar a_{.l} + \bar a_{..} 66 | } 67 | \deqn{ 68 | B_{kl} = b_{kl}-\bar b_{k.}- \bar b_{.l} + \bar b_{..}. 69 | } 70 | Here 71 | \deqn{ 72 | a_{kl} = \|X_k - X_l\|_p, \quad b_{kl} = \|Y_k - Y_l\|_q, \quad 73 | k,l=1,\dots,n, 74 | }{ 75 | a_{kl} = ||X_k - X_l||_p, b_{kl} = ||Y_k - Y_l||_q, 76 | k,l=1,\dots,n, 77 | } 78 | and the subscript \code{.} denotes that the mean is computed for the 79 | index that it replaces. Similarly, 80 | \eqn{\mathcal{V}_n(\mathbf{X})}{V_n(X)} is the nonnegative number defined by 81 | \deqn{ 82 | \mathcal{V}^2_n (\mathbf{X}) = \mathcal{V}^2_n (\mathbf{X,X}) = 83 | \frac{1}{n^2} \sum_{k,\,l=1}^n 84 | A_{kl}^2. 85 | }{ 86 | V^2_n (X) = V^2_n (X,X) = 87 | (1/n^2) sum_{k,l=1:n} 88 | A_{kl}^2. 89 | } 90 | 91 | The empirical distance correlation \eqn{\mathcal{R}_n(\mathbf{X,Y})}{R(\mathbf{X,Y})} is 92 | the square root of 93 | \deqn{ 94 | \mathcal{R}^2_n(\mathbf{X,Y})= 95 | \frac {\mathcal{V}^2_n(\mathbf{X,Y})} 96 | {\sqrt{ \mathcal{V}^2_n (\mathbf{X}) \mathcal{V}^2_n(\mathbf{Y})}}. 97 | }{ 98 | R^2_n(X,Y)= 99 | V^2_n(X,Y) / sqrt(V^2_n (X) V^2_n(Y)). 100 | } 101 | See \code{\link{dcov.test}} for a test of multivariate independence 102 | based on the distance covariance statistic. 103 | } 104 | \value{ 105 | \code{dcov} returns the sample distance covariance and 106 | \code{dcor} returns the sample distance correlation. 107 | } 108 | \note{ 109 | Note that it is inefficient to compute dCor by: 110 | 111 | square root of 112 | \code{dcov(x,y)/sqrt(dcov(x,x)*dcov(y,y))} 113 | 114 | because the individual 115 | calls to \code{dcov} involve unnecessary repetition of calculations. 116 | } 117 | \seealso{ 118 | \code{\link{dcov2d}} \code{\link{dcor2d}} 119 | \code{\link{bcdcor}} \code{\link{dcovU}} \code{\link{pdcor}} 120 | \code{\link{dcov.test}} \code{\link{dcor.test}} \code{\link{pdcor.test}} 121 | } 122 | \references{ 123 | Szekely, G.J., Rizzo, M.L., and Bakirov, N.K. (2007), 124 | Measuring and Testing Dependence by Correlation of Distances, 125 | \emph{Annals of Statistics}, Vol. 35 No. 6, pp. 2769-2794. 126 | \cr \doi{10.1214/009053607000000505} 127 | 128 | Szekely, G.J. and Rizzo, M.L. (2009), 129 | Brownian Distance Covariance, 130 | \emph{Annals of Applied Statistics}, 131 | Vol. 3, No. 4, 1236-1265. 132 | \cr \doi{10.1214/09-AOAS312} 133 | 134 | Szekely, G.J. and Rizzo, M.L. (2009), 135 | Rejoinder: Brownian Distance Covariance, 136 | \emph{Annals of Applied Statistics}, Vol. 3, No. 4, 1303-1308. 137 | } 138 | \author{ Maria L. Rizzo \email{mrizzo@bgsu.edu} and 139 | Gabor J. Szekely 140 | } 141 | \examples{ 142 | x <- iris[1:50, 1:4] 143 | y <- iris[51:100, 1:4] 144 | dcov(x, y) 145 | dcov(dist(x), dist(y)) #same thing 146 | } 147 | \keyword{ multivariate } 148 | \concept{ independence } 149 | \concept{ distance correlation } 150 | \concept{ distance covariance } 151 | \concept{ energy statistics } 152 | 153 | -------------------------------------------------------------------------------- /man/disco.Rd: -------------------------------------------------------------------------------- 1 | \name{disco} 2 | \alias{disco} 3 | \alias{disco.between} 4 | \alias{print.disco} 5 | \title{ distance components (DISCO)} 6 | \description{ 7 | E-statistics DIStance COmponents and tests, analogous to variance components 8 | and anova. 9 | } 10 | \usage{ 11 | disco(x, factors, distance, index=1.0, R, method=c("disco","discoB","discoF")) 12 | disco.between(x, factors, distance, index=1.0, R) 13 | } 14 | \arguments{ 15 | \item{x}{ data matrix or distance matrix or dist object} 16 | \item{factors}{ matrix or data frame of factor labels or integers (not design matrix)} 17 | \item{distance}{ logical, TRUE if x is distance matrix} 18 | \item{index}{ exponent on Euclidean distance in (0,2]} 19 | \item{R}{ number of replicates for a permutation test} 20 | \item{method}{ test statistic } 21 | } 22 | \details{ 23 | \code{disco} calculates the distance components decomposition of 24 | total dispersion and if R > 0 tests for significance using the test statistic 25 | disco "F" ratio (default \code{method="disco"}), 26 | or using the between component statistic (\code{method="discoB"}), 27 | each implemented by permutation test. 28 | 29 | If \code{x} is a \code{dist} object, argument \code{distance} is 30 | ignored. If \code{x} is a distance matrix, set \code{distance=TRUE}. 31 | 32 | In the current release \code{disco} computes the decomposition for one-way models 33 | only. 34 | } 35 | \value{ 36 | When \code{method="discoF"}, \code{disco} returns a list similar to the 37 | return value from \code{anova.lm}, and the \code{print.disco} method is 38 | provided to format the output into a similar table. Details: 39 | 40 | \code{disco} returns a class \code{disco} object, which is a list containing 41 | \item{call}{call} 42 | \item{method}{method} 43 | \item{statistic}{vector of observed statistics} 44 | \item{p.value}{vector of p-values} 45 | \item{k}{number of factors} 46 | \item{N}{number of observations} 47 | \item{between}{between-sample distance components} 48 | \item{withins}{one-way within-sample distance components} 49 | \item{within}{within-sample distance component} 50 | \item{total}{total dispersion} 51 | \item{Df.trt}{degrees of freedom for treatments} 52 | \item{Df.e}{degrees of freedom for error} 53 | \item{index}{index (exponent on distance)} 54 | \item{factor.names}{factor names} 55 | \item{factor.levels}{factor levels} 56 | \item{sample.sizes}{sample sizes} 57 | \item{stats}{matrix containing decomposition} 58 | 59 | When \code{method="discoB"}, \code{disco} passes the arguments to 60 | \code{disco.between}, which returns a class \code{htest} object. 61 | 62 | \code{disco.between} returns a class \code{htest} object, where the test 63 | statistic is the between-sample statistic (proportional to the numerator of the F ratio 64 | of the \code{disco} test. 65 | } 66 | \references{ 67 | M. L. Rizzo and G. J. Szekely (2010). 68 | DISCO Analysis: A Nonparametric Extension of 69 | Analysis of Variance, Annals of Applied Statistics, 70 | Vol. 4, No. 2, 1034-1055. 71 | \cr \doi{10.1214/09-AOAS245} 72 | } 73 | \note{ 74 | The current version does all calculations via matrix arithmetic and 75 | boot function. Support for more general additive models 76 | and a formula interface is under development. 77 | 78 | \code{disco} methods have been added to the cluster distance summary 79 | function \code{edist}, and energy tests for equality of distribution 80 | (see \code{eqdist.etest}). 81 | } 82 | \seealso{ 83 | \code{ \link{edist} } 84 | \code{ \link{eqdist.e} } 85 | \code{ \link{eqdist.etest} } 86 | \code{ \link{ksample.e} } 87 | } 88 | \author{ Maria L. Rizzo \email{mrizzo@bgsu.edu} and Gabor J. Szekely 89 | } 90 | \examples{ 91 | ## warpbreaks one-way decompositions 92 | data(warpbreaks) 93 | attach(warpbreaks) 94 | disco(breaks, factors=wool, R=99) 95 | 96 | ## warpbreaks two-way wool+tension 97 | disco(breaks, factors=data.frame(wool, tension), R=0) 98 | 99 | ## warpbreaks two-way wool*tension 100 | disco(breaks, factors=data.frame(wool, tension, wool:tension), R=0) 101 | 102 | ## When index=2 for univariate data, we get ANOVA decomposition 103 | disco(breaks, factors=tension, index=2.0, R=99) 104 | aov(breaks ~ tension) 105 | 106 | ## Multivariate response 107 | ## Example on producing plastic film from Krzanowski (1998, p. 381) 108 | tear <- c(6.5, 6.2, 5.8, 6.5, 6.5, 6.9, 7.2, 6.9, 6.1, 6.3, 109 | 6.7, 6.6, 7.2, 7.1, 6.8, 7.1, 7.0, 7.2, 7.5, 7.6) 110 | gloss <- c(9.5, 9.9, 9.6, 9.6, 9.2, 9.1, 10.0, 9.9, 9.5, 9.4, 111 | 9.1, 9.3, 8.3, 8.4, 8.5, 9.2, 8.8, 9.7, 10.1, 9.2) 112 | opacity <- c(4.4, 6.4, 3.0, 4.1, 0.8, 5.7, 2.0, 3.9, 1.9, 5.7, 113 | 2.8, 4.1, 3.8, 1.6, 3.4, 8.4, 5.2, 6.9, 2.7, 1.9) 114 | Y <- cbind(tear, gloss, opacity) 115 | rate <- factor(gl(2,10), labels=c("Low", "High")) 116 | 117 | ## test for equal distributions by rate 118 | disco(Y, factors=rate, R=99) 119 | disco(Y, factors=rate, R=99, method="discoB") 120 | 121 | ## Just extract the decomposition table 122 | disco(Y, factors=rate, R=0)$stats 123 | 124 | ## Compare eqdist.e methods for rate 125 | ## disco between stat is half of original when sample sizes equal 126 | eqdist.e(Y, sizes=c(10, 10), method="original") 127 | eqdist.e(Y, sizes=c(10, 10), method="discoB") 128 | 129 | ## The between-sample distance component 130 | disco.between(Y, factors=rate, R=0) 131 | } 132 | \keyword{ htest } 133 | \keyword{ multivariate } 134 | 135 | 136 | -------------------------------------------------------------------------------- /R/dcov2d.R: -------------------------------------------------------------------------------- 1 | dcor2d<- function(x, y, type = c("V", "U")) { 2 | ## computes dcor^2 or bias-corrected dcor^2 by O(n log n) algorithm 3 | ## bivariate data only: (x,y) in R^2 4 | ## should be faster than direct calc. for big n 5 | type <- match.arg(type) 6 | ## argument checking in dcov2d 7 | stat <- dcov2d(x, y, type, all.stats=TRUE) 8 | dvarX <- stat[2] 9 | dvarY <- stat[3] 10 | R2 <- 0.0 11 | if (abs(dvarX*dvarY > 10*.Machine$double.eps)) 12 | R2 <- stat[1] / sqrt(dvarX*dvarY) 13 | return (R2) 14 | } 15 | 16 | dcov2d<- function(x, y, type=c("V", "U"), all.stats=FALSE) { 17 | ## O(n log n) computation of dcovU or dcov^2 (V^2) for (x, y) in R^2 only 18 | type <- match.arg(type) 19 | if (!is.vector(x) || !is.vector(y)) { 20 | if (NCOL(x) > 1 || NCOL(y) > 1) 21 | stop("this method is only for univariate x and y") 22 | } 23 | x <- as.vector(x) 24 | y <- as.vector(y) 25 | n <- length(x) 26 | if (n != length(y)) 27 | stop("sample sizes must agree") 28 | 29 | Sums <- .dcovSums2d(x, y, all.sums=all.stats) 30 | if (type =="V") { 31 | d1 <- n^2 32 | d2 <- n^3 33 | d3 <- n^4 34 | } else { 35 | d1 <- n * (n - 3) 36 | d2 <- d1 * (n - 2) 37 | d3 <- d2 * (n - 1) 38 | } 39 | dCov2d <- Sums$S1/d1 - 2*Sums$S2/d2 + Sums$S3/d3 40 | if (all.stats) { 41 | dvarX <- Sums$S1a/d1 - 2*Sums$S2a/d2 + Sums$S3a/d3 42 | dvarY <- Sums$S1b/d1 - 2*Sums$S2b/d2 + Sums$S3b/d3 43 | } 44 | rval <- ifelse(type=="V", c(V=dCov2d), c(U=dCov2d)) 45 | if (all.stats) 46 | rval <- c(rval, dvarX=dvarX, dvarY=dvarY) 47 | return (rval) 48 | } 49 | 50 | 51 | .dcovSums2d <- function(x, y, all.sums = FALSE) { 52 | ## compute the sums S1, S2, S3 of distances for dcov^2 53 | ## dCov^2 <- S1/d1 - 2 * S2/d2 + S3/d3 54 | ## denominators differ for U-statistic, V-statisic 55 | ## if all.sums==TRUE, also return sums for dVar and kernel 56 | if (is.matrix(x) || is.matrix(y)) { 57 | if (ncol(x) > 1 || ncol(y) > 1) 58 | stop("Found multivariate (x,y) in .dcovSums2d, expecting bivariate") 59 | } 60 | n <- length(x) 61 | SRx <- sortrank(x) 62 | SRy <- sortrank(y) 63 | ## compute the rowSums of the distance matrices 64 | a. <- .rowSumsDist1(x, SRx) 65 | b. <- .rowSumsDist1(y, SRy) 66 | S2 <- sum(a. * b.) 67 | a.. <- sum(a.) 68 | b.. <- sum(b.) 69 | S3 <- sum(a.) * sum(b.) 70 | 71 | ## also need order and rank for y[order(x)] in gamma1() 72 | x1 <- SRx$x 73 | y1 <- y[SRx$ix] 74 | SRy1 <- sortrank(y1) 75 | ones <- rep(1, n) 76 | g_1 <- .gamma1(x1=x1, y1=y1, z1=ones, SRx=SRx, SRy1=SRy1) 77 | g_x <- .gamma1(x1=x1, y1=y1, z1=x1, SRx=SRx, SRy1=SRy1) 78 | g_y <- .gamma1(x1=x1, y1=y1, z1=y1, SRx=SRx, SRy1=SRy1) 79 | g_xy <- .gamma1(x1=x1, y1=y1, z1=x1*y1, SRx=SRx, SRy1=SRy1) 80 | S1 <- sum(x * y * g_1 + g_xy - x * g_y - y * g_x) 81 | 82 | L <- list(S1=S1, S2=S2, S3=S3, 83 | S1a=NA, S1b=NA, S2a=NA, S2b=NA, S3a=NA, S3b=NA, 84 | rowsumsA=NA, rowsumsB=NA, sumA=NA, sumB=NA) 85 | if (all.sums) { 86 | L$S1a <- 2 * n * (n-1) * var(x) 87 | L$S1b <- 2 * n * (n-1) * var(y) 88 | L$S2a <- sum(a.^2) 89 | L$S2b <- sum(b.^2) 90 | L$S3a <- a..^2 91 | L$S3b <- b..^2 92 | L$rowsumsA <- a. 93 | L$rowsumsB <- b. 94 | L$sumA <- a.. 95 | L$sumB <- b.. 96 | } 97 | return (L); 98 | } 99 | 100 | .dvarU2 <- function(x, SRx = NULL) { 101 | ## O(n log n) computation of dvarU for univariate x only 102 | ## this is an internal function that will do a stand-alone dVar calc. 103 | ## but it is not faster than dcovU2(x, x) unless we supply 104 | ## the precomputed sort + rank results in SRx 105 | n <- length(x) 106 | ## compute the rowSums of the distance matrices 107 | if (is.null(SRx)) 108 | SRx <- sortrank(x) 109 | a. <- .rowSumsDist1(x, SRx) 110 | S2 <- sum(a. * a.) 111 | S3 <- sum(a.)^2 112 | 113 | ## also need order and rank for y[order(x)] in gamma1() 114 | x1 <- SRx$x 115 | x2 <- x1 116 | SRx1 <- sortrank(x1) 117 | ones <- rep(1, n) 118 | g_1 <- .gamma1(x1=x1, y1=x2, z1=ones, SRx, SRx1) 119 | g_x <- .gamma1(x1=x1, y1=x2, z1=x1, SRx, SRx1) 120 | g_xx <- .gamma1(x1=x1, y1=x2, z1=x1*x2, SRx, SRx1) 121 | S1 <- sum(x^2 * g_1 + g_xx - 2 * x * g_x) 122 | d1 <- n * (n - 3) 123 | d2 <- d1 * (n - 2) 124 | d3 <- d2 * (n - 1) 125 | dVar <- S1/d1 - 2 * S2/d2 + S3/d3 126 | return(dVar) 127 | } 128 | 129 | .gamma1 <- function(x1, y1, z1, SRx, SRy1) { 130 | # computes the terms of the sum (ab) in dcovU 131 | # original sample (x_i, y_i, z_i) 132 | # triples (x1_i, y1_i, z1_i) are sorted by ix=order(x) 133 | # SRx is the result of sortrank(x), original order 134 | # SRy1 is the result of sortrank(y1), y1=y[order(x)] 135 | # pre-compute SRx, SRy1 to avoid repeated sort and rank 136 | # 137 | n <- length(x1) 138 | ix <- SRx$ix #order(x) 139 | rankx <- SRx$r #ranks of original sample x 140 | 141 | ## ranks and order vector for this permutation of sample y1 142 | iy1 <- SRy1$ix #order(y1) 143 | ranky1 <- SRy1$r #rank(y1) 144 | 145 | ## the partial sums in the formula g_1 146 | psumsy1 <- (cumsum(as.numeric(z1[iy1])) - z1[iy1])[ranky1] 147 | psumsx1 <- cumsum(as.numeric(z1)) - z1 148 | 149 | gamma1 <- Btree_sum(y=ranky1, z=z1) #y1 replaced by rank(y1) 150 | g <- sum(z1) - z1 - 2 * psumsx1 - 2 * psumsy1 + 4 * gamma1 151 | g <- g[rankx] 152 | } 153 | 154 | .rowSumsDist1 <- function(x, Sx = NULL) { 155 | ## for univariate samples, equivalent to rowSums(as.matrix(dist(x))) 156 | ## but much faster 157 | ## Sx is a sortrank object usually pre-computed here 158 | ## x is the data vector, Sx$x is sort(x) 159 | if (is.null(Sx)) 160 | Sx <- sortrank(x) 161 | n <- length(x) 162 | r <- Sx$r #ranks 163 | z <- Sx$x #ordered sample x 164 | psums1 <- (cumsum(as.numeric(z)) - z)[r] 165 | (2*(r-1)-n)*x + sum(x) - 2*psums1 166 | } 167 | -------------------------------------------------------------------------------- /man/dcov.test.Rd: -------------------------------------------------------------------------------- 1 | \name{dcov.test} 2 | \alias{distance covariance} 3 | \alias{dcov.test} 4 | \alias{dcor.test} 5 | \title{ Distance Covariance Test and Distance Correlation test} 6 | \description{ 7 | Distance covariance test and distance correlation test of multivariate independence. 8 | Distance covariance and distance correlation are 9 | multivariate measures of dependence.} 10 | \usage{ 11 | dcov.test(x, y, index = 1.0, R = NULL) 12 | dcor.test(x, y, index = 1.0, R) 13 | } 14 | \arguments{ 15 | \item{x}{ data or distances of first sample} 16 | \item{y}{ data or distances of second sample} 17 | \item{R}{ number of replicates} 18 | \item{index}{ exponent on Euclidean distance, in (0,2]} 19 | } 20 | \details{ 21 | \code{dcov.test} and \code{dcor.test} are nonparametric 22 | tests of multivariate independence. The test decision is 23 | obtained via permutation bootstrap, with \code{R} replicates. 24 | 25 | The sample sizes (number of rows) of the two samples must 26 | agree, and samples must not contain missing values. 27 | 28 | The \code{index} is an optional exponent on Euclidean distance. 29 | Valid exponents for energy are in (0, 2) excluding 2. 30 | 31 | Argument types supported are 32 | numeric data matrix, data.frame, or tibble, with observations in rows; 33 | numeric vector; ordered or unordered factors. In case of unordered factors 34 | a 0-1 distance matrix is computed. 35 | 36 | Optionally pre-computed distances can be input as class "dist" objects or as distance matrices. 37 | For data types of arguments, 38 | distance matrices are computed internally. 39 | 40 | The \code{dcov} test statistic is 41 | \eqn{n \mathcal V_n^2}{nV_n^2} where 42 | \eqn{\mathcal V_n(x,y)}{V_n(x,y)} = dcov(x,y), 43 | which is based on interpoint Euclidean distances 44 | \eqn{\|x_{i}-x_{j}\|}{||x_{i}-x_{j}||}. The \code{index} 45 | is an optional exponent on Euclidean distance. 46 | 47 | Similarly, the \code{dcor} test statistic is based on the normalized 48 | coefficient, the distance correlation. (See the manual page for \code{dcor}.) 49 | 50 | Distance correlation is a new measure of dependence between random 51 | vectors introduced by Szekely, Rizzo, and Bakirov (2007). 52 | For all distributions with finite first moments, distance 53 | correlation \eqn{\mathcal R}{R} generalizes the idea of correlation in two 54 | fundamental ways: 55 | 56 | (1) \eqn{\mathcal R(X,Y)}{R(X,Y)} is defined for \eqn{X} and \eqn{Y} in arbitrary dimension. 57 | 58 | (2) \eqn{\mathcal R(X,Y)=0}{R(X,Y)=0} characterizes independence of \eqn{X} and 59 | \eqn{Y}. 60 | 61 | Characterization (2) also holds for powers of Euclidean distance \eqn{\|x_i-x_j\|^s}{|x_i-x_j|^s}, where \eqn{0 2 | Authors and Citation • energy 6 | 7 | 8 |
    9 |
    40 | 41 | 42 | 43 |
    44 |
    45 |
    46 | 49 | 50 | 51 |
    • 52 |

      Maria Rizzo. Author, maintainer. 53 |

      54 |
    • 55 |
    • 56 |

      Gabor Szekely. Author. 57 |

      58 |
    • 59 |
    60 |
    61 |
    62 |

    Citation

    63 | Source: DESCRIPTION 64 |
    65 |
    66 | 67 | 68 |

    Rizzo M, Szekely G (2024). 69 | energy: E-Statistics: Multivariate Inference via the Energy of Data. 70 | R package version 1.7-12, https://CRAN.R-project.org/package=energy. 71 |

    72 |
    @Manual{,
     73 |   title = {energy: E-Statistics: Multivariate Inference via the Energy of Data},
     74 |   author = {Maria Rizzo and Gabor Szekely},
     75 |   year = {2024},
     76 |   note = {R package version 1.7-12},
     77 |   url = {https://CRAN.R-project.org/package=energy},
     78 | }
    79 | 80 |
    81 | 82 |
    83 | 84 | 85 | 86 |
    89 | 90 |
    91 |

    Site built with pkgdown 2.1.0.

    92 |
    93 | 94 |
    95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | -------------------------------------------------------------------------------- /src/dcov.c: -------------------------------------------------------------------------------- 1 | /* 2 | dcov.c: distance correlation and covariance statistics 3 | and dCov test for multivariate independence 4 | 5 | Szekely, G.J., Rizzo, M.L., and Bakirov, N.K. (2007) 6 | "Measuring and testing dependence by correlation of distances" 7 | Annals of Statistics, Vol. 35 No. 6, pp. 2769-2794. 8 | 9 | Author: Maria L. Rizzo 10 | energy package 11 | github.com/mariarizzo/energy 12 | 13 | */ 14 | 15 | #include 16 | #include 17 | 18 | void dCOVtest(double *x, double *y, int *nrow, int *nreps, 19 | double *reps, double *DCOV, double *pval); 20 | void dCOV(double *x, double *y, int *nrow, double *DCOV); 21 | double Akl(double **akl, double **A, int n); 22 | 23 | /* functions in utilities.c */ 24 | extern double **alloc_matrix(int r, int c); 25 | extern int **alloc_int_matrix(int r, int c); 26 | extern void free_matrix(double **matrix, int r, int c); 27 | extern void free_int_matrix(int **matrix, int r, int c); 28 | extern void permute(int *J, int n); 29 | extern void roworder(double *x, int *byrow, int r, int c); 30 | extern void Euclidean_distance(double *x, double **Dx, int n, int d); 31 | extern void index_distance(double **Dx, int n, double index); 32 | extern void vector2matrix(double *x, double **y, int N, int d, int isroworder); 33 | 34 | 35 | void dCOVtest(double *x, double *y, int *nrow, int *nreps, 36 | double *reps, double *DCOV, double *pval) { 37 | /* input vectors must expand to distance matrices 38 | any exponent must be pre-computed in R 39 | computes dCov(x,y), dCor(x,y), dVar(x), dVar(y) 40 | V-statistic is n*dCov^2 where n*dCov^2 --> Q 41 | DCOV : vector [dCov, dCor, dVar(x), dVar(y), mean(A), mean(B)] 42 | */ 43 | int i, j, k, r, J, K, M; 44 | int n = nrow[0], R = nreps[0]; 45 | int* perm; 46 | double **Dx, **Dy, **A, **B; 47 | double dcov, V; 48 | double n2 = (double) n * n; 49 | Dx = alloc_matrix(n, n); 50 | Dy = alloc_matrix(n, n); 51 | vector2matrix(x, Dx, n, n, 1); 52 | vector2matrix(y, Dy, n, n, 1); 53 | 54 | A = alloc_matrix(n, n); 55 | B = alloc_matrix(n, n); 56 | Akl(Dx, A, n); 57 | Akl(Dy, B, n); 58 | free_matrix(Dx, n, n); 59 | free_matrix(Dy, n, n); 60 | 61 | /* compute dCov(x,y), dVar(x), dVar(y) */ 62 | for (k=0; k<4; k++) 63 | DCOV[k] = 0.0; 64 | for (k=0; k 0) 74 | DCOV[k] = sqrt(DCOV[k]); 75 | else DCOV[k] = 0.0; 76 | } 77 | /* compute dCor(x, y) */ 78 | V = DCOV[2]*DCOV[3]; 79 | if (V > DBL_EPSILON) 80 | DCOV[1] = DCOV[0] / sqrt(V); 81 | else DCOV[1] = 0.0; 82 | 83 | if (R > 0) { 84 | /* compute the replicates */ 85 | if (DCOV[1] > 0.0) { 86 | perm = R_Calloc(n, int); 87 | M = 0; 88 | for (i=0; i= DCOV[0]) M++; 104 | } 105 | *pval = (double) (M+1) / (double) (R+1); 106 | PutRNGstate(); 107 | R_Free(perm); 108 | } else { 109 | *pval = 1.0; 110 | } 111 | } 112 | 113 | free_matrix(A, n, n); 114 | free_matrix(B, n, n); 115 | return; 116 | } 117 | 118 | void dCOV(double *x, double *y, int *nrow, double *DCOV) { 119 | /* input vectors must expand to distance matrices 120 | any exponent must be pre-computed in R 121 | computes dCov(x,y), dCor(x,y), dVar(x), dVar(y) 122 | V-statistic is n*dCov^2 where n*dCov^2 --> Q 123 | DCOV : vector [dCov, dCor, dVar(x), dVar(y)] 124 | */ 125 | 126 | int j, k, n = nrow[0]; 127 | double **Dx, **Dy, **A, **B; 128 | double V, n2 = (double) n * n; 129 | 130 | Dx = alloc_matrix(n, n); 131 | Dy = alloc_matrix(n, n); 132 | vector2matrix(x, Dx, n, n, 1); 133 | vector2matrix(y, Dy, n, n, 1); 134 | 135 | A = alloc_matrix(n, n); 136 | B = alloc_matrix(n, n); 137 | Akl(Dx, A, n); 138 | Akl(Dy, B, n); 139 | free_matrix(Dx, n, n); 140 | free_matrix(Dy, n, n); 141 | 142 | n2 = ((double) n) * n; 143 | 144 | /* compute dCov(x,y), dVar(x), dVar(y) */ 145 | for (k=0; k<4; k++) 146 | DCOV[k] = 0.0; 147 | for (k=0; k 0) 157 | DCOV[k] = sqrt(DCOV[k]); 158 | else DCOV[k] = 0.0; 159 | } 160 | /* compute dCor(x, y) */ 161 | V = DCOV[2]*DCOV[3]; 162 | if (V > DBL_EPSILON) 163 | DCOV[1] = DCOV[0] / sqrt(V); 164 | else DCOV[1] = 0.0; 165 | 166 | free_matrix(A, n, n); 167 | free_matrix(B, n, n); 168 | return; 169 | } 170 | 171 | double Akl(double **akl, double **A, int n) { 172 | /* -computes the A_{kl} or B_{kl} distances from the 173 | distance matrix (a_{kl}) or (b_{kl}) for dCov, dCor, dVar 174 | dCov = mean(Akl*Bkl), dVar(X) = mean(Akl^2), etc. 175 | */ 176 | int j, k; 177 | double *akbar; 178 | double abar; 179 | 180 | akbar = R_Calloc(n, double); 181 | abar = 0.0; 182 | for (k=0; k 2 | Deprecated Functions — energy-deprecated • energy 6 | 7 | 8 |
    9 |
    40 | 41 | 42 | 43 |
    44 |
    45 | 50 | 51 |
    52 |

    These deprecated functions have been replaced by revised functions and will be removed in future releases of the energy package.

    53 |
    54 | 55 |
    56 |
    DCOR(x, y, index=1.0)
    57 |
    58 | 59 |
    60 |

    Arguments

    61 |

    62 |
    x
    63 |

    data or distances of first sample

    64 | 65 |
    y
    66 |

    data or distances of second sample

    67 | 68 |
    index
    69 |

    exponent on Euclidean distance in (0, 2)

    70 | 71 |
    72 |
    73 |

    Details

    74 |

    DCOR is an R version replaced by faster compiled code.

    75 |
    76 | 77 |
    78 | 81 |
    82 | 83 | 84 |
    87 | 88 |
    89 |

    Site built with pkgdown 2.1.0.

    90 |
    91 | 92 |
    93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | -------------------------------------------------------------------------------- /docs/reference/energy-defunct.html: -------------------------------------------------------------------------------- 1 | 2 | Distance Correlation t-test for High Dimensions — dcor.ttest • energy 6 | 7 | 8 |
    9 |
    40 | 41 | 42 | 43 |
    44 |
    45 | 50 | 51 |
    52 |

    Defunct: use dcorT.test and dcorT.

    53 |
    54 | 55 |
    56 |
    dcor.t(x, y, distance = FALSE)
     57 | dcor.ttest(x, y, distance = FALSE)
    58 |
    59 | 60 |
    61 |

    Arguments

    62 |

    63 |
    x
    64 |

    data or distances of first sample

    65 | 66 |
    y
    67 |

    data or distances of second sample

    68 | 69 |
    distance
    70 |

    TRUE if x and y are distances, otherwise FALSE

    71 | 72 |
    73 |
    74 |

    Details

    75 |

    See dcorT.

    76 |
    77 |
    78 |

    Author

    79 |

    Maria L. Rizzo mrizzo@bgsu.edu and 80 | Gabor J. Szekely

    81 |
    82 | 83 |
    84 | 87 |
    88 | 89 | 90 |
    93 | 94 |
    95 |

    Site built with pkgdown 2.1.0.

    96 |
    97 | 98 |
    99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | -------------------------------------------------------------------------------- /src/RcppExports.cpp: -------------------------------------------------------------------------------- 1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | #include 5 | 6 | using namespace Rcpp; 7 | 8 | #ifdef RCPP_USE_GLOBAL_ROSTREAM 9 | Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); 10 | Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); 11 | #endif 12 | 13 | // Btree_sum 14 | NumericVector Btree_sum(IntegerVector y, NumericVector z); 15 | RcppExport SEXP _energy_Btree_sum(SEXP ySEXP, SEXP zSEXP) { 16 | BEGIN_RCPP 17 | Rcpp::RObject rcpp_result_gen; 18 | Rcpp::RNGScope rcpp_rngScope_gen; 19 | Rcpp::traits::input_parameter< IntegerVector >::type y(ySEXP); 20 | Rcpp::traits::input_parameter< NumericVector >::type z(zSEXP); 21 | rcpp_result_gen = Rcpp::wrap(Btree_sum(y, z)); 22 | return rcpp_result_gen; 23 | END_RCPP 24 | } 25 | // calc_dist 26 | NumericMatrix calc_dist(NumericMatrix x); 27 | RcppExport SEXP _energy_calc_dist(SEXP xSEXP) { 28 | BEGIN_RCPP 29 | Rcpp::RObject rcpp_result_gen; 30 | Rcpp::RNGScope rcpp_rngScope_gen; 31 | Rcpp::traits::input_parameter< NumericMatrix >::type x(xSEXP); 32 | rcpp_result_gen = Rcpp::wrap(calc_dist(x)); 33 | return rcpp_result_gen; 34 | END_RCPP 35 | } 36 | // U_product 37 | double U_product(NumericMatrix U, NumericMatrix V); 38 | RcppExport SEXP _energy_U_product(SEXP USEXP, SEXP VSEXP) { 39 | BEGIN_RCPP 40 | Rcpp::RObject rcpp_result_gen; 41 | Rcpp::RNGScope rcpp_rngScope_gen; 42 | Rcpp::traits::input_parameter< NumericMatrix >::type U(USEXP); 43 | Rcpp::traits::input_parameter< NumericMatrix >::type V(VSEXP); 44 | rcpp_result_gen = Rcpp::wrap(U_product(U, V)); 45 | return rcpp_result_gen; 46 | END_RCPP 47 | } 48 | // D_center 49 | NumericMatrix D_center(NumericMatrix Dx); 50 | RcppExport SEXP _energy_D_center(SEXP DxSEXP) { 51 | BEGIN_RCPP 52 | Rcpp::RObject rcpp_result_gen; 53 | Rcpp::RNGScope rcpp_rngScope_gen; 54 | Rcpp::traits::input_parameter< NumericMatrix >::type Dx(DxSEXP); 55 | rcpp_result_gen = Rcpp::wrap(D_center(Dx)); 56 | return rcpp_result_gen; 57 | END_RCPP 58 | } 59 | // U_center 60 | NumericMatrix U_center(NumericMatrix Dx); 61 | RcppExport SEXP _energy_U_center(SEXP DxSEXP) { 62 | BEGIN_RCPP 63 | Rcpp::RObject rcpp_result_gen; 64 | Rcpp::RNGScope rcpp_rngScope_gen; 65 | Rcpp::traits::input_parameter< NumericMatrix >::type Dx(DxSEXP); 66 | rcpp_result_gen = Rcpp::wrap(U_center(Dx)); 67 | return rcpp_result_gen; 68 | END_RCPP 69 | } 70 | // dcovU_stats 71 | NumericVector dcovU_stats(NumericMatrix Dx, NumericMatrix Dy); 72 | RcppExport SEXP _energy_dcovU_stats(SEXP DxSEXP, SEXP DySEXP) { 73 | BEGIN_RCPP 74 | Rcpp::RObject rcpp_result_gen; 75 | Rcpp::RNGScope rcpp_rngScope_gen; 76 | Rcpp::traits::input_parameter< NumericMatrix >::type Dx(DxSEXP); 77 | Rcpp::traits::input_parameter< NumericMatrix >::type Dy(DySEXP); 78 | rcpp_result_gen = Rcpp::wrap(dcovU_stats(Dx, Dy)); 79 | return rcpp_result_gen; 80 | END_RCPP 81 | } 82 | // kgroups_start 83 | List kgroups_start(NumericMatrix x, int k, IntegerVector clus, int iter_max, bool distance); 84 | RcppExport SEXP _energy_kgroups_start(SEXP xSEXP, SEXP kSEXP, SEXP clusSEXP, SEXP iter_maxSEXP, SEXP distanceSEXP) { 85 | BEGIN_RCPP 86 | Rcpp::RObject rcpp_result_gen; 87 | Rcpp::RNGScope rcpp_rngScope_gen; 88 | Rcpp::traits::input_parameter< NumericMatrix >::type x(xSEXP); 89 | Rcpp::traits::input_parameter< int >::type k(kSEXP); 90 | Rcpp::traits::input_parameter< IntegerVector >::type clus(clusSEXP); 91 | Rcpp::traits::input_parameter< int >::type iter_max(iter_maxSEXP); 92 | Rcpp::traits::input_parameter< bool >::type distance(distanceSEXP); 93 | rcpp_result_gen = Rcpp::wrap(kgroups_start(x, k, clus, iter_max, distance)); 94 | return rcpp_result_gen; 95 | END_RCPP 96 | } 97 | // Istat 98 | double Istat(NumericMatrix Dx, NumericMatrix Dy); 99 | RcppExport SEXP _energy_Istat(SEXP DxSEXP, SEXP DySEXP) { 100 | BEGIN_RCPP 101 | Rcpp::RObject rcpp_result_gen; 102 | Rcpp::RNGScope rcpp_rngScope_gen; 103 | Rcpp::traits::input_parameter< NumericMatrix >::type Dx(DxSEXP); 104 | Rcpp::traits::input_parameter< NumericMatrix >::type Dy(DySEXP); 105 | rcpp_result_gen = Rcpp::wrap(Istat(Dx, Dy)); 106 | return rcpp_result_gen; 107 | END_RCPP 108 | } 109 | // Istats 110 | NumericVector Istats(NumericMatrix Dx, NumericMatrix Dy, int R); 111 | RcppExport SEXP _energy_Istats(SEXP DxSEXP, SEXP DySEXP, SEXP RSEXP) { 112 | BEGIN_RCPP 113 | Rcpp::RObject rcpp_result_gen; 114 | Rcpp::RNGScope rcpp_rngScope_gen; 115 | Rcpp::traits::input_parameter< NumericMatrix >::type Dx(DxSEXP); 116 | Rcpp::traits::input_parameter< NumericMatrix >::type Dy(DySEXP); 117 | Rcpp::traits::input_parameter< int >::type R(RSEXP); 118 | rcpp_result_gen = Rcpp::wrap(Istats(Dx, Dy, R)); 119 | return rcpp_result_gen; 120 | END_RCPP 121 | } 122 | // partial_dcor 123 | NumericVector partial_dcor(NumericMatrix Dx, NumericMatrix Dy, NumericMatrix Dz); 124 | RcppExport SEXP _energy_partial_dcor(SEXP DxSEXP, SEXP DySEXP, SEXP DzSEXP) { 125 | BEGIN_RCPP 126 | Rcpp::RObject rcpp_result_gen; 127 | Rcpp::RNGScope rcpp_rngScope_gen; 128 | Rcpp::traits::input_parameter< NumericMatrix >::type Dx(DxSEXP); 129 | Rcpp::traits::input_parameter< NumericMatrix >::type Dy(DySEXP); 130 | Rcpp::traits::input_parameter< NumericMatrix >::type Dz(DzSEXP); 131 | rcpp_result_gen = Rcpp::wrap(partial_dcor(Dx, Dy, Dz)); 132 | return rcpp_result_gen; 133 | END_RCPP 134 | } 135 | // partial_dcov 136 | double partial_dcov(NumericMatrix Dx, NumericMatrix Dy, NumericMatrix Dz); 137 | RcppExport SEXP _energy_partial_dcov(SEXP DxSEXP, SEXP DySEXP, SEXP DzSEXP) { 138 | BEGIN_RCPP 139 | Rcpp::RObject rcpp_result_gen; 140 | Rcpp::RNGScope rcpp_rngScope_gen; 141 | Rcpp::traits::input_parameter< NumericMatrix >::type Dx(DxSEXP); 142 | Rcpp::traits::input_parameter< NumericMatrix >::type Dy(DySEXP); 143 | Rcpp::traits::input_parameter< NumericMatrix >::type Dz(DzSEXP); 144 | rcpp_result_gen = Rcpp::wrap(partial_dcov(Dx, Dy, Dz)); 145 | return rcpp_result_gen; 146 | END_RCPP 147 | } 148 | // poisMstat 149 | NumericVector poisMstat(IntegerVector x); 150 | RcppExport SEXP _energy_poisMstat(SEXP xSEXP) { 151 | BEGIN_RCPP 152 | Rcpp::RObject rcpp_result_gen; 153 | Rcpp::RNGScope rcpp_rngScope_gen; 154 | Rcpp::traits::input_parameter< IntegerVector >::type x(xSEXP); 155 | rcpp_result_gen = Rcpp::wrap(poisMstat(x)); 156 | return rcpp_result_gen; 157 | END_RCPP 158 | } 159 | // projection 160 | NumericMatrix projection(NumericMatrix Dx, NumericMatrix Dz); 161 | RcppExport SEXP _energy_projection(SEXP DxSEXP, SEXP DzSEXP) { 162 | BEGIN_RCPP 163 | Rcpp::RObject rcpp_result_gen; 164 | Rcpp::RNGScope rcpp_rngScope_gen; 165 | Rcpp::traits::input_parameter< NumericMatrix >::type Dx(DxSEXP); 166 | Rcpp::traits::input_parameter< NumericMatrix >::type Dz(DzSEXP); 167 | rcpp_result_gen = Rcpp::wrap(projection(Dx, Dz)); 168 | return rcpp_result_gen; 169 | END_RCPP 170 | } 171 | -------------------------------------------------------------------------------- /R/disco.R: -------------------------------------------------------------------------------- 1 | ### disco tests - implementation of DIStance COmponents methods in: 2 | ### 3 | ### Rizzo, M.L. and Szekely, G.J. (2010) "DISCO Analysis: A Nonparametric 4 | ### Extension of Analysis of Variance, Annals of Applied Statistics 5 | ### Vol. 4, No. 2, 1034-1055. 6 | ### 7 | ### disco: computes the decomposition and test using F ratio 8 | ### disco.between: statistic and test using between component 9 | ### .disco1: internal computations for one factor 10 | ### .disco1stat, .disco1Bstat: internal for boot function 11 | ### 12 | ### 13 | 14 | 15 | 16 | disco <- function(x, factors, distance = FALSE, index = 1, R, 17 | method = c("disco", "discoB", "discoF")) { 18 | ## x is response or Euclidean distance matrix or dist() object factors 19 | ## is a matrix or data frame of group labels distance=TRUE if x is 20 | ## distance, otherwise FALSE index is the exponent on distance, in (0,2] 21 | ## R is number of replicates for test method: use F ratio (default) or 22 | ## between component (discoB) disco method is currently alias for discoF 23 | 24 | 25 | method <- match.arg(method) 26 | factors <- data.frame(factors) 27 | if (inherits(x, "dist")) distance <- TRUE 28 | if (method == "discoB") 29 | return(disco.between(x, factors = factors, distance = distance, 30 | index = index, R = R)) 31 | nfactors <- NCOL(factors) 32 | if (distance || inherits(x, "dist")) 33 | dst <- as.matrix(x) else dst <- as.matrix(dist(x)) 34 | N <- NROW(dst) 35 | if (NCOL(dst) != N) 36 | stop("distance==TRUE but first argument is not distance") 37 | if (!isTRUE(all.equal(index, 1))) 38 | dst <- dst^index 39 | 40 | stats <- matrix(0, nfactors, 6) 41 | colnames(stats) <- c("Trt", "Within", "df1", "df2", "Stat", "p-value") 42 | 43 | for (j in 1:nfactors) { 44 | trt <- factors[, j] 45 | stats[j, 1:4] <- .disco1(trt = trt, dst = dst) 46 | if (R > 0) { 47 | b <- boot::boot(data = dst, statistic = .disco1stat, sim = "permutation", 48 | R = R, trt = trt) 49 | stats[j, 5] <- b$t0 50 | stats[j, 6] <- (sum(b$t > b$t0) + 1)/(R + 1) 51 | } else { 52 | stats[j, 5] <- .disco1stat(dst, i = 1:nrow(dst), trt = trt) 53 | stats[j, 6] <- NA 54 | } 55 | } 56 | 57 | methodname <- "DISCO (F ratio)" 58 | dataname <- deparse(substitute(x)) 59 | total <- sum(stats[1, 1:2]) 60 | within <- total - sum(stats[, 1]) 61 | Df.trt <- stats[, 3] 62 | factor.names <- names(factors) 63 | factor.levels <- sapply(factors, nlevels) 64 | sizes <- sapply(factors, tabulate) 65 | e <- list(call = match.call(), method = methodname, 66 | statistic = stats[, 5], 67 | p.value = stats[, 6], 68 | k = nfactors, 69 | N = N, 70 | between = stats[, 1], 71 | withins = stats[, 2], 72 | within = within, 73 | total = total, 74 | Df.trt = Df.trt, 75 | Df.e = nrow(dst) - sum(Df.trt) - 1, 76 | index = index, factor.names = factor.names, 77 | factor.levels = factor.levels, 78 | sample.sizes = sizes, stats = stats) 79 | class(e) <- "disco" 80 | e 81 | } 82 | 83 | disco.between <- function(x, factors, distance = FALSE, index = 1, R) { 84 | ## disco test based on the between-sample component similar to disco 85 | ## except that 'disco' test is based on the F ratio disco.between test 86 | ## for one factor (balanced) is asymptotically equivalent to k-sample E 87 | ## test (test statistics are proportional in that case but not in 88 | ## general). x is response or Euclidean distance matrix or dist() 89 | ## object factors is a matrix or data frame of group labels 90 | ## distance=TRUE if x is distance, otherwise FALSE index is the exponent 91 | ## on distance, in (0,2] 92 | 93 | factors <- data.frame(factors) 94 | nfactors <- NCOL(factors) 95 | if (nfactors > 1) 96 | stop("More than one factor is not implemented in disco.between") 97 | if (distance || inherits(x, "dist")) 98 | dst <- as.matrix(x) else dst <- as.matrix(dist(x)) 99 | N <- NROW(dst) 100 | if (NCOL(dst) != N) 101 | stop("distance==TRUE but first argument is not distance") 102 | if (!isTRUE(all.equal(index, 1))) 103 | dst <- dst^index 104 | 105 | trt <- factors[, 1] 106 | if (R > 0) { 107 | b <- boot::boot(data = dst, statistic = .disco1Bstat, sim = "permutation", 108 | R = R, trt = trt) 109 | between <- b$t0 110 | reps <- b$t 111 | pval <- (1+sum(reps > between)) / (R+1) 112 | } else { 113 | between <- .disco1Bstat(dst, i = 1:nrow(dst), trt = trt) 114 | pval <- NA 115 | } 116 | if (R == 0) 117 | return(between) 118 | 119 | methodname <- "DISCO (Between-sample)" 120 | dataname <- deparse(substitute(x)) 121 | 122 | names(between) <- "DISCO between statistic" 123 | e <- list(call = match.call(), method = methodname, statistic = between, 124 | p.value = pval, data.name = dataname) 125 | 126 | class(e) <- "htest" 127 | e 128 | } 129 | 130 | .disco1 <- function(trt, dst) { 131 | ## dst is Euclidean distance matrix or power of it trt is the treatment, 132 | ## a factor 133 | 134 | trt <- factor(trt) 135 | k <- nlevels(trt) 136 | n <- tabulate(trt) 137 | N <- sum(n) 138 | total <- sum(dst)/(2 * N) 139 | y <- as.vector(dst[, 1]) 140 | M <- model.matrix(y ~ 0 + trt) 141 | G <- t(M) %*% dst %*% M 142 | withins <- diag(G)/(2 * n) 143 | W <- sum(withins) 144 | B <- total - W 145 | c(B, W, k - 1, N - k) 146 | } 147 | 148 | .disco1stat <- function(dst, i, trt) { 149 | ## i is permuation vector supplied by bootstrap dst is Euclidean 150 | ## distance matrix or power of it trt is the treatment, a factor returns 151 | ## the disco 'F' ratio 152 | idx <- 1:nrow(dst) 153 | d <- .disco1(trt = trt[idx[i]], dst = dst) 154 | statistic <- (d[1]/d[3])/(d[2]/d[4]) 155 | } 156 | 157 | .disco1Bstat <- function(dst, i, trt) { 158 | ## i is permuation vector supplied by bootstrap dst is Euclidean 159 | ## distance matrix or power of it trt is the treatment, a factor returns 160 | ## the between-sample component (for one factor) 161 | idx <- 1:nrow(dst) 162 | .disco1(trt = trt[idx[i]], dst = dst)[1] 163 | } 164 | 165 | print.disco <- function(x, ...) { 166 | k <- x$k 167 | md1 <- x$between/x$Df.trt 168 | md2 <- x$within/x$Df.e 169 | f0 <- x$statistic 170 | print(x$call) 171 | cat(sprintf("\nDistance Components: index %5.2f\n", x$index)) 172 | cat(sprintf("%-15s %4s %10s %10s %9s %9s\n", "Source", "Df", "Sum Dist", 173 | "Mean Dist", "F-ratio", "p-value")) 174 | fabb <- abbreviate(x$factor.names, minlength=12) 175 | for (i in 1:k) { 176 | fname <- fabb[i] 177 | cat(sprintf("%-15s %4d %10.5f %10.5f %9.3f %9s\n", fname, x$Df.trt[i], 178 | x$between[i], md1[i], f0[i], format.pval(x$p.value[i]))) 179 | } 180 | cat(sprintf("%-15s %4d %10.5f %10.5f\n", "Within", x$Df.e, x$within, 181 | md2)) 182 | cat(sprintf("%-15s %4d %10.5f\n", "Total", x$N - 1, x$total)) 183 | } 184 | -------------------------------------------------------------------------------- /src/utilities.c: -------------------------------------------------------------------------------- 1 | /* 2 | utilities.c: some utilities for the energy package 3 | 4 | Author: Maria L. Rizzo 5 | github.com/mariarizzo/energy 6 | 7 | alloc_matrix, alloc_int_matrix, free_matrix, free_int_matrix: 8 | use R_Calloc, R_Free instead of Calloc, Free for memory management 9 | 10 | permute permutes the first n elements of an integer vector 11 | row_order converts arg from column order to row order 12 | vector2matrix copies double* arg into double** arg 13 | distance computes Euclidean distance matrix from double** 14 | Euclidean_distance computes Euclidean distance matrix from double* 15 | index_distance computes Euclidean distance matrix D then D^index 16 | sumdist sums the distance matrix without creating the matrix 17 | 18 | Notes: 19 | 1. index_distance (declaration and body of the function) revised in 20 | energy 1.3-0, 2/2011. 21 | */ 22 | 23 | #include 24 | #include 25 | 26 | double **alloc_matrix(int r, int c); 27 | int **alloc_int_matrix(int r, int c); 28 | void free_matrix(double **matrix, int r, int c); 29 | void free_int_matrix(int **matrix, int r, int c); 30 | 31 | void permute(int *J, int n); 32 | void permute_check(int *J, int *N); 33 | void roworder(double *x, int *byrow, int r, int c); 34 | void vector2matrix(double *x, double **y, int N, int d, int isroworder); 35 | 36 | void distance(double **bxy, double **D, int N, int d); 37 | void Euclidean_distance(double *x, double **Dx, int n, int d); 38 | void index_distance(double **Dx, int n, double index); 39 | void sumdist(double *x, int *byrow, int *nrow, int *ncol, double *lowersum); 40 | 41 | 42 | 43 | double **alloc_matrix(int r, int c) 44 | { 45 | /* allocate a matrix with r rows and c columns */ 46 | int i; 47 | double **matrix; 48 | matrix = R_Calloc(r, double *); 49 | for (i = 0; i < r; i++) 50 | matrix[i] = R_Calloc(c, double); 51 | return matrix; 52 | } 53 | 54 | 55 | int **alloc_int_matrix(int r, int c) 56 | { 57 | /* allocate an integer matrix with r rows and c columns */ 58 | int i; 59 | int **matrix; 60 | matrix = R_Calloc(r, int *); 61 | for (i = 0; i < r; i++) 62 | matrix[i] = R_Calloc(c, int); 63 | return matrix; 64 | } 65 | 66 | void free_matrix(double **matrix, int r, int c) 67 | { 68 | /* free a matrix with r rows and c columns */ 69 | int i; 70 | for (i = 0; i < r; i++) R_Free(matrix[i]); 71 | R_Free(matrix); 72 | } 73 | 74 | void free_int_matrix(int **matrix, int r, int c) 75 | { 76 | /* free an integer matrix with r rows and c columns */ 77 | int i; 78 | for (i = 0; i < r; i++) R_Free(matrix[i]); 79 | R_Free(matrix); 80 | } 81 | 82 | void permute(int *J, int n) 83 | { 84 | /* 85 | permute the first n integers of J 86 | if n is length(J), returns a permutation vector 87 | equal to rev(Rcpp::sample(n, n, false)) 88 | */ 89 | int i, j, j0, m=n; 90 | for (i=0; i DBL_EPSILON) { 206 | for (i=0; i