├── .Rbuildignore ├── .github ├── .gitignore └── workflows │ ├── R-CMD-check.yaml │ ├── pkgdown.yaml │ └── test-coverage.yaml ├── .gitignore ├── AUTHORS ├── CRAN-SUBMISSION ├── DESCRIPTION ├── NAMESPACE ├── NEWS.md ├── R ├── RcppExports.R ├── aricode-package.R └── aricode.R ├── README.Rmd ├── README.md ├── _pkgdown.yml ├── aricode.Rproj ├── cran-comments.md ├── inst ├── WORDLIST └── check_speed.R ├── man ├── AMI.Rd ├── ARI.Rd ├── Chi2.Rd ├── MARI.Rd ├── MARIraw.Rd ├── NID.Rd ├── NMI.Rd ├── NVI.Rd ├── RI.Rd ├── aricode-package.Rd ├── clustComp.Rd ├── entropy.Rd ├── figures │ └── timings_plot-1.png └── sortPairs.Rd ├── src ├── .gitignore ├── RcppExports.cpp └── aricode.cpp └── tests ├── spelling.R ├── testthat.R └── testthat ├── test_coherence.R └── test_input.R /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | cran-comments.md 4 | figure 5 | README.md 6 | README.rmd 7 | .travis.yml 8 | README_cache 9 | ^CRAN-RELEASE$ 10 | ^\.github$ 11 | ^_pkgdown\.yml$ 12 | ^pkgdown$ 13 | ^docs 14 | ^docs/*.* 15 | ^docs/* 16 | ^CRAN-SUBMISSION$ 17 | ^revdep 18 | AUTHORS 19 | -------------------------------------------------------------------------------- /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /.github/workflows/R-CMD-check.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | 9 | name: R-CMD-check 10 | 11 | jobs: 12 | R-CMD-check: 13 | runs-on: ${{ matrix.config.os }} 14 | 15 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 16 | 17 | strategy: 18 | fail-fast: false 19 | matrix: 20 | config: 21 | - {os: macOS-latest, r: 'release'} 22 | - {os: macOS-latest, r: 'oldrel-1'} 23 | - {os: windows-latest, r: 'release'} 24 | - {os: windows-latest, r: 'oldrel-1'} 25 | - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} 26 | - {os: ubuntu-latest, r: 'release'} 27 | 28 | env: 29 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 30 | R_KEEP_PKG_SOURCE: yes 31 | 32 | steps: 33 | - uses: actions/checkout@v4 34 | 35 | - uses: r-lib/actions/setup-pandoc@v2 36 | 37 | - uses: r-lib/actions/setup-r@v2 38 | with: 39 | r-version: ${{ matrix.config.r }} 40 | http-user-agent: ${{ matrix.config.http-user-agent }} 41 | use-public-rspm: true 42 | 43 | - uses: r-lib/actions/setup-r-dependencies@v2 44 | with: 45 | extra-packages: any::rcmdcheck 46 | needs: check 47 | 48 | - uses: r-lib/actions/check-r-package@v2 49 | with: 50 | upload-snapshots: true 51 | -------------------------------------------------------------------------------- /.github/workflows/pkgdown.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | release: 9 | types: [published] 10 | workflow_dispatch: 11 | 12 | name: pkgdown 13 | 14 | jobs: 15 | pkgdown: 16 | runs-on: ubuntu-latest 17 | # Only restrict concurrency for non-PR jobs 18 | concurrency: 19 | group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }} 20 | env: 21 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 22 | steps: 23 | - uses: actions/checkout@v2 24 | 25 | - uses: r-lib/actions/setup-pandoc@v2 26 | 27 | - uses: r-lib/actions/setup-r@v2 28 | with: 29 | use-public-rspm: true 30 | 31 | - uses: r-lib/actions/setup-r-dependencies@v2 32 | with: 33 | extra-packages: any::pkgdown, local::. 34 | needs: website 35 | 36 | - name: Build site 37 | run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) 38 | shell: Rscript {0} 39 | 40 | - name: Deploy to GitHub pages 🚀 41 | if: github.event_name != 'pull_request' 42 | uses: JamesIves/github-pages-deploy-action@4.1.4 43 | with: 44 | clean: false 45 | branch: gh-pages 46 | folder: docs 47 | -------------------------------------------------------------------------------- /.github/workflows/test-coverage.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | 9 | name: test-coverage 10 | 11 | jobs: 12 | test-coverage: 13 | runs-on: ubuntu-latest 14 | env: 15 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 16 | 17 | steps: 18 | - uses: actions/checkout@v2 19 | 20 | - uses: r-lib/actions/setup-r@v2 21 | with: 22 | use-public-rspm: true 23 | 24 | - uses: r-lib/actions/setup-r-dependencies@v2 25 | with: 26 | extra-packages: any::covr 27 | needs: coverage 28 | 29 | - name: Test coverage 30 | run: covr::codecov(quiet = FALSE) 31 | shell: Rscript {0} 32 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # History files 2 | .Rhistory 3 | .Rapp.history 4 | 5 | # Session Data files 6 | .RData 7 | 8 | # Example code in package build process 9 | *-Ex.R 10 | 11 | # Output files from R CMD build 12 | /*.tar.gz 13 | 14 | # Output files from R CMD check 15 | /*.Rcheck/ 16 | 17 | # RStudio files 18 | .Rproj.user/ 19 | 20 | # produced vignettes 21 | vignettes/*.html 22 | vignettes/*.pdf 23 | 24 | # web files 25 | docs/* 26 | 27 | # doc files 28 | inst/doc/* 29 | 30 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 31 | .httr-oauth 32 | 33 | # knitr and R markdown default cache directories 34 | /*_cache/ 35 | /cache/ 36 | 37 | # Temporary files created by R markdown 38 | *.utf8.md 39 | *.knit.md 40 | .Rproj.user 41 | 42 | # dyn librairies 43 | src/*.o 44 | src/*.so 45 | src/*.dll 46 | 47 | # revdep testing 48 | revdep/ -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | # Contributors 2 | * Julien Chiquet, Maintainer, Developer 3 | * Guillem Rigaill, Developer 4 | * Martina Sundqvist, Developer 5 | -------------------------------------------------------------------------------- /CRAN-SUBMISSION: -------------------------------------------------------------------------------- 1 | Version: 1.0.3 2 | Date: 2023-10-19 10:43:22 UTC 3 | SHA: 110fd17dc3613b6cb5fee0bfeebcaf3e0faf077b 4 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: aricode 2 | Type: Package 3 | Title: Efficient Computations of Standard Clustering Comparison Measures 4 | Version: 1.0.3 5 | Authors@R: c( 6 | person("Julien", "Chiquet", role = c("aut", "cre"), email = "julien.chiquet@inrae.fr", 7 | comment = c(ORCID = "0000-0002-3629-3429")), 8 | person("Guillem", "Rigaill", role = "aut", email = "guillem.rigaill@inrae.fr"), 9 | person("Martina", "Sundqvist", role = "aut", email = "martina.sundqvist@agroparistech.fr"), 10 | person("Valentin", "Dervieux", role = "ctb", email = "valentin.dervieux@gmail.com"), 11 | person("Florent", "Bersani", role = "ctb", email = "florent@bersani.org") 12 | ) 13 | Maintainer: Julien Chiquet 14 | Description: Implements an efficient O(n) algorithm based on bucket-sorting for 15 | fast computation of standard clustering comparison measures. Available measures 16 | include adjusted Rand index (ARI), normalized information distance (NID), 17 | normalized mutual information (NMI), adjusted mutual information (AMI), 18 | normalized variation information (NVI) and entropy, as described in Vinh et al (2009) 19 | . Include AMI (Adjusted Mutual Information) since version 0.1.2, 20 | a modified version of ARI (MARI), as described in Sundqvist et al. 21 | and simple Chi-square distance since version 1.0.0. 22 | License: GPL (>=3) 23 | URL: https://github.com/jchiquet/aricode 24 | BugReports: https://github.com/jchiquet/aricode/issues 25 | Encoding: UTF-8 26 | Imports: 27 | Matrix, 28 | Rcpp 29 | Suggests: 30 | testthat, 31 | spelling 32 | LinkingTo: Rcpp 33 | RoxygenNote: 7.2.3 34 | Language: en-US 35 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(AMI) 4 | export(ARI) 5 | export(Chi2) 6 | export(MARI) 7 | export(MARIraw) 8 | export(NID) 9 | export(NMI) 10 | export(NVI) 11 | export(RI) 12 | export(clustComp) 13 | export(entropy) 14 | export(sortPairs) 15 | import(Matrix) 16 | importFrom(Rcpp,sourceCpp) 17 | useDynLib(aricode) 18 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | ## aricode 1.0-3 (2023-10-19) 2 | 3 | - minor fix in package documentation due to evolution of roxygen2 7.0.0 . 4 | 5 | ## aricode 1.0-2 (2022-12-14) 6 | 7 | - fix a serious bug in the AMI function pointed by Florent Bersani 8 | 9 | ## aricode 1.0-1 (2022-09-04) 10 | 11 | - fix warnings in C++ 12 | - fix documentation for to comply with CRAN policy and HTML5 validation 13 | 14 | ## aricode 1.0-0 (2020-06-23) 15 | 16 | - added the Modified adjusted Rand Index 17 | - added the Chi-Square statistics 18 | - bug fixed in SortPairs 19 | 20 | ## aricode 0.1-2 (2019-06-28) 21 | 22 | - added the Adjusted Mutual information 23 | 24 | ## aricode 0.1-1 (2018-04-30) 25 | 26 | - first submission to CRAN 27 | -------------------------------------------------------------------------------- /R/RcppExports.R: -------------------------------------------------------------------------------- 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | countPairs <- function(classi1, classi2, order) { 5 | .Call('_aricode_countPairs', PACKAGE = 'aricode', classi1, classi2, order) 6 | } 7 | 8 | expected_MI <- function(ni_, n_j) { 9 | .Call('_aricode_expected_MI', PACKAGE = 'aricode', ni_, n_j) 10 | } 11 | 12 | getRank <- function(classi) { 13 | .Call('_aricode_getRank', PACKAGE = 'aricode', classi) 14 | } 15 | 16 | -------------------------------------------------------------------------------- /R/aricode-package.R: -------------------------------------------------------------------------------- 1 | #' @keywords internal 2 | "_PACKAGE" 3 | 4 | #' aricode 5 | #' 6 | #' A package for efficient computations of standard clustering comparison measures. Most of the available measures are described in the paper of Vinh et al, JMLR, 2009 (see reference below). 7 | #' 8 | #' Traditional implementations (e.g., function \code{adjustedRandIndex} of package \code{mclust}) are in Omega(n + u v) where n is the size of the vectors the classifications of which are to be compared, u and v are the respective number of classes in each vectors. Here, the implementation is in Theta(n), plus the gain of speed due to the C++ code. 9 | #' 10 | #' The functions included in aricode are: 11 | #' 12 | #' * ARI: computes the adjusted rand index 13 | #' * Chi2: computes the Chi-square statistic 14 | #' * MARI: computes the modified adjusted rand index (Sundqvist et al, in preparation) 15 | #' * MARIraw: computes the raw version of the modified adjusted rand index 16 | #' * RI: computes the rand index 17 | #' * NVI: computes the normalized variation information 18 | #' * NID: computes the normalized information distance 19 | #' * NMI: computes the normalized mutual information 20 | #' * AMI: computes the adjusted mutual information 21 | #' * entropy: computes the conditional and joint entropies 22 | #' * clustComp: computes all clustering comparison measures at once 23 | #' 24 | #' @author Julien Chiquet \email{julien.chiquet@@inrae.fr} 25 | #' @author Guillem Rigaill \email{guillem.rigaill@@inrae.fr} 26 | #' @author Martina Sundqvist \email{martina.sundqvist@@agroparistech.fr} 27 | #' @references 28 | #' * Nguyen Xuan Vinh, Julien Epps, and James Bailey. "Information theoretic measures for clusterings comparison: Variants, properties, normalization and correction for chance." Journal of Machine Learning Research 11.Oct (2010): 2837-2854. as described in Vinh et al (2009) 29 | #' * Sundqvist, Martina, Julien Chiquet, and Guillem Rigaill. "Adjusting the adjusted Rand Index: A multinomial story." Computational Statistics 38.1 (2023): 327-347. 30 | #' @seealso \code{\link{ARI}}, \code{\link{RI}}, \code{\link{NID}}, \code{\link{NVI}}, \code{\link{AMI}}, \code{\link{NMI}}, \code{\link{entropy}}, \code{\link{clustComp}} 31 | #' @name aricode-package 32 | 33 | # The following block is used by usethis to automatically manage 34 | # roxygen namespace tags. Modify with care! 35 | ## usethis namespace: start 36 | #' @useDynLib aricode 37 | #' @importFrom Rcpp sourceCpp 38 | ## usethis namespace: end 39 | NULL 40 | -------------------------------------------------------------------------------- /R/aricode.R: -------------------------------------------------------------------------------- 1 | #' Sort Pairs 2 | #' 3 | #' A function to sort pairs of integers or factors and identify the pairs 4 | #' 5 | #' @param c1 a vector of length n with value between 0 and N1 < n 6 | #' @param c2 a vector of length n with value between 0 and N2 < n 7 | #' @param spMat logical: send back the contingency table as sparsely encoded (cost more than the algorithm itself). Default is FALSE 8 | #' @import Matrix 9 | #' @export 10 | sortPairs <- function(c1, c2, spMat=FALSE){ 11 | if (anyNA(c1) | anyNA(c2)) 12 | stop("NA are not supported.") 13 | 14 | if (((!is.vector(c1) & !is.factor(c1)) | is.list(c1)) | ((!is.vector(c2) & !is.factor(c2)) | is.list(c2))) 15 | stop("c1 and c2 must be vectors or factors but not lists.") 16 | 17 | if (length(c1) != length(c2)) 18 | stop("the two vectors must have the same length.") 19 | 20 | n <- length(c1) 21 | 22 | ## if c1 and c2 are integer 23 | if (is.integer(c1) & is.integer(c2)) { 24 | ## getRank is O(n) if max(c1)-min(c1) and max(c2)-min(c2) is of order length(c1)=length(c2) 25 | ## NOTE: getRank does not assume c1 and c2 are between 0 and n 26 | res1 <- getRank(c1) 27 | res2 <- getRank(c2) 28 | mylevels <- list(c1=res1$index, c2=res2$index) 29 | c1 <- res1$translated # here ranks are in [0, n) 30 | c2 <- res2$translated # here ranks are in [0, n) 31 | } else if (is.factor(c1) & is.factor(c2)) { 32 | mylevels <- list(c1 = levels(c1), c2 = levels(c2)) 33 | c1 <- as.integer(c1) - 1L 34 | c2 <- as.integer(c2) - 1L 35 | } else { 36 | ## if neither a factor nor an integer or different of types force to factor then integer 37 | mylevels <- list(c1 = unique(c1), c2 = unique(c2)) 38 | c1 <- as.integer(factor(c1, levels = mylevels$c1)) - 1L 39 | c2 <- as.integer(factor(c2, levels = mylevels$c2)) - 1L 40 | } 41 | 42 | 43 | i_order <- order(c1, c2, method="radix") - 1L 44 | out <- countPairs(c1, c2, i_order) 45 | 46 | if (spMat) { 47 | spOut <- sparseMatrix(i=out$pair_c1, 48 | j=out$pair_c2, 49 | x=out$pair_nb, 50 | dims=sapply(mylevels,length), 51 | dimnames = mylevels, index1=FALSE) 52 | } else { 53 | spOut <- NULL 54 | } 55 | 56 | res <- list(spMat = spOut, 57 | levels = mylevels, 58 | nij = out$pair_nb, 59 | ni. = out$c1_nb, 60 | n.j = out$c2_nb, 61 | pair_c1 = out$pair_c1, 62 | pair_c2 = out$pair_c2 63 | ) 64 | res 65 | } 66 | 67 | #' Adjusted Rand Index 68 | #' 69 | #' A function to compute the adjusted rand index between two classifications 70 | #' 71 | #' @param c1 a vector containing the labels of the first classification. Must be a vector of characters, integers, numerics, or a factor, but not a list. 72 | #' @param c2 a vector containing the labels of the second classification. 73 | #' @return a scalar with the adjusted rand index. 74 | #' @seealso \code{\link{RI}}, \code{\link{NID}}, \code{\link{NVI}}, \code{\link{NMI}}, \code{\link{clustComp}} 75 | #' @examples 76 | #' data(iris) 77 | #' cl <- cutree(hclust(dist(iris[,-5])), 4) 78 | #' ARI(cl,iris$Species) 79 | #' @export 80 | ARI <- function(c1, c2){ 81 | 82 | ## get pairs using C 83 | ## ensure that values of c1 and c2 are between 0 and n1 84 | res <- sortPairs(c1, c2) 85 | 86 | ## get ARI using pairs 87 | N <- length(c1) 88 | 89 | stot <- sum(choose(res$nij, 2), na.rm=TRUE) 90 | srow <- sum(choose(res$ni., 2), na.rm=TRUE) 91 | scol <- sum(choose(res$n.j, 2), na.rm=TRUE) 92 | 93 | expectedIndex <-(srow*scol)/(choose(N,2)) 94 | maximumIndex <- (srow+scol)/2 95 | 96 | if (expectedIndex == maximumIndex & stot != 0) { 97 | res <- 1 98 | } else { 99 | res <- (stot-expectedIndex)/(maximumIndex-expectedIndex) 100 | } 101 | res 102 | } 103 | 104 | #' Rand Index 105 | #' 106 | #' A function to compute the rand index between two classifications 107 | #' 108 | #' @param c1 a vector containing the labels of the first classification. Must be a vector of characters, integers, numerics, or a factor, but not a list. 109 | #' @param c2 a vector containing the labels of the second classification. 110 | #' @return a scalar with the rand index. 111 | #' @seealso \code{\link{ARI}}, \code{\link{NID}}, \code{\link{NVI}}, \code{\link{NMI}}, \code{\link{clustComp}} 112 | #' @examples 113 | #' data(iris) 114 | #' cl <- cutree(hclust(dist(iris[,-5])), 4) 115 | #' RI(cl,iris$Species) 116 | #' @export 117 | RI <- function(c1, c2){ 118 | ## get pairs using C 119 | ## ensure that values of c1 and c2 are between 0 and n1 120 | res <- sortPairs(c1, c2) 121 | 122 | ## get ARI using pairs 123 | N <- length(c1) 124 | 125 | ## return the rand-index 126 | res <- 1 + (sum(res$nij^2) - (sum(res$ni.^2) + sum(res$n.j^2))/2)/choose(N,2) 127 | res 128 | } 129 | 130 | #' Modified Adjusted Rand Index 131 | #' 132 | #' A function to compute a modified adjusted rand index between two classifications as proposed by Sundqvist et al. in prep, based on a multinomial model. 133 | #' 134 | #' @param c1 a vector containing the labels of the first classification. Must be a vector of characters, integers, numerics, or a factor, but not a list. 135 | #' @param c2 a vector containing the labels of the second classification. 136 | #' @return a scalar with the modified ARI. 137 | #' @seealso \code{\link{ARI}}, \code{\link{NID}}, \code{\link{NVI}}, \code{\link{NMI}}, \code{\link{clustComp}} 138 | #' @examples 139 | #' data(iris) 140 | #' cl <- cutree(hclust(dist(iris[,-5])), 4) 141 | #' MARI(cl,iris$Species) 142 | #' @export 143 | MARI <- function(c1, c2){ 144 | ## get pairs using C 145 | ## ensure that values of c1 and c2 are between 0 and n1 146 | res <- sortPairs(c1, c2) 147 | N <- length(c1) 148 | ## 149 | 150 | stot <- sum(choose(res$nij, 2), na.rm=TRUE) 151 | srow <- sum(choose(res$ni., 2), na.rm=TRUE) 152 | scol <- sum(choose(res$n.j, 2), na.rm=TRUE) 153 | 154 | ## using Lemma 3.3 155 | ## triplets 156 | T1 <- 2*N 157 | T2 <- sum(res$nij * res$ni.[res$pair_c1+1] * res$n.j[res$pair_c2+1], na.rm=TRUE) 158 | T3 <- -sum(res$nij^2, na.rm=TRUE) - sum(res$ni.^2, na.rm=TRUE) - sum(res$n.j^2, na.rm=TRUE) 159 | 160 | ## quadruplets (and division by 6 choose(N, 4) 161 | expectedIndex <- (srow*scol - stot - (T1+T2+T3)) / (6 *choose(N, 4)) 162 | 163 | ## return the rand-index 164 | expectedIndex <- expectedIndex * choose(N, 2) ## RESCALE SO THAT THE CODE IS EQUIVALENT TO THE ARI 165 | maximumIndex <- (srow+scol)/2 166 | if (expectedIndex == maximumIndex & stot != 0) { 167 | res <- 1 168 | } else { 169 | res <- (stot-expectedIndex)/(maximumIndex-expectedIndex) 170 | } 171 | res 172 | ## return the adjusted (and divided) rand-index 173 | res 174 | } 175 | 176 | #' raw Modified Adjusted Rand Index 177 | #' 178 | #' A function to compute a modified adjusted rand index between two classifications as proposed by Sundqvist et al. in prep, based on a multinomial model. Raw means, that the index is not divided by the (maximum - expected) value. 179 | #' 180 | #' @param c1 a vector containing the labels of the first classification. Must be a vector of characters, integers, numerics, or a factor, but not a list. 181 | #' @param c2 a vector containing the labels of the second classification. 182 | #' @return a scalar with the modified ARI without the division by the (maximum - expected) 183 | #' @seealso \code{\link{ARI}}, \code{\link{NID}}, \code{\link{NVI}}, \code{\link{NMI}}, \code{\link{clustComp}} 184 | #' @examples 185 | #' data(iris) 186 | #' cl <- cutree(hclust(dist(iris[,-5])), 4) 187 | #' MARIraw(cl,iris$Species) 188 | #' @export 189 | MARIraw <- function(c1, c2){ 190 | ## get pairs using C 191 | ## ensure that values of c1 and c2 are between 0 and n1 192 | res <- sortPairs(c1, c2) 193 | N <- length(c1) 194 | 195 | stot <- sum(choose(res$nij, 2), na.rm=TRUE) 196 | srow <- sum(choose(res$ni., 2), na.rm=TRUE) 197 | scol <- sum(choose(res$n.j, 2), na.rm=TRUE) 198 | 199 | ## using Lemma 3.3 200 | ## triplets 201 | T1 <- 2*N 202 | T2 <- sum(res$nij * res$ni.[res$pair_c1+1] * res$n.j[res$pair_c2+1], na.rm=TRUE) 203 | T3 <- -sum(res$nij^2, na.rm=TRUE) - sum(res$ni.^2, na.rm=TRUE) - sum(res$n.j^2, na.rm=TRUE) 204 | 205 | ## quadruplets (and division by 6 choose(N, 4) 206 | expectedIndex <- (srow*scol - stot - (T1+T2+T3)) / (6 *choose(N, 4)) 207 | 208 | ## return the rand-index 209 | res <- (stot / choose(N, 2)) - expectedIndex 210 | res 211 | } 212 | 213 | #' Chi-square statistics 214 | #' 215 | #' A function to compute the Chi-2 statistics 216 | #' 217 | #' @param c1 a vector containing the labels of the first classification. Must be a vector of characters, integers, numerics, or a factor, but not a list. 218 | #' @param c2 a vector containing the labels of the second classification. 219 | #' @return a scalar with the chi-square statistics. 220 | #' @seealso \code{\link{ARI}}, \code{\link{NID}}, \code{\link{NVI}}, \code{\link{NMI}}, \code{\link{clustComp}} 221 | #' @examples 222 | #' data(iris) 223 | #' cl <- cutree(hclust(dist(iris[,-5])), 4) 224 | #' Chi2(cl,iris$Species) 225 | #' @export 226 | Chi2 <- function(c1, c2){ 227 | ## get pairs using C 228 | ## ensure that values of c1 and c2 are between 0 and n1 229 | res <- sortPairs(c1, c2) 230 | N <- length(c1) 231 | 232 | res <- N* sum(res$nij^2 / (res$ni.[res$pair_c1+1] * res$n.j[res$pair_c2+1]) ) 233 | res <- res - N 234 | res 235 | } 236 | 237 | 238 | #' Entropy 239 | #' 240 | #' A function to compute the empirical entropy for two vectors of classification and the joint entropy 241 | #' 242 | #' @param c1 a vector containing the labels of the first classification. Must be a vector of characters, integers, numerics, or a factor, but not a list. 243 | #' @param c2 a vector containing the labels of the second classification. 244 | #' @return a list with the two conditional entropies, the joint entropy and output of sortPairs. 245 | #' @examples 246 | #' data(iris) 247 | #' cl <- cutree(hclust(dist(iris[,-5])), 4) 248 | #' entropy(cl,iris$Species) 249 | #' @export 250 | entropy <- function(c1, c2){ 251 | res <- sortPairs(c1, c2) 252 | 253 | N <- length(c1) 254 | 255 | H.UV <- - sum(res$nij * log(res$nij))/N + log(N) 256 | H.U <- - sum(res$ni. * log(res$ni.))/N + log(N) 257 | H.V <- - sum(res$n.j * log(res$n.j))/N + log(N) 258 | 259 | res <- list(UV = H.UV, U = H.U, V = H.V, sortPairs = res) 260 | res 261 | } 262 | 263 | #' Measures of similarity between two classification 264 | #' 265 | #' A function various measures of similarity between two classifications 266 | #' 267 | #' @param c1 a vector containing the labels of the first classification. Must be a vector of characters, integers, numerics, or a factor, but not a list. 268 | #' @param c2 a vector containing the labels of the second classification. 269 | #' @return a list with the RI, ARI, NMI, NVI and NID. 270 | #' @seealso \code{\link{RI}}, \code{\link{NID}}, \code{\link{NVI}}, \code{\link{NMI}}, \code{\link{ARI}} 271 | #' @examples 272 | #' data(iris) 273 | #' cl <- cutree(hclust(dist(iris[,-5])), 4) 274 | #' clustComp(cl,iris$Species) 275 | #' @export 276 | clustComp <- function(c1, c2) { 277 | 278 | H <- entropy(c1,c2) 279 | 280 | MI <- - H$UV + H$U + H$V 281 | VI <- H$UV - MI 282 | NVI <- 1 - MI/H$UV 283 | ID <- max(H$U, H$V) - MI 284 | NID <- 1 - MI / max(H$U, H$V) 285 | NMI <- MI / max(H$U, H$V) 286 | EMI <- expected_MI(as.integer(H$sortPairs$ni.), as.integer(H$sortPairs$n.j)) 287 | 288 | res <- list(RI = RI(c1,c2) , 289 | ARI = ARI(c1,c2) , 290 | MI = - H$UV + H$U + H$V , 291 | AMI = (- H$UV + H$U + H$V - EMI) / (max(H$U,H$V) - EMI), 292 | VI = H$UV - MI , 293 | NVI = 1 - MI/H$UV , 294 | ID = max(H$U, H$V) - MI , 295 | NID = 1 - MI / max(H$U, H$V), 296 | NMI = MI / max(H$U, H$V) , 297 | Chi2 = Chi2(c1,c2) , 298 | MARI = MARI(c1,c2) , 299 | MARIraw = MARIraw(c1,c2) 300 | ) 301 | res 302 | res 303 | } 304 | 305 | #' Adjusted Mutual Information 306 | #' 307 | #' A function to compute the adjusted mutual information between two classifications 308 | #' 309 | #' @param c1 a vector containing the labels of the first classification. Must be a vector of characters, integers, numerics, or a factor, but not a list. 310 | #' @param c2 a vector containing the labels of the second classification. 311 | #' @return a scalar with the adjusted rand index. 312 | #' @seealso \code{\link{ARI}}, \code{\link{RI}}, \code{\link{NID}}, \code{\link{NVI}}, \code{\link{NMI}}, \code{\link{clustComp}} 313 | #' @examples 314 | #' data(iris) 315 | #' cl <- cutree(hclust(dist(iris[,-5])), 4) 316 | #' AMI(cl,iris$Species) 317 | #' @export 318 | AMI <- function(c1, c2){ 319 | 320 | H <- entropy(c1,c2) 321 | MI <- - H$UV + H$U + H$V 322 | EMI <- expected_MI(as.integer(H$sortPairs$ni.), as.integer(H$sortPairs$n.j)) 323 | 324 | res <- (MI - EMI) / (max(H$U,H$V) - EMI) 325 | res 326 | } 327 | 328 | #' Normalized mutual information (NMI) 329 | #' 330 | #' A function to compute the NMI between two classifications 331 | #' 332 | #' @param c1 a vector containing the labels of the first classification. Must be a vector of characters, integers, numerics, or a factor, but not a list. 333 | #' @param c2 a vector containing the labels of the second classification. 334 | #' @param variant a string in ("max", "min", "sqrt", "sum", "joint"): different variants of NMI. Default use "max". 335 | #' @return a scalar with the normalized mutual information . 336 | #' @seealso \code{\link{RI}}, \code{\link{NID}}, \code{\link{NVI}}, \code{\link{ARI}}, \code{\link{clustComp}} 337 | #' @examples 338 | #' data(iris) 339 | #' cl <- cutree(hclust(dist(iris[,-5])), 4) 340 | #' NMI(cl,iris$Species) 341 | #' @export 342 | NMI <- function(c1, c2, variant = c("max", "min", "sqrt", "sum", "joint")) { 343 | 344 | variant <- match.arg(variant) 345 | 346 | H <- entropy(c1,c2) 347 | 348 | MI <- - H$UV + H$U + H$V 349 | 350 | D <- switch(variant, 351 | "max" = max(H$U, H$V), 352 | "sqrt" = sqrt(H$U * H$V), 353 | "min" = min(H$U, H$V), 354 | "sum" = .5*(H$U + H$V), 355 | "joint" = H$UV) 356 | res <- MI / D 357 | res 358 | } 359 | 360 | #' Normalized information distance (NID) 361 | #' 362 | #' A function to compute the NID between two classifications 363 | #' 364 | #' @param c1 a vector containing the labels of the first classification. Must be a vector of characters, integers, numerics, or a factor, but not a list. 365 | #' @param c2 a vector containing the labels of the second classification. 366 | #' @return a scalar with the normalized information distance . 367 | #' @seealso \code{\link{RI}}, \code{\link{NMI}}, \code{\link{NVI}}, \code{\link{ARI}}, \code{\link{clustComp}} 368 | #' @examples 369 | #' data(iris) 370 | #' cl <- cutree(hclust(dist(iris[,-5])), 4) 371 | #' NID(cl,iris$Species) 372 | #' @export 373 | NID <- function(c1, c2) { 374 | 375 | H <- entropy(c1,c2) 376 | 377 | MI <- - H$UV + H$U + H$V 378 | res <- 1 - MI / max(H$U, H$V) 379 | res 380 | } 381 | 382 | #' Normalized variation of information (NVI) 383 | #' 384 | #' A function to compute the NVI between two classifications 385 | #' 386 | #' @param c1 a vector containing the labels of the first classification. Must be a vector of characters, integers, numerics, or a factor, but not a list. 387 | #' @param c2 a vector containing the labels of the second classification. 388 | #' @return a scalar with the normalized variation of information. 389 | #' @seealso \code{\link{RI}}, \code{\link{NID}}, \code{\link{NMI}}, \code{\link{ARI}}, \code{\link{clustComp}} 390 | #' @examples 391 | #' data(iris) 392 | #' cl <- cutree(hclust(dist(iris[,-5])), 4) 393 | #' NVI(cl,iris$Species) 394 | #' @export 395 | NVI <- function(c1, c2) { 396 | 397 | H <- entropy(c1,c2) 398 | MI <- - H$UV + H$U + H$V 399 | res <- 1 - MI/H$UV 400 | res 401 | } 402 | 403 | 404 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: github_document 3 | --- 4 | 5 | ```{r setup, include=FALSE} 6 | knitr::opts_chunk$set( 7 | collapse = TRUE, 8 | fig.path = "man/figures/" 9 | ) 10 | ``` 11 | 12 | # aricode 13 | 14 | 15 | [![R-CMD-check](https://github.com/jchiquet/aricode/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/jchiquet/aricode/actions/workflows/R-CMD-check.yaml) 16 | [![CRAN Status](https://www.r-pkg.org/badges/version/aricode)](https://CRAN.R-project.org/package=aricode) 17 | [![Coverage status](https://codecov.io/gh/jchiquet/aricode/branch/master/graph/badge.svg)](https://codecov.io/gh/jchiquet/aricode) 18 | [![Lifecycle: stable](https://img.shields.io/badge/lifecycle-stable-blue.svg)](https://www.tidyverse.org/lifecycle/#stable) 19 | [![](https://img.shields.io/github/last-commit/jchiquet/aricode.svg)](https://github.com/jchiquet/aricode/commits/master) 20 | 21 | 22 | A package for efficient computations of standard clustering comparison measures 23 | 24 | ## Installation 25 | 26 | Stable version on the [CRAN](https://cran.rstudio.com/web/packages/aricode/). 27 | 28 | ```{r install_cran, eval = FALSE} 29 | install.packages("aricode") 30 | ``` 31 | 32 | The development version is available via: 33 | 34 | ```{r install_github, eval = FALSE} 35 | devtools::install_github("jchiquet/aricode") 36 | ``` 37 | 38 | ## Description 39 | 40 | Computation of measures for clustering comparison (ARI, AMI, NID and even the $\chi^2$ distance) are usually based on the contingency table. Traditional implementations (e.g., function `adjustedRandIndex` of package **mclust**) are in $\Omega(n + u v)$ where 41 | 42 | - $n$ is the size of the vectors the classifications of which are to be compared, 43 | - $u$ and $v$ are the respective number of classes in each vectors. 44 | 45 | In **aricode** we propose an implementation, based on radix sort, that is in $\Theta(n)$ in time and space. 46 | Importantly, the complexity does not depends on $u$ and $v$. 47 | Our implementation of the ARI for instance is one or two order of magnitude faster than some standard implementation in `R`. 48 | 49 | ## Available measures and functions 50 | 51 | The functions included in aricode are: 52 | 53 | - `ARI`: computes the adjusted rand index 54 | - `Chi2`: computes the Chi-square statistics 55 | - `MARI/MARIraw`: computes the modified adjusted rand index (Sundqvist et al, in preparation) 56 | - `NVI`: computes the the normalized variation information 57 | - `NID`: computes the normalized information distance 58 | - `NMI`: computes the normalized mutual information 59 | - `AMI`: computes the adjusted mutual information 60 | - `expected_MI`: computes the expected mutual information 61 | - `entropy`: computes the conditional and joint entropies 62 | - `clustComp`: computes all clustering comparison measures at once 63 | 64 | ## Timings 65 | 66 | Here are some timings to compare the cost of computing the adjusted Rand Index with **aricode** or with the commonly used function `adjustedRandIndex` of the *mclust* package: the cost of the latter can be prohibitive for large vectors: 67 | 68 | ```{r timings_function, echo=FALSE, message=FALSE, warning=FALSE} 69 | library(aricode) 70 | library(mclust) 71 | library(ggplot2) 72 | 73 | time.aricode <- function(times, c1, c2){ 74 | replicate(times, system.time(ARI(c1, c2))[3]) 75 | } 76 | 77 | time.mclust <- function(times, c1, c2){ 78 | replicate(times, system.time(mclust::adjustedRandIndex(c1, c2))[3]) 79 | } 80 | 81 | time.method <- function(times, c1, c2, n){ 82 | rbind( 83 | data.frame(time = time.aricode(times, c1, c2), expr = "aricode", n = n), 84 | data.frame(time = time.mclust(times, c1, c2), expr = "mclust", n = n) 85 | ) 86 | } 87 | 88 | # with similar classif, number of classes grows with n 89 | sim.timings <- function(n, times = 10) { 90 | c1 <- sample(1:(n/200), n, replace=TRUE);c2 <- c1; 91 | i_change <- sample(1:n, n/50, replace=FALSE) 92 | c2[i_change] <- c2[rev(i_change)] 93 | out <- time.method(times, c1, c2, n) 94 | data.frame(time=out$time, method=out$expr, n = n) 95 | } 96 | ``` 97 | 98 | ```{r timings_run, echo=FALSE, message=FALSE, warning=FALSE, cache=TRUE} 99 | # with similar classif, number of classes grows with n 100 | ns <- sort(c(200 * 2^(3:14), 150 * 2^(3:15))) 101 | timings <- do.call("rbind", lapply(ns, sim.timings)) 102 | ``` 103 | 104 | ```{r timings_plot, echo=FALSE, message=FALSE, warning=FALSE} 105 | p.timings <- ggplot(timings, aes(x=n, y=time, colour=method)) + 106 | geom_smooth(data = dplyr::filter(timings, n > 1e4), method = "lm") + geom_point(size=0.25, alpha=0.9) + labs(y="time (sec.)") + 107 | scale_x_log10( 108 | breaks = scales::trans_breaks("log10", function(x) 10^x), 109 | labels = scales::trans_format("log10", scales::math_format(10^.x)) 110 | ) + 111 | scale_y_log10(breaks = scales::trans_breaks("log10", function(x) 10^x), 112 | labels = scales::trans_format("log10", scales::math_format(10^.x))) + 113 | annotation_logticks() 114 | 115 | p.timings + ggtitle("number of classes grows with n") + theme_bw() 116 | ``` 117 | 118 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # aricode 3 | 4 | 5 | ![R-CMD-check](https://github.com/jchiquet/aricode/workflows/R-CMD-check/badge.svg?branch=master) 6 | [![CRAN 7 | Status](https://www.r-pkg.org/badges/version/aricode)](https://CRAN.R-project.org/package=aricode) 8 | [![Coverage 9 | status](https://codecov.io/gh/jchiquet/aricode/branch/master/graph/badge.svg)](https://codecov.io/gh/jchiquet/aricode) 10 | [![Lifecycle: 11 | stable](https://img.shields.io/badge/lifecycle-stable-blue.svg)](https://www.tidyverse.org/lifecycle/#stable) 12 | [![](https://img.shields.io/github/last-commit/jchiquet/aricode.svg)](https://github.com/jchiquet/aricode/commits/master) 13 | 14 | 15 | A package for efficient computations of standard clustering comparison 16 | measures 17 | 18 | ## Installation 19 | 20 | Stable version on the 21 | [CRAN](https://cran.rstudio.com/web/packages/aricode/). 22 | 23 | ``` r 24 | install.packages("aricode") 25 | ``` 26 | 27 | The development version is available via: 28 | 29 | ``` r 30 | devtools::install_github("jchiquet/aricode") 31 | ``` 32 | 33 | ## Description 34 | 35 | Computation of measures for clustering comparison (ARI, AMI, NID and 36 | even the \(\chi^2\) distance) are usually based on the contingency 37 | table. Traditional implementations (e.g., function `adjustedRandIndex` 38 | of package **mclust**) are in \(\Omega(n + u v)\) where 39 | 40 | - \(n\) is the size of the vectors the classifications of which are to 41 | be compared, 42 | - \(u\) and \(v\) are the respective number of classes in each 43 | vectors. 44 | 45 | In **aricode** we propose an implementation, based on radix sort, that 46 | is in \(\Theta(n)\) in time and space. 47 | Importantly, the complexity does not depends on \(u\) and \(v\). Our 48 | implementation of the ARI for instance is one or two order of magnitude 49 | faster than some standard implementation in `R`. 50 | 51 | ## Available measures and functions 52 | 53 | The functions included in aricode are: 54 | 55 | - `ARI`: computes the adjusted rand index 56 | - `Chi2`: computes the Chi-square statistics 57 | - `MARI/MARIraw`: computes the modified adjusted rand index (Sundqvist 58 | et al, in preparation) 59 | - `NVI`: computes the the normalized variation information 60 | - `NID`: computes the normalized information distance 61 | - `NMI`: computes the normalized mutual information 62 | - `AMI`: computes the adjusted mutual information 63 | - `expected_MI`: computes the expected mutual information 64 | - `entropy`: computes the conditional and joint entropies 65 | - `clustComp`: computes all clustering comparison measures at once 66 | 67 | ## Timings 68 | 69 | Here are some timings to compare the cost of computing the adjusted Rand 70 | Index with **aricode** or with the commonly used function 71 | `adjustedRandIndex` of the *mclust* package: the cost of the latter can 72 | be prohibitive for large vectors: 73 | 74 | ![](man/figures/timings_plot-1.png) 75 | -------------------------------------------------------------------------------- /_pkgdown.yml: -------------------------------------------------------------------------------- 1 | templates: 2 | params: 3 | bootswatch: readable 4 | 5 | navbar: 6 | title: "aricode" 7 | type: inverse 8 | left: 9 | - icon: fa-home 10 | - text: "Home" 11 | href: index.html 12 | - text: "Reference" 13 | href: "reference/index.html" 14 | - text: "Changelog" 15 | href: news/index.html 16 | right: 17 | - icon: fa-github 18 | href: https://github.com/jchiquet/aricode 19 | 20 | -------------------------------------------------------------------------------- /aricode.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: No 4 | SaveWorkspace: No 5 | AlwaysSaveHistory: Yes 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --no-multiarch --with-keep.source 21 | -------------------------------------------------------------------------------- /cran-comments.md: -------------------------------------------------------------------------------- 1 | This submission follows the reporting of a bug in the calculation of the AMI. 2 | 3 | ## aricode 1.0.3 4 | 5 | - minor fix in package documentation due to evolution of roxygen2 7.0.0 . 6 | 7 | ## Tested environments 8 | 9 | * tested locally on Ubuntu Linux 22.04.1 LTS, R-release, GCC 10 | 11 | * tested remotely with github-action 12 | 13 | - Linux ubuntu 22.04, R-release (github-action) 14 | - Linux ubuntu 22.04, R-oldrel (github-action) 15 | - Linux ubuntu 22.04, R-devel (github-action) 16 | - Windows Server 2022, R-release, 64 bit 17 | - macOS 12, R-release (github action) 18 | - Linux ubuntu 22.04, R-release, gcc + unit test with sanitizers (github-action) 19 | - Linux ubuntu 22.04, R-release, clang + unit test with sanitizers (github-action) 20 | 21 | * tested remotely with win-builder (R version 4.3.1, R unstable, R version 4.2.3) 22 | 23 | all status OK 24 | 25 | ## Local R CMD check results 26 | 27 | ── R CMD check results ──────────────────────────────────────────────── aricode 1.0.3 ──── 28 | Duration: 43.1s 29 | 30 | 0 errors ✔ | 0 warnings ✔ | 0 notes ✔ 31 | 32 | R CMD check succeeded 33 | -------------------------------------------------------------------------------- /inst/WORDLIST: -------------------------------------------------------------------------------- 1 | AMI 2 | ARI 3 | CMD 4 | Epps 5 | JMLR 6 | Lifecycle 7 | MARIraw 8 | NID 9 | NMI 10 | NVI 11 | ORCID 12 | SortPairs 13 | Vinh 14 | Xuan 15 | al 16 | clustComp 17 | doi 18 | entropies 19 | et 20 | mclust 21 | radix 22 | roxygen 23 | sortPairs 24 | -------------------------------------------------------------------------------- /inst/check_speed.R: -------------------------------------------------------------------------------- 1 | library(microbenchmark) 2 | library(aricode) 3 | library(mclust) 4 | 5 | n <- 10^5 6 | c1 <- sample.int(100, size = n, replace=TRUE) 7 | c2 <- sample.int(100, size = n, replace=TRUE) 8 | 9 | ## define the ARI as in the mclust package 10 | adjustedRandIndex <- function (x, y) 11 | { 12 | x <- as.vector(x) 13 | y <- as.vector(y) 14 | if (length(x) != length(y)) 15 | stop("arguments must be vectors of the same length") 16 | tab <- table(x, y) 17 | if (all(dim(tab) == c(1, 1))) 18 | return(1) 19 | a <- sum(choose(tab, 2)) 20 | b <- sum(choose(rowSums(tab), 2)) - a 21 | c <- sum(choose(colSums(tab), 2)) - a 22 | d <- choose(sum(tab), 2) - a - b - c 23 | ARI <- (a - (a + b) * (a + c)/(a + b + c + d))/((a + b + a + c)/2 - (a + b) * (a + c)/(a + b + c + d)) 24 | return(ARI) 25 | } 26 | 27 | res <- microbenchmark(aricode = ARI(c1, c2), 28 | R = adjustedRandIndex(c1, c2), 29 | mclust = mclust::adjustedRandIndex(c1, c2), times=100L) 30 | ggplot2::autoplot(res) 31 | 32 | -------------------------------------------------------------------------------- /man/AMI.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/aricode.R 3 | \name{AMI} 4 | \alias{AMI} 5 | \title{Adjusted Mutual Information} 6 | \usage{ 7 | AMI(c1, c2) 8 | } 9 | \arguments{ 10 | \item{c1}{a vector containing the labels of the first classification. Must be a vector of characters, integers, numerics, or a factor, but not a list.} 11 | 12 | \item{c2}{a vector containing the labels of the second classification.} 13 | } 14 | \value{ 15 | a scalar with the adjusted rand index. 16 | } 17 | \description{ 18 | A function to compute the adjusted mutual information between two classifications 19 | } 20 | \examples{ 21 | data(iris) 22 | cl <- cutree(hclust(dist(iris[,-5])), 4) 23 | AMI(cl,iris$Species) 24 | } 25 | \seealso{ 26 | \code{\link{ARI}}, \code{\link{RI}}, \code{\link{NID}}, \code{\link{NVI}}, \code{\link{NMI}}, \code{\link{clustComp}} 27 | } 28 | -------------------------------------------------------------------------------- /man/ARI.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/aricode.R 3 | \name{ARI} 4 | \alias{ARI} 5 | \title{Adjusted Rand Index} 6 | \usage{ 7 | ARI(c1, c2) 8 | } 9 | \arguments{ 10 | \item{c1}{a vector containing the labels of the first classification. Must be a vector of characters, integers, numerics, or a factor, but not a list.} 11 | 12 | \item{c2}{a vector containing the labels of the second classification.} 13 | } 14 | \value{ 15 | a scalar with the adjusted rand index. 16 | } 17 | \description{ 18 | A function to compute the adjusted rand index between two classifications 19 | } 20 | \examples{ 21 | data(iris) 22 | cl <- cutree(hclust(dist(iris[,-5])), 4) 23 | ARI(cl,iris$Species) 24 | } 25 | \seealso{ 26 | \code{\link{RI}}, \code{\link{NID}}, \code{\link{NVI}}, \code{\link{NMI}}, \code{\link{clustComp}} 27 | } 28 | -------------------------------------------------------------------------------- /man/Chi2.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/aricode.R 3 | \name{Chi2} 4 | \alias{Chi2} 5 | \title{Chi-square statistics} 6 | \usage{ 7 | Chi2(c1, c2) 8 | } 9 | \arguments{ 10 | \item{c1}{a vector containing the labels of the first classification. Must be a vector of characters, integers, numerics, or a factor, but not a list.} 11 | 12 | \item{c2}{a vector containing the labels of the second classification.} 13 | } 14 | \value{ 15 | a scalar with the chi-square statistics. 16 | } 17 | \description{ 18 | A function to compute the Chi-2 statistics 19 | } 20 | \examples{ 21 | data(iris) 22 | cl <- cutree(hclust(dist(iris[,-5])), 4) 23 | Chi2(cl,iris$Species) 24 | } 25 | \seealso{ 26 | \code{\link{ARI}}, \code{\link{NID}}, \code{\link{NVI}}, \code{\link{NMI}}, \code{\link{clustComp}} 27 | } 28 | -------------------------------------------------------------------------------- /man/MARI.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/aricode.R 3 | \name{MARI} 4 | \alias{MARI} 5 | \title{Modified Adjusted Rand Index} 6 | \usage{ 7 | MARI(c1, c2) 8 | } 9 | \arguments{ 10 | \item{c1}{a vector containing the labels of the first classification. Must be a vector of characters, integers, numerics, or a factor, but not a list.} 11 | 12 | \item{c2}{a vector containing the labels of the second classification.} 13 | } 14 | \value{ 15 | a scalar with the modified ARI. 16 | } 17 | \description{ 18 | A function to compute a modified adjusted rand index between two classifications as proposed by Sundqvist et al. in prep, based on a multinomial model. 19 | } 20 | \examples{ 21 | data(iris) 22 | cl <- cutree(hclust(dist(iris[,-5])), 4) 23 | MARI(cl,iris$Species) 24 | } 25 | \seealso{ 26 | \code{\link{ARI}}, \code{\link{NID}}, \code{\link{NVI}}, \code{\link{NMI}}, \code{\link{clustComp}} 27 | } 28 | -------------------------------------------------------------------------------- /man/MARIraw.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/aricode.R 3 | \name{MARIraw} 4 | \alias{MARIraw} 5 | \title{raw Modified Adjusted Rand Index} 6 | \usage{ 7 | MARIraw(c1, c2) 8 | } 9 | \arguments{ 10 | \item{c1}{a vector containing the labels of the first classification. Must be a vector of characters, integers, numerics, or a factor, but not a list.} 11 | 12 | \item{c2}{a vector containing the labels of the second classification.} 13 | } 14 | \value{ 15 | a scalar with the modified ARI without the division by the (maximum - expected) 16 | } 17 | \description{ 18 | A function to compute a modified adjusted rand index between two classifications as proposed by Sundqvist et al. in prep, based on a multinomial model. Raw means, that the index is not divided by the (maximum - expected) value. 19 | } 20 | \examples{ 21 | data(iris) 22 | cl <- cutree(hclust(dist(iris[,-5])), 4) 23 | MARIraw(cl,iris$Species) 24 | } 25 | \seealso{ 26 | \code{\link{ARI}}, \code{\link{NID}}, \code{\link{NVI}}, \code{\link{NMI}}, \code{\link{clustComp}} 27 | } 28 | -------------------------------------------------------------------------------- /man/NID.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/aricode.R 3 | \name{NID} 4 | \alias{NID} 5 | \title{Normalized information distance (NID)} 6 | \usage{ 7 | NID(c1, c2) 8 | } 9 | \arguments{ 10 | \item{c1}{a vector containing the labels of the first classification. Must be a vector of characters, integers, numerics, or a factor, but not a list.} 11 | 12 | \item{c2}{a vector containing the labels of the second classification.} 13 | } 14 | \value{ 15 | a scalar with the normalized information distance . 16 | } 17 | \description{ 18 | A function to compute the NID between two classifications 19 | } 20 | \examples{ 21 | data(iris) 22 | cl <- cutree(hclust(dist(iris[,-5])), 4) 23 | NID(cl,iris$Species) 24 | } 25 | \seealso{ 26 | \code{\link{RI}}, \code{\link{NMI}}, \code{\link{NVI}}, \code{\link{ARI}}, \code{\link{clustComp}} 27 | } 28 | -------------------------------------------------------------------------------- /man/NMI.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/aricode.R 3 | \name{NMI} 4 | \alias{NMI} 5 | \title{Normalized mutual information (NMI)} 6 | \usage{ 7 | NMI(c1, c2, variant = c("max", "min", "sqrt", "sum", "joint")) 8 | } 9 | \arguments{ 10 | \item{c1}{a vector containing the labels of the first classification. Must be a vector of characters, integers, numerics, or a factor, but not a list.} 11 | 12 | \item{c2}{a vector containing the labels of the second classification.} 13 | 14 | \item{variant}{a string in ("max", "min", "sqrt", "sum", "joint"): different variants of NMI. Default use "max".} 15 | } 16 | \value{ 17 | a scalar with the normalized mutual information . 18 | } 19 | \description{ 20 | A function to compute the NMI between two classifications 21 | } 22 | \examples{ 23 | data(iris) 24 | cl <- cutree(hclust(dist(iris[,-5])), 4) 25 | NMI(cl,iris$Species) 26 | } 27 | \seealso{ 28 | \code{\link{RI}}, \code{\link{NID}}, \code{\link{NVI}}, \code{\link{ARI}}, \code{\link{clustComp}} 29 | } 30 | -------------------------------------------------------------------------------- /man/NVI.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/aricode.R 3 | \name{NVI} 4 | \alias{NVI} 5 | \title{Normalized variation of information (NVI)} 6 | \usage{ 7 | NVI(c1, c2) 8 | } 9 | \arguments{ 10 | \item{c1}{a vector containing the labels of the first classification. Must be a vector of characters, integers, numerics, or a factor, but not a list.} 11 | 12 | \item{c2}{a vector containing the labels of the second classification.} 13 | } 14 | \value{ 15 | a scalar with the normalized variation of information. 16 | } 17 | \description{ 18 | A function to compute the NVI between two classifications 19 | } 20 | \examples{ 21 | data(iris) 22 | cl <- cutree(hclust(dist(iris[,-5])), 4) 23 | NVI(cl,iris$Species) 24 | } 25 | \seealso{ 26 | \code{\link{RI}}, \code{\link{NID}}, \code{\link{NMI}}, \code{\link{ARI}}, \code{\link{clustComp}} 27 | } 28 | -------------------------------------------------------------------------------- /man/RI.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/aricode.R 3 | \name{RI} 4 | \alias{RI} 5 | \title{Rand Index} 6 | \usage{ 7 | RI(c1, c2) 8 | } 9 | \arguments{ 10 | \item{c1}{a vector containing the labels of the first classification. Must be a vector of characters, integers, numerics, or a factor, but not a list.} 11 | 12 | \item{c2}{a vector containing the labels of the second classification.} 13 | } 14 | \value{ 15 | a scalar with the rand index. 16 | } 17 | \description{ 18 | A function to compute the rand index between two classifications 19 | } 20 | \examples{ 21 | data(iris) 22 | cl <- cutree(hclust(dist(iris[,-5])), 4) 23 | RI(cl,iris$Species) 24 | } 25 | \seealso{ 26 | \code{\link{ARI}}, \code{\link{NID}}, \code{\link{NVI}}, \code{\link{NMI}}, \code{\link{clustComp}} 27 | } 28 | -------------------------------------------------------------------------------- /man/aricode-package.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/aricode-package.R 3 | \docType{package} 4 | \name{aricode-package} 5 | \alias{aricode} 6 | \alias{aricode-package} 7 | \title{aricode: Efficient Computations of Standard Clustering Comparison Measures} 8 | \description{ 9 | Implements an efficient O(n) algorithm based on bucket-sorting for fast computation of standard clustering comparison measures. Available measures include adjusted Rand index (ARI), normalized information distance (NID), normalized mutual information (NMI), adjusted mutual information (AMI), normalized variation information (NVI) and entropy, as described in Vinh et al (2009) \doi{10.1145/1553374.1553511}. Include AMI (Adjusted Mutual Information) since version 0.1.2, a modified version of ARI (MARI), as described in Sundqvist et al. \doi{10.1007/s00180-022-01230-7} and simple Chi-square distance since version 1.0.0. 10 | 11 | A package for efficient computations of standard clustering comparison measures. Most of the available measures are described in the paper of Vinh et al, JMLR, 2009 (see reference below). 12 | } 13 | \details{ 14 | Traditional implementations (e.g., function \code{adjustedRandIndex} of package \code{mclust}) are in Omega(n + u v) where n is the size of the vectors the classifications of which are to be compared, u and v are the respective number of classes in each vectors. Here, the implementation is in Theta(n), plus the gain of speed due to the C++ code. 15 | 16 | The functions included in aricode are: 17 | 18 | * ARI: computes the adjusted rand index 19 | * Chi2: computes the Chi-square statistic 20 | * MARI: computes the modified adjusted rand index (Sundqvist et al, in preparation) 21 | * MARIraw: computes the raw version of the modified adjusted rand index 22 | * RI: computes the rand index 23 | * NVI: computes the normalized variation information 24 | * NID: computes the normalized information distance 25 | * NMI: computes the normalized mutual information 26 | * AMI: computes the adjusted mutual information 27 | * entropy: computes the conditional and joint entropies 28 | * clustComp: computes all clustering comparison measures at once 29 | } 30 | \references{ 31 | * Nguyen Xuan Vinh, Julien Epps, and James Bailey. "Information theoretic measures for clusterings comparison: Variants, properties, normalization and correction for chance." Journal of Machine Learning Research 11.Oct (2010): 2837-2854. as described in Vinh et al (2009) 32 | * Sundqvist, Martina, Julien Chiquet, and Guillem Rigaill. "Adjusting the adjusted Rand Index: A multinomial story." Computational Statistics 38.1 (2023): 327-347. 33 | } 34 | \seealso{ 35 | Useful links: 36 | \itemize{ 37 | \item \url{https://github.com/jchiquet/aricode} 38 | \item Report bugs at \url{https://github.com/jchiquet/aricode/issues} 39 | } 40 | 41 | 42 | \code{\link{ARI}}, \code{\link{RI}}, \code{\link{NID}}, \code{\link{NVI}}, \code{\link{AMI}}, \code{\link{NMI}}, \code{\link{entropy}}, \code{\link{clustComp}} 43 | } 44 | \author{ 45 | \strong{Maintainer}: Julien Chiquet \email{julien.chiquet@inrae.fr} (\href{https://orcid.org/0000-0002-3629-3429}{ORCID}) 46 | 47 | Authors: 48 | \itemize{ 49 | \item Guillem Rigaill \email{guillem.rigaill@inrae.fr} 50 | \item Martina Sundqvist \email{martina.sundqvist@agroparistech.fr} 51 | } 52 | 53 | Other contributors: 54 | \itemize{ 55 | \item Valentin Dervieux \email{valentin.dervieux@gmail.com} [contributor] 56 | \item Florent Bersani \email{florent@bersani.org} [contributor] 57 | } 58 | 59 | 60 | Julien Chiquet \email{julien.chiquet@inrae.fr} 61 | 62 | Guillem Rigaill \email{guillem.rigaill@inrae.fr} 63 | 64 | Martina Sundqvist \email{martina.sundqvist@agroparistech.fr} 65 | } 66 | \keyword{internal} 67 | -------------------------------------------------------------------------------- /man/clustComp.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/aricode.R 3 | \name{clustComp} 4 | \alias{clustComp} 5 | \title{Measures of similarity between two classification} 6 | \usage{ 7 | clustComp(c1, c2) 8 | } 9 | \arguments{ 10 | \item{c1}{a vector containing the labels of the first classification. Must be a vector of characters, integers, numerics, or a factor, but not a list.} 11 | 12 | \item{c2}{a vector containing the labels of the second classification.} 13 | } 14 | \value{ 15 | a list with the RI, ARI, NMI, NVI and NID. 16 | } 17 | \description{ 18 | A function various measures of similarity between two classifications 19 | } 20 | \examples{ 21 | data(iris) 22 | cl <- cutree(hclust(dist(iris[,-5])), 4) 23 | clustComp(cl,iris$Species) 24 | } 25 | \seealso{ 26 | \code{\link{RI}}, \code{\link{NID}}, \code{\link{NVI}}, \code{\link{NMI}}, \code{\link{ARI}} 27 | } 28 | -------------------------------------------------------------------------------- /man/entropy.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/aricode.R 3 | \name{entropy} 4 | \alias{entropy} 5 | \title{Entropy} 6 | \usage{ 7 | entropy(c1, c2) 8 | } 9 | \arguments{ 10 | \item{c1}{a vector containing the labels of the first classification. Must be a vector of characters, integers, numerics, or a factor, but not a list.} 11 | 12 | \item{c2}{a vector containing the labels of the second classification.} 13 | } 14 | \value{ 15 | a list with the two conditional entropies, the joint entropy and output of sortPairs. 16 | } 17 | \description{ 18 | A function to compute the empirical entropy for two vectors of classification and the joint entropy 19 | } 20 | \examples{ 21 | data(iris) 22 | cl <- cutree(hclust(dist(iris[,-5])), 4) 23 | entropy(cl,iris$Species) 24 | } 25 | -------------------------------------------------------------------------------- /man/figures/timings_plot-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jchiquet/aricode/ffc98928afc202338481d9fbb53f7527bdad28da/man/figures/timings_plot-1.png -------------------------------------------------------------------------------- /man/sortPairs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/aricode.R 3 | \name{sortPairs} 4 | \alias{sortPairs} 5 | \title{Sort Pairs} 6 | \usage{ 7 | sortPairs(c1, c2, spMat = FALSE) 8 | } 9 | \arguments{ 10 | \item{c1}{a vector of length n with value between 0 and N1 < n} 11 | 12 | \item{c2}{a vector of length n with value between 0 and N2 < n} 13 | 14 | \item{spMat}{logical: send back the contingency table as sparsely encoded (cost more than the algorithm itself). Default is FALSE} 15 | } 16 | \description{ 17 | A function to sort pairs of integers or factors and identify the pairs 18 | } 19 | -------------------------------------------------------------------------------- /src/.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.so 3 | *.dll 4 | -------------------------------------------------------------------------------- /src/RcppExports.cpp: -------------------------------------------------------------------------------- 1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | #include 5 | 6 | using namespace Rcpp; 7 | 8 | #ifdef RCPP_USE_GLOBAL_ROSTREAM 9 | Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); 10 | Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); 11 | #endif 12 | 13 | // countPairs 14 | List countPairs(IntegerVector classi1, IntegerVector classi2, IntegerVector order); 15 | RcppExport SEXP _aricode_countPairs(SEXP classi1SEXP, SEXP classi2SEXP, SEXP orderSEXP) { 16 | BEGIN_RCPP 17 | Rcpp::RObject rcpp_result_gen; 18 | Rcpp::RNGScope rcpp_rngScope_gen; 19 | Rcpp::traits::input_parameter< IntegerVector >::type classi1(classi1SEXP); 20 | Rcpp::traits::input_parameter< IntegerVector >::type classi2(classi2SEXP); 21 | Rcpp::traits::input_parameter< IntegerVector >::type order(orderSEXP); 22 | rcpp_result_gen = Rcpp::wrap(countPairs(classi1, classi2, order)); 23 | return rcpp_result_gen; 24 | END_RCPP 25 | } 26 | // expected_MI 27 | double expected_MI(IntegerVector ni_, IntegerVector n_j); 28 | RcppExport SEXP _aricode_expected_MI(SEXP ni_SEXP, SEXP n_jSEXP) { 29 | BEGIN_RCPP 30 | Rcpp::RObject rcpp_result_gen; 31 | Rcpp::RNGScope rcpp_rngScope_gen; 32 | Rcpp::traits::input_parameter< IntegerVector >::type ni_(ni_SEXP); 33 | Rcpp::traits::input_parameter< IntegerVector >::type n_j(n_jSEXP); 34 | rcpp_result_gen = Rcpp::wrap(expected_MI(ni_, n_j)); 35 | return rcpp_result_gen; 36 | END_RCPP 37 | } 38 | // getRank 39 | List getRank(IntegerVector classi); 40 | RcppExport SEXP _aricode_getRank(SEXP classiSEXP) { 41 | BEGIN_RCPP 42 | Rcpp::RObject rcpp_result_gen; 43 | Rcpp::RNGScope rcpp_rngScope_gen; 44 | Rcpp::traits::input_parameter< IntegerVector >::type classi(classiSEXP); 45 | rcpp_result_gen = Rcpp::wrap(getRank(classi)); 46 | return rcpp_result_gen; 47 | END_RCPP 48 | } 49 | 50 | static const R_CallMethodDef CallEntries[] = { 51 | {"_aricode_countPairs", (DL_FUNC) &_aricode_countPairs, 3}, 52 | {"_aricode_expected_MI", (DL_FUNC) &_aricode_expected_MI, 2}, 53 | {"_aricode_getRank", (DL_FUNC) &_aricode_getRank, 1}, 54 | {NULL, NULL, 0} 55 | }; 56 | 57 | RcppExport void R_init_aricode(DllInfo *dll) { 58 | R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); 59 | R_useDynamicSymbols(dll, FALSE); 60 | } 61 | -------------------------------------------------------------------------------- /src/aricode.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace Rcpp; 3 | 4 | // [[Rcpp::export]] 5 | List countPairs(IntegerVector classi1, IntegerVector classi2, IntegerVector order) { 6 | // first path to count pairs 7 | int n = classi1.size(); 8 | 9 | // count per classification 10 | IntegerVector count1(n, 0); 11 | for(int i = 0; i < n; i++) count1[classi1[i]]++; 12 | 13 | IntegerVector count2(n, 0); 14 | for(int i = 0; i < n; i++) count2[classi2[i]]++; 15 | 16 | // count per pairs 17 | int count = 1; 18 | int class1_cur = classi1[order[0]]; 19 | int class2_cur = classi2[order[0]]; 20 | 21 | for(int i = 1; i < n; i++){ 22 | if( (class1_cur != classi1[order[i]]) || (class2_cur != classi2[order[i]]) ){ 23 | count++; 24 | class1_cur = classi1[order[i]]; 25 | class2_cur = classi2[order[i]]; 26 | } 27 | } 28 | 29 | // create output Integer Vector for pairs and initialize 30 | IntegerVector nameClassi1(count, 0); 31 | IntegerVector nameClassi2(count, 0); 32 | IntegerVector numberPair(count, 0); 33 | 34 | int current_position = 0; 35 | nameClassi1[0] = classi1[order[0]]; 36 | nameClassi2[0] = classi2[order[0]]; 37 | numberPair[0] = 1; 38 | 39 | // count pairs 40 | for(int i = 1; i < n; i++){ 41 | if( ( nameClassi1[current_position] == classi1[order[i]]) && (nameClassi2[current_position] == classi2[order[i]]) ){ 42 | numberPair[current_position]++; 43 | } else { 44 | current_position += 1; 45 | nameClassi1[current_position] = classi1[order[i]]; 46 | nameClassi2[current_position] = classi2[order[i]]; 47 | numberPair[current_position] = 1; 48 | } 49 | } 50 | 51 | // output as a list 52 | List ListOut; 53 | ListOut["pair_nb"] = numberPair; 54 | ListOut["pair_c1"] = nameClassi1; 55 | ListOut["pair_c2"] = nameClassi2; 56 | ListOut["c1_nb"] = count1[count1 > 0]; 57 | ListOut["c2_nb"] = count2[count2 > 0]; 58 | return(ListOut); 59 | } 60 | 61 | 62 | // [[Rcpp::export]] 63 | double expected_MI(IntegerVector ni_, IntegerVector n_j) { 64 | 65 | int N = sum(ni_) ; 66 | 67 | double emi = 0.0 ; 68 | 69 | NumericVector ni_f = lfactorial(ni_) ; 70 | NumericVector nj_f = lfactorial(n_j) ; 71 | NumericVector Nmni_f = lfactorial(N - ni_) ; 72 | NumericVector Nmnj_f = lfactorial(N - n_j) ; 73 | double N_f = lgamma(N + 1) ; 74 | 75 | for (int i=0; i< ni_.size(); i++) { 76 | for (int j=0; j< n_j.size(); j++) { 77 | 78 | int start_nij = std::max(1, ni_[i] + n_j[j] - N) ; 79 | int end_nij = std::min(ni_[i], n_j[j]) ; 80 | 81 | for (int nij = start_nij; nij <= end_nij; nij++ ) { 82 | 83 | double t1 = ((float) nij / (float) N) * std::log((float)(nij * N) / (float)(ni_[i]*n_j[j])) ; 84 | 85 | double t2 = std::exp((ni_f[i] + nj_f[j] + Nmni_f[i] + Nmnj_f[j] - N_f - lgamma(1 + nij) - lgamma(1 + ni_[i] - nij) - lgamma(1 + n_j[j] - nij) - lgamma(1 + N - ni_[i] - n_j[j] + nij))) ; 86 | 87 | emi += t1*t2; 88 | } 89 | } 90 | } 91 | return emi; 92 | 93 | } 94 | 95 | // [[Rcpp::export]] 96 | List getRank(IntegerVector classi){ 97 | int maxi = max(classi); 98 | int mini = min(classi); 99 | 100 | // Present 101 | LogicalVector present(maxi - mini + 1); 102 | for(int i=0; i< classi.size(); i++) present[classi[i]-mini] = TRUE; 103 | 104 | // Count 105 | IntegerVector translator(maxi - mini + 1); 106 | int nbIndex = 0; 107 | for(int i=0; i< present.size(); i++) { 108 | if(present[i]) nbIndex++; 109 | } 110 | 111 | // Translator and Index Vector 112 | IntegerVector index(nbIndex); 113 | int indexCur = 0; 114 | for(int i=0; i< present.size(); i++) { 115 | if(present[i]) { 116 | translator[i] = indexCur; 117 | index[indexCur] = i+mini; 118 | indexCur++; 119 | } else { 120 | translator[i] = NA_INTEGER; 121 | } 122 | } 123 | // Converted Vector 124 | IntegerVector translated(classi.size()); 125 | for(int i=0; i< classi.size(); i++) translated[i] = translator[classi[i] - mini]; 126 | 127 | // output as a list 128 | List ListOut; 129 | ListOut["index"] = index; 130 | ListOut["translator"] = translator; 131 | ListOut["translated"] = translated; 132 | return ListOut; 133 | } 134 | -------------------------------------------------------------------------------- /tests/spelling.R: -------------------------------------------------------------------------------- 1 | if(requireNamespace('spelling', quietly = TRUE)) 2 | spelling::spell_check_test(vignettes = TRUE, error = FALSE, 3 | skip_on_cran = TRUE) 4 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(aricode) 3 | test_check("aricode") 4 | -------------------------------------------------------------------------------- /tests/testthat/test_coherence.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(aricode) 3 | 4 | ## define the ARI as in the mclust package 5 | adjustedRandIndex <- function (x, y) 6 | { 7 | x <- as.vector(x) 8 | y <- as.vector(y) 9 | if (length(x) != length(y)) 10 | stop("arguments must be vectors of the same length") 11 | tab <- table(x, y) 12 | if (all(dim(tab) == c(1, 1))) 13 | return(1) 14 | a <- sum(choose(tab, 2)) 15 | b <- sum(choose(rowSums(tab), 2)) - a 16 | c <- sum(choose(colSums(tab), 2)) - a 17 | d <- choose(sum(tab), 2) - a - b - c 18 | ARI <- (a - (a + b) * (a + c)/(a + b + c + d))/((a + b + a + c)/2 - (a + b) * (a + c)/(a + b + c + d)) 19 | return(ARI) 20 | } 21 | 22 | EMI <- function(counts) { 23 | s_emi <- 0 24 | n <- sum(counts) 25 | s1 <- margin.table(counts, 1) 26 | s2 <- margin.table(counts, 2) 27 | for(i in 1:nrow(counts)){ 28 | for (j in 1:ncol(counts)){ 29 | ai <- s1[i] 30 | bj <- s2[j] 31 | min_nij <- max(1, ai+bj-n) 32 | max_nij <- min(ai,bj) 33 | if (min_nij >= max_nij) next 34 | n.ij <- seq(min_nij, max_nij) #sequence of consecutive numbers 35 | t1<- (n.ij / n) * log((n.ij * n) / (ai*bj)) 36 | t2 <- exp(lfactorial(ai) + lfactorial(bj) + lfactorial(n - ai) + lfactorial(n - bj) - lfactorial(n) - lfactorial(n.ij) - lfactorial(ai - n.ij) - lfactorial(bj - n.ij) - lfactorial(n - ai - bj + n.ij)) 37 | emi <- sum(t1*t2) 38 | if (is.nan(emi)) browser() 39 | s_emi <- s_emi + emi 40 | } 41 | } 42 | s_emi 43 | } 44 | 45 | Chi2_ref <- function(c1, c2) { 46 | as.numeric(chisq.test(c1, c2, correct=F)$stat[1]) 47 | } 48 | 49 | ## Martina's code 50 | ARI_estimated <- function(c1, c2) { 51 | # c1, c2, two classifications of the same observations 52 | n_kl <- table(c1,c2) 53 | 54 | n <- sum(n_kl) 55 | n_k <- rowSums(n_kl) 56 | n_l <- colSums(n_kl) 57 | 58 | T1 <- sum(choose(n_kl,2)) 59 | T2 <- 2*n + sum(outer(n_k, n_l) * n_kl) - sum(n_kl^2) - sum(n_k^2) - sum(n_l^2) 60 | T3 <- 1/(6*choose(n,4))*(sum(outer(n_k^2, n_l^2)) - 4*T1 - 4*T2 - n*2*(sum(choose(n_k,2)) + sum(choose(n_l,2))) - n^2)/4 61 | 62 | RI_estim <- (1/choose(n, 2))*T1 63 | ARI_estim <- RI_estim-T3 64 | ARI_estim 65 | } 66 | 67 | test_that("Testing coherence of the adjusted Rand Index", { 68 | 69 | ## "\n-large random vectors- 70 | n <- 1e5 71 | c1 <- as.numeric(sample(1:(n/100), n, replace=TRUE)) 72 | c2 <- as.numeric(sample(1:(n/100), n, replace=TRUE)) 73 | expect_equal(ARI(c1,c2), adjustedRandIndex(c1,c2)) 74 | 75 | ## "\n-real data- 76 | data(iris) 77 | cl <- cutree(hclust(dist(iris[,-5])), 4) 78 | expect_equal(ARI(cl,iris$Species), adjustedRandIndex(cl,iris$Species)) 79 | 80 | ## "\n-completely equal vectors with no groups-") 81 | c1 <- 1:100 82 | c2 <- 1:100 83 | expect_equal(ARI(c1,c2), adjustedRandIndex(c1,c2)) 84 | 85 | ## "\n-completely equal vectors with one groups-") 86 | c1 <- rep(1,100) 87 | c2 <- rep(2,100) 88 | expect_equal(ARI(c1,c2), adjustedRandIndex(c1,c2)) 89 | 90 | ## "\n-completely different vectors with one groups-") 91 | c1 <- c(rep(0,99),1) 92 | c2 <- rep(1,100) 93 | expect_equal(ARI(c1,c2), adjustedRandIndex(c1,c2)) 94 | }) 95 | 96 | test_that("Testing coherence of the expected mutual information", { 97 | 98 | ## "\n-real data- 99 | data(iris) 100 | cl <- cutree(hclust(dist(iris[,-5])), 4) 101 | counts <- table(cl,iris$Species) 102 | ni. <- rowSums(counts) 103 | n.j <- colSums(counts) 104 | expect_equal(aricode:::expected_MI(ni., n.j), EMI(counts)) 105 | 106 | ## "\n-completely equal vectors with one groups-") 107 | c1 <- rep(1,100) 108 | c2 <- rep(2,100) 109 | counts <- table(c1, c2) 110 | ni. <- rowSums(counts) 111 | n.j <- colSums(counts) 112 | expect_equal(aricode:::expected_MI(ni., n.j), EMI(counts)) 113 | 114 | ## "\n-completely different vectors with one groups-") 115 | c1 <- c(rep(0,99),1) 116 | c2 <- rep(1,100) 117 | counts <- table(c1, c2) 118 | ni. <- rowSums(counts) 119 | n.j <- colSums(counts) 120 | expect_equal(aricode:::expected_MI(ni., n.j), EMI(counts)) 121 | }) 122 | 123 | 124 | test_that("Testing coherence of the Chi2 statistics information", { 125 | 126 | ## "\n-large random vectors- 127 | n <- 1e5 128 | c1 <- as.numeric(sample(1:(n/100), n, replace=TRUE)) 129 | c2 <- as.numeric(sample(1:(n/100), n, replace=TRUE)) 130 | expect_equal(ARI(c1,c2), adjustedRandIndex(c1,c2)) 131 | 132 | ## "\n-moderate random- 133 | n <- rpois(lambda=100, n=1) + 3 134 | k1 <- rpois(lambda=5, n=1)+2; k2 <- rpois(lambda=5, n=1)+2 135 | c1 <- sample(1:k1, n, replace=T) 136 | c2 <- sample(1:k2, n, replace=T) 137 | expect_equal(aricode::Chi2(c1,c2), Chi2_ref(c1,c2)) 138 | 139 | ## "\n-real data- 140 | data(iris) 141 | cl <- cutree(hclust(dist(iris[,-5])), 4) 142 | expect_equal(aricode::Chi2(cl,iris$Species), Chi2_ref(cl,iris$Species)) 143 | 144 | ## "\n-completely equal vectors with no groups-") 145 | c1 <- 1:100 146 | c2 <- 1:100 147 | expect_equal(aricode::Chi2(c1,c2), Chi2_ref(c1,c2)) 148 | }) 149 | 150 | test_that("Testing coherence of the MARI", { 151 | 152 | ## "\n-large random vectors- 153 | n <- 1e5 154 | c1 <- as.numeric(sample(1:(n/100), n, replace=TRUE)) 155 | c2 <- as.numeric(sample(1:(n/100), n, replace=TRUE)) 156 | expect_equal(aricode::MARIraw(c1,c2), ARI_estimated(c1,c2)) 157 | 158 | ## "\n-moderate random- 159 | n <- rpois(lambda=100, n=1) + 3 160 | k1 <- rpois(lambda=5, n=1)+2; k2 <- rpois(lambda=5, n=1)+2 161 | c1 <- sample(1:k1, n, replace=T) 162 | c2 <- sample(1:k2, n, replace=T) 163 | expect_equal(aricode::MARIraw(c1,c2), ARI_estimated(c1,c2)) 164 | 165 | ## "\n-real data- 166 | data(iris) 167 | cl <- cutree(hclust(dist(iris[,-5])), 4) 168 | expect_equal(aricode::MARIraw(c1,c2), ARI_estimated(c1,c2)) 169 | 170 | ## "\n-completely equal vectors with no groups-") 171 | c1 <- 1:100 172 | c2 <- 1:100 173 | expect_equal(aricode::MARIraw(c1,c2), ARI_estimated(c1,c2)) 174 | 175 | ## "\n-completely equal vectors with one groups-") 176 | c1 <- rep(1,100) 177 | c2 <- rep(2,100) 178 | expect_equal(aricode::MARIraw(c1,c2), ARI_estimated(c1,c2)) 179 | 180 | ## "\n-completely different vectors with one groups-") 181 | c1 <- c(rep(0,99),1) 182 | c2 <- rep(1,100) 183 | expect_equal(aricode::MARIraw(c1,c2), ARI_estimated(c1,c2)) 184 | 185 | }) 186 | 187 | test_that("Testing coherence of the MARI with ARI : in case of independance their values should be close", { 188 | n <- rpois(lambda=100, n=1) + 3 189 | k1 <- rpois(lambda=5, n=1)+2; k2 <- rpois(lambda=5, n=1)+2 190 | class1 <- sample(1:k1, n, replace=T) 191 | class2 <- sample(1:k2, n, replace=T) 192 | ## expect_equal or almost equal in test_that 193 | expect_equal(ARI(class1, class2), MARI(class1, class2), tolerance = 1e-3) 194 | }) 195 | -------------------------------------------------------------------------------- /tests/testthat/test_input.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(aricode) 3 | 4 | test_that("Testing supposedly handled input types", { 5 | 6 | c1 <- rep(1:3,each=3) * 1.5 7 | c2 <- rep(4:6,each=3) * 2.7 8 | 9 | cat("\n-numeric type-") 10 | c1.numeric <- c1 11 | c2.numeric <- c2 12 | ref.object <- list(spMat = NULL, levels = list(c1 = unique(c1.numeric), c2 = unique(c2.numeric)), nij = rep(3,3), ni. = rep(3,3), n.j = rep(3,3), pair_c1 = 0:2, pair_c2 = 0:2) 13 | expect_that(sortPairs(c1.numeric,c2.numeric), equals(ref.object)) 14 | 15 | cat("\n-integer type-") 16 | c1.integer <- as.integer(c1) 17 | c2.integer <- as.integer(c2) 18 | ref.object <- list(spMat = NULL, levels = list(c1 = unique(c1.integer), c2 = unique(c2.integer)), nij = rep(3,3), ni. = rep(3,3), n.j = rep(3,3), pair_c1 = 0:2, pair_c2 = 0:2) 19 | expect_that(sortPairs(c1.integer,c2.integer), equals(ref.object)) 20 | 21 | cat("\n-character type-") 22 | c1.char <- as.character(c1) 23 | c2.char <- as.character(c2) 24 | ref.object <- list(spMat = NULL, levels = list(c1 = unique(c1.char), c2 = unique(c2.char)), nij = rep(3,3), ni. = rep(3,3), n.j = rep(3,3), pair_c1 = 0:2, pair_c2 = 0:2) 25 | expect_that(sortPairs(c1.char,c2.char), equals(ref.object)) 26 | 27 | cat("\n-factor type-") 28 | c1.factor <- as.factor(c1.char) 29 | c2.factor <- as.factor(c2.char) 30 | ref.object <- list(spMat = NULL, levels = list(c1 = levels(c1.factor), c2 = levels(c2.factor)), nij = rep(3,3), ni. = rep(3,3), n.j = rep(3,3), pair_c1 = 0:2, pair_c2 = 0:2) 31 | expect_that(sortPairs(c1.factor,c2.factor), equals(ref.object)) 32 | 33 | cat("\n-different types-") 34 | ref.object <- list(spMat = NULL, levels = list(c1 = unique(c1.char), c2 = unique(c2.factor)), nij = rep(3,3), ni. = rep(3,3), n.j = rep(3,3), pair_c1 = 0:2, pair_c2 = 0:2) 35 | expect_that(sortPairs(c1.char,c2.factor), equals(ref.object)) 36 | }) 37 | 38 | test_that("Testing error for not handled input types", { 39 | 40 | c1 <- rep(1:3,each=3) * 1.5 41 | c2 <- rep(4:6,each=3) * 2.7 42 | 43 | cat("\n-NA-") 44 | c2NA <- c2; c2NA[1] <- NA 45 | expect_error(sortPairs(c1,c2NA)) 46 | 47 | cat("\n-different sizes-") 48 | expect_error(sortPairs(c1,c2[-1])) 49 | 50 | cat("\n-list type-") 51 | expect_error(sortPairs(as.list(c1),as.list(c2))) 52 | 53 | cat("\n-matrix type-") 54 | expect_error(sortPairs(as.matrix(c1),as.matrix(c2))) 55 | 56 | cat("\n-data.frame type-") 57 | expect_error(sortPairs(as.data.frame(c1),as.data.frame(c2))) 58 | 59 | }) 60 | --------------------------------------------------------------------------------