├── .Rbuildignore
├── .github
    ├── .gitignore
    └── workflows
    │   ├── R-CMD-check.yaml
    │   ├── pkgdown.yaml
    │   └── test-coverage.yaml
├── .gitignore
├── AUTHORS
├── CRAN-SUBMISSION
├── DESCRIPTION
├── NAMESPACE
├── NEWS.md
├── R
    ├── RcppExports.R
    ├── aricode-package.R
    └── aricode.R
├── README.Rmd
├── README.md
├── _pkgdown.yml
├── aricode.Rproj
├── cran-comments.md
├── inst
    ├── WORDLIST
    └── check_speed.R
├── man
    ├── AMI.Rd
    ├── ARI.Rd
    ├── Chi2.Rd
    ├── MARI.Rd
    ├── MARIraw.Rd
    ├── NID.Rd
    ├── NMI.Rd
    ├── NVI.Rd
    ├── RI.Rd
    ├── aricode-package.Rd
    ├── clustComp.Rd
    ├── entropy.Rd
    ├── figures
    │   └── timings_plot-1.png
    └── sortPairs.Rd
├── src
    ├── .gitignore
    ├── RcppExports.cpp
    └── aricode.cpp
└── tests
    ├── spelling.R
    ├── testthat.R
    └── testthat
        ├── test_coherence.R
        └── test_input.R


/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^.*\.Rproj$
 2 | ^\.Rproj\.user$
 3 | cran-comments.md
 4 | figure
 5 | README.md
 6 | README.rmd
 7 | .travis.yml
 8 | README_cache
 9 | ^CRAN-RELEASE$
10 | ^\.github$
11 | ^_pkgdown\.yml$
12 | ^pkgdown$
13 | ^docs
14 | ^docs/*.*
15 | ^docs/*
16 | ^CRAN-SUBMISSION$
17 | ^revdep
18 | AUTHORS
19 | 


--------------------------------------------------------------------------------
/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | 


--------------------------------------------------------------------------------
/.github/workflows/R-CMD-check.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 |     branches: [main, master]
 8 | 
 9 | name: R-CMD-check
10 | 
11 | jobs:
12 |   R-CMD-check:
13 |     runs-on: ${{ matrix.config.os }}
14 | 
15 |     name: ${{ matrix.config.os }} (${{ matrix.config.r }})
16 | 
17 |     strategy:
18 |       fail-fast: false
19 |       matrix:
20 |         config:
21 |           - {os: macOS-latest,   r: 'release'}
22 |           - {os: macOS-latest,   r: 'oldrel-1'}
23 |           - {os: windows-latest, r: 'release'}
24 |           - {os: windows-latest,   r: 'oldrel-1'}
25 |           - {os: ubuntu-latest,   r: 'devel', http-user-agent: 'release'}
26 |           - {os: ubuntu-latest,   r: 'release'}
27 | 
28 |     env:
29 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
30 |       R_KEEP_PKG_SOURCE: yes
31 | 
32 |     steps:
33 |       - uses: actions/checkout@v4
34 | 
35 |       - uses: r-lib/actions/setup-pandoc@v2
36 | 
37 |       - uses: r-lib/actions/setup-r@v2
38 |         with:
39 |           r-version: ${{ matrix.config.r }}
40 |           http-user-agent: ${{ matrix.config.http-user-agent }}
41 |           use-public-rspm: true
42 | 
43 |       - uses: r-lib/actions/setup-r-dependencies@v2
44 |         with:
45 |           extra-packages: any::rcmdcheck
46 |           needs: check
47 | 
48 |       - uses: r-lib/actions/check-r-package@v2
49 |         with:
50 |           upload-snapshots: true
51 | 


--------------------------------------------------------------------------------
/.github/workflows/pkgdown.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 |     branches: [main, master]
 8 |   release:
 9 |     types: [published]
10 |   workflow_dispatch:
11 | 
12 | name: pkgdown
13 | 
14 | jobs:
15 |   pkgdown:
16 |     runs-on: ubuntu-latest
17 |     # Only restrict concurrency for non-PR jobs
18 |     concurrency:
19 |       group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }}
20 |     env:
21 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
22 |     steps:
23 |       - uses: actions/checkout@v2
24 | 
25 |       - uses: r-lib/actions/setup-pandoc@v2
26 | 
27 |       - uses: r-lib/actions/setup-r@v2
28 |         with:
29 |           use-public-rspm: true
30 | 
31 |       - uses: r-lib/actions/setup-r-dependencies@v2
32 |         with:
33 |           extra-packages: any::pkgdown, local::.
34 |           needs: website
35 | 
36 |       - name: Build site
37 |         run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE)
38 |         shell: Rscript {0}
39 | 
40 |       - name: Deploy to GitHub pages 🚀
41 |         if: github.event_name != 'pull_request'
42 |         uses: JamesIves/github-pages-deploy-action@4.1.4
43 |         with:
44 |           clean: false
45 |           branch: gh-pages
46 |           folder: docs
47 | 


--------------------------------------------------------------------------------
/.github/workflows/test-coverage.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 |     branches: [main, master]
 8 | 
 9 | name: test-coverage
10 | 
11 | jobs:
12 |   test-coverage:
13 |     runs-on: ubuntu-latest
14 |     env:
15 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
16 | 
17 |     steps:
18 |       - uses: actions/checkout@v2
19 | 
20 |       - uses: r-lib/actions/setup-r@v2
21 |         with:
22 |           use-public-rspm: true
23 | 
24 |       - uses: r-lib/actions/setup-r-dependencies@v2
25 |         with:
26 |           extra-packages: any::covr
27 |           needs: coverage
28 | 
29 |       - name: Test coverage
30 |         run: covr::codecov(quiet = FALSE)
31 |         shell: Rscript {0}
32 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # History files
 2 | .Rhistory
 3 | .Rapp.history
 4 | 
 5 | # Session Data files
 6 | .RData
 7 | 
 8 | # Example code in package build process
 9 | *-Ex.R
10 | 
11 | # Output files from R CMD build
12 | /*.tar.gz
13 | 
14 | # Output files from R CMD check
15 | /*.Rcheck/
16 | 
17 | # RStudio files
18 | .Rproj.user/
19 | 
20 | # produced vignettes
21 | vignettes/*.html
22 | vignettes/*.pdf
23 | 
24 | # web files
25 | docs/*
26 | 
27 | # doc files
28 | inst/doc/*
29 | 
30 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
31 | .httr-oauth
32 | 
33 | # knitr and R markdown default cache directories
34 | /*_cache/
35 | /cache/
36 | 
37 | # Temporary files created by R markdown
38 | *.utf8.md
39 | *.knit.md
40 | .Rproj.user
41 | 
42 | # dyn librairies
43 | src/*.o
44 | src/*.so
45 | src/*.dll
46 | 
47 | # revdep testing
48 | revdep/


--------------------------------------------------------------------------------
/AUTHORS:
--------------------------------------------------------------------------------
1 | # Contributors
2 | * Julien Chiquet, Maintainer, Developer
3 | * Guillem Rigaill, Developer
4 | * Martina Sundqvist, Developer
5 | 


--------------------------------------------------------------------------------
/CRAN-SUBMISSION:
--------------------------------------------------------------------------------
1 | Version: 1.0.3
2 | Date: 2023-10-19 10:43:22 UTC
3 | SHA: 110fd17dc3613b6cb5fee0bfeebcaf3e0faf077b
4 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: aricode
 2 | Type: Package
 3 | Title: Efficient Computations of Standard Clustering Comparison Measures
 4 | Version: 1.0.3
 5 | Authors@R: c(
 6 |     person("Julien", "Chiquet", role = c("aut", "cre"), email = "julien.chiquet@inrae.fr",
 7 |       comment = c(ORCID = "0000-0002-3629-3429")),
 8 |     person("Guillem", "Rigaill", role = "aut", email = "guillem.rigaill@inrae.fr"),
 9 |     person("Martina", "Sundqvist", role = "aut", email = "martina.sundqvist@agroparistech.fr"),
10 |     person("Valentin", "Dervieux", role = "ctb", email = "valentin.dervieux@gmail.com"),
11 |     person("Florent", "Bersani", role = "ctb", email = "florent@bersani.org")
12 |   )
13 | Maintainer: Julien Chiquet <julien.chiquet@inrae.fr>
14 | Description: Implements an efficient O(n) algorithm based on bucket-sorting for 
15 |     fast computation of standard clustering comparison measures. Available measures
16 |     include adjusted Rand index (ARI), normalized information distance (NID), 
17 |     normalized mutual information (NMI), adjusted mutual information (AMI), 
18 |     normalized variation information (NVI) and entropy, as described in Vinh et al (2009) 
19 |     <doi:10.1145/1553374.1553511>. Include AMI (Adjusted Mutual Information) since version 0.1.2, 
20 |     a modified version of ARI (MARI), as described in Sundqvist et al. <doi:10.1007/s00180-022-01230-7> 
21 |     and simple Chi-square distance since version 1.0.0.
22 | License: GPL (>=3)
23 | URL: https://github.com/jchiquet/aricode
24 | BugReports: https://github.com/jchiquet/aricode/issues
25 | Encoding: UTF-8
26 | Imports:
27 |     Matrix,
28 |     Rcpp
29 | Suggests: 
30 |     testthat,
31 |     spelling
32 | LinkingTo: Rcpp
33 | RoxygenNote: 7.2.3
34 | Language: en-US
35 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(AMI)
 4 | export(ARI)
 5 | export(Chi2)
 6 | export(MARI)
 7 | export(MARIraw)
 8 | export(NID)
 9 | export(NMI)
10 | export(NVI)
11 | export(RI)
12 | export(clustComp)
13 | export(entropy)
14 | export(sortPairs)
15 | import(Matrix)
16 | importFrom(Rcpp,sourceCpp)
17 | useDynLib(aricode)
18 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
 1 | ## aricode 1.0-3 (2023-10-19)
 2 | 
 3 | - minor fix in package documentation due to evolution of roxygen2 7.0.0 <https://github.com/r-lib/roxygen2/issues/1491>.
 4 | 
 5 | ## aricode 1.0-2 (2022-12-14)
 6 | 
 7 | - fix a serious bug in the AMI function pointed by Florent Bersani
 8 | 
 9 | ## aricode 1.0-1 (2022-09-04)
10 | 
11 | - fix warnings in C++
12 | - fix documentation for to comply with CRAN policy and HTML5 validation
13 | 
14 | ## aricode 1.0-0 (2020-06-23)
15 | 
16 |   -	added the Modified adjusted Rand Index
17 |   -	added the Chi-Square statistics
18 |   - bug fixed in SortPairs
19 | 
20 | ## aricode 0.1-2	(2019-06-28)
21 |   
22 |   -	added the Adjusted Mutual information
23 | 
24 | ## aricode 0.1-1	(2018-04-30)
25 | 
26 |   -	first submission to CRAN
27 | 


--------------------------------------------------------------------------------
/R/RcppExports.R:
--------------------------------------------------------------------------------
 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand
 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 3 | 
 4 | countPairs <- function(classi1, classi2, order) {
 5 |     .Call('_aricode_countPairs', PACKAGE = 'aricode', classi1, classi2, order)
 6 | }
 7 | 
 8 | expected_MI <- function(ni_, n_j) {
 9 |     .Call('_aricode_expected_MI', PACKAGE = 'aricode', ni_, n_j)
10 | }
11 | 
12 | getRank <- function(classi) {
13 |     .Call('_aricode_getRank', PACKAGE = 'aricode', classi)
14 | }
15 | 
16 | 


--------------------------------------------------------------------------------
/R/aricode-package.R:
--------------------------------------------------------------------------------
 1 | #' @keywords internal
 2 | "_PACKAGE"
 3 | 
 4 | #' aricode
 5 | #'
 6 | #' A package for efficient computations of standard clustering comparison measures. Most of the available measures are described in the paper of Vinh et al, JMLR, 2009 (see reference below).
 7 | #'
 8 | #' Traditional implementations (e.g., function \code{adjustedRandIndex} of package \code{mclust}) are in Omega(n + u v) where n is the size of the vectors the classifications of which are to be compared, u and v are the respective number of classes in each vectors. Here, the implementation is in Theta(n), plus the gain of speed due to the C++ code.
 9 | #'
10 | #' The functions included in aricode are:
11 | #'
12 | #' * ARI: computes the adjusted rand index
13 | #' * Chi2: computes the Chi-square statistic
14 | #' * MARI: computes the modified adjusted rand index (Sundqvist et al, in preparation)
15 | #' * MARIraw: computes the raw version of the modified adjusted rand index
16 | #' * RI: computes the rand index
17 | #' * NVI: computes the normalized variation information
18 | #' * NID: computes the normalized information distance
19 | #' * NMI: computes the normalized mutual information
20 | #' * AMI: computes the adjusted mutual information
21 | #' * entropy: computes the conditional and joint entropies
22 | #' * clustComp: computes all clustering comparison measures at once
23 | #'
24 | #' @author Julien Chiquet \email{julien.chiquet@@inrae.fr}
25 | #' @author Guillem Rigaill \email{guillem.rigaill@@inrae.fr}
26 | #' @author Martina Sundqvist \email{martina.sundqvist@@agroparistech.fr}
27 | #' @references
28 | #'  * Nguyen Xuan Vinh, Julien Epps, and James Bailey. "Information theoretic measures for clusterings comparison: Variants, properties, normalization and correction for chance." Journal of Machine Learning Research 11.Oct (2010): 2837-2854. as described in Vinh et al (2009)
29 | #'  * Sundqvist, Martina, Julien Chiquet, and Guillem Rigaill. "Adjusting the adjusted Rand Index: A multinomial story." Computational Statistics 38.1 (2023): 327-347.
30 | #' @seealso \code{\link{ARI}}, \code{\link{RI}}, \code{\link{NID}}, \code{\link{NVI}}, \code{\link{AMI}}, \code{\link{NMI}}, \code{\link{entropy}}, \code{\link{clustComp}}
31 | #' @name aricode-package
32 | 
33 | # The following block is used by usethis to automatically manage
34 | # roxygen namespace tags. Modify with care!
35 | ## usethis namespace: start
36 | #' @useDynLib aricode
37 | #' @importFrom Rcpp sourceCpp
38 | ## usethis namespace: end
39 | NULL
40 | 


--------------------------------------------------------------------------------
/R/aricode.R:
--------------------------------------------------------------------------------
  1 | #' Sort Pairs
  2 | #'
  3 | #' A function to sort pairs of integers or factors and identify the pairs
  4 | #'
  5 | #' @param c1 a vector of length n with value between 0 and N1 < n
  6 | #' @param c2 a vector of length n with value between 0 and N2 < n
  7 | #' @param spMat logical: send back the contingency table as sparsely encoded (cost more than the algorithm itself). Default is FALSE
  8 | #' @import Matrix
  9 | #' @export
 10 | sortPairs <- function(c1, c2, spMat=FALSE){
 11 |   if (anyNA(c1) | anyNA(c2))
 12 |     stop("NA are not supported.")
 13 | 
 14 |   if (((!is.vector(c1) & !is.factor(c1)) | is.list(c1)) | ((!is.vector(c2) & !is.factor(c2)) | is.list(c2)))
 15 |     stop("c1 and c2 must be vectors or factors but not lists.")
 16 | 
 17 |   if (length(c1) != length(c2))
 18 |     stop("the two vectors must have the same length.")
 19 | 
 20 |   n <- length(c1)
 21 | 
 22 |   ## if c1 and c2 are integer
 23 |   if (is.integer(c1) & is.integer(c2)) {
 24 |     ## getRank is O(n) if max(c1)-min(c1) and max(c2)-min(c2) is of order length(c1)=length(c2)
 25 |     ## NOTE: getRank does not assume c1 and c2 are between 0 and n
 26 |     res1 <- getRank(c1)
 27 |     res2 <- getRank(c2)
 28 |     mylevels <- list(c1=res1$index, c2=res2$index)
 29 |     c1 <- res1$translated  # here ranks are in [0, n)
 30 |     c2 <- res2$translated  # here ranks are in [0, n)
 31 |   } else if (is.factor(c1) & is.factor(c2)) {
 32 |     mylevels <- list(c1 = levels(c1), c2 = levels(c2))
 33 |     c1 <- as.integer(c1) - 1L
 34 |     c2 <- as.integer(c2) - 1L
 35 |   } else {
 36 |     ## if neither a factor nor an integer or different of types force to factor then integer
 37 |     mylevels <- list(c1 = unique(c1), c2 = unique(c2))
 38 |     c1 <- as.integer(factor(c1, levels = mylevels$c1)) - 1L
 39 |     c2 <- as.integer(factor(c2, levels = mylevels$c2)) - 1L
 40 |   }
 41 | 
 42 | 
 43 |   i_order <- order(c1, c2, method="radix") - 1L
 44 |   out <- countPairs(c1, c2, i_order)
 45 | 
 46 |   if (spMat) {
 47 |     spOut <- sparseMatrix(i=out$pair_c1,
 48 |                           j=out$pair_c2,
 49 |                           x=out$pair_nb,
 50 |                           dims=sapply(mylevels,length),
 51 |                           dimnames = mylevels, index1=FALSE)
 52 |   } else {
 53 |     spOut <- NULL
 54 |   }
 55 | 
 56 |   res <- list(spMat = spOut,
 57 |               levels = mylevels,
 58 |               nij = out$pair_nb,
 59 |               ni. = out$c1_nb,
 60 |               n.j = out$c2_nb,
 61 |               pair_c1 = out$pair_c1,
 62 |               pair_c2 = out$pair_c2
 63 |   )
 64 |   res
 65 | }
 66 | 
 67 | #' Adjusted Rand Index
 68 | #'
 69 | #' A function to compute the adjusted rand index between two classifications
 70 | #'
 71 | #' @param c1 a vector containing the labels of the first classification. Must be a vector of characters, integers, numerics, or a factor, but not a list.
 72 | #' @param c2 a vector containing the labels of the second classification.
 73 | #' @return a scalar with the adjusted rand index.
 74 | #' @seealso \code{\link{RI}}, \code{\link{NID}}, \code{\link{NVI}}, \code{\link{NMI}}, \code{\link{clustComp}}
 75 | #' @examples
 76 | #' data(iris)
 77 | #' cl <- cutree(hclust(dist(iris[,-5])), 4)
 78 | #' ARI(cl,iris$Species)
 79 | #' @export
 80 | ARI <- function(c1, c2){
 81 | 
 82 |     ## get pairs using C
 83 |     ## ensure that values of c1 and c2 are between 0 and n1
 84 |     res <- sortPairs(c1, c2)
 85 | 
 86 |     ## get ARI using pairs
 87 |     N <- length(c1)
 88 | 
 89 |     stot <- sum(choose(res$nij, 2), na.rm=TRUE)
 90 |     srow <- sum(choose(res$ni., 2), na.rm=TRUE)
 91 |     scol <- sum(choose(res$n.j, 2), na.rm=TRUE)
 92 | 
 93 |     expectedIndex <-(srow*scol)/(choose(N,2))
 94 |     maximumIndex <- (srow+scol)/2
 95 | 
 96 |     if (expectedIndex == maximumIndex & stot != 0) {
 97 |       res <- 1
 98 |     } else {
 99 |       res <- (stot-expectedIndex)/(maximumIndex-expectedIndex)
100 |     }
101 |     res
102 | }
103 | 
104 | #' Rand Index
105 | #'
106 | #' A function to compute the rand index between two classifications
107 | #'
108 | #' @param c1 a vector containing the labels of the first classification. Must be a vector of characters, integers, numerics, or a factor, but not a list.
109 | #' @param c2 a vector containing the labels of the second classification.
110 | #' @return a scalar with the rand index.
111 | #' @seealso \code{\link{ARI}}, \code{\link{NID}}, \code{\link{NVI}}, \code{\link{NMI}}, \code{\link{clustComp}}
112 | #' @examples
113 | #' data(iris)
114 | #' cl <- cutree(hclust(dist(iris[,-5])), 4)
115 | #' RI(cl,iris$Species)
116 | #' @export
117 | RI <- function(c1, c2){
118 |   ## get pairs using C
119 |   ## ensure that values of c1 and c2 are between 0 and n1
120 |   res <- sortPairs(c1, c2)
121 | 
122 |   ## get ARI using pairs
123 |   N <- length(c1)
124 | 
125 |   ## return the rand-index
126 |   res <- 1 + (sum(res$nij^2) - (sum(res$ni.^2) + sum(res$n.j^2))/2)/choose(N,2)
127 |   res
128 | }
129 | 
130 | #' Modified Adjusted Rand Index
131 | #'
132 | #' A function to compute a modified adjusted rand index between two classifications as proposed by Sundqvist et al. in prep, based on a multinomial model.
133 | #'
134 | #' @param c1 a vector containing the labels of the first classification. Must be a vector of characters, integers, numerics, or a factor, but not a list.
135 | #' @param c2 a vector containing the labels of the second classification.
136 | #' @return a scalar with the modified ARI.
137 | #' @seealso \code{\link{ARI}}, \code{\link{NID}}, \code{\link{NVI}}, \code{\link{NMI}}, \code{\link{clustComp}}
138 | #' @examples
139 | #' data(iris)
140 | #' cl <- cutree(hclust(dist(iris[,-5])), 4)
141 | #' MARI(cl,iris$Species)
142 | #' @export
143 | MARI <- function(c1, c2){
144 |   ## get pairs using C
145 |   ## ensure that values of c1 and c2 are between 0 and n1
146 |   res <- sortPairs(c1, c2)
147 |   N <- length(c1)
148 |   ##
149 | 
150 |   stot <- sum(choose(res$nij, 2), na.rm=TRUE)
151 |   srow <- sum(choose(res$ni., 2), na.rm=TRUE)
152 |   scol <- sum(choose(res$n.j, 2), na.rm=TRUE)
153 | 
154 |   ## using Lemma 3.3
155 |   ## triplets
156 |   T1 <- 2*N
157 |   T2 <- sum(res$nij * res$ni.[res$pair_c1+1] * res$n.j[res$pair_c2+1], na.rm=TRUE)
158 |   T3 <- -sum(res$nij^2, na.rm=TRUE) - sum(res$ni.^2, na.rm=TRUE) - sum(res$n.j^2, na.rm=TRUE)
159 | 
160 |   ## quadruplets (and division by 6 choose(N, 4)
161 |   expectedIndex <- (srow*scol - stot - (T1+T2+T3)) / (6 *choose(N, 4))
162 | 
163 |   ## return the rand-index
164 |   expectedIndex <- expectedIndex * choose(N, 2) ## RESCALE SO THAT THE CODE IS EQUIVALENT TO THE ARI
165 |   maximumIndex <- (srow+scol)/2
166 |   if (expectedIndex == maximumIndex & stot != 0) {
167 |     res <- 1
168 |   } else {
169 |     res <- (stot-expectedIndex)/(maximumIndex-expectedIndex)
170 |   }
171 |     res
172 |   ## return the adjusted (and divided) rand-index
173 |   res
174 | }
175 | 
176 | #' raw Modified Adjusted Rand Index
177 | #'
178 | #' A function to compute a modified adjusted rand index between two classifications as proposed by Sundqvist et al. in prep, based on a multinomial model. Raw means, that the index is not divided by the (maximum - expected) value.
179 | #'
180 | #' @param c1 a vector containing the labels of the first classification. Must be a vector of characters, integers, numerics, or a factor, but not a list.
181 | #' @param c2 a vector containing the labels of the second classification.
182 | #' @return a scalar with the modified ARI without the division by the (maximum - expected)
183 | #' @seealso \code{\link{ARI}}, \code{\link{NID}}, \code{\link{NVI}}, \code{\link{NMI}}, \code{\link{clustComp}}
184 | #' @examples
185 | #' data(iris)
186 | #' cl <- cutree(hclust(dist(iris[,-5])), 4)
187 | #' MARIraw(cl,iris$Species)
188 | #' @export
189 | MARIraw <- function(c1, c2){
190 |   ## get pairs using C
191 |   ## ensure that values of c1 and c2 are between 0 and n1
192 |   res <- sortPairs(c1, c2)
193 |   N <- length(c1)
194 | 
195 |   stot <- sum(choose(res$nij, 2), na.rm=TRUE)
196 |   srow <- sum(choose(res$ni., 2), na.rm=TRUE)
197 |   scol <- sum(choose(res$n.j, 2), na.rm=TRUE)
198 | 
199 |   ## using Lemma 3.3
200 |   ## triplets
201 |   T1 <- 2*N
202 |   T2 <- sum(res$nij * res$ni.[res$pair_c1+1] * res$n.j[res$pair_c2+1], na.rm=TRUE)
203 |   T3 <- -sum(res$nij^2, na.rm=TRUE) - sum(res$ni.^2, na.rm=TRUE) - sum(res$n.j^2, na.rm=TRUE)
204 | 
205 |   ## quadruplets (and division by 6 choose(N, 4)
206 |   expectedIndex <- (srow*scol - stot - (T1+T2+T3)) / (6 *choose(N, 4))
207 | 
208 |   ## return the rand-index
209 |   res <- (stot / choose(N, 2)) - expectedIndex
210 |   res
211 | }
212 | 
213 | #' Chi-square statistics
214 | #'
215 | #' A function to compute the Chi-2 statistics
216 | #'
217 | #' @param c1 a vector containing the labels of the first classification. Must be a vector of characters, integers, numerics, or a factor, but not a list.
218 | #' @param c2 a vector containing the labels of the second classification.
219 | #' @return a scalar with the chi-square statistics.
220 | #' @seealso \code{\link{ARI}}, \code{\link{NID}}, \code{\link{NVI}}, \code{\link{NMI}}, \code{\link{clustComp}}
221 | #' @examples
222 | #' data(iris)
223 | #' cl <- cutree(hclust(dist(iris[,-5])), 4)
224 | #' Chi2(cl,iris$Species)
225 | #' @export
226 | Chi2 <- function(c1, c2){
227 |   ## get pairs using C
228 |   ## ensure that values of c1 and c2 are between 0 and n1
229 |   res <- sortPairs(c1, c2)
230 |   N <- length(c1)
231 | 
232 |   res <- N* sum(res$nij^2 / (res$ni.[res$pair_c1+1] * res$n.j[res$pair_c2+1]) )
233 |   res <- res - N
234 |   res
235 | }
236 | 
237 | 
238 | #' Entropy
239 | #'
240 | #' A function to compute the empirical entropy for two vectors of classification and the joint entropy
241 | #'
242 | #' @param c1 a vector containing the labels of the first classification. Must be a vector of characters, integers, numerics, or a factor, but not a list.
243 | #' @param c2 a vector containing the labels of the second classification.
244 | #' @return a list with the two conditional entropies, the joint entropy and output of sortPairs.
245 | #' @examples
246 | #' data(iris)
247 | #' cl <- cutree(hclust(dist(iris[,-5])), 4)
248 | #' entropy(cl,iris$Species)
249 | #' @export
250 | entropy <- function(c1, c2){
251 |   res <- sortPairs(c1, c2)
252 | 
253 |   N <- length(c1)
254 | 
255 |   H.UV <- - sum(res$nij * log(res$nij))/N + log(N)
256 |   H.U  <- - sum(res$ni. * log(res$ni.))/N + log(N)
257 |   H.V  <- - sum(res$n.j * log(res$n.j))/N + log(N)
258 | 
259 |   res <- list(UV = H.UV, U = H.U, V = H.V, sortPairs = res)
260 |   res
261 | }
262 | 
263 | #' Measures of similarity between two classification
264 | #'
265 | #' A function various measures of similarity between two classifications
266 | #'
267 | #' @param c1 a vector containing the labels of the first classification. Must be a vector of characters, integers, numerics, or a factor, but not a list.
268 | #' @param c2 a vector containing the labels of the second classification.
269 | #' @return a list with the RI, ARI, NMI, NVI and NID.
270 | #' @seealso \code{\link{RI}}, \code{\link{NID}}, \code{\link{NVI}}, \code{\link{NMI}}, \code{\link{ARI}}
271 | #' @examples
272 | #' data(iris)
273 | #' cl <- cutree(hclust(dist(iris[,-5])), 4)
274 | #' clustComp(cl,iris$Species)
275 | #' @export
276 | clustComp <- function(c1, c2) {
277 | 
278 |   H   <- entropy(c1,c2)
279 | 
280 |   MI  <- - H$UV + H$U + H$V
281 |   VI  <- H$UV - MI
282 |   NVI <- 1 - MI/H$UV
283 |   ID  <- max(H$U, H$V) - MI
284 |   NID <- 1 - MI / max(H$U, H$V)
285 |   NMI <- MI / max(H$U, H$V)
286 |   EMI <- expected_MI(as.integer(H$sortPairs$ni.), as.integer(H$sortPairs$n.j))
287 | 
288 |   res <- list(RI  = RI(c1,c2)             ,
289 |               ARI = ARI(c1,c2)            ,
290 |               MI  = - H$UV + H$U + H$V    ,
291 |               AMI = (- H$UV + H$U + H$V - EMI) / (max(H$U,H$V) - EMI),
292 |               VI  = H$UV - MI             ,
293 |               NVI = 1 - MI/H$UV           ,
294 |               ID  = max(H$U, H$V) - MI    ,
295 |               NID = 1 - MI / max(H$U, H$V),
296 |               NMI = MI / max(H$U, H$V)    ,
297 |       	      Chi2 = Chi2(c1,c2)          ,
298 |   	          MARI = MARI(c1,c2)          ,
299 | 	         MARIraw = MARIraw(c1,c2)
300 |   )
301 |   res
302 |   res
303 | }
304 | 
305 | #' Adjusted Mutual Information
306 | #'
307 | #' A function to compute the adjusted mutual information between two classifications
308 | #'
309 | #' @param c1 a vector containing the labels of the first classification. Must be a vector of characters, integers, numerics, or a factor, but not a list.
310 | #' @param c2 a vector containing the labels of the second classification.
311 | #' @return a scalar with the adjusted rand index.
312 | #' @seealso \code{\link{ARI}}, \code{\link{RI}}, \code{\link{NID}}, \code{\link{NVI}}, \code{\link{NMI}}, \code{\link{clustComp}}
313 | #' @examples
314 | #' data(iris)
315 | #' cl <- cutree(hclust(dist(iris[,-5])), 4)
316 | #' AMI(cl,iris$Species)
317 | #' @export
318 | AMI <- function(c1, c2){
319 | 
320 |   H   <- entropy(c1,c2)
321 |   MI  <- - H$UV + H$U + H$V
322 |   EMI <- expected_MI(as.integer(H$sortPairs$ni.), as.integer(H$sortPairs$n.j))
323 | 
324 |   res <- (MI - EMI) / (max(H$U,H$V) - EMI)
325 |   res
326 | }
327 | 
328 | #' Normalized mutual information (NMI)
329 | #'
330 | #' A function to compute the NMI between two classifications
331 | #'
332 | #' @param c1 a vector containing the labels of the first classification. Must be a vector of characters, integers, numerics, or a factor, but not a list.
333 | #' @param c2 a vector containing the labels of the second classification.
334 | #' @param variant a string in ("max", "min", "sqrt", "sum", "joint"): different variants of NMI. Default use "max".
335 | #' @return a scalar with the normalized mutual information .
336 | #' @seealso \code{\link{RI}}, \code{\link{NID}}, \code{\link{NVI}}, \code{\link{ARI}}, \code{\link{clustComp}}
337 | #' @examples
338 | #' data(iris)
339 | #' cl <- cutree(hclust(dist(iris[,-5])), 4)
340 | #' NMI(cl,iris$Species)
341 | #' @export
342 | NMI <- function(c1, c2, variant = c("max", "min", "sqrt", "sum", "joint")) {
343 | 
344 |   variant <- match.arg(variant)
345 | 
346 |   H   <- entropy(c1,c2)
347 | 
348 |   MI  <- - H$UV + H$U + H$V
349 | 
350 |   D <- switch(variant,
351 |           "max" = max(H$U, H$V),
352 |           "sqrt" = sqrt(H$U * H$V),
353 |           "min" = min(H$U, H$V),
354 |           "sum" = .5*(H$U + H$V),
355 |          "joint" = H$UV)
356 |   res <- MI / D
357 |   res
358 | }
359 | 
360 | #' Normalized information distance (NID)
361 | #'
362 | #' A function to compute the NID between two classifications
363 | #'
364 | #' @param c1 a vector containing the labels of the first classification. Must be a vector of characters, integers, numerics, or a factor, but not a list.
365 | #' @param c2 a vector containing the labels of the second classification.
366 | #' @return a scalar with the normalized information distance .
367 | #' @seealso \code{\link{RI}}, \code{\link{NMI}}, \code{\link{NVI}}, \code{\link{ARI}}, \code{\link{clustComp}}
368 | #' @examples
369 | #' data(iris)
370 | #' cl <- cutree(hclust(dist(iris[,-5])), 4)
371 | #' NID(cl,iris$Species)
372 | #' @export
373 | NID <- function(c1, c2) {
374 | 
375 |   H   <- entropy(c1,c2)
376 | 
377 |   MI  <- - H$UV + H$U + H$V
378 |   res <- 1 - MI / max(H$U, H$V)
379 |   res
380 | }
381 | 
382 | #' Normalized variation of information (NVI)
383 | #'
384 | #' A function to compute the NVI between two classifications
385 | #'
386 | #' @param c1 a vector containing the labels of the first classification. Must be a vector of characters, integers, numerics, or a factor, but not a list.
387 | #' @param c2 a vector containing the labels of the second classification.
388 | #' @return a scalar with the normalized variation of information.
389 | #' @seealso \code{\link{RI}}, \code{\link{NID}}, \code{\link{NMI}}, \code{\link{ARI}}, \code{\link{clustComp}}
390 | #' @examples
391 | #' data(iris)
392 | #' cl <- cutree(hclust(dist(iris[,-5])), 4)
393 | #' NVI(cl,iris$Species)
394 | #' @export
395 | NVI <- function(c1, c2) {
396 | 
397 |   H   <- entropy(c1,c2)
398 |   MI  <- - H$UV + H$U + H$V
399 |   res <- 1 - MI/H$UV
400 |   res
401 | }
402 | 
403 | 
404 | 


--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | output: github_document
  3 | ---
  4 | 
  5 | ```{r setup, include=FALSE}
  6 | knitr::opts_chunk$set(
  7 |   collapse = TRUE,
  8 |   fig.path = "man/figures/"
  9 | )
 10 | ```
 11 | 
 12 | # aricode
 13 | 
 14 |  <!-- badges: start -->
 15 | [![R-CMD-check](https://github.com/jchiquet/aricode/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/jchiquet/aricode/actions/workflows/R-CMD-check.yaml)
 16 | [![CRAN Status](https://www.r-pkg.org/badges/version/aricode)](https://CRAN.R-project.org/package=aricode)
 17 | [![Coverage status](https://codecov.io/gh/jchiquet/aricode/branch/master/graph/badge.svg)](https://codecov.io/gh/jchiquet/aricode)
 18 | [![Lifecycle: stable](https://img.shields.io/badge/lifecycle-stable-blue.svg)](https://www.tidyverse.org/lifecycle/#stable)
 19 | [![](https://img.shields.io/github/last-commit/jchiquet/aricode.svg)](https://github.com/jchiquet/aricode/commits/master)
 20 |   <!-- badges: end -->
 21 | 
 22 | A package for efficient computations of standard clustering comparison measures
 23 | 
 24 | ## Installation
 25 | 
 26 | Stable version on the [CRAN](https://cran.rstudio.com/web/packages/aricode/).
 27 | 
 28 | ```{r install_cran, eval = FALSE}
 29 | install.packages("aricode")
 30 | ```
 31 | 
 32 | The development version is available via:
 33 | 
 34 | ```{r install_github, eval = FALSE}
 35 | devtools::install_github("jchiquet/aricode")
 36 | ```
 37 | 
 38 | ## Description
 39 | 
 40 | Computation of measures for clustering comparison (ARI, AMI, NID and even the $\chi^2$ distance) are usually based on the contingency table. Traditional implementations (e.g., function `adjustedRandIndex` of package **mclust**) are in $\Omega(n + u v)$ where 
 41 | 
 42 | - $n$ is the size of the vectors the classifications of which are to be compared,
 43 | - $u$ and $v$ are the respective number of classes in each vectors. 
 44 | 
 45 | In **aricode** we propose an implementation, based on radix sort, that is in $\Theta(n)$ in time and space.  
 46 | Importantly, the complexity does not depends on $u$ and $v$.
 47 | Our implementation of the ARI for instance is one or two order of magnitude faster than some standard implementation in `R`.
 48 | 
 49 | ## Available measures and functions
 50 | 
 51 | The functions included in aricode are:
 52 | 
 53 | - `ARI`: computes the adjusted rand index
 54 | - `Chi2`: computes the Chi-square statistics
 55 | - `MARI/MARIraw`: computes the modified adjusted rand index (Sundqvist et al, in preparation)
 56 | - `NVI`: computes the the normalized variation information
 57 | - `NID`: computes the normalized information distance
 58 | - `NMI`: computes the normalized mutual information
 59 | - `AMI`: computes the adjusted mutual information
 60 | - `expected_MI`: computes the expected mutual information
 61 | - `entropy`: computes the conditional and joint entropies
 62 | - `clustComp`: computes all clustering comparison measures at once
 63 | 
 64 | ## Timings
 65 | 
 66 | Here are some timings to compare the cost of computing the adjusted Rand Index with **aricode** or with the commonly used function `adjustedRandIndex` of the *mclust* package: the cost of the latter can be prohibitive for large vectors: 
 67 | 
 68 | ```{r timings_function, echo=FALSE, message=FALSE, warning=FALSE}
 69 | library(aricode)
 70 | library(mclust)
 71 | library(ggplot2)
 72 | 
 73 | time.aricode <- function(times, c1, c2){
 74 |   replicate(times, system.time(ARI(c1, c2))[3])
 75 | }
 76 | 
 77 | time.mclust <- function(times, c1, c2){
 78 |   replicate(times, system.time(mclust::adjustedRandIndex(c1, c2))[3])
 79 | }
 80 | 
 81 | time.method <- function(times, c1, c2, n){
 82 |   rbind(
 83 |     data.frame(time = time.aricode(times, c1, c2), expr = "aricode", n = n),
 84 |     data.frame(time = time.mclust(times, c1, c2), expr = "mclust", n = n)
 85 |   )
 86 | }
 87 | 
 88 | # with similar classif, number of classes grows with n
 89 | sim.timings <- function(n, times = 10) {
 90 |     c1 <- sample(1:(n/200), n, replace=TRUE);c2 <- c1;
 91 |     i_change <- sample(1:n, n/50, replace=FALSE)
 92 |     c2[i_change] <- c2[rev(i_change)]
 93 |     out <- time.method(times, c1, c2, n)
 94 |     data.frame(time=out$time, method=out$expr, n = n)
 95 | }
 96 | ```
 97 | 
 98 | ```{r timings_run, echo=FALSE, message=FALSE, warning=FALSE, cache=TRUE}
 99 | # with similar classif, number of classes grows with n
100 | ns <- sort(c(200 * 2^(3:14), 150 * 2^(3:15)))
101 | timings <- do.call("rbind", lapply(ns, sim.timings))
102 | ```
103 | 
104 | ```{r timings_plot, echo=FALSE, message=FALSE, warning=FALSE}
105 | p.timings <- ggplot(timings, aes(x=n, y=time, colour=method)) +
106 |   geom_smooth(data = dplyr::filter(timings, n > 1e4), method = "lm") + geom_point(size=0.25, alpha=0.9) + labs(y="time (sec.)") +
107 |     scale_x_log10(
108 |    breaks = scales::trans_breaks("log10", function(x) 10^x),
109 |    labels = scales::trans_format("log10", scales::math_format(10^.x))
110 |  ) +
111 |  scale_y_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),
112 |    labels = scales::trans_format("log10", scales::math_format(10^.x))) +
113 |    annotation_logticks()                 
114 | 
115 | p.timings + ggtitle("number of classes grows with n") + theme_bw()
116 | ```
117 | 
118 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # aricode
 3 | 
 4 | <!-- badges: start -->
 5 | ![R-CMD-check](https://github.com/jchiquet/aricode/workflows/R-CMD-check/badge.svg?branch=master)
 6 | [![CRAN
 7 | Status](https://www.r-pkg.org/badges/version/aricode)](https://CRAN.R-project.org/package=aricode)
 8 | [![Coverage
 9 | status](https://codecov.io/gh/jchiquet/aricode/branch/master/graph/badge.svg)](https://codecov.io/gh/jchiquet/aricode)
10 | [![Lifecycle:
11 | stable](https://img.shields.io/badge/lifecycle-stable-blue.svg)](https://www.tidyverse.org/lifecycle/#stable)
12 | [![](https://img.shields.io/github/last-commit/jchiquet/aricode.svg)](https://github.com/jchiquet/aricode/commits/master)
13 | <!-- badges: end -->
14 | 
15 | A package for efficient computations of standard clustering comparison
16 | measures
17 | 
18 | ## Installation
19 | 
20 | Stable version on the
21 | [CRAN](https://cran.rstudio.com/web/packages/aricode/).
22 | 
23 | ``` r
24 | install.packages("aricode")
25 | ```
26 | 
27 | The development version is available via:
28 | 
29 | ``` r
30 | devtools::install_github("jchiquet/aricode")
31 | ```
32 | 
33 | ## Description
34 | 
35 | Computation of measures for clustering comparison (ARI, AMI, NID and
36 | even the \(\chi^2\) distance) are usually based on the contingency
37 | table. Traditional implementations (e.g., function `adjustedRandIndex`
38 | of package **mclust**) are in \(\Omega(n + u v)\) where
39 | 
40 |   - \(n\) is the size of the vectors the classifications of which are to
41 |     be compared,
42 |   - \(u\) and \(v\) are the respective number of classes in each
43 |     vectors.
44 | 
45 | In **aricode** we propose an implementation, based on radix sort, that
46 | is in \(\Theta(n)\) in time and space.  
47 | Importantly, the complexity does not depends on \(u\) and \(v\). Our
48 | implementation of the ARI for instance is one or two order of magnitude
49 | faster than some standard implementation in `R`.
50 | 
51 | ## Available measures and functions
52 | 
53 | The functions included in aricode are:
54 | 
55 |   - `ARI`: computes the adjusted rand index
56 |   - `Chi2`: computes the Chi-square statistics
57 |   - `MARI/MARIraw`: computes the modified adjusted rand index (Sundqvist
58 |     et al, in preparation)
59 |   - `NVI`: computes the the normalized variation information
60 |   - `NID`: computes the normalized information distance
61 |   - `NMI`: computes the normalized mutual information
62 |   - `AMI`: computes the adjusted mutual information
63 |   - `expected_MI`: computes the expected mutual information
64 |   - `entropy`: computes the conditional and joint entropies
65 |   - `clustComp`: computes all clustering comparison measures at once
66 | 
67 | ## Timings
68 | 
69 | Here are some timings to compare the cost of computing the adjusted Rand
70 | Index with **aricode** or with the commonly used function
71 | `adjustedRandIndex` of the *mclust* package: the cost of the latter can
72 | be prohibitive for large vectors:
73 | 
74 | ![](man/figures/timings_plot-1.png)<!-- -->
75 | 


--------------------------------------------------------------------------------
/_pkgdown.yml:
--------------------------------------------------------------------------------
 1 | templates:
 2 |   params:
 3 |     bootswatch: readable
 4 | 
 5 | navbar:
 6 |   title: "aricode"
 7 |   type: inverse
 8 |   left:
 9 |     - icon: fa-home
10 |     - text: "Home"
11 |       href: index.html
12 |     - text: "Reference"
13 |       href: "reference/index.html"
14 |     - text: "Changelog"
15 |       href: news/index.html
16 |   right:
17 |     - icon: fa-github
18 |       href: https://github.com/jchiquet/aricode
19 | 
20 | 


--------------------------------------------------------------------------------
/aricode.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: No
 4 | SaveWorkspace: No
 5 | AlwaysSaveHistory: Yes
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | 
18 | BuildType: Package
19 | PackageUseDevtools: Yes
20 | PackageInstallArgs: --no-multiarch --with-keep.source
21 | 


--------------------------------------------------------------------------------
/cran-comments.md:
--------------------------------------------------------------------------------
 1 | This submission follows the reporting of a bug in the calculation of the AMI.
 2 | 
 3 | ## aricode 1.0.3
 4 | 
 5 | - minor fix in package documentation due to evolution of roxygen2 7.0.0 <https://github.com/r-lib/roxygen2/issues/1491>.
 6 | 
 7 | ## Tested environments
 8 | 
 9 | * tested locally on Ubuntu Linux 22.04.1 LTS, R-release, GCC
10 | 
11 | * tested remotely with github-action
12 | 
13 | - Linux ubuntu 22.04, R-release (github-action)
14 | - Linux ubuntu 22.04, R-oldrel (github-action)
15 | - Linux ubuntu 22.04, R-devel (github-action)
16 | - Windows Server 2022, R-release, 64 bit
17 | - macOS 12, R-release (github action)
18 | - Linux ubuntu 22.04, R-release, gcc + unit test with sanitizers (github-action)
19 | - Linux ubuntu 22.04, R-release, clang + unit test with sanitizers (github-action)
20 | 
21 | * tested remotely with win-builder (R version 4.3.1, R unstable, R version 4.2.3)
22 | 
23 | all status OK
24 | 
25 | ## Local R CMD check results
26 | 
27 | ── R CMD check results ──────────────────────────────────────────────── aricode 1.0.3 ────
28 | Duration: 43.1s
29 | 
30 | 0 errors ✔ | 0 warnings ✔ | 0 notes ✔
31 | 
32 | R CMD check succeeded
33 | 


--------------------------------------------------------------------------------
/inst/WORDLIST:
--------------------------------------------------------------------------------
 1 | AMI
 2 | ARI
 3 | CMD
 4 | Epps
 5 | JMLR
 6 | Lifecycle
 7 | MARIraw
 8 | NID
 9 | NMI
10 | NVI
11 | ORCID
12 | SortPairs
13 | Vinh
14 | Xuan
15 | al
16 | clustComp
17 | doi
18 | entropies
19 | et
20 | mclust
21 | radix
22 | roxygen
23 | sortPairs
24 | 


--------------------------------------------------------------------------------
/inst/check_speed.R:
--------------------------------------------------------------------------------
 1 | library(microbenchmark)
 2 | library(aricode)
 3 | library(mclust)
 4 | 
 5 | n <- 10^5
 6 | c1 <- sample.int(100, size = n, replace=TRUE)
 7 | c2 <- sample.int(100, size = n, replace=TRUE)
 8 | 
 9 | ## define the ARI as in the mclust package
10 | adjustedRandIndex <- function (x, y)
11 | {
12 |   x <- as.vector(x)
13 |   y <- as.vector(y)
14 |   if (length(x) != length(y))
15 |     stop("arguments must be vectors of the same length")
16 |   tab <- table(x, y)
17 |   if (all(dim(tab) == c(1, 1)))
18 |     return(1)
19 |   a <- sum(choose(tab, 2))
20 |   b <- sum(choose(rowSums(tab), 2)) - a
21 |   c <- sum(choose(colSums(tab), 2)) - a
22 |   d <- choose(sum(tab), 2) - a - b - c
23 |   ARI <- (a - (a + b) * (a + c)/(a + b + c + d))/((a + b + a + c)/2 - (a + b) * (a + c)/(a + b + c + d))
24 |   return(ARI)
25 | }
26 | 
27 | res <- microbenchmark(aricode = ARI(c1, c2),
28 |                       R = adjustedRandIndex(c1, c2),
29 |                       mclust = mclust::adjustedRandIndex(c1, c2), times=100L)
30 | ggplot2::autoplot(res)
31 | 
32 | 


--------------------------------------------------------------------------------
/man/AMI.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/aricode.R
 3 | \name{AMI}
 4 | \alias{AMI}
 5 | \title{Adjusted Mutual Information}
 6 | \usage{
 7 | AMI(c1, c2)
 8 | }
 9 | \arguments{
10 | \item{c1}{a vector containing the labels of the first classification. Must be a vector of characters, integers, numerics, or a factor, but not a list.}
11 | 
12 | \item{c2}{a vector containing the labels of the second classification.}
13 | }
14 | \value{
15 | a scalar with the adjusted rand index.
16 | }
17 | \description{
18 | A function to compute the adjusted mutual information between two classifications
19 | }
20 | \examples{
21 | data(iris)
22 | cl <- cutree(hclust(dist(iris[,-5])), 4)
23 | AMI(cl,iris$Species)
24 | }
25 | \seealso{
26 | \code{\link{ARI}}, \code{\link{RI}}, \code{\link{NID}}, \code{\link{NVI}}, \code{\link{NMI}}, \code{\link{clustComp}}
27 | }
28 | 


--------------------------------------------------------------------------------
/man/ARI.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/aricode.R
 3 | \name{ARI}
 4 | \alias{ARI}
 5 | \title{Adjusted Rand Index}
 6 | \usage{
 7 | ARI(c1, c2)
 8 | }
 9 | \arguments{
10 | \item{c1}{a vector containing the labels of the first classification. Must be a vector of characters, integers, numerics, or a factor, but not a list.}
11 | 
12 | \item{c2}{a vector containing the labels of the second classification.}
13 | }
14 | \value{
15 | a scalar with the adjusted rand index.
16 | }
17 | \description{
18 | A function to compute the adjusted rand index between two classifications
19 | }
20 | \examples{
21 | data(iris)
22 | cl <- cutree(hclust(dist(iris[,-5])), 4)
23 | ARI(cl,iris$Species)
24 | }
25 | \seealso{
26 | \code{\link{RI}}, \code{\link{NID}}, \code{\link{NVI}}, \code{\link{NMI}}, \code{\link{clustComp}}
27 | }
28 | 


--------------------------------------------------------------------------------
/man/Chi2.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/aricode.R
 3 | \name{Chi2}
 4 | \alias{Chi2}
 5 | \title{Chi-square statistics}
 6 | \usage{
 7 | Chi2(c1, c2)
 8 | }
 9 | \arguments{
10 | \item{c1}{a vector containing the labels of the first classification. Must be a vector of characters, integers, numerics, or a factor, but not a list.}
11 | 
12 | \item{c2}{a vector containing the labels of the second classification.}
13 | }
14 | \value{
15 | a scalar with the chi-square statistics.
16 | }
17 | \description{
18 | A function to compute the Chi-2 statistics
19 | }
20 | \examples{
21 | data(iris)
22 | cl <- cutree(hclust(dist(iris[,-5])), 4)
23 | Chi2(cl,iris$Species)
24 | }
25 | \seealso{
26 | \code{\link{ARI}}, \code{\link{NID}}, \code{\link{NVI}}, \code{\link{NMI}}, \code{\link{clustComp}}
27 | }
28 | 


--------------------------------------------------------------------------------
/man/MARI.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/aricode.R
 3 | \name{MARI}
 4 | \alias{MARI}
 5 | \title{Modified Adjusted Rand Index}
 6 | \usage{
 7 | MARI(c1, c2)
 8 | }
 9 | \arguments{
10 | \item{c1}{a vector containing the labels of the first classification. Must be a vector of characters, integers, numerics, or a factor, but not a list.}
11 | 
12 | \item{c2}{a vector containing the labels of the second classification.}
13 | }
14 | \value{
15 | a scalar with the modified ARI.
16 | }
17 | \description{
18 | A function to compute a modified adjusted rand index between two classifications as proposed by Sundqvist et al. in prep, based on a multinomial model.
19 | }
20 | \examples{
21 | data(iris)
22 | cl <- cutree(hclust(dist(iris[,-5])), 4)
23 | MARI(cl,iris$Species)
24 | }
25 | \seealso{
26 | \code{\link{ARI}}, \code{\link{NID}}, \code{\link{NVI}}, \code{\link{NMI}}, \code{\link{clustComp}}
27 | }
28 | 


--------------------------------------------------------------------------------
/man/MARIraw.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/aricode.R
 3 | \name{MARIraw}
 4 | \alias{MARIraw}
 5 | \title{raw Modified Adjusted Rand Index}
 6 | \usage{
 7 | MARIraw(c1, c2)
 8 | }
 9 | \arguments{
10 | \item{c1}{a vector containing the labels of the first classification. Must be a vector of characters, integers, numerics, or a factor, but not a list.}
11 | 
12 | \item{c2}{a vector containing the labels of the second classification.}
13 | }
14 | \value{
15 | a scalar with the modified ARI without the division by the (maximum - expected)
16 | }
17 | \description{
18 | A function to compute a modified adjusted rand index between two classifications as proposed by Sundqvist et al. in prep, based on a multinomial model. Raw means, that the index is not divided by the (maximum - expected) value.
19 | }
20 | \examples{
21 | data(iris)
22 | cl <- cutree(hclust(dist(iris[,-5])), 4)
23 | MARIraw(cl,iris$Species)
24 | }
25 | \seealso{
26 | \code{\link{ARI}}, \code{\link{NID}}, \code{\link{NVI}}, \code{\link{NMI}}, \code{\link{clustComp}}
27 | }
28 | 


--------------------------------------------------------------------------------
/man/NID.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/aricode.R
 3 | \name{NID}
 4 | \alias{NID}
 5 | \title{Normalized information distance (NID)}
 6 | \usage{
 7 | NID(c1, c2)
 8 | }
 9 | \arguments{
10 | \item{c1}{a vector containing the labels of the first classification. Must be a vector of characters, integers, numerics, or a factor, but not a list.}
11 | 
12 | \item{c2}{a vector containing the labels of the second classification.}
13 | }
14 | \value{
15 | a scalar with the normalized information distance .
16 | }
17 | \description{
18 | A function to compute the NID between two classifications
19 | }
20 | \examples{
21 | data(iris)
22 | cl <- cutree(hclust(dist(iris[,-5])), 4)
23 | NID(cl,iris$Species)
24 | }
25 | \seealso{
26 | \code{\link{RI}}, \code{\link{NMI}}, \code{\link{NVI}}, \code{\link{ARI}}, \code{\link{clustComp}}
27 | }
28 | 


--------------------------------------------------------------------------------
/man/NMI.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/aricode.R
 3 | \name{NMI}
 4 | \alias{NMI}
 5 | \title{Normalized mutual information (NMI)}
 6 | \usage{
 7 | NMI(c1, c2, variant = c("max", "min", "sqrt", "sum", "joint"))
 8 | }
 9 | \arguments{
10 | \item{c1}{a vector containing the labels of the first classification. Must be a vector of characters, integers, numerics, or a factor, but not a list.}
11 | 
12 | \item{c2}{a vector containing the labels of the second classification.}
13 | 
14 | \item{variant}{a string in ("max", "min", "sqrt", "sum", "joint"): different variants of NMI. Default use "max".}
15 | }
16 | \value{
17 | a scalar with the normalized mutual information .
18 | }
19 | \description{
20 | A function to compute the NMI between two classifications
21 | }
22 | \examples{
23 | data(iris)
24 | cl <- cutree(hclust(dist(iris[,-5])), 4)
25 | NMI(cl,iris$Species)
26 | }
27 | \seealso{
28 | \code{\link{RI}}, \code{\link{NID}}, \code{\link{NVI}}, \code{\link{ARI}}, \code{\link{clustComp}}
29 | }
30 | 


--------------------------------------------------------------------------------
/man/NVI.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/aricode.R
 3 | \name{NVI}
 4 | \alias{NVI}
 5 | \title{Normalized variation of information (NVI)}
 6 | \usage{
 7 | NVI(c1, c2)
 8 | }
 9 | \arguments{
10 | \item{c1}{a vector containing the labels of the first classification. Must be a vector of characters, integers, numerics, or a factor, but not a list.}
11 | 
12 | \item{c2}{a vector containing the labels of the second classification.}
13 | }
14 | \value{
15 | a scalar with the normalized variation of information.
16 | }
17 | \description{
18 | A function to compute the NVI between two classifications
19 | }
20 | \examples{
21 | data(iris)
22 | cl <- cutree(hclust(dist(iris[,-5])), 4)
23 | NVI(cl,iris$Species)
24 | }
25 | \seealso{
26 | \code{\link{RI}}, \code{\link{NID}}, \code{\link{NMI}}, \code{\link{ARI}}, \code{\link{clustComp}}
27 | }
28 | 


--------------------------------------------------------------------------------
/man/RI.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/aricode.R
 3 | \name{RI}
 4 | \alias{RI}
 5 | \title{Rand Index}
 6 | \usage{
 7 | RI(c1, c2)
 8 | }
 9 | \arguments{
10 | \item{c1}{a vector containing the labels of the first classification. Must be a vector of characters, integers, numerics, or a factor, but not a list.}
11 | 
12 | \item{c2}{a vector containing the labels of the second classification.}
13 | }
14 | \value{
15 | a scalar with the rand index.
16 | }
17 | \description{
18 | A function to compute the rand index between two classifications
19 | }
20 | \examples{
21 | data(iris)
22 | cl <- cutree(hclust(dist(iris[,-5])), 4)
23 | RI(cl,iris$Species)
24 | }
25 | \seealso{
26 | \code{\link{ARI}}, \code{\link{NID}}, \code{\link{NVI}}, \code{\link{NMI}}, \code{\link{clustComp}}
27 | }
28 | 


--------------------------------------------------------------------------------
/man/aricode-package.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/aricode-package.R
 3 | \docType{package}
 4 | \name{aricode-package}
 5 | \alias{aricode}
 6 | \alias{aricode-package}
 7 | \title{aricode: Efficient Computations of Standard Clustering Comparison Measures}
 8 | \description{
 9 | Implements an efficient O(n) algorithm based on bucket-sorting for fast computation of standard clustering comparison measures. Available measures include adjusted Rand index (ARI), normalized information distance (NID), normalized mutual information (NMI), adjusted mutual information (AMI), normalized variation information (NVI) and entropy, as described in Vinh et al (2009) \doi{10.1145/1553374.1553511}. Include AMI (Adjusted Mutual Information) since version 0.1.2, a modified version of ARI (MARI), as described in Sundqvist et al. \doi{10.1007/s00180-022-01230-7} and simple Chi-square distance since version 1.0.0.
10 | 
11 | A package for efficient computations of standard clustering comparison measures. Most of the available measures are described in the paper of Vinh et al, JMLR, 2009 (see reference below).
12 | }
13 | \details{
14 | Traditional implementations (e.g., function \code{adjustedRandIndex} of package \code{mclust}) are in Omega(n + u v) where n is the size of the vectors the classifications of which are to be compared, u and v are the respective number of classes in each vectors. Here, the implementation is in Theta(n), plus the gain of speed due to the C++ code.
15 | 
16 | The functions included in aricode are:
17 | 
18 | * ARI: computes the adjusted rand index
19 | * Chi2: computes the Chi-square statistic
20 | * MARI: computes the modified adjusted rand index (Sundqvist et al, in preparation)
21 | * MARIraw: computes the raw version of the modified adjusted rand index
22 | * RI: computes the rand index
23 | * NVI: computes the normalized variation information
24 | * NID: computes the normalized information distance
25 | * NMI: computes the normalized mutual information
26 | * AMI: computes the adjusted mutual information
27 | * entropy: computes the conditional and joint entropies
28 | * clustComp: computes all clustering comparison measures at once
29 | }
30 | \references{
31 | * Nguyen Xuan Vinh, Julien Epps, and James Bailey. "Information theoretic measures for clusterings comparison: Variants, properties, normalization and correction for chance." Journal of Machine Learning Research 11.Oct (2010): 2837-2854. as described in Vinh et al (2009)
32 |  * Sundqvist, Martina, Julien Chiquet, and Guillem Rigaill. "Adjusting the adjusted Rand Index: A multinomial story." Computational Statistics 38.1 (2023): 327-347.
33 | }
34 | \seealso{
35 | Useful links:
36 | \itemize{
37 |   \item \url{https://github.com/jchiquet/aricode}
38 |   \item Report bugs at \url{https://github.com/jchiquet/aricode/issues}
39 | }
40 | 
41 | 
42 | \code{\link{ARI}}, \code{\link{RI}}, \code{\link{NID}}, \code{\link{NVI}}, \code{\link{AMI}}, \code{\link{NMI}}, \code{\link{entropy}}, \code{\link{clustComp}}
43 | }
44 | \author{
45 | \strong{Maintainer}: Julien Chiquet \email{julien.chiquet@inrae.fr} (\href{https://orcid.org/0000-0002-3629-3429}{ORCID})
46 | 
47 | Authors:
48 | \itemize{
49 |   \item Guillem Rigaill \email{guillem.rigaill@inrae.fr}
50 |   \item Martina Sundqvist \email{martina.sundqvist@agroparistech.fr}
51 | }
52 | 
53 | Other contributors:
54 | \itemize{
55 |   \item Valentin Dervieux \email{valentin.dervieux@gmail.com} [contributor]
56 |   \item Florent Bersani \email{florent@bersani.org} [contributor]
57 | }
58 | 
59 | 
60 | Julien Chiquet \email{julien.chiquet@inrae.fr}
61 | 
62 | Guillem Rigaill \email{guillem.rigaill@inrae.fr}
63 | 
64 | Martina Sundqvist \email{martina.sundqvist@agroparistech.fr}
65 | }
66 | \keyword{internal}
67 | 


--------------------------------------------------------------------------------
/man/clustComp.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/aricode.R
 3 | \name{clustComp}
 4 | \alias{clustComp}
 5 | \title{Measures of similarity between two classification}
 6 | \usage{
 7 | clustComp(c1, c2)
 8 | }
 9 | \arguments{
10 | \item{c1}{a vector containing the labels of the first classification. Must be a vector of characters, integers, numerics, or a factor, but not a list.}
11 | 
12 | \item{c2}{a vector containing the labels of the second classification.}
13 | }
14 | \value{
15 | a list with the RI, ARI, NMI, NVI and NID.
16 | }
17 | \description{
18 | A function various measures of similarity between two classifications
19 | }
20 | \examples{
21 | data(iris)
22 | cl <- cutree(hclust(dist(iris[,-5])), 4)
23 | clustComp(cl,iris$Species)
24 | }
25 | \seealso{
26 | \code{\link{RI}}, \code{\link{NID}}, \code{\link{NVI}}, \code{\link{NMI}}, \code{\link{ARI}}
27 | }
28 | 


--------------------------------------------------------------------------------
/man/entropy.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/aricode.R
 3 | \name{entropy}
 4 | \alias{entropy}
 5 | \title{Entropy}
 6 | \usage{
 7 | entropy(c1, c2)
 8 | }
 9 | \arguments{
10 | \item{c1}{a vector containing the labels of the first classification. Must be a vector of characters, integers, numerics, or a factor, but not a list.}
11 | 
12 | \item{c2}{a vector containing the labels of the second classification.}
13 | }
14 | \value{
15 | a list with the two conditional entropies, the joint entropy and output of sortPairs.
16 | }
17 | \description{
18 | A function to compute the empirical entropy for two vectors of classification and the joint entropy
19 | }
20 | \examples{
21 | data(iris)
22 | cl <- cutree(hclust(dist(iris[,-5])), 4)
23 | entropy(cl,iris$Species)
24 | }
25 | 


--------------------------------------------------------------------------------
/man/figures/timings_plot-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jchiquet/aricode/ffc98928afc202338481d9fbb53f7527bdad28da/man/figures/timings_plot-1.png


--------------------------------------------------------------------------------
/man/sortPairs.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/aricode.R
 3 | \name{sortPairs}
 4 | \alias{sortPairs}
 5 | \title{Sort Pairs}
 6 | \usage{
 7 | sortPairs(c1, c2, spMat = FALSE)
 8 | }
 9 | \arguments{
10 | \item{c1}{a vector of length n with value between 0 and N1 < n}
11 | 
12 | \item{c2}{a vector of length n with value between 0 and N2 < n}
13 | 
14 | \item{spMat}{logical: send back the contingency table as sparsely encoded (cost more than the algorithm itself). Default is FALSE}
15 | }
16 | \description{
17 | A function to sort pairs of integers or factors and identify the pairs
18 | }
19 | 


--------------------------------------------------------------------------------
/src/.gitignore:
--------------------------------------------------------------------------------
1 | *.o
2 | *.so
3 | *.dll
4 | 


--------------------------------------------------------------------------------
/src/RcppExports.cpp:
--------------------------------------------------------------------------------
 1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand
 2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 3 | 
 4 | #include <Rcpp.h>
 5 | 
 6 | using namespace Rcpp;
 7 | 
 8 | #ifdef RCPP_USE_GLOBAL_ROSTREAM
 9 | Rcpp::Rostream<true>&  Rcpp::Rcout = Rcpp::Rcpp_cout_get();
10 | Rcpp::Rostream<false>& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get();
11 | #endif
12 | 
13 | // countPairs
14 | List countPairs(IntegerVector classi1, IntegerVector classi2, IntegerVector order);
15 | RcppExport SEXP _aricode_countPairs(SEXP classi1SEXP, SEXP classi2SEXP, SEXP orderSEXP) {
16 | BEGIN_RCPP
17 |     Rcpp::RObject rcpp_result_gen;
18 |     Rcpp::RNGScope rcpp_rngScope_gen;
19 |     Rcpp::traits::input_parameter< IntegerVector >::type classi1(classi1SEXP);
20 |     Rcpp::traits::input_parameter< IntegerVector >::type classi2(classi2SEXP);
21 |     Rcpp::traits::input_parameter< IntegerVector >::type order(orderSEXP);
22 |     rcpp_result_gen = Rcpp::wrap(countPairs(classi1, classi2, order));
23 |     return rcpp_result_gen;
24 | END_RCPP
25 | }
26 | // expected_MI
27 | double expected_MI(IntegerVector ni_, IntegerVector n_j);
28 | RcppExport SEXP _aricode_expected_MI(SEXP ni_SEXP, SEXP n_jSEXP) {
29 | BEGIN_RCPP
30 |     Rcpp::RObject rcpp_result_gen;
31 |     Rcpp::RNGScope rcpp_rngScope_gen;
32 |     Rcpp::traits::input_parameter< IntegerVector >::type ni_(ni_SEXP);
33 |     Rcpp::traits::input_parameter< IntegerVector >::type n_j(n_jSEXP);
34 |     rcpp_result_gen = Rcpp::wrap(expected_MI(ni_, n_j));
35 |     return rcpp_result_gen;
36 | END_RCPP
37 | }
38 | // getRank
39 | List getRank(IntegerVector classi);
40 | RcppExport SEXP _aricode_getRank(SEXP classiSEXP) {
41 | BEGIN_RCPP
42 |     Rcpp::RObject rcpp_result_gen;
43 |     Rcpp::RNGScope rcpp_rngScope_gen;
44 |     Rcpp::traits::input_parameter< IntegerVector >::type classi(classiSEXP);
45 |     rcpp_result_gen = Rcpp::wrap(getRank(classi));
46 |     return rcpp_result_gen;
47 | END_RCPP
48 | }
49 | 
50 | static const R_CallMethodDef CallEntries[] = {
51 |     {"_aricode_countPairs", (DL_FUNC) &_aricode_countPairs, 3},
52 |     {"_aricode_expected_MI", (DL_FUNC) &_aricode_expected_MI, 2},
53 |     {"_aricode_getRank", (DL_FUNC) &_aricode_getRank, 1},
54 |     {NULL, NULL, 0}
55 | };
56 | 
57 | RcppExport void R_init_aricode(DllInfo *dll) {
58 |     R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
59 |     R_useDynamicSymbols(dll, FALSE);
60 | }
61 | 


--------------------------------------------------------------------------------
/src/aricode.cpp:
--------------------------------------------------------------------------------
  1 | #include <Rcpp.h>
  2 | using namespace Rcpp;
  3 | 
  4 | // [[Rcpp::export]]
  5 | List countPairs(IntegerVector classi1, IntegerVector classi2, IntegerVector order) {
  6 |   // first path to count pairs
  7 |   int n = classi1.size();
  8 | 
  9 |   // count per classification
 10 |   IntegerVector count1(n, 0);
 11 |   for(int i = 0; i < n; i++) count1[classi1[i]]++;
 12 | 
 13 |   IntegerVector count2(n, 0);
 14 |   for(int i = 0; i < n; i++) count2[classi2[i]]++;
 15 | 
 16 |   // count per pairs
 17 |   int count = 1;
 18 |   int class1_cur = classi1[order[0]];
 19 |   int class2_cur = classi2[order[0]];
 20 | 
 21 |   for(int i = 1; i < n; i++){
 22 |     if( (class1_cur != classi1[order[i]]) || (class2_cur != classi2[order[i]]) ){
 23 |       count++;
 24 |       class1_cur = classi1[order[i]];
 25 |       class2_cur = classi2[order[i]];
 26 |     }
 27 |   }
 28 | 
 29 |   // create output Integer Vector for pairs and initialize
 30 |   IntegerVector nameClassi1(count, 0);
 31 |   IntegerVector nameClassi2(count, 0);
 32 |   IntegerVector numberPair(count, 0);
 33 | 
 34 |   int current_position = 0;
 35 |   nameClassi1[0] = classi1[order[0]];
 36 |   nameClassi2[0] = classi2[order[0]];
 37 |   numberPair[0] = 1;
 38 | 
 39 |   // count pairs
 40 |   for(int i = 1; i < n; i++){
 41 |     if( ( nameClassi1[current_position] == classi1[order[i]]) && (nameClassi2[current_position] == classi2[order[i]]) ){
 42 |       numberPair[current_position]++;
 43 |     } else {
 44 |       current_position += 1;
 45 |       nameClassi1[current_position] = classi1[order[i]];
 46 |       nameClassi2[current_position] = classi2[order[i]];
 47 |       numberPair[current_position]  = 1;
 48 |     }
 49 |   }
 50 | 
 51 |   // output as a list
 52 |   List ListOut;
 53 |   ListOut["pair_nb"] = numberPair;
 54 |   ListOut["pair_c1"] = nameClassi1;
 55 |   ListOut["pair_c2"] = nameClassi2;
 56 |   ListOut["c1_nb"]   = count1[count1 > 0];
 57 |   ListOut["c2_nb"]   = count2[count2 > 0];
 58 |   return(ListOut);
 59 | }
 60 | 
 61 | 
 62 | // [[Rcpp::export]]
 63 | double expected_MI(IntegerVector ni_, IntegerVector n_j) {
 64 | 
 65 |   int N = sum(ni_) ;
 66 | 
 67 |   double emi = 0.0 ;
 68 | 
 69 |   NumericVector ni_f = lfactorial(ni_) ;
 70 |   NumericVector nj_f = lfactorial(n_j) ;
 71 |   NumericVector Nmni_f = lfactorial(N - ni_) ;
 72 |   NumericVector Nmnj_f = lfactorial(N - n_j) ;
 73 |   double N_f = lgamma(N + 1) ;
 74 | 
 75 |   for (int i=0; i< ni_.size(); i++) {
 76 |     for (int j=0; j< n_j.size(); j++) {
 77 | 
 78 |       int start_nij = std::max(1, ni_[i] + n_j[j] - N) ;
 79 |       int end_nij = std::min(ni_[i], n_j[j]) ;
 80 | 
 81 |       for (int nij = start_nij; nij <= end_nij; nij++ ) {
 82 | 
 83 |           double t1 = ((float) nij / (float) N) * std::log((float)(nij * N) / (float)(ni_[i]*n_j[j])) ;
 84 | 
 85 |           double t2 = std::exp((ni_f[i] + nj_f[j] + Nmni_f[i] + Nmnj_f[j] - N_f - lgamma(1 + nij) - lgamma(1 + ni_[i] - nij) - lgamma(1 + n_j[j] - nij) - lgamma(1 + N - ni_[i] - n_j[j] + nij))) ;
 86 | 
 87 |           emi += t1*t2;
 88 |       }
 89 |     }
 90 |   }
 91 |   return emi;
 92 | 
 93 | }
 94 | 
 95 | // [[Rcpp::export]]
 96 | List getRank(IntegerVector classi){
 97 |    int maxi = max(classi);
 98 |    int mini = min(classi);
 99 | 
100 |    // Present
101 |    LogicalVector present(maxi - mini + 1);
102 |    for(int i=0; i< classi.size(); i++) present[classi[i]-mini] = TRUE;
103 | 
104 |    // Count
105 |    IntegerVector translator(maxi - mini + 1);
106 |    int nbIndex = 0;
107 |    for(int i=0; i< present.size(); i++) {
108 |      if(present[i]) nbIndex++;
109 |    }
110 | 
111 |    // Translator and Index Vector
112 |    IntegerVector index(nbIndex);
113 |    int indexCur = 0;
114 |    for(int i=0; i< present.size(); i++) {
115 |      if(present[i]) {
116 |         translator[i] = indexCur;
117 | 	index[indexCur] = i+mini;
118 | 	indexCur++;
119 |      } else {
120 |         translator[i] = NA_INTEGER;
121 |      }
122 |    }
123 |    // Converted Vector
124 |    IntegerVector translated(classi.size());
125 |    for(int i=0; i< classi.size(); i++) translated[i] = translator[classi[i] - mini];
126 | 
127 |    // output as a list
128 |    List ListOut;
129 |    ListOut["index"] = index;
130 |    ListOut["translator"] = translator;
131 |    ListOut["translated"] = translated;
132 |    return ListOut;
133 | }
134 | 


--------------------------------------------------------------------------------
/tests/spelling.R:
--------------------------------------------------------------------------------
1 | if(requireNamespace('spelling', quietly = TRUE))
2 |   spelling::spell_check_test(vignettes = TRUE, error = FALSE,
3 |                              skip_on_cran = TRUE)
4 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(aricode)
3 | test_check("aricode")
4 | 


--------------------------------------------------------------------------------
/tests/testthat/test_coherence.R:
--------------------------------------------------------------------------------
  1 | library(testthat)
  2 | library(aricode)
  3 | 
  4 | ## define the ARI as in the mclust package
  5 | adjustedRandIndex <- function (x, y)
  6 | {
  7 |   x <- as.vector(x)
  8 |   y <- as.vector(y)
  9 |   if (length(x) != length(y))
 10 |     stop("arguments must be vectors of the same length")
 11 |   tab <- table(x, y)
 12 |   if (all(dim(tab) == c(1, 1)))
 13 |     return(1)
 14 |   a <- sum(choose(tab, 2))
 15 |   b <- sum(choose(rowSums(tab), 2)) - a
 16 |   c <- sum(choose(colSums(tab), 2)) - a
 17 |   d <- choose(sum(tab), 2) - a - b - c
 18 |   ARI <- (a - (a + b) * (a + c)/(a + b + c + d))/((a + b + a + c)/2 - (a + b) * (a + c)/(a + b + c + d))
 19 |   return(ARI)
 20 | }
 21 | 
 22 | EMI <- function(counts) {
 23 |     s_emi <- 0
 24 |     n <- sum(counts)
 25 |     s1 <- margin.table(counts, 1)
 26 |     s2 <- margin.table(counts, 2)
 27 |     for(i in 1:nrow(counts)){
 28 |         for (j in 1:ncol(counts)){
 29 |             ai <- s1[i]
 30 |             bj <- s2[j]
 31 |             min_nij <- max(1, ai+bj-n)
 32 |             max_nij <- min(ai,bj)
 33 |             if (min_nij >= max_nij) next
 34 |             n.ij <- seq(min_nij, max_nij)   #sequence of consecutive numbers
 35 |             t1<- (n.ij / n) * log((n.ij * n) / (ai*bj))
 36 |             t2 <- exp(lfactorial(ai) + lfactorial(bj) + lfactorial(n - ai) + lfactorial(n - bj) - lfactorial(n) - lfactorial(n.ij) - lfactorial(ai - n.ij) - lfactorial(bj - n.ij) - lfactorial(n - ai - bj + n.ij))
 37 |             emi <- sum(t1*t2)
 38 |             if (is.nan(emi)) browser()
 39 |             s_emi <- s_emi + emi
 40 |         }
 41 |     }
 42 |     s_emi
 43 | }
 44 | 
 45 | Chi2_ref <- function(c1, c2) {
 46 |   as.numeric(chisq.test(c1, c2, correct=F)$stat[1])
 47 | }
 48 | 
 49 | ## Martina's code
 50 | ARI_estimated <- function(c1, c2) {
 51 |   # c1, c2, two classifications of the same observations
 52 |   n_kl <- table(c1,c2)
 53 | 
 54 |   n   <- sum(n_kl)
 55 |   n_k <- rowSums(n_kl)
 56 |   n_l <- colSums(n_kl)
 57 | 
 58 |   T1 <- sum(choose(n_kl,2))
 59 |   T2 <- 2*n + sum(outer(n_k, n_l) * n_kl) - sum(n_kl^2) - sum(n_k^2) -  sum(n_l^2)
 60 |   T3 <- 1/(6*choose(n,4))*(sum(outer(n_k^2, n_l^2)) - 4*T1 - 4*T2 - n*2*(sum(choose(n_k,2)) + sum(choose(n_l,2))) - n^2)/4
 61 | 
 62 |   RI_estim <- (1/choose(n, 2))*T1
 63 |   ARI_estim <- RI_estim-T3
 64 |  ARI_estim
 65 | }
 66 | 
 67 | test_that("Testing coherence of the adjusted Rand Index", {
 68 | 
 69 |   ## "\n-large random vectors-
 70 |   n <- 1e5
 71 |   c1 <- as.numeric(sample(1:(n/100), n, replace=TRUE))
 72 |   c2 <- as.numeric(sample(1:(n/100), n, replace=TRUE))
 73 |   expect_equal(ARI(c1,c2), adjustedRandIndex(c1,c2))
 74 | 
 75 |   ## "\n-real data-
 76 |   data(iris)
 77 |   cl <- cutree(hclust(dist(iris[,-5])), 4)
 78 |   expect_equal(ARI(cl,iris$Species), adjustedRandIndex(cl,iris$Species))
 79 | 
 80 |   ## "\n-completely equal vectors with no groups-")
 81 |   c1 <- 1:100
 82 |   c2 <- 1:100
 83 |   expect_equal(ARI(c1,c2), adjustedRandIndex(c1,c2))
 84 | 
 85 |   ## "\n-completely equal vectors with one groups-")
 86 |   c1 <- rep(1,100)
 87 |   c2 <- rep(2,100)
 88 |   expect_equal(ARI(c1,c2), adjustedRandIndex(c1,c2))
 89 | 
 90 |   ## "\n-completely different vectors with one groups-")
 91 |   c1 <- c(rep(0,99),1)
 92 |   c2 <- rep(1,100)
 93 |   expect_equal(ARI(c1,c2), adjustedRandIndex(c1,c2))
 94 | })
 95 | 
 96 | test_that("Testing coherence of the expected mutual information", {
 97 | 
 98 |   ## "\n-real data-
 99 |   data(iris)
100 |   cl <- cutree(hclust(dist(iris[,-5])), 4)
101 |   counts <- table(cl,iris$Species)
102 |   ni. <- rowSums(counts)
103 |   n.j <- colSums(counts)
104 |   expect_equal(aricode:::expected_MI(ni., n.j), EMI(counts))
105 | 
106 |   ## "\n-completely equal vectors with one groups-")
107 |   c1 <- rep(1,100)
108 |   c2 <- rep(2,100)
109 |   counts <- table(c1, c2)
110 |   ni. <- rowSums(counts)
111 |   n.j <- colSums(counts)
112 |   expect_equal(aricode:::expected_MI(ni., n.j), EMI(counts))
113 | 
114 |   ## "\n-completely different vectors with one groups-")
115 |   c1 <- c(rep(0,99),1)
116 |   c2 <- rep(1,100)
117 |   counts <- table(c1, c2)
118 |   ni. <- rowSums(counts)
119 |   n.j <- colSums(counts)
120 |   expect_equal(aricode:::expected_MI(ni., n.j), EMI(counts))
121 | })
122 | 
123 | 
124 | test_that("Testing coherence of the Chi2 statistics information", {
125 | 
126 |   ## "\n-large random vectors-
127 |   n <- 1e5
128 |   c1 <- as.numeric(sample(1:(n/100), n, replace=TRUE))
129 |   c2 <- as.numeric(sample(1:(n/100), n, replace=TRUE))
130 |   expect_equal(ARI(c1,c2), adjustedRandIndex(c1,c2))
131 | 
132 |   ## "\n-moderate random-
133 |   n <- rpois(lambda=100, n=1) +   3
134 |   k1 <- rpois(lambda=5, n=1)+2; k2 <- rpois(lambda=5, n=1)+2
135 |   c1 <- sample(1:k1, n, replace=T)
136 |   c2 <- sample(1:k2, n, replace=T)
137 |   expect_equal(aricode::Chi2(c1,c2), Chi2_ref(c1,c2))
138 | 
139 |   ## "\n-real data-
140 |   data(iris)
141 |   cl <- cutree(hclust(dist(iris[,-5])), 4)
142 |   expect_equal(aricode::Chi2(cl,iris$Species), Chi2_ref(cl,iris$Species))
143 | 
144 |   ## "\n-completely equal vectors with no groups-")
145 |   c1 <- 1:100
146 |   c2 <- 1:100
147 |   expect_equal(aricode::Chi2(c1,c2), Chi2_ref(c1,c2))
148 | })
149 | 
150 | test_that("Testing coherence of the MARI", {
151 | 
152 |   ## "\n-large random vectors-
153 |   n <- 1e5
154 |   c1 <- as.numeric(sample(1:(n/100), n, replace=TRUE))
155 |   c2 <- as.numeric(sample(1:(n/100), n, replace=TRUE))
156 |   expect_equal(aricode::MARIraw(c1,c2), ARI_estimated(c1,c2))
157 | 
158 |   ## "\n-moderate random-
159 |   n <- rpois(lambda=100, n=1) +   3
160 |   k1 <- rpois(lambda=5, n=1)+2; k2 <- rpois(lambda=5, n=1)+2
161 |   c1 <- sample(1:k1, n, replace=T)
162 |   c2 <- sample(1:k2, n, replace=T)
163 |   expect_equal(aricode::MARIraw(c1,c2), ARI_estimated(c1,c2))
164 | 
165 |   ## "\n-real data-
166 |   data(iris)
167 |   cl <- cutree(hclust(dist(iris[,-5])), 4)
168 |   expect_equal(aricode::MARIraw(c1,c2), ARI_estimated(c1,c2))
169 | 
170 |   ## "\n-completely equal vectors with no groups-")
171 |   c1 <- 1:100
172 |   c2 <- 1:100
173 |   expect_equal(aricode::MARIraw(c1,c2), ARI_estimated(c1,c2))
174 | 
175 |   ## "\n-completely equal vectors with one groups-")
176 |   c1 <- rep(1,100)
177 |   c2 <- rep(2,100)
178 |   expect_equal(aricode::MARIraw(c1,c2), ARI_estimated(c1,c2))
179 | 
180 |   ## "\n-completely different vectors with one groups-")
181 |   c1 <- c(rep(0,99),1)
182 |   c2 <- rep(1,100)
183 |   expect_equal(aricode::MARIraw(c1,c2), ARI_estimated(c1,c2))
184 | 
185 | })
186 | 
187 | test_that("Testing coherence of the MARI with ARI : in case of independance their values should be close", {
188 |   n <- rpois(lambda=100, n=1) +   3
189 |   k1 <- rpois(lambda=5, n=1)+2; k2 <- rpois(lambda=5, n=1)+2
190 |   class1 <- sample(1:k1, n, replace=T)
191 |   class2 <- sample(1:k2, n, replace=T)
192 |   ## expect_equal or almost equal in test_that
193 |   expect_equal(ARI(class1, class2), MARI(class1, class2), tolerance = 1e-3)
194 | })
195 | 


--------------------------------------------------------------------------------
/tests/testthat/test_input.R:
--------------------------------------------------------------------------------
 1 | library(testthat)
 2 | library(aricode)
 3 | 
 4 | test_that("Testing supposedly handled input types", {
 5 | 
 6 |   c1 <- rep(1:3,each=3) * 1.5
 7 |   c2 <- rep(4:6,each=3) * 2.7
 8 | 
 9 |   cat("\n-numeric type-")
10 |   c1.numeric <- c1
11 |   c2.numeric <- c2
12 |   ref.object <- list(spMat = NULL, levels = list(c1 = unique(c1.numeric), c2 = unique(c2.numeric)), nij = rep(3,3), ni. =  rep(3,3),  n.j = rep(3,3), pair_c1 = 0:2, pair_c2 = 0:2)
13 |   expect_that(sortPairs(c1.numeric,c2.numeric), equals(ref.object))
14 | 
15 |   cat("\n-integer type-")
16 |   c1.integer <- as.integer(c1)
17 |   c2.integer <- as.integer(c2)
18 |   ref.object <- list(spMat = NULL, levels = list(c1 = unique(c1.integer), c2 = unique(c2.integer)), nij = rep(3,3), ni. =  rep(3,3),  n.j = rep(3,3), pair_c1 = 0:2, pair_c2 = 0:2)
19 |   expect_that(sortPairs(c1.integer,c2.integer), equals(ref.object))
20 | 
21 |   cat("\n-character type-")
22 |   c1.char <- as.character(c1)
23 |   c2.char <- as.character(c2)
24 |   ref.object <- list(spMat = NULL, levels = list(c1 = unique(c1.char), c2 = unique(c2.char)), nij = rep(3,3), ni. =  rep(3,3),  n.j = rep(3,3), pair_c1 = 0:2, pair_c2 = 0:2)
25 |   expect_that(sortPairs(c1.char,c2.char), equals(ref.object))
26 | 
27 |   cat("\n-factor type-")
28 |   c1.factor <- as.factor(c1.char)
29 |   c2.factor <- as.factor(c2.char)
30 |   ref.object <- list(spMat = NULL, levels = list(c1 = levels(c1.factor), c2 = levels(c2.factor)), nij = rep(3,3), ni. =  rep(3,3),  n.j = rep(3,3), pair_c1 = 0:2, pair_c2 = 0:2)
31 |   expect_that(sortPairs(c1.factor,c2.factor), equals(ref.object))
32 | 
33 |   cat("\n-different types-")
34 |   ref.object <- list(spMat = NULL, levels = list(c1 = unique(c1.char), c2 = unique(c2.factor)), nij = rep(3,3), ni. =  rep(3,3),  n.j = rep(3,3), pair_c1 = 0:2, pair_c2 = 0:2)
35 |   expect_that(sortPairs(c1.char,c2.factor), equals(ref.object))
36 | })
37 | 
38 | test_that("Testing error for not handled input types", {
39 | 
40 |   c1 <- rep(1:3,each=3) * 1.5
41 |   c2 <- rep(4:6,each=3) * 2.7
42 | 
43 |   cat("\n-NA-")
44 |   c2NA <- c2; c2NA[1] <- NA
45 |   expect_error(sortPairs(c1,c2NA))
46 | 
47 |   cat("\n-different sizes-")
48 |   expect_error(sortPairs(c1,c2[-1]))
49 | 
50 |   cat("\n-list type-")
51 |   expect_error(sortPairs(as.list(c1),as.list(c2)))
52 | 
53 |   cat("\n-matrix type-")
54 |   expect_error(sortPairs(as.matrix(c1),as.matrix(c2)))
55 | 
56 |   cat("\n-data.frame type-")
57 |   expect_error(sortPairs(as.data.frame(c1),as.data.frame(c2)))
58 | 
59 | })
60 | 


--------------------------------------------------------------------------------