├── .gitattributes
├── tests
    ├── testthat.R
    └── testthat
    │   ├── test-measures_clusterings.R
    │   ├── test-transformations.R
    │   └── test-measures_pairs.R
├── .gitignore
├── .Rbuildignore
├── R
    ├── clevr.R
    ├── RcppExports.R
    ├── transformations.R
    ├── measures_clusterings.R
    └── measures_pairs.R
├── cran-comments.md
├── NEWS.md
├── clevr.Rproj
├── src
    ├── pairs_to_membership.cpp
    └── RcppExports.cpp
├── man
    ├── clevr-package.Rd
    ├── mutual_info.Rd
    ├── contingency_table_clusters.Rd
    ├── fowlkes_mallows.Rd
    ├── eval_report_clusters.Rd
    ├── canonicalize_pairs.Rd
    ├── adj_rand_index.Rd
    ├── rand_index.Rd
    ├── precision_pairs.Rd
    ├── homogeneity.Rd
    ├── completeness.Rd
    ├── variation_info.Rd
    ├── recall_pairs.Rd
    ├── specificity_pairs.Rd
    ├── fowlkes_mallows_pairs.Rd
    ├── accuracy_pairs.Rd
    ├── f_measure_pairs.Rd
    ├── balanced_accuracy_pairs.Rd
    ├── v_measure.Rd
    ├── contingency_table_pairs.Rd
    ├── eval_report_pairs.Rd
    └── clustering_representations.Rd
├── NAMESPACE
├── DESCRIPTION
├── README.Rmd
├── README.md
└── LICENSE


/.gitattributes:
--------------------------------------------------------------------------------
1 | * text=auto
2 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(clevr)
3 | 
4 | test_check("clevr")
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | src/*.o
6 | src/*.so
7 | src/*.dll
8 | .bak
9 | 


--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | ^README\.Rmd$
4 | ^cran-comments.md$
5 | ^CRAN-RELEASE$
6 | ^LICENSE$
7 | 


--------------------------------------------------------------------------------
/R/clevr.R:
--------------------------------------------------------------------------------
1 | #' @keywords internal
2 | "_PACKAGE"
3 | 
4 | #' @import Rcpp
5 | #' @importFrom Rcpp evalCpp
6 | #' @useDynLib clevr
7 | NULL
8 | 


--------------------------------------------------------------------------------
/cran-comments.md:
--------------------------------------------------------------------------------
 1 | ## Comments
 2 | 
 3 | Minor release to address incompatibility with upcoming release of Matrix package.
 4 | 
 5 | ## Test environments
 6 | * Fedora 38, R 4.3.1
 7 | * winbuilder, R 4.3.1
 8 | 
 9 | ## R CMD check results
10 | 
11 | 0 errors v | 0 warnings v | 0 note v
12 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
 1 | # clevr 0.1.2
 2 | * Address compatibility issue with Matrix 1.6-2 release
 3 | 
 4 | # clevr 0.1.1
 5 | * Fix behavior when pairs are represented using different types
 6 | * Improve documentation by adding examples
 7 | * First release to CRAN
 8 | 
 9 | # clevr 0.1.0
10 | * Initial release
11 | 


--------------------------------------------------------------------------------
/R/RcppExports.R:
--------------------------------------------------------------------------------
1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand
2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
3 | 
4 | pairs_to_membership_cpp <- function(pairs, num_records) {
5 |     .Call('_clevr_pairs_to_membership_cpp', PACKAGE = 'clevr', pairs, num_records)
6 | }
7 | 
8 | 


--------------------------------------------------------------------------------
/clevr.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | 
18 | BuildType: Package
19 | PackageUseDevtools: Yes
20 | PackageInstallArgs: --no-multiarch --with-keep.source
21 | PackageRoxygenize: rd,collate,namespace
22 | 


--------------------------------------------------------------------------------
/src/pairs_to_membership.cpp:
--------------------------------------------------------------------------------
 1 | #include <Rcpp.h>
 2 | #include <boost/graph/connected_components.hpp>
 3 | #include <boost/graph/adjacency_list.hpp>
 4 | using namespace Rcpp;
 5 | 
 6 | // [[Rcpp::export]]
 7 | IntegerVector pairs_to_membership_cpp(const IntegerMatrix &pairs, int num_records) {
 8 |   using namespace boost;
 9 | 
10 |   typedef adjacency_list <vecS, vecS, undirectedS> Graph;
11 | 
12 |   Graph G(num_records);
13 |   for (int i = 0; i < pairs.nrow(); i++) {
14 |     add_edge(pairs.at(i, 0), pairs.at(i, 1), G);
15 |   }
16 | 
17 |   IntegerVector membership(num_records);
18 |   connected_components(G, &membership[0]);
19 | 
20 |   return membership;
21 | }
22 | 


--------------------------------------------------------------------------------
/man/clevr-package.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/clevr.R
 3 | \docType{package}
 4 | \name{clevr-package}
 5 | \alias{clevr}
 6 | \alias{clevr-package}
 7 | \title{clevr: Clustering and Link Prediction Evaluation in R}
 8 | \description{
 9 | Tools for evaluating link prediction and clustering algorithms with respect to ground truth. Includes efficient implementations of common performance measures such as pairwise precision/recall, cluster homogeneity/completeness, variation of information, Rand index etc.
10 | }
11 | \seealso{
12 | Useful links:
13 | \itemize{
14 |   \item \url{https://github.com/cleanzr/clevr}
15 |   \item Report bugs at \url{https://github.com/cleanzr/clevr/issues}
16 | }
17 | 
18 | }
19 | \author{
20 | \strong{Maintainer}: Neil Marchant \email{ngmarchant@gmail.com}
21 | 
22 | Authors:
23 | \itemize{
24 |   \item Rebecca Steorts \email{beka@stat.duke.edu}
25 | }
26 | 
27 | }
28 | \keyword{internal}
29 | 


--------------------------------------------------------------------------------
/man/mutual_info.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/measures_clusterings.R
 3 | \name{mutual_info}
 4 | \alias{mutual_info}
 5 | \title{Mutual Information Between Clusterings}
 6 | \usage{
 7 | mutual_info(true, pred, base = exp(1))
 8 | }
 9 | \arguments{
10 | \item{true}{ground truth clustering represented as a membership
11 | vector. Each entry corresponds to an element and the value identifies
12 | the assigned cluster. The specific values of the cluster identifiers
13 | are arbitrary.}
14 | 
15 | \item{pred}{predicted clustering represented as a membership
16 | vector.}
17 | 
18 | \item{base}{base of the logarithm. Defaults to \code{exp(1)}.}
19 | }
20 | \description{
21 | Computes the mutual information between two
22 | clusterings, such as a predicted and ground truth clustering.
23 | }
24 | \details{
25 | Mutual information is an entropy-based measure of the similarity
26 | between two clusterings.
27 | }
28 | \examples{
29 | true <- c(1,1,1,2,2)  # ground truth clustering
30 | pred <- c(1,1,2,2,2)  # predicted clustering
31 | mutual_info(true, pred)
32 | 
33 | }
34 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(accuracy_pairs)
 4 | export(adj_rand_index)
 5 | export(balanced_accuracy_pairs)
 6 | export(canonicalize_pairs)
 7 | export(clusters_to_membership)
 8 | export(clusters_to_pairs)
 9 | export(completeness)
10 | export(contingency_table_clusters)
11 | export(contingency_table_pairs)
12 | export(eval_report_clusters)
13 | export(eval_report_pairs)
14 | export(f_measure_pairs)
15 | export(fowlkes_mallows)
16 | export(fowlkes_mallows_pairs)
17 | export(homogeneity)
18 | export(membership_to_clusters)
19 | export(membership_to_pairs)
20 | export(mutual_info)
21 | export(pairs_to_clusters)
22 | export(pairs_to_membership)
23 | export(precision_pairs)
24 | export(rand_index)
25 | export(recall_pairs)
26 | export(sensitivity_pairs)
27 | export(specificity_pairs)
28 | export(v_measure)
29 | export(variation_info)
30 | import(Rcpp)
31 | importFrom(Matrix,colSums)
32 | importFrom(Matrix,crossprod)
33 | importFrom(Matrix,rowSums)
34 | importFrom(Matrix,which)
35 | importFrom(Rcpp,evalCpp)
36 | importFrom(stats,na.action)
37 | importFrom(stats,na.fail)
38 | importFrom(stats,na.omit)
39 | importFrom(stats,xtabs)
40 | importFrom(utils,combn)
41 | useDynLib(clevr)
42 | 


--------------------------------------------------------------------------------
/man/contingency_table_clusters.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/measures_clusterings.R
 3 | \name{contingency_table_clusters}
 4 | \alias{contingency_table_clusters}
 5 | \title{Contingency Table for Clusterings}
 6 | \usage{
 7 | contingency_table_clusters(true, pred)
 8 | }
 9 | \arguments{
10 | \item{true}{ground truth clustering represented as a membership
11 | vector. Each entry corresponds to an element and the value identifies
12 | the assigned cluster. The specific values of the cluster identifiers
13 | are arbitrary.}
14 | 
15 | \item{pred}{predicted clustering represented as a membership
16 | vector.}
17 | }
18 | \value{
19 | Returns a table \eqn{C} (stored as a sparse matrix) such that
20 | \eqn{C_{ij}}{C_ij} counts the number of elements assigned to
21 | cluster \eqn{i} in \code{pred} and cluster \eqn{j} in \code{true}.
22 | }
23 | \description{
24 | Compute the contingency table for a \emph{predicted} clustering
25 | given a \emph{ground truth} clustering.
26 | }
27 | \examples{
28 | true <- c(1,1,1,2,2)  # ground truth clustering
29 | pred <- c(1,1,2,2,2)  # predicted clustering
30 | contingency_table_clusters(true, pred)
31 | 
32 | }
33 | \seealso{
34 | \code{\link{eval_report_clusters}} computes common evaluation measures derived
35 | from the output of this function.
36 | }
37 | 


--------------------------------------------------------------------------------
/man/fowlkes_mallows.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/measures_clusterings.R
 3 | \name{fowlkes_mallows}
 4 | \alias{fowlkes_mallows}
 5 | \title{Fowlkes-Mallows Index Between Clusterings}
 6 | \usage{
 7 | fowlkes_mallows(true, pred)
 8 | }
 9 | \arguments{
10 | \item{true}{ground truth clustering represented as a membership
11 | vector. Each entry corresponds to an element and the value identifies
12 | the assigned cluster. The specific values of the cluster identifiers
13 | are arbitrary.}
14 | 
15 | \item{pred}{predicted clustering represented as a membership
16 | vector.}
17 | }
18 | \description{
19 | Computes the Fowlkes-Mallows index between two clusterings,
20 | such as a predicted and ground truth clustering.
21 | }
22 | \details{
23 | The Fowlkes-Mallows index is defined as the geometric mean of
24 | precision and recall, computed with respect to pairs of elements.
25 | }
26 | \examples{
27 | true <- c(1,1,1,2,2)  # ground truth clustering
28 | pred <- c(1,1,2,2,2)  # predicted clustering
29 | fowlkes_mallows(true, pred)
30 | 
31 | }
32 | \references{
33 | Fowlkes, E. B. and Mallows, C. L. "A Method for Comparing Two Hierarchical
34 | Clusterings." \emph{Journal of the American Statistical Association} \strong{78:383},
35 | 553-569, (1983). \doi{10.1080/01621459.1983.10478008}
36 | }
37 | 


--------------------------------------------------------------------------------
/src/RcppExports.cpp:
--------------------------------------------------------------------------------
 1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand
 2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 3 | 
 4 | #include <Rcpp.h>
 5 | 
 6 | using namespace Rcpp;
 7 | 
 8 | #ifdef RCPP_USE_GLOBAL_ROSTREAM
 9 | Rcpp::Rostream<true>&  Rcpp::Rcout = Rcpp::Rcpp_cout_get();
10 | Rcpp::Rostream<false>& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get();
11 | #endif
12 | 
13 | // pairs_to_membership_cpp
14 | IntegerVector pairs_to_membership_cpp(const IntegerMatrix& pairs, int num_records);
15 | RcppExport SEXP _clevr_pairs_to_membership_cpp(SEXP pairsSEXP, SEXP num_recordsSEXP) {
16 | BEGIN_RCPP
17 |     Rcpp::RObject rcpp_result_gen;
18 |     Rcpp::RNGScope rcpp_rngScope_gen;
19 |     Rcpp::traits::input_parameter< const IntegerMatrix& >::type pairs(pairsSEXP);
20 |     Rcpp::traits::input_parameter< int >::type num_records(num_recordsSEXP);
21 |     rcpp_result_gen = Rcpp::wrap(pairs_to_membership_cpp(pairs, num_records));
22 |     return rcpp_result_gen;
23 | END_RCPP
24 | }
25 | 
26 | static const R_CallMethodDef CallEntries[] = {
27 |     {"_clevr_pairs_to_membership_cpp", (DL_FUNC) &_clevr_pairs_to_membership_cpp, 2},
28 |     {NULL, NULL, 0}
29 | };
30 | 
31 | RcppExport void R_init_clevr(DllInfo *dll) {
32 |     R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
33 |     R_useDynamicSymbols(dll, FALSE);
34 | }
35 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: clevr
 2 | Type: Package
 3 | Title: Clustering and Link Prediction Evaluation in R
 4 | Version: 0.1.2
 5 | Date: 2023-09-16
 6 | Authors@R: c(
 7 |     person(given = "Neil", 
 8 |            family = "Marchant", 
 9 |            email = "ngmarchant@gmail.com", 
10 |            role = c("aut", "cre")),
11 |     person(given = "Rebecca",
12 |            family = "Steorts", 
13 |            email = "beka@stat.duke.edu",
14 |            role = c("aut")),
15 |     person(given = "Olivier",
16 |            family = "Binette",
17 |            email = "olivier.binette@gmail.com",
18 |            role = c("ctb")))
19 | Maintainer: Neil Marchant <ngmarchant@gmail.com>
20 | Description: Tools for evaluating link prediction and clustering algorithms 
21 |     with respect to ground truth. Includes efficient implementations of 
22 |     common performance measures such as pairwise precision/recall, 
23 |     cluster homogeneity/completeness, variation of information, 
24 |     Rand index etc.
25 | License: GPL-2
26 | Encoding: UTF-8
27 | Depends: R (>= 3.0.2)
28 | Imports: Rcpp (>= 1.0.5),
29 |     stats,
30 |     Matrix
31 | LinkingTo: Rcpp, BH (>= 1.69.0)
32 | RoxygenNote: 7.2.3
33 | Roxygen: list(markdown = TRUE)
34 | Suggests: testthat
35 | URL: https://github.com/cleanzr/clevr
36 | BugReports: https://github.com/cleanzr/clevr/issues
37 | Collate: 
38 |     'RcppExports.R'
39 |     'clevr.R'
40 |     'measures_clusterings.R'
41 |     'transformations.R'
42 |     'measures_pairs.R'
43 | 


--------------------------------------------------------------------------------
/man/eval_report_clusters.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/measures_clusterings.R
 3 | \name{eval_report_clusters}
 4 | \alias{eval_report_clusters}
 5 | \title{Evaluation Report for Clustering}
 6 | \usage{
 7 | eval_report_clusters(true, pred)
 8 | }
 9 | \arguments{
10 | \item{true}{ground truth clustering represented as a membership
11 | vector. Each entry corresponds to an element and the value identifies
12 | the assigned cluster. The specific values of the cluster identifiers
13 | are arbitrary.}
14 | 
15 | \item{pred}{predicted clustering represented as a membership
16 | vector.}
17 | }
18 | \value{
19 | Returns a list containing the following measures:
20 | \describe{
21 | \item{homogeneity}{see \code{\link{homogeneity}}}
22 | \item{completeness}{see \code{\link{completeness}}}
23 | \item{v_measure}{see \code{\link{v_measure}}}
24 | \item{rand_index}{see \code{\link{rand_index}}}
25 | \item{adj_rand_index}{see \code{\link{adj_rand_index}}}
26 | \item{variation_info}{see \code{\link{variation_info}}}
27 | \item{mutual_info}{see \code{\link{mutual_info}}}
28 | \item{fowlkes_mallows}{see \code{\link{fowlkes_mallows}}}
29 | }
30 | }
31 | \description{
32 | Compute various evaluation measures for a predicted
33 | clustering using a ground truth clustering as a reference.
34 | }
35 | \examples{
36 | true <- c(1,1,1,2,2)  # ground truth clustering
37 | pred <- c(1,1,2,2,2)  # predicted clustering
38 | eval_report_clusters(true, pred)
39 | 
40 | }
41 | 


--------------------------------------------------------------------------------
/man/canonicalize_pairs.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/transformations.R
 3 | \name{canonicalize_pairs}
 4 | \alias{canonicalize_pairs}
 5 | \title{Canonicalize element pairs}
 6 | \usage{
 7 | canonicalize_pairs(pairs, ordered = FALSE)
 8 | }
 9 | \arguments{
10 | \item{pairs}{a matrix or data.frame of element pairs where rows correspond
11 | to element pairs and columns correspond to element identifiers.}
12 | 
13 | \item{ordered}{whether to treat the element pairs as ordered---i.e. whether
14 | pair \eqn{(x, y)} is distinct from pair \eqn{(y, x)} for \eqn{x \neq y}.
15 | Defaults to FALSE, which is appropriate for clustering, undirected link
16 | prediction, record linkage etc.}
17 | }
18 | \value{
19 | Returns the element pairs in canonical form, so that:
20 | \itemize{
21 | \item the first element id precedes the second element id lexicographically
22 | if \code{ordered = FALSE}---i.e. pair (3, 2) becomes pair (2, 3);
23 | \item pairs with missing element ids are removed;
24 | \item duplicate pairs are removed; and
25 | \item the rows in the matrix/data.frame pairs are sorted lexicographically
26 | by the first element id, then by the second element id.
27 | }
28 | }
29 | \description{
30 | Coerce a collection of element pairs into canonical form. Facilitates
31 | testing of equivalence.
32 | }
33 | \examples{
34 | messy_pairs <- rbind(c(2,1), c(1,2), c(3,1), c(1,2))
35 | clean_pairs <- canonicalize_pairs(messy_pairs)
36 | all(rbind(c(1,2), c(1,3)) == clean_pairs) # duplicates removed and order fixed
37 | 
38 | }
39 | 


--------------------------------------------------------------------------------
/man/adj_rand_index.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/measures_clusterings.R
 3 | \name{adj_rand_index}
 4 | \alias{adj_rand_index}
 5 | \title{Adjusted Rand Index Between Clusterings}
 6 | \usage{
 7 | adj_rand_index(true, pred)
 8 | }
 9 | \arguments{
10 | \item{true}{ground truth clustering represented as a membership
11 | vector. Each entry corresponds to an element and the value identifies
12 | the assigned cluster. The specific values of the cluster identifiers
13 | are arbitrary.}
14 | 
15 | \item{pred}{predicted clustering represented as a membership
16 | vector.}
17 | }
18 | \description{
19 | Computes the adjusted Rand index (ARI) between two clusterings,
20 | such as a predicted and ground truth clustering.
21 | }
22 | \details{
23 | The adjusted Rand index (ARI) is a variant of the Rand index (RI)
24 | which is corrected for chance using the Permutation Model for
25 | clusterings. It is related to the RI as follows:
26 | \deqn{\frac{RI - E(RI)}{1 - E(RI)},}{(RI - E(RI))/(1 - E(RI)),}
27 | where \eqn{E(RI)} is the expected value of the RI under the Permutation
28 | Model.
29 | Unlike the RI, the ARI takes values in the range -1 to 1. A value
30 | of 1 indicates that the clusterings are identical, while a value of
31 | 0 indicates the clusterings are drawn randomly independent of one
32 | another.
33 | }
34 | \examples{
35 | true <- c(1,1,1,2,2)  # ground truth clustering
36 | pred <- c(1,1,2,2,2)  # predicted clustering
37 | adj_rand_index(true, pred)
38 | 
39 | }
40 | \references{
41 | Hubert, L., Arabie, P. "Comparing partitions." \emph{Journal of Classification}
42 | \strong{2}, 193–218 (1985). \doi{10.1007/BF01908075}
43 | }
44 | 


--------------------------------------------------------------------------------
/man/rand_index.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/measures_clusterings.R
 3 | \name{rand_index}
 4 | \alias{rand_index}
 5 | \title{Rand Index Between Clusterings}
 6 | \usage{
 7 | rand_index(true, pred)
 8 | }
 9 | \arguments{
10 | \item{true}{ground truth clustering represented as a membership
11 | vector. Each entry corresponds to an element and the value identifies
12 | the assigned cluster. The specific values of the cluster identifiers
13 | are arbitrary.}
14 | 
15 | \item{pred}{predicted clustering represented as a membership
16 | vector.}
17 | }
18 | \description{
19 | Computes the Rand index (RI) between two clusterings, such
20 | as a predicted and ground truth clustering.
21 | }
22 | \details{
23 | The Rand index (RI) can be expressed as:
24 | \deqn{\frac{a + b}{{n \choose 2}}.}{(a + b)/binom(n, 2).}
25 | where
26 | \itemize{
27 | \item \eqn{n} is the number of elements,
28 | \item \eqn{a} is the number of pairs of elements that appear in the
29 | same cluster in both clusterings, and
30 | \item \eqn{b} is the number of pairs of elements that appear in distinct
31 | clusters in both clusterings.
32 | }
33 | 
34 | The RI takes on values between 0 and 1, where 1 denotes exact agreement
35 | between the clusterings and 0 denotes disagreement on all pairs of
36 | elements.
37 | }
38 | \examples{
39 | true <- c(1,1,1,2,2)  # ground truth clustering
40 | pred <- c(1,1,2,2,2)  # predicted clustering
41 | rand_index(true, pred)
42 | 
43 | }
44 | \references{
45 | Rand, W. M. "Objective Criteria for the Evaluation of Clustering Methods."
46 | \emph{Journal of the American Statistical Association} 66(336), 846-850 (1971).
47 | \doi{10.1080/01621459.1971.10482356}
48 | }
49 | 


--------------------------------------------------------------------------------
/man/precision_pairs.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/measures_pairs.R
 3 | \name{precision_pairs}
 4 | \alias{precision_pairs}
 5 | \title{Precision of Linked Pairs}
 6 | \usage{
 7 | precision_pairs(true_pairs, pred_pairs, ordered = FALSE)
 8 | }
 9 | \arguments{
10 | \item{true_pairs}{set of true coreferent pairs stored in a matrix or
11 | data.frame, where rows index pairs and columns index the ids of the
12 | constituents. Any pairs not included are assumed to be \emph{non-coreferent}.
13 | Duplicate pairs (including equivalent pairs with reversed ids) are
14 | automatically removed.}
15 | 
16 | \item{pred_pairs}{set of predicted coreferent pairs, following the same
17 | specification as \code{true_pairs}.}
18 | 
19 | \item{ordered}{whether to treat the element pairs as ordered---i.e. whether
20 | pair \eqn{(x, y)} is distinct from pair \eqn{(y, x)} for \eqn{x \neq y}.
21 | Defaults to FALSE, which is appropriate for clustering, undirected link
22 | prediction, record linkage etc.}
23 | }
24 | \description{
25 | Computes the precision of a set of \emph{predicted} coreferent
26 | (linked) pairs given a set of \emph{ground truth} coreferent pairs.
27 | }
28 | \details{
29 | The precision is defined as:
30 | \deqn{\frac{|T \cap P|}{|P|}}{|T ∩ P|/|P|}
31 |   where \eqn{T} is the set of true coreferent pairs and \eqn{P} is the
32 | set of predicted coreferent pairs.
33 | }
34 | \examples{
35 | true_pairs <- rbind(c(1,2), c(2,3), c(1,3)) # ground truth is 3-clique
36 | pred_pairs <- rbind(c(1,2), c(2,3))         # prediction misses one edge
37 | num_pairs <- 3                              # assuming 3 elements
38 | precision_pairs(true_pairs, pred_pairs, num_pairs)
39 | 
40 | }
41 | 


--------------------------------------------------------------------------------
/man/homogeneity.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/measures_clusterings.R
 3 | \name{homogeneity}
 4 | \alias{homogeneity}
 5 | \title{Homogeneity Between Clusterings}
 6 | \usage{
 7 | homogeneity(true, pred)
 8 | }
 9 | \arguments{
10 | \item{true}{ground truth clustering represented as a membership
11 | vector. Each entry corresponds to an element and the value identifies
12 | the assigned cluster. The specific values of the cluster identifiers
13 | are arbitrary.}
14 | 
15 | \item{pred}{predicted clustering represented as a membership
16 | vector.}
17 | }
18 | \description{
19 | Computes the homogeneity between two clusterings, such
20 | as a predicted and ground truth clustering.
21 | }
22 | \details{
23 | Homogeneity is an entropy-based measure of the similarity
24 | between two clusterings, say \eqn{t} and \eqn{p}. The homogeneity
25 | is high if clustering \eqn{t} only assigns members of a cluster to
26 | a single cluster in \eqn{p}. The homogeneity ranges between 0
27 | and 1, where 1 indicates a perfect homogeneity.
28 | }
29 | \examples{
30 | true <- c(1,1,1,2,2)  # ground truth clustering
31 | pred <- c(1,1,2,2,2)  # predicted clustering
32 | homogeneity(true, pred)
33 | 
34 | }
35 | \references{
36 | Rosenberg, A. and Hirschberg, J. "V-measure: A conditional entropy-based external cluster evaluation measure." \emph{Proceedings of the 2007 Joint Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning} (EMNLP-CoNLL), (2007).
37 | }
38 | \seealso{
39 | \code{\link{completeness}} evaluates the \emph{completeness}, which is a dual
40 | measure to \emph{homogeneity}. \code{\link{v_measure}} evaluates the harmonic mean of
41 | \emph{completeness} and \emph{homogeneity}.
42 | }
43 | 


--------------------------------------------------------------------------------
/man/completeness.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/measures_clusterings.R
 3 | \name{completeness}
 4 | \alias{completeness}
 5 | \title{Completeness Between Clusterings}
 6 | \usage{
 7 | completeness(true, pred)
 8 | }
 9 | \arguments{
10 | \item{true}{ground truth clustering represented as a membership
11 | vector. Each entry corresponds to an element and the value identifies
12 | the assigned cluster. The specific values of the cluster identifiers
13 | are arbitrary.}
14 | 
15 | \item{pred}{predicted clustering represented as a membership
16 | vector.}
17 | }
18 | \description{
19 | Computes the completeness between two clusterings, such
20 | as a predicted and ground truth clustering.
21 | }
22 | \details{
23 | Completeness is an entropy-based measure of the similarity
24 | between two clusterings, say \eqn{t} and \eqn{p}. The completeness
25 | is high if \emph{all} members of a given cluster in \eqn{t} are assigned
26 | to a single cluster in \eqn{p}. The completeness ranges between 0
27 | and 1, where 1 indicates perfect completeness.
28 | }
29 | \examples{
30 | true <- c(1,1,1,2,2)  # ground truth clustering
31 | pred <- c(1,1,2,2,2)  # predicted clustering
32 | completeness(true, pred)
33 | 
34 | }
35 | \references{
36 | Rosenberg, A. and Hirschberg, J. "V-measure: A conditional entropy-based external cluster evaluation measure." \emph{Proceedings of the 2007 Joint Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning} (EMNLP-CoNLL), (2007).
37 | }
38 | \seealso{
39 | \code{\link{homogeneity}} evaluates the \emph{homogeneity}, which is a dual
40 | measure to \emph{completeness}. \code{\link{v_measure}} evaluates the harmonic mean of
41 | \emph{completeness} and \emph{homogeneity}.
42 | }
43 | 


--------------------------------------------------------------------------------
/man/variation_info.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/measures_clusterings.R
 3 | \name{variation_info}
 4 | \alias{variation_info}
 5 | \title{Variation of Information Between Clusterings}
 6 | \usage{
 7 | variation_info(true, pred, base = exp(1))
 8 | }
 9 | \arguments{
10 | \item{true}{ground truth clustering represented as a membership
11 | vector. Each entry corresponds to an element and the value identifies
12 | the assigned cluster. The specific values of the cluster identifiers
13 | are arbitrary.}
14 | 
15 | \item{pred}{predicted clustering represented as a membership
16 | vector.}
17 | 
18 | \item{base}{base of the logarithm. Defaults to \code{exp(1)}.}
19 | }
20 | \description{
21 | Computes the variation of information between two
22 | clusterings, such as a predicted and ground truth clustering.
23 | }
24 | \details{
25 | Variation of information is an entropy-based distance metric
26 | on the space of clusterings. It is unnormalized and varies between
27 | \eqn{0} and \eqn{\log(N)}{log(N)} where \eqn{N} is the number of
28 | clustered elements. Larger values of the distance metric correspond
29 | to greater dissimilarity between the clusterings.
30 | }
31 | \examples{
32 | true <- c(1,1,1,2,2)  # ground truth clustering
33 | pred <- c(1,1,2,2,2)  # predicted clustering
34 | variation_info(true, pred)
35 | 
36 | }
37 | \references{
38 | Arabie, P. and Boorman, S. A. "Multidimensional scaling of measures of
39 | distance between partitions." \emph{Journal of Mathematical Psychology} \strong{10:2},
40 | 148-203, (1973). \doi{10.1016/0022-2496(73)90012-6}
41 | 
42 | Meilă, M. "Comparing Clusterings by the Variation of Information." In:
43 | Learning Theory and Kernel Machines, Lecture Notes in Computer Science
44 | \strong{2777}, Springer, Berlin, Heidelberg, (2003).
45 | \doi{10.1007/978-3-540-45167-9_14}
46 | }
47 | 


--------------------------------------------------------------------------------
/man/recall_pairs.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/measures_pairs.R
 3 | \name{recall_pairs}
 4 | \alias{recall_pairs}
 5 | \alias{sensitivity_pairs}
 6 | \title{Recall of Linked Pairs}
 7 | \usage{
 8 | recall_pairs(true_pairs, pred_pairs, ordered = FALSE)
 9 | 
10 | sensitivity_pairs(true_pairs, pred_pairs, ordered = FALSE)
11 | }
12 | \arguments{
13 | \item{true_pairs}{set of true coreferent pairs stored in a matrix or
14 | data.frame, where rows index pairs and columns index the ids of the
15 | constituents. Any pairs not included are assumed to be \emph{non-coreferent}.
16 | Duplicate pairs (including equivalent pairs with reversed ids) are
17 | automatically removed.}
18 | 
19 | \item{pred_pairs}{set of predicted coreferent pairs, following the same
20 | specification as \code{true_pairs}.}
21 | 
22 | \item{ordered}{whether to treat the element pairs as ordered---i.e. whether
23 | pair \eqn{(x, y)} is distinct from pair \eqn{(y, x)} for \eqn{x \neq y}.
24 | Defaults to FALSE, which is appropriate for clustering, undirected link
25 | prediction, record linkage etc.}
26 | }
27 | \description{
28 | Computes the precision of a set of \emph{predicted} coreferent
29 | (linked) pairs given a set of \emph{ground truth} coreferent pairs.
30 | }
31 | \details{
32 | The recall is defined as:
33 | \deqn{\frac{|T \cap P|}{|T|}}{|T ∩ P|/|T|}
34 |   where \eqn{T} is the set of true coreferent pairs and \eqn{P} is the
35 | set of predicted coreferent pairs.
36 | }
37 | \note{
38 | \code{sensitivity_pairs} is an alias for \code{recall_pairs}.
39 | }
40 | \examples{
41 | true_pairs <- rbind(c(1,2), c(2,3), c(1,3)) # ground truth is 3-clique
42 | pred_pairs <- rbind(c(1,2), c(2,3))         # prediction misses one edge
43 | num_pairs <- 3                              # assuming 3 elements
44 | recall_pairs(true_pairs, pred_pairs, num_pairs)
45 | 
46 | }
47 | 


--------------------------------------------------------------------------------
/man/specificity_pairs.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/measures_pairs.R
 3 | \name{specificity_pairs}
 4 | \alias{specificity_pairs}
 5 | \title{Specificity of Linked Pairs}
 6 | \usage{
 7 | specificity_pairs(true_pairs, pred_pairs, num_pairs, ordered = FALSE)
 8 | }
 9 | \arguments{
10 | \item{true_pairs}{set of true coreferent pairs stored in a matrix or
11 | data.frame, where rows index pairs and columns index the ids of the
12 | constituents. Any pairs not included are assumed to be \emph{non-coreferent}.
13 | Duplicate pairs (including equivalent pairs with reversed ids) are
14 | automatically removed.}
15 | 
16 | \item{pred_pairs}{set of predicted coreferent pairs, following the same
17 | specification as \code{true_pairs}.}
18 | 
19 | \item{num_pairs}{the total number of coreferent and non-coreferent pairs,
20 | excluding equivalent pairs with reversed ids.}
21 | 
22 | \item{ordered}{whether to treat the element pairs as ordered---i.e. whether
23 | pair \eqn{(x, y)} is distinct from pair \eqn{(y, x)} for \eqn{x \neq y}.
24 | Defaults to FALSE, which is appropriate for clustering, undirected link
25 | prediction, record linkage etc.}
26 | }
27 | \description{
28 | Computes the specificity of a set of \emph{predicted} coreferent
29 | (linked) pairs given a set of \emph{ground truth} coreferent pairs.
30 | }
31 | \details{
32 | The specificity is defined as:
33 | \deqn{\frac{|P' \cap T'|}{|P'|}}{|P' ∩ T'|/|P'|}
34 |   where \eqn{T'} is the set of true non-coreferent pairs, \eqn{P} is the
35 | set of predicted non-coreferent pairs.
36 | }
37 | \examples{
38 | true_pairs <- rbind(c(1,2), c(2,3), c(1,3)) # ground truth is 3-clique
39 | pred_pairs <- rbind(c(1,2), c(2,3))         # prediction misses one edge
40 | num_pairs <- 3                              # assuming 3 elements
41 | specificity_pairs(true_pairs, pred_pairs, num_pairs)
42 | 
43 | }
44 | 


--------------------------------------------------------------------------------
/man/fowlkes_mallows_pairs.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/measures_pairs.R
 3 | \name{fowlkes_mallows_pairs}
 4 | \alias{fowlkes_mallows_pairs}
 5 | \title{Fowlkes-Mallows Index of Linked Pairs}
 6 | \usage{
 7 | fowlkes_mallows_pairs(true_pairs, pred_pairs, ordered = FALSE)
 8 | }
 9 | \arguments{
10 | \item{true_pairs}{set of true coreferent pairs stored in a matrix or
11 | data.frame, where rows index pairs and columns index the ids of the
12 | constituents. Any pairs not included are assumed to be \emph{non-coreferent}.
13 | Duplicate pairs (including equivalent pairs with reversed ids) are
14 | automatically removed.}
15 | 
16 | \item{pred_pairs}{set of predicted coreferent pairs, following the same
17 | specification as \code{true_pairs}.}
18 | 
19 | \item{ordered}{whether to treat the element pairs as ordered---i.e. whether
20 | pair \eqn{(x, y)} is distinct from pair \eqn{(y, x)} for \eqn{x \neq y}.
21 | Defaults to FALSE, which is appropriate for clustering, undirected link
22 | prediction, record linkage etc.}
23 | }
24 | \description{
25 | Computes the Fowlkes-Mallows index for a set of \emph{predicted}
26 | coreferent (linked) pairs given a set of \emph{ground truth} coreferent pairs.
27 | }
28 | \details{
29 | The Fowlkes-Mallows index is defined as the geometric mean of
30 | precision \eqn{P} and recall \eqn{R}:
31 | \deqn{\sqrt{P R}.}{√(P·R).}
32 | }
33 | \examples{
34 | true_pairs <- rbind(c(1,2), c(2,3), c(1,3)) # ground truth is 3-clique
35 | pred_pairs <- rbind(c(1,2), c(2,3))         # prediction misses one edge
36 | num_pairs <- 3                              # assuming 3 elements
37 | fowlkes_mallows_pairs(true_pairs, pred_pairs, num_pairs)
38 | 
39 | }
40 | \references{
41 | Fowlkes, E. B. and Mallows, C. L. "A Method for Comparing Two Hierarchical
42 | Clusterings." \emph{Journal of the American Statistical Association} \strong{78:383},
43 | 553-569, (1983). \doi{10.1080/01621459.1983.10478008}.
44 | }
45 | 


--------------------------------------------------------------------------------
/man/accuracy_pairs.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/measures_pairs.R
 3 | \name{accuracy_pairs}
 4 | \alias{accuracy_pairs}
 5 | \title{Accuracy of Linked Pairs}
 6 | \usage{
 7 | accuracy_pairs(true_pairs, pred_pairs, num_pairs, ordered = FALSE)
 8 | }
 9 | \arguments{
10 | \item{true_pairs}{set of true coreferent pairs stored in a matrix or
11 | data.frame, where rows index pairs and columns index the ids of the
12 | constituents. Any pairs not included are assumed to be \emph{non-coreferent}.
13 | Duplicate pairs (including equivalent pairs with reversed ids) are
14 | automatically removed.}
15 | 
16 | \item{pred_pairs}{set of predicted coreferent pairs, following the same
17 | specification as \code{true_pairs}.}
18 | 
19 | \item{num_pairs}{the total number of coreferent and non-coreferent pairs,
20 | excluding equivalent pairs with reversed ids.}
21 | 
22 | \item{ordered}{whether to treat the element pairs as ordered---i.e. whether
23 | pair \eqn{(x, y)} is distinct from pair \eqn{(y, x)} for \eqn{x \neq y}.
24 | Defaults to FALSE, which is appropriate for clustering, undirected link
25 | prediction, record linkage etc.}
26 | }
27 | \description{
28 | Computes the accuracy of a set of \emph{predicted} coreferent
29 | (linked) pairs given a set of \emph{ground truth} coreferent pairs.
30 | }
31 | \details{
32 | The accuracy is defined as:
33 | \deqn{\frac{|T \cap P| + |T' \cap P'|}{N}}{(|T ∩ P| + |T' ∩ P'|)/N}
34 |   where:
35 | \itemize{
36 | \item \eqn{T} is the set of true coreferent pairs,
37 | \item \eqn{P} is the set of predicted coreferent pairs,
38 | \item \eqn{T'} is the set of true non-coreferent pairs,
39 | \item \eqn{P'} is the set of predicted non-coreferent pairs, and
40 | \item \eqn{N} is the total number of coreferent and non-coreferent pairs.
41 | }
42 | }
43 | \examples{
44 | true_pairs <- rbind(c(1,2), c(2,3), c(1,3)) # ground truth is 3-clique
45 | pred_pairs <- rbind(c(1,2), c(2,3))         # prediction misses one edge
46 | num_pairs <- 3                              # assuming 3 elements
47 | accuracy_pairs(true_pairs, pred_pairs, num_pairs)
48 | 
49 | }
50 | 


--------------------------------------------------------------------------------
/man/f_measure_pairs.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/measures_pairs.R
 3 | \name{f_measure_pairs}
 4 | \alias{f_measure_pairs}
 5 | \title{F-measure of Linked Pairs}
 6 | \usage{
 7 | f_measure_pairs(true_pairs, pred_pairs, beta = 1, ordered = FALSE)
 8 | }
 9 | \arguments{
10 | \item{true_pairs}{set of true coreferent pairs stored in a matrix or
11 | data.frame, where rows index pairs and columns index the ids of the
12 | constituents. Any pairs not included are assumed to be \emph{non-coreferent}.
13 | Duplicate pairs (including equivalent pairs with reversed ids) are
14 | automatically removed.}
15 | 
16 | \item{pred_pairs}{set of predicted coreferent pairs, following the same
17 | specification as \code{true_pairs}.}
18 | 
19 | \item{beta}{non-negative weight. A value of 0 assigns no weight to recall
20 | (i.e. the measure reduces to precision), while larger values assign
21 | increasing weight to recall. A value of 1 weights precision and recall
22 | equally.}
23 | 
24 | \item{ordered}{whether to treat the element pairs as ordered---i.e. whether
25 | pair \eqn{(x, y)} is distinct from pair \eqn{(y, x)} for \eqn{x \neq y}.
26 | Defaults to FALSE, which is appropriate for clustering, undirected link
27 | prediction, record linkage etc.}
28 | }
29 | \description{
30 | Computes the F-measure (a.k.a. F-score) of a set of
31 | \emph{predicted} coreferent (linked) pairs given a set of \emph{ground truth}
32 | coreferent pairs.
33 | }
34 | \details{
35 | The \eqn{\beta}{β}-weighted F-measure is defined as the weighted
36 | harmonic mean of precision \eqn{P} and recall \eqn{R}:
37 | \deqn{(1 + \beta^2)\frac{P \cdot R}{\beta^2 \cdot P + R}.}{(1 + β^2)·P·R/(β^2·P + R).}
38 | }
39 | \examples{
40 | true_pairs <- rbind(c(1,2), c(2,3), c(1,3)) # ground truth is 3-clique
41 | pred_pairs <- rbind(c(1,2), c(2,3))         # prediction misses one edge
42 | num_pairs <- 3                              # assuming 3 elements
43 | f_measure_pairs(true_pairs, pred_pairs, num_pairs)
44 | 
45 | }
46 | \references{
47 | Van Rijsbergen, C. J. "Information Retrieval." (2nd ed.).
48 | Butterworth-Heinemann, USA, (1979).
49 | }
50 | 


--------------------------------------------------------------------------------
/man/balanced_accuracy_pairs.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/measures_pairs.R
 3 | \name{balanced_accuracy_pairs}
 4 | \alias{balanced_accuracy_pairs}
 5 | \title{Balanced Accuracy of Linked Pairs}
 6 | \usage{
 7 | balanced_accuracy_pairs(true_pairs, pred_pairs, num_pairs, ordered = FALSE)
 8 | }
 9 | \arguments{
10 | \item{true_pairs}{set of true coreferent pairs stored in a matrix or
11 | data.frame, where rows index pairs and columns index the ids of the
12 | constituents. Any pairs not included are assumed to be \emph{non-coreferent}.
13 | Duplicate pairs (including equivalent pairs with reversed ids) are
14 | automatically removed.}
15 | 
16 | \item{pred_pairs}{set of predicted coreferent pairs, following the same
17 | specification as \code{true_pairs}.}
18 | 
19 | \item{num_pairs}{the total number of coreferent and non-coreferent pairs,
20 | excluding equivalent pairs with reversed ids.}
21 | 
22 | \item{ordered}{whether to treat the element pairs as ordered---i.e. whether
23 | pair \eqn{(x, y)} is distinct from pair \eqn{(y, x)} for \eqn{x \neq y}.
24 | Defaults to FALSE, which is appropriate for clustering, undirected link
25 | prediction, record linkage etc.}
26 | }
27 | \description{
28 | Computes the balanced accuracy of a set of \emph{predicted}
29 | coreferent (linked) pairs given a set of \emph{ground truth} coreferent
30 | pairs.
31 | }
32 | \details{
33 | The balanced accuracy is defined as:
34 | \deqn{\frac{\frac{|T \cap P|}{|P|} + \frac{|T' \cap P'|}{|P'|}}{2}}{|T ∩ P|/(2|P|) + |T' ∩ P'|/(2|P'|)}
35 |   where:
36 | \itemize{
37 | \item \eqn{T} is the set of true coreferent pairs,
38 | \item \eqn{P} is the set of predicted coreferent pairs,
39 | \item \eqn{T'} is the set of true non-coreferent pairs, and
40 | \item \eqn{P'} is the set of predicted non-coreferent pairs.
41 | }
42 | }
43 | \examples{
44 | true_pairs <- rbind(c(1,2), c(2,3), c(1,3)) # ground truth is 3-clique
45 | pred_pairs <- rbind(c(1,2), c(2,3))         # prediction misses one edge
46 | num_pairs <- 3                              # assuming 3 elements
47 | balanced_accuracy_pairs(true_pairs, pred_pairs, num_pairs)
48 | 
49 | }
50 | 


--------------------------------------------------------------------------------
/man/v_measure.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/measures_clusterings.R
 3 | \name{v_measure}
 4 | \alias{v_measure}
 5 | \title{V-measure Between Clusterings}
 6 | \usage{
 7 | v_measure(true, pred, beta = 1)
 8 | }
 9 | \arguments{
10 | \item{true}{ground truth clustering represented as a membership
11 | vector. Each entry corresponds to an element and the value identifies
12 | the assigned cluster. The specific values of the cluster identifiers
13 | are arbitrary.}
14 | 
15 | \item{pred}{predicted clustering represented as a membership
16 | vector.}
17 | 
18 | \item{beta}{non-negative weight. A value of 0 assigns no weight to
19 | completeness (i.e. the measure reduces to homogeneity), while larger
20 | values assign increasing weight to completeness. A value of 1 weights
21 | completeness and homogeneity equally.}
22 | }
23 | \description{
24 | Computes the V-measure between two clusterings, such
25 | as a predicted and ground truth clustering.
26 | }
27 | \details{
28 | V-measure is defined as the \eqn{\beta}{β}-weighted harmonic
29 | mean of homogeneity \eqn{h} and completeness \eqn{c}:
30 | \deqn{(1 + \beta)\frac{h \cdot c}{\beta \cdot h + c}.}{(1 + β)·h·c/(β·h + c).}
31 |    The range of V-measure is between 0 and 1, where 1 corresponds to a
32 | perfect match between the clusterings. It is equivalent to the
33 | normalised mutual information, when the aggregation function is the
34 | arithmetic mean.
35 | }
36 | \examples{
37 | true <- c(1,1,1,2,2)  # ground truth clustering
38 | pred <- c(1,1,2,2,2)  # predicted clustering
39 | v_measure(true, pred)
40 | 
41 | }
42 | \references{
43 | Rosenberg, A. and Hirschberg, J. "V-measure: A conditional entropy-based external cluster evaluation measure." \emph{Proceedings of the 2007 Joint Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning} (EMNLP-CoNLL), (2007).
44 | 
45 | Becker, H. "Identification and characterization of events in social media."
46 | \emph{PhD dissertation}, Columbia University, (2011).
47 | }
48 | \seealso{
49 | \code{\link{homogeneity}} and \code{\link{completeness}} evaluate the component
50 | measures upon which this measure is based.
51 | }
52 | 


--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | output: github_document
 3 | ---
 4 | 
 5 | <!-- README.md is generated from README.Rmd. Please edit that file -->
 6 | 
 7 | ```{r, include = FALSE}
 8 | knitr::opts_chunk$set(
 9 |   collapse = TRUE,
10 |   comment = "#>",
11 |   fig.path = "man/figures/README-",
12 |   out.width = "100%"
13 | )
14 | ```
15 | 
16 | # clevr: Clustering and Link Prediction Evaluation in R
17 | 
18 | <!-- badges: start -->
19 | <!-- badges: end -->
20 | 
21 | clevr implements functions for evaluating link prediction and clustering 
22 | algorithms in R. It includes efficient implementations of common performance 
23 | measures, such as:
24 | 
25 | * pairwise precision, recall, F-measure;
26 | * homogeneity, completeness and V-measure;
27 | * (adjusted) Rand index;
28 | * variation of information; and
29 | * mutual information.
30 | 
31 | While the current focus is on supervised (a.k.a. external) performance 
32 | measures, unsupervised (internal) measures are also in scope for future 
33 | releases.
34 | 
35 | ## Installation
36 | 
37 | You can install the latest release from [CRAN](https://CRAN.R-project.org) 
38 | by entering:
39 | 
40 | ``` r
41 | install.packages("clevr")
42 | ```
43 | 
44 | The development version can be installed from GitHub using `devtools`:
45 | 
46 | ``` r
47 | # install.packages("devtools")
48 | devtools::install_github("cleanzr/clevr")
49 | ```
50 | 
51 | ## Example
52 | 
53 | Several functions are included which transform between different clustering 
54 | representations.
55 | 
56 | ```{r example}
57 | library(clevr)
58 | # A clustering of four records represented as a membership vector
59 | pred_membership <- c("Record1" = 1, "Record2" = 1, "Record3" = 1, "Record4" = 2)
60 | 
61 | # Represent as a set of record pairs that appear in the same cluster
62 | pred_pairs <- membership_to_pairs(pred_membership)
63 | print(pred_pairs)
64 | 
65 | # Represent as a list of record clusters
66 | pred_clusters <- membership_to_clusters(pred_membership)
67 | print(pred_clusters)
68 | ```
69 | 
70 | Performance measures are available for evaluating linked pairs:
71 | 
72 | ```{r pair-measures}
73 | true_pairs <- rbind(c("Record1", "Record2"), c("Record3", "Record4"))
74 | 
75 | pr <- precision_pairs(true_pairs, pred_pairs)
76 | print(pr)
77 | 
78 | re <- recall_pairs(true_pairs, pred_pairs)
79 | print(re)
80 | ```
81 | 
82 | and for evaluating clusterings:
83 | 
84 | ```{r clust-measures}
85 | true_membership <- c("Record1" = 1, "Record2" = 1, "Record3" = 2, "Record4" = 2)
86 | 
87 | ari <- adj_rand_index(true_membership, pred_membership)
88 | print(ari)
89 | 
90 | vi <- variation_info(true_membership, pred_membership)
91 | print(vi)
92 | ```
93 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | <!-- README.md is generated from README.Rmd. Please edit that file -->
 3 | 
 4 | # clevr: Clustering and Link Prediction Evaluation in R
 5 | 
 6 | <!-- badges: start -->
 7 | 
 8 | <!-- badges: end -->
 9 | 
10 | clevr implements functions for evaluating link prediction and clustering
11 | algorithms in R. It includes efficient implementations of common
12 | performance measures, such as:
13 | 
14 |   - pairwise precision, recall, F-measure;
15 |   - homogeneity, completeness and V-measure;
16 |   - (adjusted) Rand index;
17 |   - variation of information; and
18 |   - mutual information.
19 | 
20 | While the current focus is on supervised (a.k.a. external) performance
21 | measures, unsupervised (internal) measures are also in scope for future
22 | releases.
23 | 
24 | ## Installation
25 | 
26 | You can install the latest release from
27 | [CRAN](https://CRAN.R-project.org) by entering:
28 | 
29 | ``` r
30 | install.packages("clevr")
31 | ```
32 | 
33 | The development version can be installed from GitHub using `devtools`:
34 | 
35 | ``` r
36 | # install.packages("devtools")
37 | devtools::install_github("cleanzr/clevr")
38 | ```
39 | 
40 | ## Example
41 | 
42 | Several functions are included which transform between different
43 | clustering representations.
44 | 
45 | ``` r
46 | library(clevr)
47 | # A clustering of four records represented as a membership vector
48 | pred_membership <- c("Record1" = 1, "Record2" = 1, "Record3" = 1, "Record4" = 2)
49 | 
50 | # Represent as a set of record pairs that appear in the same cluster
51 | pred_pairs <- membership_to_pairs(pred_membership)
52 | print(pred_pairs)
53 | #>      [,1]      [,2]     
54 | #> [1,] "Record1" "Record2"
55 | #> [2,] "Record1" "Record3"
56 | #> [3,] "Record2" "Record3"
57 | 
58 | # Represent as a list of record clusters
59 | pred_clusters <- membership_to_clusters(pred_membership)
60 | print(pred_clusters)
61 | #> $`1`
62 | #> [1] "Record1" "Record2" "Record3"
63 | #> 
64 | #> $`2`
65 | #> [1] "Record4"
66 | ```
67 | 
68 | Performance measures are available for evaluating linked pairs:
69 | 
70 | ``` r
71 | true_pairs <- rbind(c("Record1", "Record2"), c("Record3", "Record4"))
72 | 
73 | pr <- precision_pairs(true_pairs, pred_pairs)
74 | print(pr)
75 | #> [1] 0.3333333
76 | 
77 | re <- recall_pairs(true_pairs, pred_pairs)
78 | print(re)
79 | #> [1] 0.5
80 | ```
81 | 
82 | and for evaluating clusterings:
83 | 
84 | ``` r
85 | true_membership <- c("Record1" = 1, "Record2" = 1, "Record3" = 2, "Record4" = 2)
86 | 
87 | ari <- adj_rand_index(true_membership, pred_membership)
88 | print(ari)
89 | #> [1] 0
90 | 
91 | vi <- variation_info(true_membership, pred_membership)
92 | print(vi)
93 | #> [1] 0.8239592
94 | ```
95 | 


--------------------------------------------------------------------------------
/man/contingency_table_pairs.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/measures_pairs.R
 3 | \name{contingency_table_pairs}
 4 | \alias{contingency_table_pairs}
 5 | \title{Binary Contingency Table for Linked Pairs}
 6 | \usage{
 7 | contingency_table_pairs(
 8 |   true_pairs,
 9 |   pred_pairs,
10 |   num_pairs = NULL,
11 |   ordered = FALSE
12 | )
13 | }
14 | \arguments{
15 | \item{true_pairs}{set of true coreferent pairs stored in a matrix or
16 | data.frame, where rows index pairs and columns index the ids of the
17 | constituents. Any pairs not included are assumed to be \emph{non-coreferent}.
18 | Duplicate pairs (including equivalent pairs with reversed ids) are
19 | automatically removed.}
20 | 
21 | \item{pred_pairs}{set of predicted coreferent pairs, following the same
22 | specification as \code{true_pairs}.}
23 | 
24 | \item{num_pairs}{the total number of coreferent and non-coreferent pairs,
25 | excluding equivalent pairs with reversed ids. If not provided,
26 | the true negative cell will be set to \code{NA}.}
27 | 
28 | \item{ordered}{whether to treat the element pairs as ordered---i.e. whether
29 | pair \eqn{(x, y)} is distinct from pair \eqn{(y, x)} for \eqn{x \neq y}.
30 | Defaults to FALSE, which is appropriate for clustering, undirected link
31 | prediction, record linkage etc.}
32 | }
33 | \value{
34 | Returns a \eqn{2 \times 2}{2×2} contingency table of the form:
35 | \preformatted{
36 |              Truth
37 |    Prediction   TRUE  FALSE
38 |         TRUE      TP     FP
39 |         FALSE     FN     TN
40 | }
41 | }
42 | \description{
43 | Compute the binary contingency table for a set of \emph{predicted}
44 | coreferent (linked) pairs given a set of \emph{ground truth} coreferent pairs.
45 | }
46 | \examples{
47 | ### Example where pairs/edges are undirected
48 | # ground truth is 3-clique
49 | true_pairs <- rbind(c(1,2), c(2,3), c(1,3))
50 | # prediction misses one edge
51 | pred_pairs <- rbind(c(1,2), c(2,3))
52 | # total number of pairs assuming 3 elements
53 | num_pairs <- 3 * (3 - 1) / 2
54 | eval_report_pairs(true_pairs, pred_pairs, num_pairs)
55 | 
56 | ### Example where pairs/edges are directed
57 | # ground truth is a 3-star
58 | true_pairs <- rbind(c(2,1), c(3,1), c(4,1))
59 | # prediction gets direction of one edge incorrect
60 | pred_pairs <- rbind(c(2,1), c(3,1), c(1,4))
61 | # total number of pairs assuming 4 elements
62 | num_pairs <- 4 * 4
63 | eval_report_pairs(true_pairs, pred_pairs, num_pairs, ordered = TRUE)
64 | 
65 | }
66 | \seealso{
67 | The \code{\link{membership_to_pairs}} and \code{\link{clusters_to_pairs}} functions can be
68 | used to transform other clustering representations into lists of pairs,
69 | as required by this function.
70 | The \code{\link{eval_report_pairs}} function computes common evaluation measures
71 | derived from binary contingency matrices, like the ones output by this
72 | function.
73 | }
74 | 


--------------------------------------------------------------------------------
/man/eval_report_pairs.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/measures_pairs.R
 3 | \name{eval_report_pairs}
 4 | \alias{eval_report_pairs}
 5 | \title{Evaluation Report for Linked Pairs}
 6 | \usage{
 7 | eval_report_pairs(true_pairs, pred_pairs, num_pairs = NULL, ordered = FALSE)
 8 | }
 9 | \arguments{
10 | \item{true_pairs}{set of true coreferent pairs stored in a matrix or
11 | data.frame, where rows index pairs and columns index the ids of the
12 | constituents. Any pairs not included are assumed to be \emph{non-coreferent}.
13 | Duplicate pairs (including equivalent pairs with reversed ids) are
14 | automatically removed.}
15 | 
16 | \item{pred_pairs}{set of predicted coreferent pairs, following the same
17 | specification as \code{true_pairs}.}
18 | 
19 | \item{num_pairs}{the total number of coreferent and non-coreferent pairs,
20 | excluding equivalent pairs with reversed ids. If not provided,
21 | measures that depend on the number of true negatives will be returned
22 | as \code{NA}.}
23 | 
24 | \item{ordered}{whether to treat the element pairs as ordered---i.e. whether
25 | pair \eqn{(x, y)} is distinct from pair \eqn{(y, x)} for \eqn{x \neq y}.
26 | Defaults to FALSE, which is appropriate for clustering, undirected link
27 | prediction, record linkage etc.}
28 | }
29 | \value{
30 | Returns a list containing the following measures:
31 | \describe{
32 | \item{precision}{see \code{\link{precision_pairs}}}
33 | \item{recall}{see \code{\link{recall_pairs}}}
34 | \item{specificity}{see \code{\link{specificity_pairs}}}
35 | \item{sensitivity}{see \code{\link{sensitivity_pairs}}}
36 | \item{f1score}{see \code{\link{f_measure_pairs}}}
37 | \item{accuracy}{see \code{\link{accuracy_pairs}}}
38 | \item{balanced_accuracy}{see \code{\link{balanced_accuracy_pairs}}}
39 | \item{fowlkes_mallows}{see \code{\link{fowlkes_mallows_pairs}}}
40 | }
41 | }
42 | \description{
43 | Compute various evaluation measures for a set of \emph{predicted}
44 | coreferent (linked) pairs given a set of \emph{ground truth} coreferent pairs.
45 | }
46 | \examples{
47 | ### Example where pairs/edges are undirected
48 | # ground truth is 3-clique
49 | true_pairs <- rbind(c(1,2), c(2,3), c(1,3))
50 | # prediction misses one edge
51 | pred_pairs <- rbind(c(1,2), c(2,3))
52 | # total number of pairs assuming 3 elements
53 | num_pairs <- 3 * (3 - 1) / 2
54 | eval_report_pairs(true_pairs, pred_pairs, num_pairs)
55 | 
56 | ### Example where pairs/edges are directed
57 | # ground truth is a 3-star
58 | true_pairs <- rbind(c(2,1), c(3,1), c(4,1))
59 | # prediction gets direction of one edge incorrect
60 | pred_pairs <- rbind(c(2,1), c(3,1), c(1,4))
61 | # total number of pairs assuming 4 elements
62 | num_pairs <- 4 * 4
63 | eval_report_pairs(true_pairs, pred_pairs, num_pairs, ordered = TRUE)
64 | 
65 | }
66 | \seealso{
67 | The \code{\link{contingency_table_pairs}} function can be used to compute
68 | the contingency table for entity resolution or record linkage problems.
69 | }
70 | 


--------------------------------------------------------------------------------
/tests/testthat/test-measures_clusterings.R:
--------------------------------------------------------------------------------
  1 | 
  2 | # Examples to test
  3 | make_clusterings_identical <- function() {
  4 |   true <- c(1,1,1,2,2)
  5 |   pred <- c(1,1,1,2,2)
  6 |   measures <- list(
  7 |     "rand_index" = 1.0,
  8 |     "adj_rand_index" = 1.0,
  9 |     "fowlkes_mallows" = 1.0,
 10 |     "homogeneity" = 1.0,
 11 |     "completeness" = 1.0,
 12 |     "v_measure" = 1.0,
 13 |     "variation_info" = 0.0,
 14 |     "mutual_info" = 0.6730116670092563
 15 |   )
 16 |   list("true" = true, "pred" = pred, "true_measures" = measures,
 17 |        "description" = "clusterings in perfect agreement")
 18 | }
 19 | 
 20 | make_clusterings_distinct <- function() {
 21 |   true <- c(1,2,3,4,5)
 22 |   pred <- c(1,1,1,1,1)
 23 |   measures <- list(
 24 |     "rand_index" = 0.0,
 25 |     "adj_rand_index" = 0.0,
 26 |     "fowlkes_mallows" = 0.0,
 27 |     "homogeneity" = 0.0,
 28 |     "completeness" = 1.0,
 29 |     "v_measure" = 0.0,
 30 |     "variation_info" = 1.6094379124341003,
 31 |     "mutual_info" = 0.0
 32 |   )
 33 |   list("true" = true, "pred" = pred, "true_measures" = measures,
 34 |        "description" = "clusterings in complete disagreement")
 35 | }
 36 | 
 37 | make_clusterings_one_difference <- function() {
 38 |   true <- c(1,1,2,2,2)
 39 |   pred <- c(1,1,1,2,2)
 40 |   measures <- list(
 41 |     "rand_index" = 0.6,
 42 |     "adj_rand_index" = 0.16666666666666666,
 43 |     "fowlkes_mallows" = 0.5,
 44 |     "homogeneity" = 0.43253806776631243,
 45 |     "completeness" = 0.43253806776631243,
 46 |     "v_measure" = 0.43253806776631243,
 47 |     "variation_info" = 0.7638170019537754,
 48 |     "mutual_info" = 0.2911031660323686
 49 |   )
 50 |   list("true" = true, "pred" = pred, "true_measures" = measures,
 51 |        "description" = "clusterings with one difference")
 52 | }
 53 | 
 54 | make_clusterings_anticorrelated <- function() {
 55 |   true <- c(1,1,1,2,3)
 56 |   pred <- c(1,2,3,4,4)
 57 |   measures <- list(
 58 |     "rand_index" = 0.6,
 59 |     "adj_rand_index" = -0.176470588235294,
 60 |     "fowlkes_mallows" = 0.0,
 61 |     "homogeneity" = 0.7082316448032829,
 62 |     "completeness" = 0.5051961085524235,
 63 |     "v_measure" = 0.5897275217561567,
 64 |     "variation_info" = 0.936426245424844,
 65 |     "mutual_info" = 0.6730116670092563
 66 |   )
 67 |   list("true" = true, "pred" = pred, "true_measures" = measures,
 68 |        "description" = "clusterings that are anti-correlated")
 69 | }
 70 | 
 71 | examples_to_test <- list(make_clusterings_identical,
 72 |                          make_clusterings_distinct,
 73 |                          make_clusterings_one_difference,
 74 |                          make_clusterings_anticorrelated)
 75 | 
 76 | measures_to_test <- c("Rand Index" = "rand_index",
 77 |                       "Adjusted Rand Index" = "adj_rand_index",
 78 |                       "Fowlkes-Mallows Index" = "fowlkes_mallows",
 79 |                       "Homogeneity" = "homogeneity",
 80 |                       "Completeness" = "completeness",
 81 |                       "V-Measure" = "v_measure",
 82 |                       "Variation Information" = "variation_info",
 83 |                       "Mutual Information" = "mutual_info")
 84 | 
 85 | 
 86 | for (measure_name in names(measures_to_test)) {
 87 |   context(measure_name)
 88 |   measure <- measures_to_test[measure_name]
 89 |   for (example in examples_to_test) {
 90 |     example <- example()
 91 |     test_that(paste(measure_name, "is correct for", example$description), {
 92 |       true <- example$true
 93 |       pred <- example$pred
 94 |       expect_equal(eval(parse(text=paste0(measure, "(true, pred)"))),
 95 |                    example$true_measures[[measure]])
 96 |     })
 97 |   }
 98 | }
 99 | 
100 | test_that("V-Measure is correct for a simple example when beta != 1", {
101 |   true <- c(1,1,2,2,2)
102 |   pred <- c(1,1,1,2,2)
103 |   expect_equal(v_measure(true, pred, beta = 0), homogeneity(true, pred))
104 |   expect_equal(v_measure(true, pred, beta = Inf), completeness(true, pred))
105 | })
106 | 


--------------------------------------------------------------------------------
/man/clustering_representations.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/transformations.R
  3 | \name{clusters_to_membership}
  4 | \alias{clusters_to_membership}
  5 | \alias{membership_to_clusters}
  6 | \alias{clusters_to_pairs}
  7 | \alias{membership_to_pairs}
  8 | \alias{pairs_to_membership}
  9 | \alias{pairs_to_clusters}
 10 | \title{Transform Clustering Representations}
 11 | \usage{
 12 | clusters_to_membership(clusters, elem_ids = NULL, clust_ids = NULL)
 13 | 
 14 | membership_to_clusters(membership, elem_ids = NULL, clust_ids = NULL)
 15 | 
 16 | clusters_to_pairs(clusters)
 17 | 
 18 | membership_to_pairs(membership, elem_ids = NULL)
 19 | 
 20 | pairs_to_membership(pairs, elem_ids)
 21 | 
 22 | pairs_to_clusters(pairs, elem_ids)
 23 | }
 24 | \arguments{
 25 | \item{clusters}{a representation of a clustering as a list of vectors,
 26 | where the i-th vector contains the identifiers of elements assigned to the
 27 | i-th cluster. If \code{clust_ids} is specified (see below), the i-th cluster
 28 | is identified according to the corresponding entry in \code{clust_ids}.
 29 | Otherwise the i-th cluster is identified according it's name (if
 30 | \code{clusters} is a named list) or its integer index i.}
 31 | 
 32 | \item{elem_ids}{a vector specifying the complete set of identifiers for the
 33 | cluster elements in canonical order. Optional for all functions excluding
 34 | \code{pairs_to_membership} and \code{pairs_to_clusters}.}
 35 | 
 36 | \item{clust_ids}{a vector specifying the complete set of identifiers for
 37 | the clusters in canonical order. Optional for all functions.}
 38 | 
 39 | \item{membership}{a representation of a clustering as a membership vector,
 40 | where the i-th entry contains the cluster identifier for the i-th element.
 41 | If \code{elem_ids} is specified (see below), the i-th element is identified
 42 | according to the corresponding entry in \code{elem_ids}. Otherwise the i-th
 43 | element is identified according it's name (if \code{members} is a named vector)
 44 | or its integer index i.}
 45 | 
 46 | \item{pairs}{a representation of a clustering as a matrix or data.frame
 47 | containing all pairs of elements that are co-clustered. The rows index
 48 | of the matrix/data.frame index pairs and columns index the identifiers
 49 | of the constituent elements. The \code{elem_ids} argument (see below) must be
 50 | specified in order to recover singleton clusters (containing a single
 51 | element).}
 52 | }
 53 | \value{
 54 | \code{clusters_to_membership} and \code{pairs_to_membership} both return a
 55 | membership vector representation of the clustering. The order of the
 56 | elements is taken from \code{elem_ids} if specified, otherwise the elements are
 57 | ordered lexicographically by their identifiers. For
 58 | \code{pairs_to_membership}, the cluster identifiers cannot be recovered and
 59 | are taken to be integers.
 60 | 
 61 | \code{membership_to_clusters} and \code{pairs_to_clusters} both return a
 62 | representation of the clustering as a list of vectors. The order of the
 63 | clusters is taken from \code{clust_ids} if specified, otherwise the clusters
 64 | are ordered lexicographically by their identifiers. For
 65 | \code{pairs_to_clusters}, the cluster identifiers cannot be recovered and
 66 | are taken to be integers.
 67 | 
 68 | \code{clusters_to_pairs} and \code{membership_to_pairs} both return a
 69 | representation of the clustering as a matrix of element pairs that are
 70 | co-clustered. This representation results in loss of information, as
 71 | singleton clusters (with one element) and cluster identifiers are not
 72 | represented.
 73 | }
 74 | \description{
 75 | Transform between different representations of a clustering.
 76 | }
 77 | \examples{
 78 | ## A clustering of three items represented as a membership vector
 79 | m <- c("Item1" = 1, "Item2" = 2, "Item3" = 1)
 80 | 
 81 | # Transform to list of clusters
 82 | membership_to_clusters(m)
 83 | # Specify different identifiers for the items
 84 | membership_to_clusters(m, elem_ids = c(1, 2, 3))
 85 | # Transform to array of pairs that are co-clustered
 86 | membership_to_pairs(m)
 87 | 
 88 | ## A clustering represented as a list of clusters
 89 | cl <- list("ClustA" = c(1,3), "ClustB" = c(2))
 90 | 
 91 | # Transform to membership vector representation
 92 | clusters_to_membership(cl)
 93 | # Transform to array of pairs that are co-clustered
 94 | clusters_to_pairs(cl)
 95 | 
 96 | ## A clustering (incompletely) represented as an array of pairs that
 97 | ## are co-clustered
 98 | p <- rbind(c(1,3)) # pairs of elements in the same cluster
 99 | ids <- c(1,2,3)    # necessary to specify set of all elements
100 | 
101 | # Transform to membership vector representation
102 | pairs_to_membership(p, ids)
103 | # Transform to list of clusters
104 | pairs_to_clusters(p, ids)
105 | 
106 | }
107 | 


--------------------------------------------------------------------------------
/tests/testthat/test-transformations.R:
--------------------------------------------------------------------------------
  1 | context("Clusters to membership vector")
  2 | 
  3 | test_that("un-named list of integer vectors correctly transformed to membership vector", {
  4 |   clusters <- list(c(100L, 1L), c(2L))
  5 |   clust_ids <- c("A", "B")
  6 |   elem_ids <- c(1L, 2L, 100L)
  7 |   expect_equal(clusters_to_membership(clusters),
  8 |                c("1" = 1L, "100" = 1L, "2" = 2L))
  9 |   expect_equal(clusters_to_membership(clusters, elem_ids = elem_ids),
 10 |                c("1" = 1L, "2" = 2L, "100" = 1L))
 11 |   expect_equal(clusters_to_membership(clusters, clust_ids = clust_ids),
 12 |                c("1" = "A", "100" = "A", "2" = "B"))
 13 |   expect_equal(clusters_to_membership(clusters, elem_ids = elem_ids, clust_ids = clust_ids),
 14 |                c("1" = "A", "2" = "B", "100" = "A"))
 15 | })
 16 | 
 17 | test_that("un-named list of character vectors correctly transformed to membership vector", {
 18 |   clusters <- list(c("ELEM3", "ELEM1"), c("ELEM2"))
 19 |   clust_ids <- c("A", "B")
 20 |   elem_ids <- c("ELEM3", "ELEM2", "ELEM1")
 21 |   expect_equal(clusters_to_membership(clusters),
 22 |                c("ELEM1" = 1L, "ELEM2" = 2L, "ELEM3" = 1L))
 23 |   expect_equal(clusters_to_membership(clusters, elem_ids = elem_ids),
 24 |                c("ELEM3" = 1L, "ELEM2" = 2L, "ELEM1" = 1L))
 25 |   expect_equal(clusters_to_membership(clusters, clust_ids = clust_ids),
 26 |                c("ELEM1" = "A", "ELEM2" = "B", "ELEM3" = "A"))
 27 |   expect_equal(clusters_to_membership(clusters, elem_ids = elem_ids, clust_ids = clust_ids),
 28 |                c("ELEM3" = "A", "ELEM2" = "B", "ELEM1" = "A"))
 29 | })
 30 | 
 31 | test_that("named list of integer vectors correctly transformed to membership vector", {
 32 |   clusters <- list("A" = c(100L, 1L), "B" = c(2L))
 33 |   clust_ids <- c("A", "B")
 34 |   elem_ids <- c(1L, 2L, 100L)
 35 |   expect_equal(clusters_to_membership(clusters),
 36 |                c("1" = "A", "100" = "A", "2" = "B"))
 37 |   expect_equal(clusters_to_membership(clusters, elem_ids = elem_ids),
 38 |                c("1" = "A", "2" = "B", "100" = "A"))
 39 |   expect_equal(clusters_to_membership(clusters, clust_ids = clust_ids),
 40 |                c("1" = "A", "100" = "A", "2" = "B"))
 41 |   expect_equal(clusters_to_membership(clusters, elem_ids = elem_ids, clust_ids = clust_ids),
 42 |                c("1" = "A", "2" = "B", "100" = "A"))
 43 | })
 44 | 
 45 | 
 46 | context("Membership vector to clusters")
 47 | 
 48 | test_that("un-named integer membership vector correctly transformed to list of vectors", {
 49 |   membership <- c(1L, 2L, 1L)
 50 |   clust_ids <- c(2L, 1L)
 51 |   elem_ids <- c(1L, 2L, 100L)
 52 |   expect_equal(membership_to_clusters(membership),
 53 |                list("1" = c(1L, 3L), "2" = 2L))
 54 |   expect_equal(membership_to_clusters(membership, elem_ids = elem_ids),
 55 |                list("1" = c(1L, 100L), "2" = 2L))
 56 |   expect_equal(membership_to_clusters(membership, clust_ids = clust_ids),
 57 |                list("2" = 2L, "1" = c(1L, 3L)))
 58 |   expect_equal(membership_to_clusters(membership, elem_ids = elem_ids, clust_ids = clust_ids),
 59 |                list("2" = 2L, "1" = c(1L, 100L)))
 60 | })
 61 | 
 62 | test_that("un-named character membership vector correctly transformed to list of vectors", {
 63 |   membership <- c("B", "A", "B")
 64 |   clust_ids <- c("B", "A")
 65 |   elem_ids <- c(1L, 2L, 100L)
 66 |   expect_equal(membership_to_clusters(membership),
 67 |                list("A" = 2L, "B" = c(1L, 3L)))
 68 |   expect_equal(membership_to_clusters(membership, elem_ids = elem_ids),
 69 |                list("A" = 2L, "B" = c(1L, 100L)))
 70 |   expect_equal(membership_to_clusters(membership, clust_ids = clust_ids),
 71 |                list("B" = c(1L, 3L), "A" = 2L))
 72 |   expect_equal(membership_to_clusters(membership, elem_ids = elem_ids, clust_ids = clust_ids),
 73 |                list("B" = c(1L, 100L), "A" = 2L))
 74 | })
 75 | 
 76 | test_that("named character membership vector correctly transformed to list of vectors", {
 77 |   membership <- c("1" = "B", "2" = "A", "100" = "B")
 78 |   clust_ids <- c("B", "A")
 79 |   elem_ids <- c(1L, 2L, 100L)
 80 |   expect_equal(membership_to_clusters(membership),
 81 |                list("A" = "2", "B" = c("1", "100")))
 82 |   expect_equal(membership_to_clusters(membership, elem_ids = elem_ids),
 83 |                list("A" = 2L, "B" = c(1L, 100L)))
 84 |   expect_equal(membership_to_clusters(membership, clust_ids = clust_ids),
 85 |                list("B" = c("1", "100"), "A" = "2"))
 86 |   expect_equal(membership_to_clusters(membership, elem_ids = elem_ids, clust_ids = clust_ids),
 87 |                list("B" = c(1L, 100L), "A" = 2L))
 88 | })
 89 | 
 90 | 
 91 | context("Pairs to membership vector")
 92 | 
 93 | test_that("integer matrix of pairs correctly transformed to membership vector", {
 94 |   pairs <- rbind(c(1L, 2L), c(1L, 3L), c(2L, 3L), c(4L, 5L))
 95 |   elem_ids <- seq_len(5)
 96 |   expect_equal(pairs_to_membership(pairs, elem_ids),
 97 |                c("1" = 1, "2" = 1, "3" = 1, "4" = 2, "5" = 2))
 98 | })
 99 | 
100 | test_that("special case of no pairs handled correctly", {
101 |   pairs <- matrix(0L, nrow = 0, ncol = 2)
102 |   elem_ids <- seq_len(5)
103 |   expect_equal(pairs_to_membership(pairs, elem_ids),
104 |                c("1" = 1, "2" = 2, "3" = 3, "4" = 4, "5" = 5))
105 | })
106 | 
107 | test_that("character matrix of pairs correctly transformed to membership vector", {
108 |   pairs <- rbind(c("A", "B"), c("B", "C"), c("A", "C"), c("D", "E"))
109 |   elem_ids <- LETTERS[1:5]
110 |   expect_equal(pairs_to_membership(pairs, elem_ids),
111 |                c("A" = 1, "B" = 1, "C" = 1, "D" = 2, "E" = 2))
112 | })
113 | 
114 | test_that("missing element identifiers in pairs produces a warning", {
115 |   pairs <- rbind(c(NA, 2L), c(1L, 3L), c(2L, 3L))
116 |   elem_ids <- seq_len(5)
117 |   expect_warning(pairs_to_membership(pairs, elem_ids))
118 | })
119 | 
120 | test_that("missing element identifiers in `elem_ids` results in error", {
121 |   pairs <- rbind(c(1L, 2L), c(1L, 3L), c(2L, 3L))
122 |   elem_ids <- c(1L, NA, 3L)
123 |   expect_error(pairs_to_membership(pairs, elem_ids))
124 | })
125 | 
126 | test_that("passing pairs with incorrect dimensions results in error", {
127 |   pairs <- rbind(c(1L, 2L), c(1L, 3L), c(2L, 3L))
128 |   elem_ids <- c(1L, 2L, 3L)
129 |   expect_error(pairs_to_membership(pairs[,0], elem_ids))
130 | })
131 | 
132 | 
133 | context("Canonicalize pairs")
134 | 
135 | test_that("rows are ordered lexicographically by first column then second column", {
136 |   pairs <- rbind(c(3,4), c(1,5), c(1,2))
137 |   expect_equal(canonicalize_pairs(pairs),
138 |                rbind(c(1,2), c(1,5), c(3,4)))
139 | })
140 | 
141 | test_that("identifiers are ordered lexicographically within each row", {
142 |   pairs <- rbind(c(4,3), c(1,5), c(2,1))
143 |   expect_equal(canonicalize_pairs(pairs),
144 |                rbind(c(1,2), c(1,5), c(3,4)))
145 | })
146 | 
147 | test_that("duplicate pairs are removed", {
148 |   pairs <- rbind(c(1,2), c(2,1))
149 |   expect_equal(canonicalize_pairs(pairs),
150 |                rbind(c(1,2)))
151 | })
152 | 


--------------------------------------------------------------------------------
/tests/testthat/test-measures_pairs.R:
--------------------------------------------------------------------------------
  1 | 
  2 | context("Binary Contingency Table for Linked Pairs")
  3 | 
  4 | test_that("pairwise contingency table is correct for a simple example", {
  5 |   pred_pairs <- rbind(c(1, 2), c(1, 3), c(4, 5))
  6 |   true_pairs <- rbind(c(1, 2), c(1, 5))
  7 |   result <- contingency_table_pairs(true_pairs, pred_pairs, num_pairs = 25)
  8 |   true_result <- rbind("TRUE" = c("TRUE" = 1,"FALSE" = 2), "FALSE" = c("TRUE" = 1, "FALSE" = 21))
  9 |   true_result <- as.table(true_result)
 10 |   names(dimnames(true_result)) <- c("Prediction", "Truth")
 11 |   expect_equal(result, true_result)
 12 | })
 13 | 
 14 | test_that("pairwise contingency table is correct when pairs are represented using different types", {
 15 |   pred_pairs <- rbind(c(2,17), c(16, 17), c(18, 23))
 16 |   true_pairs <- pred_pairs
 17 |   storage.mode(true_pairs) <- "character"
 18 |   result <- contingency_table_pairs(true_pairs, pred_pairs)
 19 |   true_result <- rbind("TRUE" = c("TRUE" = 3,"FALSE" = 0), "FALSE" = c("TRUE" = 0, "FALSE" = NA))
 20 |   true_result <- as.table(true_result)
 21 |   names(dimnames(true_result)) <- c("Prediction", "Truth")
 22 |   expect_equal(result, true_result)
 23 | })
 24 | 
 25 | # Examples to test
 26 | make_pairs_identical <- function() {
 27 |   true <- rbind(c(1, 2), c(1, 3), c(2, 3), c(4, 5))
 28 |   pred <- rbind(c(1, 2), c(1, 3), c(2, 3), c(4, 5))
 29 |   num_pairs <- 10
 30 |   measures <- list(
 31 |     "precision_pairs" = 1.0,
 32 |     "recall_pairs" = 1.0,
 33 |     "specificity_pairs" = 1.0,
 34 |     "sensitivity_pairs" = 1.0,
 35 |     "f_measure_pairs" = 1.0,
 36 |     "accuracy_pairs" = 1.0,
 37 |     "balanced_accuracy_pairs" = 1.0,
 38 |     "fowlkes_mallows_pairs" = 1.0
 39 |   )
 40 |   list("true" = true, "pred" = pred, "num_pairs" = num_pairs, "true_measures" = measures,
 41 |        "description" = "pairs in complete agreement")
 42 | }
 43 | 
 44 | make_pairs_distinct <- function() {
 45 |   true <- rbind(c(1, 2), c(1, 3), c(2, 3))
 46 |   pred <- rbind(c(1, 4), c(2, 4), c(3, 4))
 47 |   num_pairs <- 6
 48 |   measures <- list(
 49 |     "precision_pairs" = 0.0,
 50 |     "recall_pairs" = 0.0,
 51 |     "specificity_pairs" = 0.0,
 52 |     "sensitivity_pairs" = 0.0,
 53 |     "f_measure_pairs" = 0.0,
 54 |     "accuracy_pairs" = 0.0,
 55 |     "balanced_accuracy_pairs" = 0.0,
 56 |     "fowlkes_mallows_pairs" = 0.0
 57 |   )
 58 |   list("true" = true, "pred" = pred, "num_pairs" = num_pairs, "true_measures" = measures,
 59 |        "description" = "pairs in complete disagreement")
 60 | }
 61 | 
 62 | make_pairs_no_pred <- function() {
 63 |   true <- rbind(c(1, 2), c(1, 3), c(2, 3))
 64 |   pred <- matrix(0L, nrow = 0, ncol = 2)
 65 |   num_pairs <- 3
 66 |   measures <- list(
 67 |     "precision_pairs" = NaN,
 68 |     "recall_pairs" = 0.0,
 69 |     "specificity_pairs" = NaN,
 70 |     "sensitivity_pairs" = 0.0,
 71 |     "f_measure_pairs" = NaN,
 72 |     "accuracy_pairs" = 0.0,
 73 |     "balanced_accuracy_pairs" = NaN,
 74 |     "fowlkes_mallows_pairs" = NaN
 75 |   )
 76 |   list("true" = true, "pred" = pred, "num_pairs" = num_pairs, "true_measures" = measures,
 77 |        "description" = "pairs with zero recall")
 78 | }
 79 | 
 80 | make_pairs_one_fp <- function() {
 81 |   true <- rbind(c(1, 2), c(1, 3), c(2, 3), c(4, 5))
 82 |   pred <- rbind(c(1, 2), c(1, 3), c(2, 3), c(4, 5), c(1, 4))
 83 |   num_pairs <- 10
 84 |   measures <- list(
 85 |     "precision_pairs" = 4/5,
 86 |     "recall_pairs" = 1.0,
 87 |     "specificity_pairs" = 5/6,
 88 |     "sensitivity_pairs" = 1.0,
 89 |     "f_measure_pairs" = 8/9,
 90 |     "accuracy_pairs" = 9/10,
 91 |     "balanced_accuracy_pairs" = 11/12,
 92 |     "fowlkes_mallows_pairs" = 2/sqrt(5)
 93 |   )
 94 |   list("true" = true, "pred" = pred, "num_pairs" = num_pairs, "true_measures" = measures,
 95 |        "description" = "pairs with one false positive error")
 96 | }
 97 | 
 98 | make_pairs_no_true <- function() {
 99 |   true <- matrix(0L, nrow = 0, ncol = 2)
100 |   pred <- rbind(c(1, 2), c(1, 3), c(2, 3))
101 |   num_pairs <- 3
102 |   measures <- list(
103 |     "precision_pairs" = 0.0,
104 |     "recall_pairs" = NaN,
105 |     "specificity_pairs" = 0.0,
106 |     "sensitivity_pairs" = NaN,
107 |     "f_measure_pairs" = NaN,
108 |     "accuracy_pairs" = 0.0,
109 |     "balanced_accuracy_pairs" = NaN,
110 |     "fowlkes_mallows_pairs" = NaN
111 |   )
112 |   list("true" = true, "pred" = pred, "num_pairs" = num_pairs, "true_measures" = measures,
113 |        "description" = "pairs with zero precision")
114 | }
115 | 
116 | examples_to_test <- list(make_pairs_identical,
117 |                          make_pairs_distinct,
118 |                          make_pairs_no_pred,
119 |                          make_pairs_no_true,
120 |                          make_pairs_one_fp)
121 | 
122 | 
123 | context("Precision of Linked Pairs")
124 | for (example in examples_to_test) {
125 |   example <- example()
126 |   test_that(paste("measure is correct for", example$description), {
127 |     true <- example$true
128 |     pred <- example$pred
129 |     expect_equal(precision_pairs(true, pred),
130 |                  example$true_measures[["precision_pairs"]])
131 |   })
132 | }
133 | 
134 | context("Recall of Linked Pairs")
135 | for (example in examples_to_test) {
136 |   example <- example()
137 |   test_that(paste("measure is correct for", example$description), {
138 |     true <- example$true
139 |     pred <- example$pred
140 |     expect_equal(recall_pairs(true, pred),
141 |                  example$true_measures[["recall_pairs"]])
142 |   })
143 | }
144 | 
145 | context("Specificity of Linked Pairs")
146 | for (example in examples_to_test) {
147 |   example <- example()
148 |   test_that(paste("measure is correct for", example$description), {
149 |     true <- example$true
150 |     pred <- example$pred
151 |     num_pairs <- example$num_pairs
152 |     expect_equal(specificity_pairs(true, pred, num_pairs),
153 |                  example$true_measures[["specificity_pairs"]])
154 |   })
155 | }
156 | 
157 | context("Sensitivity of Linked Pairs")
158 | for (example in examples_to_test) {
159 |   example <- example()
160 |   test_that(paste("measure is correct for", example$description), {
161 |     true <- example$true
162 |     pred <- example$pred
163 |     expect_equal(sensitivity_pairs(true, pred),
164 |                  example$true_measures[["sensitivity_pairs"]])
165 |   })
166 | }
167 | 
168 | context("F-Measure of Linked Pairs")
169 | for (example in examples_to_test) {
170 |   example <- example()
171 |   test_that(paste("measure is correct for", example$description), {
172 |     true <- example$true
173 |     pred <- example$pred
174 |     expect_equal(f_measure_pairs(true, pred),
175 |                  example$true_measures[["f_measure_pairs"]])
176 |   })
177 | }
178 | 
179 | context("Accuracy of Linked Pairs")
180 | for (example in examples_to_test) {
181 |   example <- example()
182 |   test_that(paste("measure is correct for", example$description), {
183 |     true <- example$true
184 |     pred <- example$pred
185 |     num_pairs <- example$num_pairs
186 |     expect_equal(accuracy_pairs(true, pred, num_pairs),
187 |                  example$true_measures[["accuracy_pairs"]])
188 |   })
189 | }
190 | 
191 | context("Balanced Accuracy of Linked Pairs")
192 | for (example in examples_to_test) {
193 |   example <- example()
194 |   test_that(paste("measure is correct for", example$description), {
195 |     true <- example$true
196 |     pred <- example$pred
197 |     num_pairs <- example$num_pairs
198 |     expect_equal(balanced_accuracy_pairs(true, pred, num_pairs),
199 |                  example$true_measures[["balanced_accuracy_pairs"]])
200 |   })
201 | }
202 | 
203 | context("Fowlkes-Mallows Index of Linked Pairs")
204 | for (example in examples_to_test) {
205 |   example <- example()
206 |   test_that(paste("measure is correct for", example$description), {
207 |     true <- example$true
208 |     pred <- example$pred
209 |     expect_equal(fowlkes_mallows_pairs(true, pred),
210 |                  example$true_measures[["fowlkes_mallows_pairs"]])
211 |   })
212 | }
213 | 


--------------------------------------------------------------------------------
/R/transformations.R:
--------------------------------------------------------------------------------
  1 | #' Transform Clustering Representations
  2 | #'
  3 | #' @description
  4 | #' Transform between different representations of a clustering.
  5 | #'
  6 | #' @param clusters a representation of a clustering as a list of vectors,
  7 | #'   where the i-th vector contains the identifiers of elements assigned to the
  8 | #'   i-th cluster. If `clust_ids` is specified (see below), the i-th cluster
  9 | #'   is identified according to the corresponding entry in `clust_ids`.
 10 | #'   Otherwise the i-th cluster is identified according it's name (if
 11 | #'   `clusters` is a named list) or its integer index i.
 12 | #' @param membership a representation of a clustering as a membership vector,
 13 | #'   where the i-th entry contains the cluster identifier for the i-th element.
 14 | #'   If `elem_ids` is specified (see below), the i-th element is identified
 15 | #'   according to the corresponding entry in `elem_ids`. Otherwise the i-th
 16 | #'   element is identified according it's name (if `members` is a named vector)
 17 | #'   or its integer index i.
 18 | #' @param pairs a representation of a clustering as a matrix or data.frame
 19 | #'   containing all pairs of elements that are co-clustered. The rows index
 20 | #'   of the matrix/data.frame index pairs and columns index the identifiers
 21 | #'   of the constituent elements. The `elem_ids` argument (see below) must be
 22 | #'   specified in order to recover singleton clusters (containing a single
 23 | #'   element).
 24 | #' @param elem_ids a vector specifying the complete set of identifiers for the
 25 | #'   cluster elements in canonical order. Optional for all functions excluding
 26 | #'   `pairs_to_membership` and `pairs_to_clusters`.
 27 | #' @param clust_ids a vector specifying the complete set of identifiers for
 28 | #'   the clusters in canonical order. Optional for all functions.
 29 | #' @return `clusters_to_membership` and `pairs_to_membership` both return a
 30 | #'   membership vector representation of the clustering. The order of the
 31 | #'   elements is taken from `elem_ids` if specified, otherwise the elements are
 32 | #'   ordered lexicographically by their identifiers. For
 33 | #'   `pairs_to_membership`, the cluster identifiers cannot be recovered and
 34 | #'   are taken to be integers.
 35 | #'
 36 | #'   `membership_to_clusters` and `pairs_to_clusters` both return a
 37 | #'   representation of the clustering as a list of vectors. The order of the
 38 | #'   clusters is taken from `clust_ids` if specified, otherwise the clusters
 39 | #'   are ordered lexicographically by their identifiers. For
 40 | #'   `pairs_to_clusters`, the cluster identifiers cannot be recovered and
 41 | #'   are taken to be integers.
 42 | #'
 43 | #'   `clusters_to_pairs` and `membership_to_pairs` both return a
 44 | #'   representation of the clustering as a matrix of element pairs that are
 45 | #'   co-clustered. This representation results in loss of information, as
 46 | #'   singleton clusters (with one element) and cluster identifiers are not
 47 | #'   represented.
 48 | #'
 49 | #' @examples
 50 | #' ## A clustering of three items represented as a membership vector
 51 | #' m <- c("Item1" = 1, "Item2" = 2, "Item3" = 1)
 52 | #'
 53 | #' # Transform to list of clusters
 54 | #' membership_to_clusters(m)
 55 | #' # Specify different identifiers for the items
 56 | #' membership_to_clusters(m, elem_ids = c(1, 2, 3))
 57 | #' # Transform to array of pairs that are co-clustered
 58 | #' membership_to_pairs(m)
 59 | #'
 60 | #' ## A clustering represented as a list of clusters
 61 | #' cl <- list("ClustA" = c(1,3), "ClustB" = c(2))
 62 | #'
 63 | #' # Transform to membership vector representation
 64 | #' clusters_to_membership(cl)
 65 | #' # Transform to array of pairs that are co-clustered
 66 | #' clusters_to_pairs(cl)
 67 | #'
 68 | #' ## A clustering (incompletely) represented as an array of pairs that
 69 | #' ## are co-clustered
 70 | #' p <- rbind(c(1,3)) # pairs of elements in the same cluster
 71 | #' ids <- c(1,2,3)    # necessary to specify set of all elements
 72 | #'
 73 | #' # Transform to membership vector representation
 74 | #' pairs_to_membership(p, ids)
 75 | #' # Transform to list of clusters
 76 | #' pairs_to_clusters(p, ids)
 77 | #'
 78 | #' @export
 79 | #' @importFrom stats na.fail
 80 | #' @rdname clustering_representations
 81 | clusters_to_membership <- function(clusters, elem_ids = NULL, clust_ids = NULL)
 82 | {
 83 |   if (!is.null(clust_ids)) {
 84 |     # Check provided clust_ids for consistency
 85 |     if (length(clust_ids) != length(clusters))
 86 |       stop("`clust_ids` must be the same length as `clusters`")
 87 |     tryCatch(na.fail(clust_ids), error = function(e)
 88 |       stop("`clust_ids` cannot contain NA values"))
 89 |   } else {
 90 |     # Infer clust_ids from names first, falling back to integer ids
 91 |     if (!is.null(names(clusters))) {
 92 |       clust_ids <- names(clusters)
 93 |     } else {
 94 |       clust_ids <- seq_along(clusters)
 95 |     }
 96 |   }
 97 | 
 98 |   clust_sizes <- sapply(clusters, length)
 99 |   if (!is.null(elem_ids)) {
100 |     if (sum(clust_sizes) != length(elem_ids))
101 |       stop("`elem_ids` does not match number of elements in `clusters`")
102 |     tryCatch(na.fail(elem_ids), error = function(e)
103 |       stop("`elem_ids` cannot contain NA values"))
104 |   }
105 | 
106 |   membership <- rep(clust_ids, times=clust_sizes)
107 |   names(membership) <- as.character(unlist(clusters))
108 | 
109 |   # Reorder membership vector
110 |   if (!is.null(elem_ids)) {
111 |     # Use order in elem_ids
112 |     membership <- membership[as.character(elem_ids)]
113 |   } else {
114 |     # Order lexicographically by name
115 |     ordered_idx <- order(names(membership))
116 |     membership <- membership[ordered_idx]
117 |   }
118 | 
119 |   return(membership)
120 | }
121 | 
122 | 
123 | #' @importFrom stats na.fail
124 | #' @export
125 | #' @rdname clustering_representations
126 | membership_to_clusters <- function(membership, elem_ids = NULL, clust_ids = NULL) {
127 |   if (!is.null(elem_ids)) {
128 |     # Check provided elem_ids for consistency
129 |     if (length(elem_ids) != length(membership))
130 |       stop("`elem_ids` must be the same length as `membership`")
131 |     tryCatch(na.fail(elem_ids), error = function(e)
132 |       stop("`elem_ids` cannot contain NA values"))
133 |   } else {
134 |     # Infer elem_ids from names first, falling back to integer ids
135 |     if (!is.null(names(membership))) {
136 |       elem_ids <- names(membership)
137 |     } else {
138 |       elem_ids <- seq_along(membership)
139 |     }
140 |   }
141 | 
142 |   clusters <- split(elem_ids, membership)
143 | 
144 |   # Reorder clusters list
145 |   if (!is.null(clust_ids)) {
146 |     # Use order in clust_ids, but first check consistency
147 |     tryCatch(na.fail(clust_ids), error = function(e)
148 |       stop("`clust_ids` cannot contain NA values"))
149 |     clusters <- clusters[as.character(clust_ids)]
150 |   } else {
151 |     # Order lexicographically by name
152 |     ordered_idx <- order(names(clusters))
153 |     clusters <- clusters[ordered_idx]
154 |   }
155 | 
156 |   return(clusters)
157 | }
158 | 
159 | 
160 | #' @importFrom utils combn
161 | #' @export
162 | #' @rdname clustering_representations
163 | clusters_to_pairs <- function(clusters) {
164 | 
165 |   non_singletons <- Filter(function(x) length(x) > 1, clusters)
166 | 
167 |   if (length(non_singletons) == 0) {
168 |     if (length(clusters) == 0) {
169 |       # No clusters
170 |       pairs <- array(dim = c(0, 2), data = 0L)
171 | 
172 |     } else {
173 |       # All clusters are singletons: no pairs to return
174 |       element_id_type <- typeof(clusters[[1]])
175 |       pairs <- array(dim = c(0, 2), data = vector(mode = element_id_type))
176 |     }
177 |     return(pairs)
178 |   }
179 | 
180 |   # Make ? x 2 array of pairs for each cluster and store in a list
181 |   pairs <- lapply(non_singletons, function(x) t(combn(x, 2)))
182 |   # Merge pairs from all clusters into single ? x 2 array
183 |   pairs <- do.call(rbind, pairs)
184 | 
185 |   pairs <- canonicalize_pairs(pairs)
186 |   return(pairs)
187 | }
188 | 
189 | 
190 | #' @export
191 | #' @rdname clustering_representations
192 | membership_to_pairs <- function(membership, elem_ids = NULL) {
193 |   clusters <- membership_to_clusters(membership, elem_ids = elem_ids)
194 |   pairs <- clusters_to_pairs(clusters)
195 |   return(pairs)
196 | }
197 | 
198 | 
199 | #' @importFrom stats na.fail na.omit na.action
200 | #' @export
201 | #' @rdname clustering_representations
202 | pairs_to_membership <- function(pairs, elem_ids) {
203 |   # Need to convert to matrix in order for factor to work below
204 |   pairs <- as.matrix(pairs)
205 |   pairs <- na.omit(pairs)
206 | 
207 |   if (ncol(pairs) != 2) stop("`pairs` must have exactly two columns")
208 |   if (length(na.action(pairs))!= 0)
209 |     warning("rows with NA values were removed from `pairs`")
210 | 
211 |   tryCatch(na.fail(elem_ids), error = function(e) stop("`elem_ids` cannot contain NA values"))
212 | 
213 |   # Transform pairs so that elem_ids are represented as integers starting at 0
214 |   original_dim <- dim(pairs)
215 |   pairs <- factor(pairs)
216 |   pairs <- unclass(pairs) - 1
217 |   dim(pairs) <- original_dim
218 | 
219 |   # Save mapping to original elem_ids used in pairs
220 |   pairs_elem_ids <- levels(pairs)
221 | 
222 |   membership <- pairs_to_membership_cpp(pairs, length(elem_ids))
223 |   # R indexing starts at 1
224 |   membership <- membership + 1
225 | 
226 |   # Fill names with elem_ids
227 |   char_elem_ids <- as.character(elem_ids)
228 |   singleton_elem_ids <- setdiff(char_elem_ids, pairs_elem_ids)
229 |   names(membership) <- c(pairs_elem_ids, singleton_elem_ids)
230 | 
231 |   # Sort according to elem_ids
232 |   membership <- membership[char_elem_ids]
233 | 
234 |   return(membership)
235 | }
236 | 
237 | 
238 | #' @export
239 | #' @rdname clustering_representations
240 | pairs_to_clusters <- function(pairs, elem_ids) {
241 |   membership <- pairs_to_membership(pairs, elem_ids)
242 |   clusters <- membership_to_clusters(membership, elem_ids = elem_ids)
243 |   names(clusters) <- NULL
244 |   return(clusters)
245 | }
246 | 
247 | 
248 | #' Canonicalize element pairs
249 | #'
250 | #' @description
251 | #' Coerce a collection of element pairs into canonical form. Facilitates
252 | #' testing of equivalence.
253 | #'
254 | #' @param pairs a matrix or data.frame of element pairs where rows correspond
255 | #'   to element pairs and columns correspond to element identifiers.
256 | #' @param ordered whether to treat the element pairs as ordered---i.e. whether
257 | #'   pair \eqn{(x, y)} is distinct from pair \eqn{(y, x)} for \eqn{x \neq y}.
258 | #'   Defaults to FALSE, which is appropriate for clustering, undirected link
259 | #'   prediction, record linkage etc.
260 | #' @return Returns the element pairs in canonical form, so that:
261 | #'   * the first element id precedes the second element id lexicographically
262 | #'     if `ordered = FALSE`---i.e. pair (3, 2) becomes pair (2, 3);
263 | #'   * duplicate pairs are removed; and
264 | #'   * the rows in the matrix/data.frame pairs are sorted lexicographically
265 | #'     by the first element id, then by the second element id.
266 | #'
267 | #' @examples
268 | #' messy_pairs <- rbind(c(2,1), c(1,2), c(3,1), c(1,2))
269 | #' clean_pairs <- canonicalize_pairs(messy_pairs)
270 | #' all(rbind(c(1,2), c(1,3)) == clean_pairs) # duplicates removed and order fixed
271 | #'
272 | #' @export
273 | canonicalize_pairs <- function(pairs, ordered=FALSE) {
274 |   if (ncol(pairs) != 2) stop("`pairs` must have exactly two columns")
275 | 
276 |   pairs <- as.matrix(pairs)
277 | 
278 |   if (nrow(pairs) == 0) return(pairs)
279 | 
280 |   # Sort entries in each row lexicographically
281 |   if (!ordered) {
282 |     pairs <- t(apply(pairs, 1, sort))
283 |   }
284 | 
285 |   # Remove duplicate rows
286 |   pairs <- unique(pairs)
287 | 
288 |   # Sort rows lexicographically, by first column then second
289 |   ordered_row <- order(pairs[,1], pairs[,2])
290 |   pairs <- pairs[ordered_row, , drop=FALSE]
291 | 
292 |   return(pairs)
293 | }
294 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                     GNU GENERAL PUBLIC LICENSE
  2 |                        Version 2, June 1991
  3 | 
  4 |  Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
  5 |  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  6 |  Everyone is permitted to copy and distribute verbatim copies
  7 |  of this license document, but changing it is not allowed.
  8 | 
  9 |                             Preamble
 10 | 
 11 |   The licenses for most software are designed to take away your
 12 | freedom to share and change it.  By contrast, the GNU General Public
 13 | License is intended to guarantee your freedom to share and change free
 14 | software--to make sure the software is free for all its users.  This
 15 | General Public License applies to most of the Free Software
 16 | Foundation's software and to any other program whose authors commit to
 17 | using it.  (Some other Free Software Foundation software is covered by
 18 | the GNU Lesser General Public License instead.)  You can apply it to
 19 | your programs, too.
 20 | 
 21 |   When we speak of free software, we are referring to freedom, not
 22 | price.  Our General Public Licenses are designed to make sure that you
 23 | have the freedom to distribute copies of free software (and charge for
 24 | this service if you wish), that you receive source code or can get it
 25 | if you want it, that you can change the software or use pieces of it
 26 | in new free programs; and that you know you can do these things.
 27 | 
 28 |   To protect your rights, we need to make restrictions that forbid
 29 | anyone to deny you these rights or to ask you to surrender the rights.
 30 | These restrictions translate to certain responsibilities for you if you
 31 | distribute copies of the software, or if you modify it.
 32 | 
 33 |   For example, if you distribute copies of such a program, whether
 34 | gratis or for a fee, you must give the recipients all the rights that
 35 | you have.  You must make sure that they, too, receive or can get the
 36 | source code.  And you must show them these terms so they know their
 37 | rights.
 38 | 
 39 |   We protect your rights with two steps: (1) copyright the software, and
 40 | (2) offer you this license which gives you legal permission to copy,
 41 | distribute and/or modify the software.
 42 | 
 43 |   Also, for each author's protection and ours, we want to make certain
 44 | that everyone understands that there is no warranty for this free
 45 | software.  If the software is modified by someone else and passed on, we
 46 | want its recipients to know that what they have is not the original, so
 47 | that any problems introduced by others will not reflect on the original
 48 | authors' reputations.
 49 | 
 50 |   Finally, any free program is threatened constantly by software
 51 | patents.  We wish to avoid the danger that redistributors of a free
 52 | program will individually obtain patent licenses, in effect making the
 53 | program proprietary.  To prevent this, we have made it clear that any
 54 | patent must be licensed for everyone's free use or not licensed at all.
 55 | 
 56 |   The precise terms and conditions for copying, distribution and
 57 | modification follow.
 58 | 
 59 |                     GNU GENERAL PUBLIC LICENSE
 60 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 61 | 
 62 |   0. This License applies to any program or other work which contains
 63 | a notice placed by the copyright holder saying it may be distributed
 64 | under the terms of this General Public License.  The "Program", below,
 65 | refers to any such program or work, and a "work based on the Program"
 66 | means either the Program or any derivative work under copyright law:
 67 | that is to say, a work containing the Program or a portion of it,
 68 | either verbatim or with modifications and/or translated into another
 69 | language.  (Hereinafter, translation is included without limitation in
 70 | the term "modification".)  Each licensee is addressed as "you".
 71 | 
 72 | Activities other than copying, distribution and modification are not
 73 | covered by this License; they are outside its scope.  The act of
 74 | running the Program is not restricted, and the output from the Program
 75 | is covered only if its contents constitute a work based on the
 76 | Program (independent of having been made by running the Program).
 77 | Whether that is true depends on what the Program does.
 78 | 
 79 |   1. You may copy and distribute verbatim copies of the Program's
 80 | source code as you receive it, in any medium, provided that you
 81 | conspicuously and appropriately publish on each copy an appropriate
 82 | copyright notice and disclaimer of warranty; keep intact all the
 83 | notices that refer to this License and to the absence of any warranty;
 84 | and give any other recipients of the Program a copy of this License
 85 | along with the Program.
 86 | 
 87 | You may charge a fee for the physical act of transferring a copy, and
 88 | you may at your option offer warranty protection in exchange for a fee.
 89 | 
 90 |   2. You may modify your copy or copies of the Program or any portion
 91 | of it, thus forming a work based on the Program, and copy and
 92 | distribute such modifications or work under the terms of Section 1
 93 | above, provided that you also meet all of these conditions:
 94 | 
 95 |     a) You must cause the modified files to carry prominent notices
 96 |     stating that you changed the files and the date of any change.
 97 | 
 98 |     b) You must cause any work that you distribute or publish, that in
 99 |     whole or in part contains or is derived from the Program or any
100 |     part thereof, to be licensed as a whole at no charge to all third
101 |     parties under the terms of this License.
102 | 
103 |     c) If the modified program normally reads commands interactively
104 |     when run, you must cause it, when started running for such
105 |     interactive use in the most ordinary way, to print or display an
106 |     announcement including an appropriate copyright notice and a
107 |     notice that there is no warranty (or else, saying that you provide
108 |     a warranty) and that users may redistribute the program under
109 |     these conditions, and telling the user how to view a copy of this
110 |     License.  (Exception: if the Program itself is interactive but
111 |     does not normally print such an announcement, your work based on
112 |     the Program is not required to print an announcement.)
113 | 
114 | These requirements apply to the modified work as a whole.  If
115 | identifiable sections of that work are not derived from the Program,
116 | and can be reasonably considered independent and separate works in
117 | themselves, then this License, and its terms, do not apply to those
118 | sections when you distribute them as separate works.  But when you
119 | distribute the same sections as part of a whole which is a work based
120 | on the Program, the distribution of the whole must be on the terms of
121 | this License, whose permissions for other licensees extend to the
122 | entire whole, and thus to each and every part regardless of who wrote it.
123 | 
124 | Thus, it is not the intent of this section to claim rights or contest
125 | your rights to work written entirely by you; rather, the intent is to
126 | exercise the right to control the distribution of derivative or
127 | collective works based on the Program.
128 | 
129 | In addition, mere aggregation of another work not based on the Program
130 | with the Program (or with a work based on the Program) on a volume of
131 | a storage or distribution medium does not bring the other work under
132 | the scope of this License.
133 | 
134 |   3. You may copy and distribute the Program (or a work based on it,
135 | under Section 2) in object code or executable form under the terms of
136 | Sections 1 and 2 above provided that you also do one of the following:
137 | 
138 |     a) Accompany it with the complete corresponding machine-readable
139 |     source code, which must be distributed under the terms of Sections
140 |     1 and 2 above on a medium customarily used for software interchange; or,
141 | 
142 |     b) Accompany it with a written offer, valid for at least three
143 |     years, to give any third party, for a charge no more than your
144 |     cost of physically performing source distribution, a complete
145 |     machine-readable copy of the corresponding source code, to be
146 |     distributed under the terms of Sections 1 and 2 above on a medium
147 |     customarily used for software interchange; or,
148 | 
149 |     c) Accompany it with the information you received as to the offer
150 |     to distribute corresponding source code.  (This alternative is
151 |     allowed only for noncommercial distribution and only if you
152 |     received the program in object code or executable form with such
153 |     an offer, in accord with Subsection b above.)
154 | 
155 | The source code for a work means the preferred form of the work for
156 | making modifications to it.  For an executable work, complete source
157 | code means all the source code for all modules it contains, plus any
158 | associated interface definition files, plus the scripts used to
159 | control compilation and installation of the executable.  However, as a
160 | special exception, the source code distributed need not include
161 | anything that is normally distributed (in either source or binary
162 | form) with the major components (compiler, kernel, and so on) of the
163 | operating system on which the executable runs, unless that component
164 | itself accompanies the executable.
165 | 
166 | If distribution of executable or object code is made by offering
167 | access to copy from a designated place, then offering equivalent
168 | access to copy the source code from the same place counts as
169 | distribution of the source code, even though third parties are not
170 | compelled to copy the source along with the object code.
171 | 
172 |   4. You may not copy, modify, sublicense, or distribute the Program
173 | except as expressly provided under this License.  Any attempt
174 | otherwise to copy, modify, sublicense or distribute the Program is
175 | void, and will automatically terminate your rights under this License.
176 | However, parties who have received copies, or rights, from you under
177 | this License will not have their licenses terminated so long as such
178 | parties remain in full compliance.
179 | 
180 |   5. You are not required to accept this License, since you have not
181 | signed it.  However, nothing else grants you permission to modify or
182 | distribute the Program or its derivative works.  These actions are
183 | prohibited by law if you do not accept this License.  Therefore, by
184 | modifying or distributing the Program (or any work based on the
185 | Program), you indicate your acceptance of this License to do so, and
186 | all its terms and conditions for copying, distributing or modifying
187 | the Program or works based on it.
188 | 
189 |   6. Each time you redistribute the Program (or any work based on the
190 | Program), the recipient automatically receives a license from the
191 | original licensor to copy, distribute or modify the Program subject to
192 | these terms and conditions.  You may not impose any further
193 | restrictions on the recipients' exercise of the rights granted herein.
194 | You are not responsible for enforcing compliance by third parties to
195 | this License.
196 | 
197 |   7. If, as a consequence of a court judgment or allegation of patent
198 | infringement or for any other reason (not limited to patent issues),
199 | conditions are imposed on you (whether by court order, agreement or
200 | otherwise) that contradict the conditions of this License, they do not
201 | excuse you from the conditions of this License.  If you cannot
202 | distribute so as to satisfy simultaneously your obligations under this
203 | License and any other pertinent obligations, then as a consequence you
204 | may not distribute the Program at all.  For example, if a patent
205 | license would not permit royalty-free redistribution of the Program by
206 | all those who receive copies directly or indirectly through you, then
207 | the only way you could satisfy both it and this License would be to
208 | refrain entirely from distribution of the Program.
209 | 
210 | If any portion of this section is held invalid or unenforceable under
211 | any particular circumstance, the balance of the section is intended to
212 | apply and the section as a whole is intended to apply in other
213 | circumstances.
214 | 
215 | It is not the purpose of this section to induce you to infringe any
216 | patents or other property right claims or to contest validity of any
217 | such claims; this section has the sole purpose of protecting the
218 | integrity of the free software distribution system, which is
219 | implemented by public license practices.  Many people have made
220 | generous contributions to the wide range of software distributed
221 | through that system in reliance on consistent application of that
222 | system; it is up to the author/donor to decide if he or she is willing
223 | to distribute software through any other system and a licensee cannot
224 | impose that choice.
225 | 
226 | This section is intended to make thoroughly clear what is believed to
227 | be a consequence of the rest of this License.
228 | 
229 |   8. If the distribution and/or use of the Program is restricted in
230 | certain countries either by patents or by copyrighted interfaces, the
231 | original copyright holder who places the Program under this License
232 | may add an explicit geographical distribution limitation excluding
233 | those countries, so that distribution is permitted only in or among
234 | countries not thus excluded.  In such case, this License incorporates
235 | the limitation as if written in the body of this License.
236 | 
237 |   9. The Free Software Foundation may publish revised and/or new versions
238 | of the General Public License from time to time.  Such new versions will
239 | be similar in spirit to the present version, but may differ in detail to
240 | address new problems or concerns.
241 | 
242 | Each version is given a distinguishing version number.  If the Program
243 | specifies a version number of this License which applies to it and "any
244 | later version", you have the option of following the terms and conditions
245 | either of that version or of any later version published by the Free
246 | Software Foundation.  If the Program does not specify a version number of
247 | this License, you may choose any version ever published by the Free Software
248 | Foundation.
249 | 
250 |   10. If you wish to incorporate parts of the Program into other free
251 | programs whose distribution conditions are different, write to the author
252 | to ask for permission.  For software which is copyrighted by the Free
253 | Software Foundation, write to the Free Software Foundation; we sometimes
254 | make exceptions for this.  Our decision will be guided by the two goals
255 | of preserving the free status of all derivatives of our free software and
256 | of promoting the sharing and reuse of software generally.
257 | 
258 |                             NO WARRANTY
259 | 
260 |   11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
268 | REPAIR OR CORRECTION.
269 | 
270 |   12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
278 | POSSIBILITY OF SUCH DAMAGES.
279 | 
280 |                      END OF TERMS AND CONDITIONS
281 | 


--------------------------------------------------------------------------------
/R/measures_clusterings.R:
--------------------------------------------------------------------------------
  1 | 
  2 | #' @importFrom stats xtabs
  3 | #' @importFrom Matrix rowSums colSums crossprod
  4 | #' @noRd
  5 | pair_contingency_table_clusters <- function(true, pred) {
  6 |   if (length(true) != length(pred))
  7 |     stop("`true` and `pred` must have the same length")
  8 | 
  9 |   # TODO: NA treatment
 10 |   data <- data.frame("Truth" = true, "Prediction" = pred,
 11 |                      stringsAsFactors = FALSE)
 12 |   ct <- xtabs(~ pred + true, data = data, sparse = TRUE)
 13 | 
 14 |   sizes_true <- colSums(ct)
 15 |   sizes_pred <- rowSums(ct)
 16 |   sum_squares <- sum(ct^2)
 17 |   num_items <- length(true)
 18 |   pair_ct <- matrix(nrow = 2, ncol = 2, data = NA_integer_)
 19 |   pair_ct[1,1] <- sum_squares - num_items # TP
 20 |   pair_ct[2,1] <- sum(ct %*% sizes_true) - sum_squares # FP
 21 |   pair_ct[1,2] <- sum(crossprod(ct, sizes_pred)) - sum_squares # FN
 22 |   pair_ct[2,2] <- num_items^2 - pair_ct[1,2] - pair_ct[2,1] - sum_squares # TN
 23 |   dimnames(pair_ct) <- list("Prediction" = c("TRUE", "FALSE"), "Truth" = c("TRUE", "FALSE"))
 24 |   return(as.table(pair_ct))
 25 | }
 26 | 
 27 | 
 28 | #' Contingency Table for Clusterings
 29 | #'
 30 | #' @description Compute the contingency table for a _predicted_ clustering
 31 | #'   given a _ground truth_ clustering.
 32 | #'
 33 | #' @param true ground truth clustering represented as a membership
 34 | #'    vector. Each entry corresponds to an element and the value identifies
 35 | #'    the assigned cluster. The specific values of the cluster identifiers
 36 | #'    are arbitrary.
 37 | #' @param pred predicted clustering represented as a membership
 38 | #'    vector.
 39 | #' @return Returns a table \eqn{C} (stored as a sparse matrix) such that
 40 | #'    \eqn{C_{ij}}{C_ij} counts the number of elements assigned to
 41 | #'    cluster \eqn{i} in `pred` and cluster \eqn{j} in `true`.
 42 | #'
 43 | #' @seealso
 44 | #' [`eval_report_clusters`] computes common evaluation measures derived
 45 | #' from the output of this function.
 46 | #'
 47 | #' @examples
 48 | #' true <- c(1,1,1,2,2)  # ground truth clustering
 49 | #' pred <- c(1,1,2,2,2)  # predicted clustering
 50 | #' contingency_table_clusters(true, pred)
 51 | #'
 52 | #' @export
 53 | #' @importFrom stats xtabs
 54 | contingency_table_clusters <- function(true, pred) {
 55 |   if (length(true) != length(pred))
 56 |     stop("`true` and `pred` must have the same length")
 57 | 
 58 |   # TODO: NA treatment
 59 |   data <- data.frame("true" = true, "pred" = pred,
 60 |                      stringsAsFactors = FALSE)
 61 |   ct <- xtabs(~ pred + true, data = data, sparse = TRUE)
 62 |   ct
 63 | }
 64 | 
 65 | 
 66 | #' Evaluation Report for Clustering
 67 | #'
 68 | #' @description Compute various evaluation measures for a predicted
 69 | #'   clustering using a ground truth clustering as a reference.
 70 | #'
 71 | #' @param true ground truth clustering represented as a membership
 72 | #'    vector. Each entry corresponds to an element and the value identifies
 73 | #'    the assigned cluster. The specific values of the cluster identifiers
 74 | #'    are arbitrary.
 75 | #' @param pred predicted clustering represented as a membership
 76 | #'    vector.
 77 | #' @return Returns a list containing the following measures:
 78 | #'   \describe{
 79 | #'     \item{homogeneity}{see [`homogeneity`]}
 80 | #'     \item{completeness}{see [`completeness`]}
 81 | #'     \item{v_measure}{see [`v_measure`]}
 82 | #'     \item{rand_index}{see [`rand_index`]}
 83 | #'     \item{adj_rand_index}{see [`adj_rand_index`]}
 84 | #'     \item{variation_info}{see [`variation_info`]}
 85 | #'     \item{mutual_info}{see [`mutual_info`]}
 86 | #'     \item{fowlkes_mallows}{see [`fowlkes_mallows`]}
 87 | #'   }
 88 | #'
 89 | #' @examples
 90 | #' true <- c(1,1,1,2,2)  # ground truth clustering
 91 | #' pred <- c(1,1,2,2,2)  # predicted clustering
 92 | #' eval_report_clusters(true, pred)
 93 | #'
 94 | #' @export
 95 | eval_report_clusters <- function(true, pred) {
 96 |   pair_ct <- pair_contingency_table_clusters(true, pred)
 97 |   ct <- contingency_table_clusters(true, pred)
 98 |   list("homogeneity" = homogeneity_ct(ct),
 99 |        "completeness" = completeness_ct(ct),
100 |        "v_measure" = v_measure_ct(ct),
101 |        "rand_index" = rand_index_ct(pair_ct),
102 |        "adj_rand_index" = adj_rand_index_ct(pair_ct),
103 |        "variation_info" = variation_info_ct(ct),
104 |        "mutual_info" = mutual_info_ct(ct),
105 |        "fowlkes_mallows" = fowlkes_mallows_ct(ct))
106 | }
107 | 
108 | 
109 | #' Rand Index Between Clusterings
110 | #'
111 | #' @description Computes the Rand index (RI) between two clusterings, such
112 | #'    as a predicted and ground truth clustering.
113 | #'
114 | #' @details The Rand index (RI) can be expressed as:
115 | #'   \deqn{\frac{a + b}{{n \choose 2}}.}{(a + b)/binom(n, 2).}
116 | #'   where
117 | #'   * \eqn{n} is the number of elements,
118 | #'   * \eqn{a} is the number of pairs of elements that appear in the
119 | #'   same cluster in both clusterings, and
120 | #'   * \eqn{b} is the number of pairs of elements that appear in distinct
121 | #'   clusters in both clusterings.
122 | #'
123 | #'   The RI takes on values between 0 and 1, where 1 denotes exact agreement
124 | #'   between the clusterings and 0 denotes disagreement on all pairs of
125 | #'   elements.
126 | #'
127 | #' @param true ground truth clustering represented as a membership
128 | #'    vector. Each entry corresponds to an element and the value identifies
129 | #'    the assigned cluster. The specific values of the cluster identifiers
130 | #'    are arbitrary.
131 | #' @param pred predicted clustering represented as a membership
132 | #'    vector.
133 | #'
134 | #' @references
135 | #' Rand, W. M. "Objective Criteria for the Evaluation of Clustering Methods."
136 | #' _Journal of the American Statistical Association_ 66(336), 846-850 (1971).
137 | #' \doi{10.1080/01621459.1971.10482356}
138 | #'
139 | #' @examples
140 | #' true <- c(1,1,1,2,2)  # ground truth clustering
141 | #' pred <- c(1,1,2,2,2)  # predicted clustering
142 | #' rand_index(true, pred)
143 | #'
144 | #' @export
145 | rand_index <- function(true, pred) {
146 |   pair_ct <- pair_contingency_table_clusters(true, pred)
147 |   rand_index_ct(pair_ct)
148 | }
149 | 
150 | 
151 | #' Adjusted Rand Index Between Clusterings
152 | #'
153 | #' @description Computes the adjusted Rand index (ARI) between two clusterings,
154 | #'    such as a predicted and ground truth clustering.
155 | #'
156 | #' @details The adjusted Rand index (ARI) is a variant of the Rand index (RI)
157 | #'   which is corrected for chance using the Permutation Model for
158 | #'   clusterings. It is related to the RI as follows:
159 | #'   \deqn{\frac{RI - E(RI)}{1 - E(RI)},}{(RI - E(RI))/(1 - E(RI)),}
160 | #'   where \eqn{E(RI)} is the expected value of the RI under the Permutation
161 | #'   Model.
162 | #'   Unlike the RI, the ARI takes values in the range -1 to 1. A value
163 | #'   of 1 indicates that the clusterings are identical, while a value of
164 | #'   0 indicates the clusterings are drawn randomly independent of one
165 | #'   another.
166 | #'
167 | #' @param true ground truth clustering represented as a membership
168 | #'    vector. Each entry corresponds to an element and the value identifies
169 | #'    the assigned cluster. The specific values of the cluster identifiers
170 | #'    are arbitrary.
171 | #' @param pred predicted clustering represented as a membership
172 | #'    vector.
173 | #'
174 | #' @examples
175 | #' true <- c(1,1,1,2,2)  # ground truth clustering
176 | #' pred <- c(1,1,2,2,2)  # predicted clustering
177 | #' adj_rand_index(true, pred)
178 | #'
179 | #' @references
180 | #' Hubert, L., Arabie, P. "Comparing partitions." _Journal of Classification_
181 | #' **2**, 193–218 (1985). \doi{10.1007/BF01908075}
182 | #'
183 | #' @export
184 | adj_rand_index <- function(true, pred) {
185 |   pair_ct <- pair_contingency_table_clusters(true, pred)
186 |   adj_rand_index_ct(pair_ct)
187 | }
188 | 
189 | 
190 | #' Fowlkes-Mallows Index Between Clusterings
191 | #'
192 | #' @description Computes the Fowlkes-Mallows index between two clusterings,
193 | #'    such as a predicted and ground truth clustering.
194 | #'
195 | #' @details The Fowlkes-Mallows index is defined as the geometric mean of
196 | #'    precision and recall, computed with respect to pairs of elements.
197 | #'
198 | #' @param true ground truth clustering represented as a membership
199 | #'    vector. Each entry corresponds to an element and the value identifies
200 | #'    the assigned cluster. The specific values of the cluster identifiers
201 | #'    are arbitrary.
202 | #' @param pred predicted clustering represented as a membership
203 | #'    vector.
204 | #'
205 | #' @references
206 | #' Fowlkes, E. B. and Mallows, C. L. "A Method for Comparing Two Hierarchical
207 | #' Clusterings." _Journal of the American Statistical Association_ **78:383**,
208 | #' 553-569, (1983). \doi{10.1080/01621459.1983.10478008}
209 | #'
210 | #' @examples
211 | #' true <- c(1,1,1,2,2)  # ground truth clustering
212 | #' pred <- c(1,1,2,2,2)  # predicted clustering
213 | #' fowlkes_mallows(true, pred)
214 | #'
215 | #' @export
216 | fowlkes_mallows <- function(true, pred) {
217 |   ct <- contingency_table_clusters(true, pred)
218 |   fowlkes_mallows_ct(ct)
219 | }
220 | 
221 | 
222 | 
223 | #' Homogeneity Between Clusterings
224 | #'
225 | #' @description Computes the homogeneity between two clusterings, such
226 | #'    as a predicted and ground truth clustering.
227 | #'
228 | #' @details Homogeneity is an entropy-based measure of the similarity
229 | #'    between two clusterings, say \eqn{t} and \eqn{p}. The homogeneity
230 | #'    is high if clustering \eqn{t} only assigns members of a cluster to
231 | #'    a single cluster in \eqn{p}. The homogeneity ranges between 0
232 | #'    and 1, where 1 indicates a perfect homogeneity.
233 | #'
234 | #' @param true ground truth clustering represented as a membership
235 | #'    vector. Each entry corresponds to an element and the value identifies
236 | #'    the assigned cluster. The specific values of the cluster identifiers
237 | #'    are arbitrary.
238 | #' @param pred predicted clustering represented as a membership
239 | #'    vector.
240 | #'
241 | #' @references
242 | #' Rosenberg, A. and Hirschberg, J. "V-measure: A conditional entropy-based external cluster evaluation measure." _Proceedings of the 2007 Joint Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning_ (EMNLP-CoNLL), (2007).
243 | #'
244 | #' @seealso [`completeness`] evaluates the _completeness_, which is a dual
245 | #' measure to _homogeneity_. [`v_measure`] evaluates the harmonic mean of
246 | #' _completeness_ and _homogeneity_.
247 | #'
248 | #' @examples
249 | #' true <- c(1,1,1,2,2)  # ground truth clustering
250 | #' pred <- c(1,1,2,2,2)  # predicted clustering
251 | #' homogeneity(true, pred)
252 | #'
253 | #' @export
254 | homogeneity <- function(true, pred) {
255 |   ct <- contingency_table_clusters(true, pred)
256 |   homogeneity_ct(ct)
257 | }
258 | 
259 | 
260 | #' Completeness Between Clusterings
261 | #'
262 | #' @description Computes the completeness between two clusterings, such
263 | #'    as a predicted and ground truth clustering.
264 | #'
265 | #' @details Completeness is an entropy-based measure of the similarity
266 | #'    between two clusterings, say \eqn{t} and \eqn{p}. The completeness
267 | #'    is high if _all_ members of a given cluster in \eqn{t} are assigned
268 | #'    to a single cluster in \eqn{p}. The completeness ranges between 0
269 | #'    and 1, where 1 indicates perfect completeness.
270 | #'
271 | #' @param true ground truth clustering represented as a membership
272 | #'    vector. Each entry corresponds to an element and the value identifies
273 | #'    the assigned cluster. The specific values of the cluster identifiers
274 | #'    are arbitrary.
275 | #' @param pred predicted clustering represented as a membership
276 | #'    vector.
277 | #'
278 | #' @references
279 | #' Rosenberg, A. and Hirschberg, J. "V-measure: A conditional entropy-based external cluster evaluation measure." _Proceedings of the 2007 Joint Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning_ (EMNLP-CoNLL), (2007).
280 | #'
281 | #' @seealso [`homogeneity`] evaluates the _homogeneity_, which is a dual
282 | #' measure to _completeness_. [`v_measure`] evaluates the harmonic mean of
283 | #' _completeness_ and _homogeneity_.
284 | #'
285 | #' @examples
286 | #' true <- c(1,1,1,2,2)  # ground truth clustering
287 | #' pred <- c(1,1,2,2,2)  # predicted clustering
288 | #' completeness(true, pred)
289 | #'
290 | #' @export
291 | completeness <- function(true, pred) {
292 |   ct <- contingency_table_clusters(true, pred)
293 |   completeness_ct(ct)
294 | }
295 | 
296 | 
297 | #' V-measure Between Clusterings
298 | #'
299 | #' @description Computes the V-measure between two clusterings, such
300 | #'    as a predicted and ground truth clustering.
301 | #'
302 | #' @details V-measure is defined as the \eqn{\beta}{β}-weighted harmonic
303 | #'    mean of homogeneity \eqn{h} and completeness \eqn{c}:
304 | #'    \deqn{(1 + \beta)\frac{h \cdot c}{\beta \cdot h + c}.}{(1 + β)·h·c/(β·h + c).}
305 | #'    The range of V-measure is between 0 and 1, where 1 corresponds to a
306 | #'    perfect match between the clusterings. It is equivalent to the
307 | #'    normalised mutual information, when the aggregation function is the
308 | #'    arithmetic mean.
309 | #'
310 | #' @param true ground truth clustering represented as a membership
311 | #'    vector. Each entry corresponds to an element and the value identifies
312 | #'    the assigned cluster. The specific values of the cluster identifiers
313 | #'    are arbitrary.
314 | #' @param pred predicted clustering represented as a membership
315 | #'    vector.
316 | #' @param beta non-negative weight. A value of 0 assigns no weight to
317 | #'   completeness (i.e. the measure reduces to homogeneity), while larger
318 | #'   values assign increasing weight to completeness. A value of 1 weights
319 | #'   completeness and homogeneity equally.
320 | #'
321 | #' @references
322 | #' Rosenberg, A. and Hirschberg, J. "V-measure: A conditional entropy-based external cluster evaluation measure." _Proceedings of the 2007 Joint Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning_ (EMNLP-CoNLL), (2007).
323 | #'
324 | #' Becker, H. "Identification and characterization of events in social media."
325 | #' _PhD dissertation_, Columbia University, (2011).
326 | #'
327 | #' @seealso [`homogeneity`] and [`completeness`] evaluate the component
328 | #' measures upon which this measure is based.
329 | #'
330 | #' @examples
331 | #' true <- c(1,1,1,2,2)  # ground truth clustering
332 | #' pred <- c(1,1,2,2,2)  # predicted clustering
333 | #' v_measure(true, pred)
334 | #'
335 | #' @export
336 | v_measure <- function(true, pred, beta=1) {
337 |   ct <- contingency_table_clusters(true, pred)
338 |   v_measure_ct(ct, beta=beta)
339 | }
340 | 
341 | 
342 | #' Variation of Information Between Clusterings
343 | #'
344 | #' @description Computes the variation of information between two
345 | #'    clusterings, such as a predicted and ground truth clustering.
346 | #'
347 | #' @details Variation of information is an entropy-based distance metric
348 | #'    on the space of clusterings. It is unnormalized and varies between
349 | #'    \eqn{0} and \eqn{\log(N)}{log(N)} where \eqn{N} is the number of
350 | #'    clustered elements. Larger values of the distance metric correspond
351 | #'    to greater dissimilarity between the clusterings.
352 | #'
353 | #' @param true ground truth clustering represented as a membership
354 | #'    vector. Each entry corresponds to an element and the value identifies
355 | #'    the assigned cluster. The specific values of the cluster identifiers
356 | #'    are arbitrary.
357 | #' @param pred predicted clustering represented as a membership
358 | #'    vector.
359 | #' @param base base of the logarithm. Defaults to `exp(1)`.
360 | #'
361 | #' @references
362 | #' Arabie, P. and Boorman, S. A. "Multidimensional scaling of measures of
363 | #' distance between partitions." _Journal of Mathematical Psychology_ **10:2**,
364 | #' 148-203, (1973). \doi{10.1016/0022-2496(73)90012-6}
365 | #'
366 | #' Meilă, M. "Comparing Clusterings by the Variation of Information." In:
367 | #' Learning Theory and Kernel Machines, Lecture Notes in Computer Science
368 | #' **2777**, Springer, Berlin, Heidelberg, (2003).
369 | #' \doi{10.1007/978-3-540-45167-9_14}
370 | #'
371 | #' @examples
372 | #' true <- c(1,1,1,2,2)  # ground truth clustering
373 | #' pred <- c(1,1,2,2,2)  # predicted clustering
374 | #' variation_info(true, pred)
375 | #'
376 | #' @export
377 | variation_info <- function(true, pred, base=exp(1)) {
378 |   ct <- contingency_table_clusters(true, pred)
379 |   variation_info_ct(ct, base=base)
380 | }
381 | 
382 | 
383 | #' Mutual Information Between Clusterings
384 | #'
385 | #' @description Computes the mutual information between two
386 | #'    clusterings, such as a predicted and ground truth clustering.
387 | #'
388 | #' @details Mutual information is an entropy-based measure of the similarity
389 | #'    between two clusterings.
390 | #'
391 | #' @param true ground truth clustering represented as a membership
392 | #'    vector. Each entry corresponds to an element and the value identifies
393 | #'    the assigned cluster. The specific values of the cluster identifiers
394 | #'    are arbitrary.
395 | #' @param pred predicted clustering represented as a membership
396 | #'    vector.
397 | #' @param base base of the logarithm. Defaults to `exp(1)`.
398 | #'
399 | #' @examples
400 | #' true <- c(1,1,1,2,2)  # ground truth clustering
401 | #' pred <- c(1,1,2,2,2)  # predicted clustering
402 | #' mutual_info(true, pred)
403 | #'
404 | #' @export
405 | mutual_info <- function(true, pred, base=exp(1)) {
406 |   ct <- contingency_table_clusters(true, pred)
407 |   mutual_info_ct(ct, base=base)
408 | }
409 | 
410 | 
411 | # Definition of clustering measures in terms of contingency tables
412 | rand_index_ct <- function(pair_ct) {
413 |   correct <- sum(diag(pair_ct))
414 |   total <- sum(pair_ct)
415 | 
416 |   if (correct == total || total == 0)
417 |     # Special cases: no clustering since the data is not split;
418 |     # or trivial clustering where each item is assigned a unique
419 |     # cluster. These are perfect matches hence return 1.0.
420 |     return(1.0)
421 | 
422 |   return(correct / total)
423 | }
424 | 
425 | adj_rand_index_ct <- function(pair_ct) {
426 |   tp <- pair_ct["TRUE", "TRUE"]
427 |   fp <- pair_ct["TRUE", "FALSE"]
428 |   fn <- pair_ct["FALSE", "TRUE"]
429 |   tn <- pair_ct["FALSE", "FALSE"]
430 | 
431 |   # Special cases: empty data or full agreement
432 |   if (fn == 0 && fp == 0) return(1.0)
433 | 
434 |   return(2 * (tp * tn - fn * fp) /
435 |            ((tp + fn) * (fn + tn) + (tp + fp) * (fp + tn)))
436 | }
437 | 
438 | 
439 | #' @param ct contingency table represented as a sparse matrix, specifically
440 | #'   an object of S4 class [`Matrix::dgCMatrix-class`]
441 | #' @importFrom Matrix colSums
442 | #' @noRd
443 | homogeneity_ct <- function(ct) {
444 |   true_counts <- colSums(ct)
445 |   entropy <- entropy_counts(true_counts)
446 |   if (entropy == 0) return(1.0)
447 |   mi <- mutual_info_ct(ct)
448 |   mi / entropy
449 | }
450 | 
451 | 
452 | #' @param ct contingency table represented as a sparse matrix, specifically
453 | #'   an object of S4 class [`Matrix::dgCMatrix-class`]
454 | #' @importFrom Matrix rowSums
455 | #' @noRd
456 | completeness_ct <- function(ct) {
457 |   pred_counts <- rowSums(ct)
458 |   entropy <- entropy_counts(pred_counts)
459 |   if (entropy == 0) return(1.0)
460 |   mi <- mutual_info_ct(ct)
461 |   mi / entropy
462 | }
463 | 
464 | 
465 | #' @param ct contingency table represented as a sparse matrix, specifically
466 | #'   an object of S4 class [`Matrix::dgCMatrix-class`]
467 | #' @importFrom Matrix rowSums colSums which
468 | #' @noRd
469 | fowlkes_mallows_ct <- function(ct) {
470 |   n <- sum(ct)
471 |   tk <- sum(ct^2) - n
472 |   pk <- sum(rowSums(ct)^2) - n
473 |   qk <- sum(colSums(ct)^2) - n
474 |   ifelse(tk == 0, 0.0, sqrt(tk / pk) * sqrt(tk / qk))
475 | }
476 | 
477 | 
478 | #' @param ct contingency table represented as a sparse matrix, specifically
479 | #'   an object of S4 class [`Matrix::dgCMatrix-class`]
480 | #' @importFrom Matrix rowSums colSums which
481 | #' @noRd
482 | v_measure_ct <- function(ct, beta=1.0) {
483 |   true_counts <- colSums(ct)
484 |   pred_counts <- rowSums(ct)
485 |   entropy_true <- entropy_counts(true_counts)
486 |   entropy_pred <- entropy_counts(pred_counts)
487 |   mi <- mutual_info_ct(ct)
488 |   homogeneity <- ifelse(entropy_true==0, 1.0, mi / entropy_true)
489 |   completeness <- ifelse(entropy_pred==0, 1.0, mi / entropy_pred)
490 |   alpha <- 1/(1 + beta^2)
491 |   1 / (alpha / homogeneity + (1 - alpha) / completeness)
492 | }
493 | 
494 | 
495 | #' @param ct contingency table represented as a sparse matrix, specifically
496 | #'   an object of S4 class [`Matrix::dgCMatrix-class`]
497 | #' @param base base of the logarithm. Defaults to `exp(1)`.
498 | #' @importFrom Matrix rowSums colSums
499 | #' @noRd
500 | variation_info_ct <- function(ct, base=exp(1)) {
501 |   # Get array indices of non-zero elements
502 |   true_counts <- colSums(ct)
503 |   pred_counts <- rowSums(ct)
504 |   entropy_true <- entropy_counts(true_counts, base=base)
505 |   entropy_pred <- entropy_counts(pred_counts, base=base)
506 |   mi <- mutual_info_ct(ct, base=base)
507 |   vi <- entropy_true + entropy_pred - 2 * mi
508 |   ifelse(vi >= 0, vi, 0.0)
509 | }
510 | 
511 | 
512 | #' @param counts numeric vector of counts for categories
513 | #' @param base base of the logarithm. Defaults to `exp(1)`.
514 | #' @noRd
515 | entropy_counts <- function(counts, base=exp(1)) {
516 |   counts <- counts[counts > 0]
517 |   total <- sum(counts)
518 |   - sum(counts / total * (log(counts, base=base) - log(total, base=base)))
519 | }
520 | 
521 | 
522 | #' @param ct contingency table represented as a sparse matrix, specifically
523 | #'   an object of S4 class [`Matrix::dgCMatrix-class`]
524 | #' @param base base of the logarithm. Defaults to `exp(1)`.
525 | #' @importFrom Matrix rowSums colSums which
526 | #' @noRd
527 | mutual_info_ct <- function(ct, base=exp(1)) {
528 |   # Get array indices of non-zero elements
529 |   nz_ind <- which(ct > 0, arr.ind = TRUE, useNames = FALSE)
530 |   total <- sum(ct)
531 |   row_totals <- rowSums(ct)
532 |   col_totals <- colSums(ct)
533 |   ct_nz <- ct@x                # non-zero entries
534 |   ct_nz_norm <- ct_nz / total  # normalized non-zero entries
535 |   log_ct_nz <- log(ct_nz, base=base)
536 |   outer <- row_totals[nz_ind[,1]] * col_totals[nz_ind[,2]]
537 |   log_outer <- - log(outer, base=base) + 2 * log(total, base=base)
538 |   mi <- sum(ct_nz_norm * (log_ct_nz  - log(total, base=base)) + ct_nz_norm * log_outer)
539 |   ifelse(mi >= 0, mi, 0.0)
540 | }
541 | 


--------------------------------------------------------------------------------
/R/measures_pairs.R:
--------------------------------------------------------------------------------
  1 | #' @include transformations.R
  2 | NULL
  3 | 
  4 | #' Binary Contingency Table for Linked Pairs
  5 | #'
  6 | #' @description Compute the binary contingency table for a set of _predicted_
  7 | #'   coreferent (linked) pairs given a set of _ground truth_ coreferent pairs.
  8 | #'
  9 | #' @param true_pairs set of true coreferent pairs stored in a matrix or
 10 | #'   data.frame, where rows index pairs and columns index the ids of the
 11 | #'   constituents. Any pairs not included are assumed to be _non-coreferent_.
 12 | #'   Duplicate pairs (including equivalent pairs with reversed ids) are
 13 | #'   automatically removed.
 14 | #' @param pred_pairs set of predicted coreferent pairs, following the same
 15 | #'   specification as `true_pairs`.
 16 | #' @param num_pairs the total number of coreferent and non-coreferent pairs,
 17 | #'   excluding equivalent pairs with reversed ids. If not provided,
 18 | #'   the true negative cell will be set to `NA`.
 19 | #' @param ordered whether to treat the element pairs as ordered---i.e. whether
 20 | #'   pair \eqn{(x, y)} is distinct from pair \eqn{(y, x)} for \eqn{x \neq y}.
 21 | #'   Defaults to FALSE, which is appropriate for clustering, undirected link
 22 | #'   prediction, record linkage etc.
 23 | #' @return Returns a \eqn{2 \times 2}{2×2} contingency table of the form:
 24 | #' \preformatted{
 25 | #'              Truth
 26 | #'    Prediction   TRUE  FALSE
 27 | #'         TRUE      TP     FP
 28 | #'         FALSE     FN     TN
 29 | #' }
 30 | #'
 31 | #' @seealso
 32 | #' The [`membership_to_pairs`] and [`clusters_to_pairs`] functions can be
 33 | #' used to transform other clustering representations into lists of pairs,
 34 | #' as required by this function.
 35 | #' The [`eval_report_pairs`] function computes common evaluation measures
 36 | #' derived from binary contingency matrices, like the ones output by this
 37 | #' function.
 38 | #'
 39 | #' @examples
 40 | #' ### Example where pairs/edges are undirected
 41 | #' # ground truth is 3-clique
 42 | #' true_pairs <- rbind(c(1,2), c(2,3), c(1,3))
 43 | #' # prediction misses one edge
 44 | #' pred_pairs <- rbind(c(1,2), c(2,3))
 45 | #' # total number of pairs assuming 3 elements
 46 | #' num_pairs <- 3 * (3 - 1) / 2
 47 | #' eval_report_pairs(true_pairs, pred_pairs, num_pairs)
 48 | #'
 49 | #' ### Example where pairs/edges are directed
 50 | #' # ground truth is a 3-star
 51 | #' true_pairs <- rbind(c(2,1), c(3,1), c(4,1))
 52 | #' # prediction gets direction of one edge incorrect
 53 | #' pred_pairs <- rbind(c(2,1), c(3,1), c(1,4))
 54 | #' # total number of pairs assuming 4 elements
 55 | #' num_pairs <- 4 * 4
 56 | #' eval_report_pairs(true_pairs, pred_pairs, num_pairs, ordered = TRUE)
 57 | #'
 58 | #' @export
 59 | contingency_table_pairs <- function(true_pairs, pred_pairs, num_pairs=NULL, ordered=FALSE) {
 60 |   if (!is.null(num_pairs)) {
 61 |     if (length(num_pairs) != 1 | num_pairs <= 0)
 62 |       stop("num_pairs must be a positive scalar or NULL")
 63 |   }
 64 | 
 65 |   # Binding pairs ensures that they are coerced to the same type
 66 |   comb_pairs <- rbind(true_pairs, pred_pairs)
 67 |   true_pairs <- comb_pairs[seq_len(nrow(true_pairs)),]
 68 |   pred_pairs <- comb_pairs[nrow(true_pairs) + seq_len(nrow(pred_pairs)),]
 69 | 
 70 |   # Canonicalize pairs
 71 |   pred_pairs <- as.data.frame(canonicalize_pairs(pred_pairs, ordered = ordered))
 72 |   true_pairs <- as.data.frame(canonicalize_pairs(true_pairs, ordered = ordered))
 73 | 
 74 |   # Standardize column names
 75 |   colnames(pred_pairs) <- c("ID.x", "ID.y")
 76 |   colnames(true_pairs) <- c("ID.x", "ID.y")
 77 | 
 78 |   # Allow for empty data frames
 79 |   pred_pairs[["PRED_MATCH"]] <- rep(TRUE, times=nrow(pred_pairs))
 80 |   true_pairs[["MATCH"]] <- rep(TRUE, times=nrow(true_pairs))
 81 | 
 82 |   # Perform a full outer join on the two data frames.
 83 |   merged_pairs <- merge(pred_pairs, true_pairs, by=c("ID.x", "ID.y"), all=TRUE)
 84 | 
 85 |   # An NA in PRED_MATCH or MATCH represents 'FALSE'
 86 |   merged_pairs$PRED_MATCH[is.na(merged_pairs$PRED_MATCH)] <- FALSE
 87 |   merged_pairs$MATCH[is.na(merged_pairs$MATCH)] <- FALSE
 88 | 
 89 |   # Convert to factors so we can use built-in table function
 90 |   prediction = factor(merged_pairs$PRED_MATCH, levels = c(TRUE, FALSE))
 91 |   truth = factor(merged_pairs$MATCH, levels = c(TRUE, FALSE))
 92 | 
 93 |   ct <- table(prediction, truth, dnn = c("Prediction", "Truth"))
 94 | 
 95 |   if (is.null(num_pairs)) {
 96 |     ct[2,2] <- NA # number of true negatives is unknown since links are incomplete
 97 |   } else {
 98 |     ct[2,2] <- num_pairs - nrow(merged_pairs)
 99 |   }
100 | 
101 |   return(ct)
102 | }
103 | 
104 | 
105 | #' Evaluation Report for Linked Pairs
106 | #'
107 | #' @description Compute various evaluation measures for a set of _predicted_
108 | #'   coreferent (linked) pairs given a set of _ground truth_ coreferent pairs.
109 | #'
110 | #' @param true_pairs set of true coreferent pairs stored in a matrix or
111 | #'   data.frame, where rows index pairs and columns index the ids of the
112 | #'   constituents. Any pairs not included are assumed to be _non-coreferent_.
113 | #'   Duplicate pairs (including equivalent pairs with reversed ids) are
114 | #'   automatically removed.
115 | #' @param pred_pairs set of predicted coreferent pairs, following the same
116 | #'   specification as `true_pairs`.
117 | #' @param num_pairs the total number of coreferent and non-coreferent pairs,
118 | #'   excluding equivalent pairs with reversed ids. If not provided,
119 | #'   measures that depend on the number of true negatives will be returned
120 | #'   as `NA`.
121 | #' @param ordered whether to treat the element pairs as ordered---i.e. whether
122 | #'   pair \eqn{(x, y)} is distinct from pair \eqn{(y, x)} for \eqn{x \neq y}.
123 | #'   Defaults to FALSE, which is appropriate for clustering, undirected link
124 | #'   prediction, record linkage etc.
125 | #'
126 | #' @return Returns a list containing the following measures:
127 | #'   \describe{
128 | #'     \item{precision}{see [`precision_pairs`]}
129 | #'     \item{recall}{see [`recall_pairs`]}
130 | #'     \item{specificity}{see [`specificity_pairs`]}
131 | #'     \item{sensitivity}{see [`sensitivity_pairs`]}
132 | #'     \item{f1score}{see [`f_measure_pairs`]}
133 | #'     \item{accuracy}{see [`accuracy_pairs`]}
134 | #'     \item{balanced_accuracy}{see [`balanced_accuracy_pairs`]}
135 | #'     \item{fowlkes_mallows}{see [`fowlkes_mallows_pairs`]}
136 | #'   }
137 | #'
138 | #' @seealso The [`contingency_table_pairs`] function can be used to compute
139 | #'   the contingency table for entity resolution or record linkage problems.
140 | #'
141 | #' @examples
142 | #' ### Example where pairs/edges are undirected
143 | #' # ground truth is 3-clique
144 | #' true_pairs <- rbind(c(1,2), c(2,3), c(1,3))
145 | #' # prediction misses one edge
146 | #' pred_pairs <- rbind(c(1,2), c(2,3))
147 | #' # total number of pairs assuming 3 elements
148 | #' num_pairs <- 3 * (3 - 1) / 2
149 | #' eval_report_pairs(true_pairs, pred_pairs, num_pairs)
150 | #'
151 | #' ### Example where pairs/edges are directed
152 | #' # ground truth is a 3-star
153 | #' true_pairs <- rbind(c(2,1), c(3,1), c(4,1))
154 | #' # prediction gets direction of one edge incorrect
155 | #' pred_pairs <- rbind(c(2,1), c(3,1), c(1,4))
156 | #' # total number of pairs assuming 4 elements
157 | #' num_pairs <- 4 * 4
158 | #' eval_report_pairs(true_pairs, pred_pairs, num_pairs, ordered = TRUE)
159 | #'
160 | #' @export
161 | eval_report_pairs <- function(true_pairs, pred_pairs, num_pairs = NULL, ordered=FALSE)
162 | {
163 |   ct <- contingency_table_pairs(true_pairs, pred_pairs, num_pairs = num_pairs, ordered = ordered)
164 |   list("precision" = precision_pairs_ct(ct),
165 |        "recall" = recall_pairs_ct(ct),
166 |        "specificity" = specificity_pairs_ct(ct),
167 |        "sensitivity" = recall_pairs_ct(ct),
168 |        "f1score" = f_measure_pairs_ct(ct),
169 |        "accuracy" = accuracy_pairs_ct(ct),
170 |        "balanced_accuracy" = balanced_accuracy_pairs_ct(ct))
171 | }
172 | 
173 | 
174 | #' Precision of Linked Pairs
175 | #'
176 | #' @description Computes the precision of a set of _predicted_ coreferent
177 | #'   (linked) pairs given a set of _ground truth_ coreferent pairs.
178 | #'
179 | #' @details The precision is defined as:
180 | #'   \deqn{\frac{|T \cap P|}{|P|}}{|T ∩ P|/|P|}
181 | #'   where \eqn{T} is the set of true coreferent pairs and \eqn{P} is the
182 | #'   set of predicted coreferent pairs.
183 | #'
184 | #' @param true_pairs set of true coreferent pairs stored in a matrix or
185 | #'   data.frame, where rows index pairs and columns index the ids of the
186 | #'   constituents. Any pairs not included are assumed to be _non-coreferent_.
187 | #'   Duplicate pairs (including equivalent pairs with reversed ids) are
188 | #'   automatically removed.
189 | #' @param pred_pairs set of predicted coreferent pairs, following the same
190 | #'   specification as `true_pairs`.
191 | #' @param ordered whether to treat the element pairs as ordered---i.e. whether
192 | #'   pair \eqn{(x, y)} is distinct from pair \eqn{(y, x)} for \eqn{x \neq y}.
193 | #'   Defaults to FALSE, which is appropriate for clustering, undirected link
194 | #'   prediction, record linkage etc.
195 | #'
196 | #' @examples
197 | #' true_pairs <- rbind(c(1,2), c(2,3), c(1,3)) # ground truth is 3-clique
198 | #' pred_pairs <- rbind(c(1,2), c(2,3))         # prediction misses one edge
199 | #' num_pairs <- 3                              # assuming 3 elements
200 | #' precision_pairs(true_pairs, pred_pairs, num_pairs)
201 | #'
202 | #' @export
203 | precision_pairs <- function(true_pairs, pred_pairs, ordered=FALSE) {
204 |   ct <- contingency_table_pairs(true_pairs, pred_pairs, ordered = ordered)
205 |   precision_pairs_ct(ct)
206 | }
207 | 
208 | 
209 | #' Recall of Linked Pairs
210 | #'
211 | #' @description Computes the precision of a set of _predicted_ coreferent
212 | #'   (linked) pairs given a set of _ground truth_ coreferent pairs.
213 | #'
214 | #' @details The recall is defined as:
215 | #'   \deqn{\frac{|T \cap P|}{|T|}}{|T ∩ P|/|T|}
216 | #'   where \eqn{T} is the set of true coreferent pairs and \eqn{P} is the
217 | #'   set of predicted coreferent pairs.
218 | #'
219 | #' @param true_pairs set of true coreferent pairs stored in a matrix or
220 | #'   data.frame, where rows index pairs and columns index the ids of the
221 | #'   constituents. Any pairs not included are assumed to be _non-coreferent_.
222 | #'   Duplicate pairs (including equivalent pairs with reversed ids) are
223 | #'   automatically removed.
224 | #' @param pred_pairs set of predicted coreferent pairs, following the same
225 | #'   specification as `true_pairs`.
226 | #' @param ordered whether to treat the element pairs as ordered---i.e. whether
227 | #'   pair \eqn{(x, y)} is distinct from pair \eqn{(y, x)} for \eqn{x \neq y}.
228 | #'   Defaults to FALSE, which is appropriate for clustering, undirected link
229 | #'   prediction, record linkage etc.
230 | #'
231 | #' @examples
232 | #' true_pairs <- rbind(c(1,2), c(2,3), c(1,3)) # ground truth is 3-clique
233 | #' pred_pairs <- rbind(c(1,2), c(2,3))         # prediction misses one edge
234 | #' num_pairs <- 3                              # assuming 3 elements
235 | #' recall_pairs(true_pairs, pred_pairs, num_pairs)
236 | #'
237 | #' @rdname recall_pairs
238 | #' @export
239 | recall_pairs <- function(true_pairs, pred_pairs, ordered=FALSE) {
240 |   ct <- contingency_table_pairs(true_pairs, pred_pairs, ordered = ordered)
241 |   recall_pairs_ct(ct)
242 | }
243 | 
244 | 
245 | #' @note `sensitivity_pairs` is an alias for `recall_pairs`.
246 | #'
247 | #' @rdname recall_pairs
248 | #' @export
249 | sensitivity_pairs <- function(true_pairs, pred_pairs, ordered=FALSE) {
250 |   recall_pairs(true_pairs, pred_pairs)
251 | }
252 | 
253 | 
254 | #' F-measure of Linked Pairs
255 | #'
256 | #' @description Computes the F-measure (a.k.a. F-score) of a set of
257 | #'   _predicted_ coreferent (linked) pairs given a set of _ground truth_
258 | #'   coreferent pairs.
259 | #'
260 | #' @details The \eqn{\beta}{β}-weighted F-measure is defined as the weighted
261 | #'   harmonic mean of precision \eqn{P} and recall \eqn{R}:
262 | #'   \deqn{(1 + \beta^2)\frac{P \cdot R}{\beta^2 \cdot P + R}.}{(1 + β^2)·P·R/(β^2·P + R).}
263 | #'
264 | #' @param true_pairs set of true coreferent pairs stored in a matrix or
265 | #'   data.frame, where rows index pairs and columns index the ids of the
266 | #'   constituents. Any pairs not included are assumed to be _non-coreferent_.
267 | #'   Duplicate pairs (including equivalent pairs with reversed ids) are
268 | #'   automatically removed.
269 | #' @param pred_pairs set of predicted coreferent pairs, following the same
270 | #'   specification as `true_pairs`.
271 | #' @param beta non-negative weight. A value of 0 assigns no weight to recall
272 | #'   (i.e. the measure reduces to precision), while larger values assign
273 | #'   increasing weight to recall. A value of 1 weights precision and recall
274 | #'   equally.
275 | #' @param ordered whether to treat the element pairs as ordered---i.e. whether
276 | #'   pair \eqn{(x, y)} is distinct from pair \eqn{(y, x)} for \eqn{x \neq y}.
277 | #'   Defaults to FALSE, which is appropriate for clustering, undirected link
278 | #'   prediction, record linkage etc.
279 | #'
280 | #' @references
281 | #' Van Rijsbergen, C. J. "Information Retrieval." (2nd ed.).
282 | #' Butterworth-Heinemann, USA, (1979).
283 | #'
284 | #' @examples
285 | #' true_pairs <- rbind(c(1,2), c(2,3), c(1,3)) # ground truth is 3-clique
286 | #' pred_pairs <- rbind(c(1,2), c(2,3))         # prediction misses one edge
287 | #' num_pairs <- 3                              # assuming 3 elements
288 | #' f_measure_pairs(true_pairs, pred_pairs, num_pairs)
289 | #'
290 | #' @export
291 | f_measure_pairs <- function(true_pairs, pred_pairs, beta=1, ordered=FALSE) {
292 |   ct <- contingency_table_pairs(true_pairs, pred_pairs, ordered = ordered)
293 |   f_measure_pairs_ct(ct, beta)
294 | }
295 | 
296 | 
297 | #' Specificity of Linked Pairs
298 | #'
299 | #' @description Computes the specificity of a set of _predicted_ coreferent
300 | #'   (linked) pairs given a set of _ground truth_ coreferent pairs.
301 | #'
302 | #' @details The specificity is defined as:
303 | #'   \deqn{\frac{|P' \cap T'|}{|P'|}}{|P' ∩ T'|/|P'|}
304 | #'   where \eqn{T'} is the set of true non-coreferent pairs, \eqn{P} is the
305 | #'   set of predicted non-coreferent pairs.
306 | #'
307 | #' @param true_pairs set of true coreferent pairs stored in a matrix or
308 | #'   data.frame, where rows index pairs and columns index the ids of the
309 | #'   constituents. Any pairs not included are assumed to be _non-coreferent_.
310 | #'   Duplicate pairs (including equivalent pairs with reversed ids) are
311 | #'   automatically removed.
312 | #' @param pred_pairs set of predicted coreferent pairs, following the same
313 | #'   specification as `true_pairs`.
314 | #' @param num_pairs the total number of coreferent and non-coreferent pairs,
315 | #'   excluding equivalent pairs with reversed ids.
316 | #' @param ordered whether to treat the element pairs as ordered---i.e. whether
317 | #'   pair \eqn{(x, y)} is distinct from pair \eqn{(y, x)} for \eqn{x \neq y}.
318 | #'   Defaults to FALSE, which is appropriate for clustering, undirected link
319 | #'   prediction, record linkage etc.
320 | #'
321 | #' @examples
322 | #' true_pairs <- rbind(c(1,2), c(2,3), c(1,3)) # ground truth is 3-clique
323 | #' pred_pairs <- rbind(c(1,2), c(2,3))         # prediction misses one edge
324 | #' num_pairs <- 3                              # assuming 3 elements
325 | #' specificity_pairs(true_pairs, pred_pairs, num_pairs)
326 | #'
327 | #' @export
328 | specificity_pairs <- function(true_pairs, pred_pairs, num_pairs, ordered=FALSE) {
329 |   ct <- contingency_table_pairs(true_pairs, pred_pairs, num_pairs = num_pairs, ordered = ordered)
330 |   specificity_pairs_ct(ct)
331 | }
332 | 
333 | 
334 | #' Accuracy of Linked Pairs
335 | #'
336 | #' @description Computes the accuracy of a set of _predicted_ coreferent
337 | #'   (linked) pairs given a set of _ground truth_ coreferent pairs.
338 | #'
339 | #' @details The accuracy is defined as:
340 | #'   \deqn{\frac{|T \cap P| + |T' \cap P'|}{N}}{(|T ∩ P| + |T' ∩ P'|)/N}
341 | #'   where:
342 | #'   * \eqn{T} is the set of true coreferent pairs,
343 | #'   * \eqn{P} is the set of predicted coreferent pairs,
344 | #'   * \eqn{T'} is the set of true non-coreferent pairs,
345 | #'   * \eqn{P'} is the set of predicted non-coreferent pairs, and
346 | #'   * \eqn{N} is the total number of coreferent and non-coreferent pairs.
347 | #'
348 | #' @param true_pairs set of true coreferent pairs stored in a matrix or
349 | #'   data.frame, where rows index pairs and columns index the ids of the
350 | #'   constituents. Any pairs not included are assumed to be _non-coreferent_.
351 | #'   Duplicate pairs (including equivalent pairs with reversed ids) are
352 | #'   automatically removed.
353 | #' @param pred_pairs set of predicted coreferent pairs, following the same
354 | #'   specification as `true_pairs`.
355 | #' @param num_pairs the total number of coreferent and non-coreferent pairs,
356 | #'   excluding equivalent pairs with reversed ids.
357 | #' @param ordered whether to treat the element pairs as ordered---i.e. whether
358 | #'   pair \eqn{(x, y)} is distinct from pair \eqn{(y, x)} for \eqn{x \neq y}.
359 | #'   Defaults to FALSE, which is appropriate for clustering, undirected link
360 | #'   prediction, record linkage etc.
361 | #'
362 | #' @examples
363 | #' true_pairs <- rbind(c(1,2), c(2,3), c(1,3)) # ground truth is 3-clique
364 | #' pred_pairs <- rbind(c(1,2), c(2,3))         # prediction misses one edge
365 | #' num_pairs <- 3                              # assuming 3 elements
366 | #' accuracy_pairs(true_pairs, pred_pairs, num_pairs)
367 | #'
368 | #' @export
369 | accuracy_pairs <- function(true_pairs, pred_pairs, num_pairs, ordered=FALSE) {
370 |   ct <- contingency_table_pairs(true_pairs, pred_pairs, num_pairs = num_pairs, ordered = ordered)
371 |   accuracy_pairs_ct(ct)
372 | }
373 | 
374 | 
375 | #' Balanced Accuracy of Linked Pairs
376 | #'
377 | #' @description Computes the balanced accuracy of a set of _predicted_
378 | #'   coreferent (linked) pairs given a set of _ground truth_ coreferent
379 | #'   pairs.
380 | #'
381 | #' @details The balanced accuracy is defined as:
382 | #'   \deqn{\frac{\frac{|T \cap P|}{|P|} + \frac{|T' \cap P'|}{|P'|}}{2}}{|T ∩ P|/(2|P|) + |T' ∩ P'|/(2|P'|)}
383 | #'   where:
384 | #'   * \eqn{T} is the set of true coreferent pairs,
385 | #'   * \eqn{P} is the set of predicted coreferent pairs,
386 | #'   * \eqn{T'} is the set of true non-coreferent pairs, and
387 | #'   * \eqn{P'} is the set of predicted non-coreferent pairs.
388 | #'
389 | #' @param true_pairs set of true coreferent pairs stored in a matrix or
390 | #'   data.frame, where rows index pairs and columns index the ids of the
391 | #'   constituents. Any pairs not included are assumed to be _non-coreferent_.
392 | #'   Duplicate pairs (including equivalent pairs with reversed ids) are
393 | #'   automatically removed.
394 | #' @param pred_pairs set of predicted coreferent pairs, following the same
395 | #'   specification as `true_pairs`.
396 | #' @param num_pairs the total number of coreferent and non-coreferent pairs,
397 | #'   excluding equivalent pairs with reversed ids.
398 | #' @param ordered whether to treat the element pairs as ordered---i.e. whether
399 | #'   pair \eqn{(x, y)} is distinct from pair \eqn{(y, x)} for \eqn{x \neq y}.
400 | #'   Defaults to FALSE, which is appropriate for clustering, undirected link
401 | #'   prediction, record linkage etc.
402 | #'
403 | #' @examples
404 | #' true_pairs <- rbind(c(1,2), c(2,3), c(1,3)) # ground truth is 3-clique
405 | #' pred_pairs <- rbind(c(1,2), c(2,3))         # prediction misses one edge
406 | #' num_pairs <- 3                              # assuming 3 elements
407 | #' balanced_accuracy_pairs(true_pairs, pred_pairs, num_pairs)
408 | #'
409 | #' @export
410 | balanced_accuracy_pairs <- function(true_pairs, pred_pairs, num_pairs, ordered=FALSE) {
411 |   ct <- contingency_table_pairs(true_pairs, pred_pairs, num_pairs = num_pairs, ordered = ordered)
412 |   balanced_accuracy_pairs_ct(ct)
413 | }
414 | 
415 | 
416 | #' Fowlkes-Mallows Index of Linked Pairs
417 | #'
418 | #' @description Computes the Fowlkes-Mallows index for a set of _predicted_
419 | #'   coreferent (linked) pairs given a set of _ground truth_ coreferent pairs.
420 | #'
421 | #' @details The Fowlkes-Mallows index is defined as the geometric mean of
422 | #'   precision \eqn{P} and recall \eqn{R}:
423 | #'   \deqn{\sqrt{P R}.}{√(P·R).}
424 | #'
425 | #' @param true_pairs set of true coreferent pairs stored in a matrix or
426 | #'   data.frame, where rows index pairs and columns index the ids of the
427 | #'   constituents. Any pairs not included are assumed to be _non-coreferent_.
428 | #'   Duplicate pairs (including equivalent pairs with reversed ids) are
429 | #'   automatically removed.
430 | #' @param pred_pairs set of predicted coreferent pairs, following the same
431 | #'   specification as `true_pairs`.
432 | #' @param ordered whether to treat the element pairs as ordered---i.e. whether
433 | #'   pair \eqn{(x, y)} is distinct from pair \eqn{(y, x)} for \eqn{x \neq y}.
434 | #'   Defaults to FALSE, which is appropriate for clustering, undirected link
435 | #'   prediction, record linkage etc.
436 | #'
437 | #' @references
438 | #' Fowlkes, E. B. and Mallows, C. L. "A Method for Comparing Two Hierarchical
439 | #' Clusterings." _Journal of the American Statistical Association_ **78:383**,
440 | #' 553-569, (1983). \doi{10.1080/01621459.1983.10478008}.
441 | #'
442 | #' @examples
443 | #' true_pairs <- rbind(c(1,2), c(2,3), c(1,3)) # ground truth is 3-clique
444 | #' pred_pairs <- rbind(c(1,2), c(2,3))         # prediction misses one edge
445 | #' num_pairs <- 3                              # assuming 3 elements
446 | #' fowlkes_mallows_pairs(true_pairs, pred_pairs, num_pairs)
447 | #'
448 | #' @export
449 | fowlkes_mallows_pairs <- function(true_pairs, pred_pairs, ordered=FALSE) {
450 |   ct <- contingency_table_pairs(true_pairs, pred_pairs, ordered = ordered)
451 |   fowlkes_mallows_pairs_ct(ct)
452 | }
453 | 
454 | 
455 | # Definition of measures in terms of contingency table
456 | precision_pairs_ct <- function(ct) {
457 |   tp <- ct["TRUE", "TRUE"]
458 |   fp <- ct["TRUE", "FALSE"]
459 |   pp <- tp + fp
460 |   return(tp / pp)
461 | }
462 | 
463 | recall_pairs_ct <- function(ct) {
464 |   tp <- ct["TRUE", "TRUE"]
465 |   fn <- ct["FALSE", "TRUE"]
466 |   p <- tp + fn
467 |   return(tp / p)
468 | }
469 | 
470 | f_measure_pairs_ct <- function(ct, beta=1.0) {
471 |   if (beta < 0)
472 |     stop("`beta` must be non-negative")
473 |   P <- precision_pairs_ct(ct)
474 |   R <- recall_pairs_ct(ct)
475 |   alpha <- 1/(1 + beta^2)
476 |   1 / (alpha / P + (1 - alpha) / R)
477 | }
478 | 
479 | specificity_pairs_ct <- function(ct) {
480 |   fp <- ct["TRUE", "FALSE"]
481 |   tn <- ct["FALSE", "FALSE"]
482 |   n <- tn + fp
483 |   tn / n
484 | }
485 | 
486 | accuracy_pairs_ct <- function(ct) {
487 |   tp <- ct["TRUE", "TRUE"]
488 |   fp <- ct["TRUE", "FALSE"]
489 |   fn <- ct["FALSE", "TRUE"]
490 |   tn <- ct["FALSE", "FALSE"]
491 |   correct <- tp + tn
492 |   total <- tp + fp + tn + fn
493 |   correct/total
494 | }
495 | 
496 | balanced_accuracy_pairs_ct <- function(ct) {
497 |   sensitivity <- recall_pairs_ct(ct)
498 |   specificity <- specificity_pairs_ct(ct)
499 |   (sensitivity + specificity) / 2
500 | }
501 | 
502 | fowlkes_mallows_pairs_ct <- function(ct) {
503 |   P <- precision_pairs_ct(ct)
504 |   R <- recall_pairs_ct(ct)
505 |   sqrt(P) * sqrt(R)
506 | }
507 | 


--------------------------------------------------------------------------------