├── .gitattributes ├── tests ├── testthat.R └── testthat │ ├── test-measures_clusterings.R │ ├── test-transformations.R │ └── test-measures_pairs.R ├── .gitignore ├── .Rbuildignore ├── R ├── clevr.R ├── RcppExports.R ├── transformations.R ├── measures_clusterings.R └── measures_pairs.R ├── cran-comments.md ├── NEWS.md ├── clevr.Rproj ├── src ├── pairs_to_membership.cpp └── RcppExports.cpp ├── man ├── clevr-package.Rd ├── mutual_info.Rd ├── contingency_table_clusters.Rd ├── fowlkes_mallows.Rd ├── eval_report_clusters.Rd ├── canonicalize_pairs.Rd ├── adj_rand_index.Rd ├── rand_index.Rd ├── precision_pairs.Rd ├── homogeneity.Rd ├── completeness.Rd ├── variation_info.Rd ├── recall_pairs.Rd ├── specificity_pairs.Rd ├── fowlkes_mallows_pairs.Rd ├── accuracy_pairs.Rd ├── f_measure_pairs.Rd ├── balanced_accuracy_pairs.Rd ├── v_measure.Rd ├── contingency_table_pairs.Rd ├── eval_report_pairs.Rd └── clustering_representations.Rd ├── NAMESPACE ├── DESCRIPTION ├── README.Rmd ├── README.md └── LICENSE /.gitattributes: -------------------------------------------------------------------------------- 1 | * text=auto 2 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(clevr) 3 | 4 | test_check("clevr") 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | src/*.o 6 | src/*.so 7 | src/*.dll 8 | .bak 9 | -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^README\.Rmd$ 4 | ^cran-comments.md$ 5 | ^CRAN-RELEASE$ 6 | ^LICENSE$ 7 | -------------------------------------------------------------------------------- /R/clevr.R: -------------------------------------------------------------------------------- 1 | #' @keywords internal 2 | "_PACKAGE" 3 | 4 | #' @import Rcpp 5 | #' @importFrom Rcpp evalCpp 6 | #' @useDynLib clevr 7 | NULL 8 | -------------------------------------------------------------------------------- /cran-comments.md: -------------------------------------------------------------------------------- 1 | ## Comments 2 | 3 | Minor release to address incompatibility with upcoming release of Matrix package. 4 | 5 | ## Test environments 6 | * Fedora 38, R 4.3.1 7 | * winbuilder, R 4.3.1 8 | 9 | ## R CMD check results 10 | 11 | 0 errors v | 0 warnings v | 0 note v 12 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # clevr 0.1.2 2 | * Address compatibility issue with Matrix 1.6-2 release 3 | 4 | # clevr 0.1.1 5 | * Fix behavior when pairs are represented using different types 6 | * Improve documentation by adding examples 7 | * First release to CRAN 8 | 9 | # clevr 0.1.0 10 | * Initial release 11 | -------------------------------------------------------------------------------- /R/RcppExports.R: -------------------------------------------------------------------------------- 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | pairs_to_membership_cpp <- function(pairs, num_records) { 5 | .Call('_clevr_pairs_to_membership_cpp', PACKAGE = 'clevr', pairs, num_records) 6 | } 7 | 8 | -------------------------------------------------------------------------------- /clevr.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --no-multiarch --with-keep.source 21 | PackageRoxygenize: rd,collate,namespace 22 | -------------------------------------------------------------------------------- /src/pairs_to_membership.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | using namespace Rcpp; 5 | 6 | // [[Rcpp::export]] 7 | IntegerVector pairs_to_membership_cpp(const IntegerMatrix &pairs, int num_records) { 8 | using namespace boost; 9 | 10 | typedef adjacency_list Graph; 11 | 12 | Graph G(num_records); 13 | for (int i = 0; i < pairs.nrow(); i++) { 14 | add_edge(pairs.at(i, 0), pairs.at(i, 1), G); 15 | } 16 | 17 | IntegerVector membership(num_records); 18 | connected_components(G, &membership[0]); 19 | 20 | return membership; 21 | } 22 | -------------------------------------------------------------------------------- /man/clevr-package.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/clevr.R 3 | \docType{package} 4 | \name{clevr-package} 5 | \alias{clevr} 6 | \alias{clevr-package} 7 | \title{clevr: Clustering and Link Prediction Evaluation in R} 8 | \description{ 9 | Tools for evaluating link prediction and clustering algorithms with respect to ground truth. Includes efficient implementations of common performance measures such as pairwise precision/recall, cluster homogeneity/completeness, variation of information, Rand index etc. 10 | } 11 | \seealso{ 12 | Useful links: 13 | \itemize{ 14 | \item \url{https://github.com/cleanzr/clevr} 15 | \item Report bugs at \url{https://github.com/cleanzr/clevr/issues} 16 | } 17 | 18 | } 19 | \author{ 20 | \strong{Maintainer}: Neil Marchant \email{ngmarchant@gmail.com} 21 | 22 | Authors: 23 | \itemize{ 24 | \item Rebecca Steorts \email{beka@stat.duke.edu} 25 | } 26 | 27 | } 28 | \keyword{internal} 29 | -------------------------------------------------------------------------------- /man/mutual_info.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/measures_clusterings.R 3 | \name{mutual_info} 4 | \alias{mutual_info} 5 | \title{Mutual Information Between Clusterings} 6 | \usage{ 7 | mutual_info(true, pred, base = exp(1)) 8 | } 9 | \arguments{ 10 | \item{true}{ground truth clustering represented as a membership 11 | vector. Each entry corresponds to an element and the value identifies 12 | the assigned cluster. The specific values of the cluster identifiers 13 | are arbitrary.} 14 | 15 | \item{pred}{predicted clustering represented as a membership 16 | vector.} 17 | 18 | \item{base}{base of the logarithm. Defaults to \code{exp(1)}.} 19 | } 20 | \description{ 21 | Computes the mutual information between two 22 | clusterings, such as a predicted and ground truth clustering. 23 | } 24 | \details{ 25 | Mutual information is an entropy-based measure of the similarity 26 | between two clusterings. 27 | } 28 | \examples{ 29 | true <- c(1,1,1,2,2) # ground truth clustering 30 | pred <- c(1,1,2,2,2) # predicted clustering 31 | mutual_info(true, pred) 32 | 33 | } 34 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(accuracy_pairs) 4 | export(adj_rand_index) 5 | export(balanced_accuracy_pairs) 6 | export(canonicalize_pairs) 7 | export(clusters_to_membership) 8 | export(clusters_to_pairs) 9 | export(completeness) 10 | export(contingency_table_clusters) 11 | export(contingency_table_pairs) 12 | export(eval_report_clusters) 13 | export(eval_report_pairs) 14 | export(f_measure_pairs) 15 | export(fowlkes_mallows) 16 | export(fowlkes_mallows_pairs) 17 | export(homogeneity) 18 | export(membership_to_clusters) 19 | export(membership_to_pairs) 20 | export(mutual_info) 21 | export(pairs_to_clusters) 22 | export(pairs_to_membership) 23 | export(precision_pairs) 24 | export(rand_index) 25 | export(recall_pairs) 26 | export(sensitivity_pairs) 27 | export(specificity_pairs) 28 | export(v_measure) 29 | export(variation_info) 30 | import(Rcpp) 31 | importFrom(Matrix,colSums) 32 | importFrom(Matrix,crossprod) 33 | importFrom(Matrix,rowSums) 34 | importFrom(Matrix,which) 35 | importFrom(Rcpp,evalCpp) 36 | importFrom(stats,na.action) 37 | importFrom(stats,na.fail) 38 | importFrom(stats,na.omit) 39 | importFrom(stats,xtabs) 40 | importFrom(utils,combn) 41 | useDynLib(clevr) 42 | -------------------------------------------------------------------------------- /man/contingency_table_clusters.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/measures_clusterings.R 3 | \name{contingency_table_clusters} 4 | \alias{contingency_table_clusters} 5 | \title{Contingency Table for Clusterings} 6 | \usage{ 7 | contingency_table_clusters(true, pred) 8 | } 9 | \arguments{ 10 | \item{true}{ground truth clustering represented as a membership 11 | vector. Each entry corresponds to an element and the value identifies 12 | the assigned cluster. The specific values of the cluster identifiers 13 | are arbitrary.} 14 | 15 | \item{pred}{predicted clustering represented as a membership 16 | vector.} 17 | } 18 | \value{ 19 | Returns a table \eqn{C} (stored as a sparse matrix) such that 20 | \eqn{C_{ij}}{C_ij} counts the number of elements assigned to 21 | cluster \eqn{i} in \code{pred} and cluster \eqn{j} in \code{true}. 22 | } 23 | \description{ 24 | Compute the contingency table for a \emph{predicted} clustering 25 | given a \emph{ground truth} clustering. 26 | } 27 | \examples{ 28 | true <- c(1,1,1,2,2) # ground truth clustering 29 | pred <- c(1,1,2,2,2) # predicted clustering 30 | contingency_table_clusters(true, pred) 31 | 32 | } 33 | \seealso{ 34 | \code{\link{eval_report_clusters}} computes common evaluation measures derived 35 | from the output of this function. 36 | } 37 | -------------------------------------------------------------------------------- /man/fowlkes_mallows.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/measures_clusterings.R 3 | \name{fowlkes_mallows} 4 | \alias{fowlkes_mallows} 5 | \title{Fowlkes-Mallows Index Between Clusterings} 6 | \usage{ 7 | fowlkes_mallows(true, pred) 8 | } 9 | \arguments{ 10 | \item{true}{ground truth clustering represented as a membership 11 | vector. Each entry corresponds to an element and the value identifies 12 | the assigned cluster. The specific values of the cluster identifiers 13 | are arbitrary.} 14 | 15 | \item{pred}{predicted clustering represented as a membership 16 | vector.} 17 | } 18 | \description{ 19 | Computes the Fowlkes-Mallows index between two clusterings, 20 | such as a predicted and ground truth clustering. 21 | } 22 | \details{ 23 | The Fowlkes-Mallows index is defined as the geometric mean of 24 | precision and recall, computed with respect to pairs of elements. 25 | } 26 | \examples{ 27 | true <- c(1,1,1,2,2) # ground truth clustering 28 | pred <- c(1,1,2,2,2) # predicted clustering 29 | fowlkes_mallows(true, pred) 30 | 31 | } 32 | \references{ 33 | Fowlkes, E. B. and Mallows, C. L. "A Method for Comparing Two Hierarchical 34 | Clusterings." \emph{Journal of the American Statistical Association} \strong{78:383}, 35 | 553-569, (1983). \doi{10.1080/01621459.1983.10478008} 36 | } 37 | -------------------------------------------------------------------------------- /src/RcppExports.cpp: -------------------------------------------------------------------------------- 1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | #include 5 | 6 | using namespace Rcpp; 7 | 8 | #ifdef RCPP_USE_GLOBAL_ROSTREAM 9 | Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); 10 | Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); 11 | #endif 12 | 13 | // pairs_to_membership_cpp 14 | IntegerVector pairs_to_membership_cpp(const IntegerMatrix& pairs, int num_records); 15 | RcppExport SEXP _clevr_pairs_to_membership_cpp(SEXP pairsSEXP, SEXP num_recordsSEXP) { 16 | BEGIN_RCPP 17 | Rcpp::RObject rcpp_result_gen; 18 | Rcpp::RNGScope rcpp_rngScope_gen; 19 | Rcpp::traits::input_parameter< const IntegerMatrix& >::type pairs(pairsSEXP); 20 | Rcpp::traits::input_parameter< int >::type num_records(num_recordsSEXP); 21 | rcpp_result_gen = Rcpp::wrap(pairs_to_membership_cpp(pairs, num_records)); 22 | return rcpp_result_gen; 23 | END_RCPP 24 | } 25 | 26 | static const R_CallMethodDef CallEntries[] = { 27 | {"_clevr_pairs_to_membership_cpp", (DL_FUNC) &_clevr_pairs_to_membership_cpp, 2}, 28 | {NULL, NULL, 0} 29 | }; 30 | 31 | RcppExport void R_init_clevr(DllInfo *dll) { 32 | R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); 33 | R_useDynamicSymbols(dll, FALSE); 34 | } 35 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: clevr 2 | Type: Package 3 | Title: Clustering and Link Prediction Evaluation in R 4 | Version: 0.1.2 5 | Date: 2023-09-16 6 | Authors@R: c( 7 | person(given = "Neil", 8 | family = "Marchant", 9 | email = "ngmarchant@gmail.com", 10 | role = c("aut", "cre")), 11 | person(given = "Rebecca", 12 | family = "Steorts", 13 | email = "beka@stat.duke.edu", 14 | role = c("aut")), 15 | person(given = "Olivier", 16 | family = "Binette", 17 | email = "olivier.binette@gmail.com", 18 | role = c("ctb"))) 19 | Maintainer: Neil Marchant 20 | Description: Tools for evaluating link prediction and clustering algorithms 21 | with respect to ground truth. Includes efficient implementations of 22 | common performance measures such as pairwise precision/recall, 23 | cluster homogeneity/completeness, variation of information, 24 | Rand index etc. 25 | License: GPL-2 26 | Encoding: UTF-8 27 | Depends: R (>= 3.0.2) 28 | Imports: Rcpp (>= 1.0.5), 29 | stats, 30 | Matrix 31 | LinkingTo: Rcpp, BH (>= 1.69.0) 32 | RoxygenNote: 7.2.3 33 | Roxygen: list(markdown = TRUE) 34 | Suggests: testthat 35 | URL: https://github.com/cleanzr/clevr 36 | BugReports: https://github.com/cleanzr/clevr/issues 37 | Collate: 38 | 'RcppExports.R' 39 | 'clevr.R' 40 | 'measures_clusterings.R' 41 | 'transformations.R' 42 | 'measures_pairs.R' 43 | -------------------------------------------------------------------------------- /man/eval_report_clusters.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/measures_clusterings.R 3 | \name{eval_report_clusters} 4 | \alias{eval_report_clusters} 5 | \title{Evaluation Report for Clustering} 6 | \usage{ 7 | eval_report_clusters(true, pred) 8 | } 9 | \arguments{ 10 | \item{true}{ground truth clustering represented as a membership 11 | vector. Each entry corresponds to an element and the value identifies 12 | the assigned cluster. The specific values of the cluster identifiers 13 | are arbitrary.} 14 | 15 | \item{pred}{predicted clustering represented as a membership 16 | vector.} 17 | } 18 | \value{ 19 | Returns a list containing the following measures: 20 | \describe{ 21 | \item{homogeneity}{see \code{\link{homogeneity}}} 22 | \item{completeness}{see \code{\link{completeness}}} 23 | \item{v_measure}{see \code{\link{v_measure}}} 24 | \item{rand_index}{see \code{\link{rand_index}}} 25 | \item{adj_rand_index}{see \code{\link{adj_rand_index}}} 26 | \item{variation_info}{see \code{\link{variation_info}}} 27 | \item{mutual_info}{see \code{\link{mutual_info}}} 28 | \item{fowlkes_mallows}{see \code{\link{fowlkes_mallows}}} 29 | } 30 | } 31 | \description{ 32 | Compute various evaluation measures for a predicted 33 | clustering using a ground truth clustering as a reference. 34 | } 35 | \examples{ 36 | true <- c(1,1,1,2,2) # ground truth clustering 37 | pred <- c(1,1,2,2,2) # predicted clustering 38 | eval_report_clusters(true, pred) 39 | 40 | } 41 | -------------------------------------------------------------------------------- /man/canonicalize_pairs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/transformations.R 3 | \name{canonicalize_pairs} 4 | \alias{canonicalize_pairs} 5 | \title{Canonicalize element pairs} 6 | \usage{ 7 | canonicalize_pairs(pairs, ordered = FALSE) 8 | } 9 | \arguments{ 10 | \item{pairs}{a matrix or data.frame of element pairs where rows correspond 11 | to element pairs and columns correspond to element identifiers.} 12 | 13 | \item{ordered}{whether to treat the element pairs as ordered---i.e. whether 14 | pair \eqn{(x, y)} is distinct from pair \eqn{(y, x)} for \eqn{x \neq y}. 15 | Defaults to FALSE, which is appropriate for clustering, undirected link 16 | prediction, record linkage etc.} 17 | } 18 | \value{ 19 | Returns the element pairs in canonical form, so that: 20 | \itemize{ 21 | \item the first element id precedes the second element id lexicographically 22 | if \code{ordered = FALSE}---i.e. pair (3, 2) becomes pair (2, 3); 23 | \item pairs with missing element ids are removed; 24 | \item duplicate pairs are removed; and 25 | \item the rows in the matrix/data.frame pairs are sorted lexicographically 26 | by the first element id, then by the second element id. 27 | } 28 | } 29 | \description{ 30 | Coerce a collection of element pairs into canonical form. Facilitates 31 | testing of equivalence. 32 | } 33 | \examples{ 34 | messy_pairs <- rbind(c(2,1), c(1,2), c(3,1), c(1,2)) 35 | clean_pairs <- canonicalize_pairs(messy_pairs) 36 | all(rbind(c(1,2), c(1,3)) == clean_pairs) # duplicates removed and order fixed 37 | 38 | } 39 | -------------------------------------------------------------------------------- /man/adj_rand_index.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/measures_clusterings.R 3 | \name{adj_rand_index} 4 | \alias{adj_rand_index} 5 | \title{Adjusted Rand Index Between Clusterings} 6 | \usage{ 7 | adj_rand_index(true, pred) 8 | } 9 | \arguments{ 10 | \item{true}{ground truth clustering represented as a membership 11 | vector. Each entry corresponds to an element and the value identifies 12 | the assigned cluster. The specific values of the cluster identifiers 13 | are arbitrary.} 14 | 15 | \item{pred}{predicted clustering represented as a membership 16 | vector.} 17 | } 18 | \description{ 19 | Computes the adjusted Rand index (ARI) between two clusterings, 20 | such as a predicted and ground truth clustering. 21 | } 22 | \details{ 23 | The adjusted Rand index (ARI) is a variant of the Rand index (RI) 24 | which is corrected for chance using the Permutation Model for 25 | clusterings. It is related to the RI as follows: 26 | \deqn{\frac{RI - E(RI)}{1 - E(RI)},}{(RI - E(RI))/(1 - E(RI)),} 27 | where \eqn{E(RI)} is the expected value of the RI under the Permutation 28 | Model. 29 | Unlike the RI, the ARI takes values in the range -1 to 1. A value 30 | of 1 indicates that the clusterings are identical, while a value of 31 | 0 indicates the clusterings are drawn randomly independent of one 32 | another. 33 | } 34 | \examples{ 35 | true <- c(1,1,1,2,2) # ground truth clustering 36 | pred <- c(1,1,2,2,2) # predicted clustering 37 | adj_rand_index(true, pred) 38 | 39 | } 40 | \references{ 41 | Hubert, L., Arabie, P. "Comparing partitions." \emph{Journal of Classification} 42 | \strong{2}, 193–218 (1985). \doi{10.1007/BF01908075} 43 | } 44 | -------------------------------------------------------------------------------- /man/rand_index.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/measures_clusterings.R 3 | \name{rand_index} 4 | \alias{rand_index} 5 | \title{Rand Index Between Clusterings} 6 | \usage{ 7 | rand_index(true, pred) 8 | } 9 | \arguments{ 10 | \item{true}{ground truth clustering represented as a membership 11 | vector. Each entry corresponds to an element and the value identifies 12 | the assigned cluster. The specific values of the cluster identifiers 13 | are arbitrary.} 14 | 15 | \item{pred}{predicted clustering represented as a membership 16 | vector.} 17 | } 18 | \description{ 19 | Computes the Rand index (RI) between two clusterings, such 20 | as a predicted and ground truth clustering. 21 | } 22 | \details{ 23 | The Rand index (RI) can be expressed as: 24 | \deqn{\frac{a + b}{{n \choose 2}}.}{(a + b)/binom(n, 2).} 25 | where 26 | \itemize{ 27 | \item \eqn{n} is the number of elements, 28 | \item \eqn{a} is the number of pairs of elements that appear in the 29 | same cluster in both clusterings, and 30 | \item \eqn{b} is the number of pairs of elements that appear in distinct 31 | clusters in both clusterings. 32 | } 33 | 34 | The RI takes on values between 0 and 1, where 1 denotes exact agreement 35 | between the clusterings and 0 denotes disagreement on all pairs of 36 | elements. 37 | } 38 | \examples{ 39 | true <- c(1,1,1,2,2) # ground truth clustering 40 | pred <- c(1,1,2,2,2) # predicted clustering 41 | rand_index(true, pred) 42 | 43 | } 44 | \references{ 45 | Rand, W. M. "Objective Criteria for the Evaluation of Clustering Methods." 46 | \emph{Journal of the American Statistical Association} 66(336), 846-850 (1971). 47 | \doi{10.1080/01621459.1971.10482356} 48 | } 49 | -------------------------------------------------------------------------------- /man/precision_pairs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/measures_pairs.R 3 | \name{precision_pairs} 4 | \alias{precision_pairs} 5 | \title{Precision of Linked Pairs} 6 | \usage{ 7 | precision_pairs(true_pairs, pred_pairs, ordered = FALSE) 8 | } 9 | \arguments{ 10 | \item{true_pairs}{set of true coreferent pairs stored in a matrix or 11 | data.frame, where rows index pairs and columns index the ids of the 12 | constituents. Any pairs not included are assumed to be \emph{non-coreferent}. 13 | Duplicate pairs (including equivalent pairs with reversed ids) are 14 | automatically removed.} 15 | 16 | \item{pred_pairs}{set of predicted coreferent pairs, following the same 17 | specification as \code{true_pairs}.} 18 | 19 | \item{ordered}{whether to treat the element pairs as ordered---i.e. whether 20 | pair \eqn{(x, y)} is distinct from pair \eqn{(y, x)} for \eqn{x \neq y}. 21 | Defaults to FALSE, which is appropriate for clustering, undirected link 22 | prediction, record linkage etc.} 23 | } 24 | \description{ 25 | Computes the precision of a set of \emph{predicted} coreferent 26 | (linked) pairs given a set of \emph{ground truth} coreferent pairs. 27 | } 28 | \details{ 29 | The precision is defined as: 30 | \deqn{\frac{|T \cap P|}{|P|}}{|T ∩ P|/|P|} 31 | where \eqn{T} is the set of true coreferent pairs and \eqn{P} is the 32 | set of predicted coreferent pairs. 33 | } 34 | \examples{ 35 | true_pairs <- rbind(c(1,2), c(2,3), c(1,3)) # ground truth is 3-clique 36 | pred_pairs <- rbind(c(1,2), c(2,3)) # prediction misses one edge 37 | num_pairs <- 3 # assuming 3 elements 38 | precision_pairs(true_pairs, pred_pairs, num_pairs) 39 | 40 | } 41 | -------------------------------------------------------------------------------- /man/homogeneity.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/measures_clusterings.R 3 | \name{homogeneity} 4 | \alias{homogeneity} 5 | \title{Homogeneity Between Clusterings} 6 | \usage{ 7 | homogeneity(true, pred) 8 | } 9 | \arguments{ 10 | \item{true}{ground truth clustering represented as a membership 11 | vector. Each entry corresponds to an element and the value identifies 12 | the assigned cluster. The specific values of the cluster identifiers 13 | are arbitrary.} 14 | 15 | \item{pred}{predicted clustering represented as a membership 16 | vector.} 17 | } 18 | \description{ 19 | Computes the homogeneity between two clusterings, such 20 | as a predicted and ground truth clustering. 21 | } 22 | \details{ 23 | Homogeneity is an entropy-based measure of the similarity 24 | between two clusterings, say \eqn{t} and \eqn{p}. The homogeneity 25 | is high if clustering \eqn{t} only assigns members of a cluster to 26 | a single cluster in \eqn{p}. The homogeneity ranges between 0 27 | and 1, where 1 indicates a perfect homogeneity. 28 | } 29 | \examples{ 30 | true <- c(1,1,1,2,2) # ground truth clustering 31 | pred <- c(1,1,2,2,2) # predicted clustering 32 | homogeneity(true, pred) 33 | 34 | } 35 | \references{ 36 | Rosenberg, A. and Hirschberg, J. "V-measure: A conditional entropy-based external cluster evaluation measure." \emph{Proceedings of the 2007 Joint Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning} (EMNLP-CoNLL), (2007). 37 | } 38 | \seealso{ 39 | \code{\link{completeness}} evaluates the \emph{completeness}, which is a dual 40 | measure to \emph{homogeneity}. \code{\link{v_measure}} evaluates the harmonic mean of 41 | \emph{completeness} and \emph{homogeneity}. 42 | } 43 | -------------------------------------------------------------------------------- /man/completeness.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/measures_clusterings.R 3 | \name{completeness} 4 | \alias{completeness} 5 | \title{Completeness Between Clusterings} 6 | \usage{ 7 | completeness(true, pred) 8 | } 9 | \arguments{ 10 | \item{true}{ground truth clustering represented as a membership 11 | vector. Each entry corresponds to an element and the value identifies 12 | the assigned cluster. The specific values of the cluster identifiers 13 | are arbitrary.} 14 | 15 | \item{pred}{predicted clustering represented as a membership 16 | vector.} 17 | } 18 | \description{ 19 | Computes the completeness between two clusterings, such 20 | as a predicted and ground truth clustering. 21 | } 22 | \details{ 23 | Completeness is an entropy-based measure of the similarity 24 | between two clusterings, say \eqn{t} and \eqn{p}. The completeness 25 | is high if \emph{all} members of a given cluster in \eqn{t} are assigned 26 | to a single cluster in \eqn{p}. The completeness ranges between 0 27 | and 1, where 1 indicates perfect completeness. 28 | } 29 | \examples{ 30 | true <- c(1,1,1,2,2) # ground truth clustering 31 | pred <- c(1,1,2,2,2) # predicted clustering 32 | completeness(true, pred) 33 | 34 | } 35 | \references{ 36 | Rosenberg, A. and Hirschberg, J. "V-measure: A conditional entropy-based external cluster evaluation measure." \emph{Proceedings of the 2007 Joint Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning} (EMNLP-CoNLL), (2007). 37 | } 38 | \seealso{ 39 | \code{\link{homogeneity}} evaluates the \emph{homogeneity}, which is a dual 40 | measure to \emph{completeness}. \code{\link{v_measure}} evaluates the harmonic mean of 41 | \emph{completeness} and \emph{homogeneity}. 42 | } 43 | -------------------------------------------------------------------------------- /man/variation_info.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/measures_clusterings.R 3 | \name{variation_info} 4 | \alias{variation_info} 5 | \title{Variation of Information Between Clusterings} 6 | \usage{ 7 | variation_info(true, pred, base = exp(1)) 8 | } 9 | \arguments{ 10 | \item{true}{ground truth clustering represented as a membership 11 | vector. Each entry corresponds to an element and the value identifies 12 | the assigned cluster. The specific values of the cluster identifiers 13 | are arbitrary.} 14 | 15 | \item{pred}{predicted clustering represented as a membership 16 | vector.} 17 | 18 | \item{base}{base of the logarithm. Defaults to \code{exp(1)}.} 19 | } 20 | \description{ 21 | Computes the variation of information between two 22 | clusterings, such as a predicted and ground truth clustering. 23 | } 24 | \details{ 25 | Variation of information is an entropy-based distance metric 26 | on the space of clusterings. It is unnormalized and varies between 27 | \eqn{0} and \eqn{\log(N)}{log(N)} where \eqn{N} is the number of 28 | clustered elements. Larger values of the distance metric correspond 29 | to greater dissimilarity between the clusterings. 30 | } 31 | \examples{ 32 | true <- c(1,1,1,2,2) # ground truth clustering 33 | pred <- c(1,1,2,2,2) # predicted clustering 34 | variation_info(true, pred) 35 | 36 | } 37 | \references{ 38 | Arabie, P. and Boorman, S. A. "Multidimensional scaling of measures of 39 | distance between partitions." \emph{Journal of Mathematical Psychology} \strong{10:2}, 40 | 148-203, (1973). \doi{10.1016/0022-2496(73)90012-6} 41 | 42 | Meilă, M. "Comparing Clusterings by the Variation of Information." In: 43 | Learning Theory and Kernel Machines, Lecture Notes in Computer Science 44 | \strong{2777}, Springer, Berlin, Heidelberg, (2003). 45 | \doi{10.1007/978-3-540-45167-9_14} 46 | } 47 | -------------------------------------------------------------------------------- /man/recall_pairs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/measures_pairs.R 3 | \name{recall_pairs} 4 | \alias{recall_pairs} 5 | \alias{sensitivity_pairs} 6 | \title{Recall of Linked Pairs} 7 | \usage{ 8 | recall_pairs(true_pairs, pred_pairs, ordered = FALSE) 9 | 10 | sensitivity_pairs(true_pairs, pred_pairs, ordered = FALSE) 11 | } 12 | \arguments{ 13 | \item{true_pairs}{set of true coreferent pairs stored in a matrix or 14 | data.frame, where rows index pairs and columns index the ids of the 15 | constituents. Any pairs not included are assumed to be \emph{non-coreferent}. 16 | Duplicate pairs (including equivalent pairs with reversed ids) are 17 | automatically removed.} 18 | 19 | \item{pred_pairs}{set of predicted coreferent pairs, following the same 20 | specification as \code{true_pairs}.} 21 | 22 | \item{ordered}{whether to treat the element pairs as ordered---i.e. whether 23 | pair \eqn{(x, y)} is distinct from pair \eqn{(y, x)} for \eqn{x \neq y}. 24 | Defaults to FALSE, which is appropriate for clustering, undirected link 25 | prediction, record linkage etc.} 26 | } 27 | \description{ 28 | Computes the precision of a set of \emph{predicted} coreferent 29 | (linked) pairs given a set of \emph{ground truth} coreferent pairs. 30 | } 31 | \details{ 32 | The recall is defined as: 33 | \deqn{\frac{|T \cap P|}{|T|}}{|T ∩ P|/|T|} 34 | where \eqn{T} is the set of true coreferent pairs and \eqn{P} is the 35 | set of predicted coreferent pairs. 36 | } 37 | \note{ 38 | \code{sensitivity_pairs} is an alias for \code{recall_pairs}. 39 | } 40 | \examples{ 41 | true_pairs <- rbind(c(1,2), c(2,3), c(1,3)) # ground truth is 3-clique 42 | pred_pairs <- rbind(c(1,2), c(2,3)) # prediction misses one edge 43 | num_pairs <- 3 # assuming 3 elements 44 | recall_pairs(true_pairs, pred_pairs, num_pairs) 45 | 46 | } 47 | -------------------------------------------------------------------------------- /man/specificity_pairs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/measures_pairs.R 3 | \name{specificity_pairs} 4 | \alias{specificity_pairs} 5 | \title{Specificity of Linked Pairs} 6 | \usage{ 7 | specificity_pairs(true_pairs, pred_pairs, num_pairs, ordered = FALSE) 8 | } 9 | \arguments{ 10 | \item{true_pairs}{set of true coreferent pairs stored in a matrix or 11 | data.frame, where rows index pairs and columns index the ids of the 12 | constituents. Any pairs not included are assumed to be \emph{non-coreferent}. 13 | Duplicate pairs (including equivalent pairs with reversed ids) are 14 | automatically removed.} 15 | 16 | \item{pred_pairs}{set of predicted coreferent pairs, following the same 17 | specification as \code{true_pairs}.} 18 | 19 | \item{num_pairs}{the total number of coreferent and non-coreferent pairs, 20 | excluding equivalent pairs with reversed ids.} 21 | 22 | \item{ordered}{whether to treat the element pairs as ordered---i.e. whether 23 | pair \eqn{(x, y)} is distinct from pair \eqn{(y, x)} for \eqn{x \neq y}. 24 | Defaults to FALSE, which is appropriate for clustering, undirected link 25 | prediction, record linkage etc.} 26 | } 27 | \description{ 28 | Computes the specificity of a set of \emph{predicted} coreferent 29 | (linked) pairs given a set of \emph{ground truth} coreferent pairs. 30 | } 31 | \details{ 32 | The specificity is defined as: 33 | \deqn{\frac{|P' \cap T'|}{|P'|}}{|P' ∩ T'|/|P'|} 34 | where \eqn{T'} is the set of true non-coreferent pairs, \eqn{P} is the 35 | set of predicted non-coreferent pairs. 36 | } 37 | \examples{ 38 | true_pairs <- rbind(c(1,2), c(2,3), c(1,3)) # ground truth is 3-clique 39 | pred_pairs <- rbind(c(1,2), c(2,3)) # prediction misses one edge 40 | num_pairs <- 3 # assuming 3 elements 41 | specificity_pairs(true_pairs, pred_pairs, num_pairs) 42 | 43 | } 44 | -------------------------------------------------------------------------------- /man/fowlkes_mallows_pairs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/measures_pairs.R 3 | \name{fowlkes_mallows_pairs} 4 | \alias{fowlkes_mallows_pairs} 5 | \title{Fowlkes-Mallows Index of Linked Pairs} 6 | \usage{ 7 | fowlkes_mallows_pairs(true_pairs, pred_pairs, ordered = FALSE) 8 | } 9 | \arguments{ 10 | \item{true_pairs}{set of true coreferent pairs stored in a matrix or 11 | data.frame, where rows index pairs and columns index the ids of the 12 | constituents. Any pairs not included are assumed to be \emph{non-coreferent}. 13 | Duplicate pairs (including equivalent pairs with reversed ids) are 14 | automatically removed.} 15 | 16 | \item{pred_pairs}{set of predicted coreferent pairs, following the same 17 | specification as \code{true_pairs}.} 18 | 19 | \item{ordered}{whether to treat the element pairs as ordered---i.e. whether 20 | pair \eqn{(x, y)} is distinct from pair \eqn{(y, x)} for \eqn{x \neq y}. 21 | Defaults to FALSE, which is appropriate for clustering, undirected link 22 | prediction, record linkage etc.} 23 | } 24 | \description{ 25 | Computes the Fowlkes-Mallows index for a set of \emph{predicted} 26 | coreferent (linked) pairs given a set of \emph{ground truth} coreferent pairs. 27 | } 28 | \details{ 29 | The Fowlkes-Mallows index is defined as the geometric mean of 30 | precision \eqn{P} and recall \eqn{R}: 31 | \deqn{\sqrt{P R}.}{√(P·R).} 32 | } 33 | \examples{ 34 | true_pairs <- rbind(c(1,2), c(2,3), c(1,3)) # ground truth is 3-clique 35 | pred_pairs <- rbind(c(1,2), c(2,3)) # prediction misses one edge 36 | num_pairs <- 3 # assuming 3 elements 37 | fowlkes_mallows_pairs(true_pairs, pred_pairs, num_pairs) 38 | 39 | } 40 | \references{ 41 | Fowlkes, E. B. and Mallows, C. L. "A Method for Comparing Two Hierarchical 42 | Clusterings." \emph{Journal of the American Statistical Association} \strong{78:383}, 43 | 553-569, (1983). \doi{10.1080/01621459.1983.10478008}. 44 | } 45 | -------------------------------------------------------------------------------- /man/accuracy_pairs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/measures_pairs.R 3 | \name{accuracy_pairs} 4 | \alias{accuracy_pairs} 5 | \title{Accuracy of Linked Pairs} 6 | \usage{ 7 | accuracy_pairs(true_pairs, pred_pairs, num_pairs, ordered = FALSE) 8 | } 9 | \arguments{ 10 | \item{true_pairs}{set of true coreferent pairs stored in a matrix or 11 | data.frame, where rows index pairs and columns index the ids of the 12 | constituents. Any pairs not included are assumed to be \emph{non-coreferent}. 13 | Duplicate pairs (including equivalent pairs with reversed ids) are 14 | automatically removed.} 15 | 16 | \item{pred_pairs}{set of predicted coreferent pairs, following the same 17 | specification as \code{true_pairs}.} 18 | 19 | \item{num_pairs}{the total number of coreferent and non-coreferent pairs, 20 | excluding equivalent pairs with reversed ids.} 21 | 22 | \item{ordered}{whether to treat the element pairs as ordered---i.e. whether 23 | pair \eqn{(x, y)} is distinct from pair \eqn{(y, x)} for \eqn{x \neq y}. 24 | Defaults to FALSE, which is appropriate for clustering, undirected link 25 | prediction, record linkage etc.} 26 | } 27 | \description{ 28 | Computes the accuracy of a set of \emph{predicted} coreferent 29 | (linked) pairs given a set of \emph{ground truth} coreferent pairs. 30 | } 31 | \details{ 32 | The accuracy is defined as: 33 | \deqn{\frac{|T \cap P| + |T' \cap P'|}{N}}{(|T ∩ P| + |T' ∩ P'|)/N} 34 | where: 35 | \itemize{ 36 | \item \eqn{T} is the set of true coreferent pairs, 37 | \item \eqn{P} is the set of predicted coreferent pairs, 38 | \item \eqn{T'} is the set of true non-coreferent pairs, 39 | \item \eqn{P'} is the set of predicted non-coreferent pairs, and 40 | \item \eqn{N} is the total number of coreferent and non-coreferent pairs. 41 | } 42 | } 43 | \examples{ 44 | true_pairs <- rbind(c(1,2), c(2,3), c(1,3)) # ground truth is 3-clique 45 | pred_pairs <- rbind(c(1,2), c(2,3)) # prediction misses one edge 46 | num_pairs <- 3 # assuming 3 elements 47 | accuracy_pairs(true_pairs, pred_pairs, num_pairs) 48 | 49 | } 50 | -------------------------------------------------------------------------------- /man/f_measure_pairs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/measures_pairs.R 3 | \name{f_measure_pairs} 4 | \alias{f_measure_pairs} 5 | \title{F-measure of Linked Pairs} 6 | \usage{ 7 | f_measure_pairs(true_pairs, pred_pairs, beta = 1, ordered = FALSE) 8 | } 9 | \arguments{ 10 | \item{true_pairs}{set of true coreferent pairs stored in a matrix or 11 | data.frame, where rows index pairs and columns index the ids of the 12 | constituents. Any pairs not included are assumed to be \emph{non-coreferent}. 13 | Duplicate pairs (including equivalent pairs with reversed ids) are 14 | automatically removed.} 15 | 16 | \item{pred_pairs}{set of predicted coreferent pairs, following the same 17 | specification as \code{true_pairs}.} 18 | 19 | \item{beta}{non-negative weight. A value of 0 assigns no weight to recall 20 | (i.e. the measure reduces to precision), while larger values assign 21 | increasing weight to recall. A value of 1 weights precision and recall 22 | equally.} 23 | 24 | \item{ordered}{whether to treat the element pairs as ordered---i.e. whether 25 | pair \eqn{(x, y)} is distinct from pair \eqn{(y, x)} for \eqn{x \neq y}. 26 | Defaults to FALSE, which is appropriate for clustering, undirected link 27 | prediction, record linkage etc.} 28 | } 29 | \description{ 30 | Computes the F-measure (a.k.a. F-score) of a set of 31 | \emph{predicted} coreferent (linked) pairs given a set of \emph{ground truth} 32 | coreferent pairs. 33 | } 34 | \details{ 35 | The \eqn{\beta}{β}-weighted F-measure is defined as the weighted 36 | harmonic mean of precision \eqn{P} and recall \eqn{R}: 37 | \deqn{(1 + \beta^2)\frac{P \cdot R}{\beta^2 \cdot P + R}.}{(1 + β^2)·P·R/(β^2·P + R).} 38 | } 39 | \examples{ 40 | true_pairs <- rbind(c(1,2), c(2,3), c(1,3)) # ground truth is 3-clique 41 | pred_pairs <- rbind(c(1,2), c(2,3)) # prediction misses one edge 42 | num_pairs <- 3 # assuming 3 elements 43 | f_measure_pairs(true_pairs, pred_pairs, num_pairs) 44 | 45 | } 46 | \references{ 47 | Van Rijsbergen, C. J. "Information Retrieval." (2nd ed.). 48 | Butterworth-Heinemann, USA, (1979). 49 | } 50 | -------------------------------------------------------------------------------- /man/balanced_accuracy_pairs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/measures_pairs.R 3 | \name{balanced_accuracy_pairs} 4 | \alias{balanced_accuracy_pairs} 5 | \title{Balanced Accuracy of Linked Pairs} 6 | \usage{ 7 | balanced_accuracy_pairs(true_pairs, pred_pairs, num_pairs, ordered = FALSE) 8 | } 9 | \arguments{ 10 | \item{true_pairs}{set of true coreferent pairs stored in a matrix or 11 | data.frame, where rows index pairs and columns index the ids of the 12 | constituents. Any pairs not included are assumed to be \emph{non-coreferent}. 13 | Duplicate pairs (including equivalent pairs with reversed ids) are 14 | automatically removed.} 15 | 16 | \item{pred_pairs}{set of predicted coreferent pairs, following the same 17 | specification as \code{true_pairs}.} 18 | 19 | \item{num_pairs}{the total number of coreferent and non-coreferent pairs, 20 | excluding equivalent pairs with reversed ids.} 21 | 22 | \item{ordered}{whether to treat the element pairs as ordered---i.e. whether 23 | pair \eqn{(x, y)} is distinct from pair \eqn{(y, x)} for \eqn{x \neq y}. 24 | Defaults to FALSE, which is appropriate for clustering, undirected link 25 | prediction, record linkage etc.} 26 | } 27 | \description{ 28 | Computes the balanced accuracy of a set of \emph{predicted} 29 | coreferent (linked) pairs given a set of \emph{ground truth} coreferent 30 | pairs. 31 | } 32 | \details{ 33 | The balanced accuracy is defined as: 34 | \deqn{\frac{\frac{|T \cap P|}{|P|} + \frac{|T' \cap P'|}{|P'|}}{2}}{|T ∩ P|/(2|P|) + |T' ∩ P'|/(2|P'|)} 35 | where: 36 | \itemize{ 37 | \item \eqn{T} is the set of true coreferent pairs, 38 | \item \eqn{P} is the set of predicted coreferent pairs, 39 | \item \eqn{T'} is the set of true non-coreferent pairs, and 40 | \item \eqn{P'} is the set of predicted non-coreferent pairs. 41 | } 42 | } 43 | \examples{ 44 | true_pairs <- rbind(c(1,2), c(2,3), c(1,3)) # ground truth is 3-clique 45 | pred_pairs <- rbind(c(1,2), c(2,3)) # prediction misses one edge 46 | num_pairs <- 3 # assuming 3 elements 47 | balanced_accuracy_pairs(true_pairs, pred_pairs, num_pairs) 48 | 49 | } 50 | -------------------------------------------------------------------------------- /man/v_measure.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/measures_clusterings.R 3 | \name{v_measure} 4 | \alias{v_measure} 5 | \title{V-measure Between Clusterings} 6 | \usage{ 7 | v_measure(true, pred, beta = 1) 8 | } 9 | \arguments{ 10 | \item{true}{ground truth clustering represented as a membership 11 | vector. Each entry corresponds to an element and the value identifies 12 | the assigned cluster. The specific values of the cluster identifiers 13 | are arbitrary.} 14 | 15 | \item{pred}{predicted clustering represented as a membership 16 | vector.} 17 | 18 | \item{beta}{non-negative weight. A value of 0 assigns no weight to 19 | completeness (i.e. the measure reduces to homogeneity), while larger 20 | values assign increasing weight to completeness. A value of 1 weights 21 | completeness and homogeneity equally.} 22 | } 23 | \description{ 24 | Computes the V-measure between two clusterings, such 25 | as a predicted and ground truth clustering. 26 | } 27 | \details{ 28 | V-measure is defined as the \eqn{\beta}{β}-weighted harmonic 29 | mean of homogeneity \eqn{h} and completeness \eqn{c}: 30 | \deqn{(1 + \beta)\frac{h \cdot c}{\beta \cdot h + c}.}{(1 + β)·h·c/(β·h + c).} 31 | The range of V-measure is between 0 and 1, where 1 corresponds to a 32 | perfect match between the clusterings. It is equivalent to the 33 | normalised mutual information, when the aggregation function is the 34 | arithmetic mean. 35 | } 36 | \examples{ 37 | true <- c(1,1,1,2,2) # ground truth clustering 38 | pred <- c(1,1,2,2,2) # predicted clustering 39 | v_measure(true, pred) 40 | 41 | } 42 | \references{ 43 | Rosenberg, A. and Hirschberg, J. "V-measure: A conditional entropy-based external cluster evaluation measure." \emph{Proceedings of the 2007 Joint Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning} (EMNLP-CoNLL), (2007). 44 | 45 | Becker, H. "Identification and characterization of events in social media." 46 | \emph{PhD dissertation}, Columbia University, (2011). 47 | } 48 | \seealso{ 49 | \code{\link{homogeneity}} and \code{\link{completeness}} evaluate the component 50 | measures upon which this measure is based. 51 | } 52 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: github_document 3 | --- 4 | 5 | 6 | 7 | ```{r, include = FALSE} 8 | knitr::opts_chunk$set( 9 | collapse = TRUE, 10 | comment = "#>", 11 | fig.path = "man/figures/README-", 12 | out.width = "100%" 13 | ) 14 | ``` 15 | 16 | # clevr: Clustering and Link Prediction Evaluation in R 17 | 18 | 19 | 20 | 21 | clevr implements functions for evaluating link prediction and clustering 22 | algorithms in R. It includes efficient implementations of common performance 23 | measures, such as: 24 | 25 | * pairwise precision, recall, F-measure; 26 | * homogeneity, completeness and V-measure; 27 | * (adjusted) Rand index; 28 | * variation of information; and 29 | * mutual information. 30 | 31 | While the current focus is on supervised (a.k.a. external) performance 32 | measures, unsupervised (internal) measures are also in scope for future 33 | releases. 34 | 35 | ## Installation 36 | 37 | You can install the latest release from [CRAN](https://CRAN.R-project.org) 38 | by entering: 39 | 40 | ``` r 41 | install.packages("clevr") 42 | ``` 43 | 44 | The development version can be installed from GitHub using `devtools`: 45 | 46 | ``` r 47 | # install.packages("devtools") 48 | devtools::install_github("cleanzr/clevr") 49 | ``` 50 | 51 | ## Example 52 | 53 | Several functions are included which transform between different clustering 54 | representations. 55 | 56 | ```{r example} 57 | library(clevr) 58 | # A clustering of four records represented as a membership vector 59 | pred_membership <- c("Record1" = 1, "Record2" = 1, "Record3" = 1, "Record4" = 2) 60 | 61 | # Represent as a set of record pairs that appear in the same cluster 62 | pred_pairs <- membership_to_pairs(pred_membership) 63 | print(pred_pairs) 64 | 65 | # Represent as a list of record clusters 66 | pred_clusters <- membership_to_clusters(pred_membership) 67 | print(pred_clusters) 68 | ``` 69 | 70 | Performance measures are available for evaluating linked pairs: 71 | 72 | ```{r pair-measures} 73 | true_pairs <- rbind(c("Record1", "Record2"), c("Record3", "Record4")) 74 | 75 | pr <- precision_pairs(true_pairs, pred_pairs) 76 | print(pr) 77 | 78 | re <- recall_pairs(true_pairs, pred_pairs) 79 | print(re) 80 | ``` 81 | 82 | and for evaluating clusterings: 83 | 84 | ```{r clust-measures} 85 | true_membership <- c("Record1" = 1, "Record2" = 1, "Record3" = 2, "Record4" = 2) 86 | 87 | ari <- adj_rand_index(true_membership, pred_membership) 88 | print(ari) 89 | 90 | vi <- variation_info(true_membership, pred_membership) 91 | print(vi) 92 | ``` 93 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # clevr: Clustering and Link Prediction Evaluation in R 5 | 6 | 7 | 8 | 9 | 10 | clevr implements functions for evaluating link prediction and clustering 11 | algorithms in R. It includes efficient implementations of common 12 | performance measures, such as: 13 | 14 | - pairwise precision, recall, F-measure; 15 | - homogeneity, completeness and V-measure; 16 | - (adjusted) Rand index; 17 | - variation of information; and 18 | - mutual information. 19 | 20 | While the current focus is on supervised (a.k.a. external) performance 21 | measures, unsupervised (internal) measures are also in scope for future 22 | releases. 23 | 24 | ## Installation 25 | 26 | You can install the latest release from 27 | [CRAN](https://CRAN.R-project.org) by entering: 28 | 29 | ``` r 30 | install.packages("clevr") 31 | ``` 32 | 33 | The development version can be installed from GitHub using `devtools`: 34 | 35 | ``` r 36 | # install.packages("devtools") 37 | devtools::install_github("cleanzr/clevr") 38 | ``` 39 | 40 | ## Example 41 | 42 | Several functions are included which transform between different 43 | clustering representations. 44 | 45 | ``` r 46 | library(clevr) 47 | # A clustering of four records represented as a membership vector 48 | pred_membership <- c("Record1" = 1, "Record2" = 1, "Record3" = 1, "Record4" = 2) 49 | 50 | # Represent as a set of record pairs that appear in the same cluster 51 | pred_pairs <- membership_to_pairs(pred_membership) 52 | print(pred_pairs) 53 | #> [,1] [,2] 54 | #> [1,] "Record1" "Record2" 55 | #> [2,] "Record1" "Record3" 56 | #> [3,] "Record2" "Record3" 57 | 58 | # Represent as a list of record clusters 59 | pred_clusters <- membership_to_clusters(pred_membership) 60 | print(pred_clusters) 61 | #> $`1` 62 | #> [1] "Record1" "Record2" "Record3" 63 | #> 64 | #> $`2` 65 | #> [1] "Record4" 66 | ``` 67 | 68 | Performance measures are available for evaluating linked pairs: 69 | 70 | ``` r 71 | true_pairs <- rbind(c("Record1", "Record2"), c("Record3", "Record4")) 72 | 73 | pr <- precision_pairs(true_pairs, pred_pairs) 74 | print(pr) 75 | #> [1] 0.3333333 76 | 77 | re <- recall_pairs(true_pairs, pred_pairs) 78 | print(re) 79 | #> [1] 0.5 80 | ``` 81 | 82 | and for evaluating clusterings: 83 | 84 | ``` r 85 | true_membership <- c("Record1" = 1, "Record2" = 1, "Record3" = 2, "Record4" = 2) 86 | 87 | ari <- adj_rand_index(true_membership, pred_membership) 88 | print(ari) 89 | #> [1] 0 90 | 91 | vi <- variation_info(true_membership, pred_membership) 92 | print(vi) 93 | #> [1] 0.8239592 94 | ``` 95 | -------------------------------------------------------------------------------- /man/contingency_table_pairs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/measures_pairs.R 3 | \name{contingency_table_pairs} 4 | \alias{contingency_table_pairs} 5 | \title{Binary Contingency Table for Linked Pairs} 6 | \usage{ 7 | contingency_table_pairs( 8 | true_pairs, 9 | pred_pairs, 10 | num_pairs = NULL, 11 | ordered = FALSE 12 | ) 13 | } 14 | \arguments{ 15 | \item{true_pairs}{set of true coreferent pairs stored in a matrix or 16 | data.frame, where rows index pairs and columns index the ids of the 17 | constituents. Any pairs not included are assumed to be \emph{non-coreferent}. 18 | Duplicate pairs (including equivalent pairs with reversed ids) are 19 | automatically removed.} 20 | 21 | \item{pred_pairs}{set of predicted coreferent pairs, following the same 22 | specification as \code{true_pairs}.} 23 | 24 | \item{num_pairs}{the total number of coreferent and non-coreferent pairs, 25 | excluding equivalent pairs with reversed ids. If not provided, 26 | the true negative cell will be set to \code{NA}.} 27 | 28 | \item{ordered}{whether to treat the element pairs as ordered---i.e. whether 29 | pair \eqn{(x, y)} is distinct from pair \eqn{(y, x)} for \eqn{x \neq y}. 30 | Defaults to FALSE, which is appropriate for clustering, undirected link 31 | prediction, record linkage etc.} 32 | } 33 | \value{ 34 | Returns a \eqn{2 \times 2}{2×2} contingency table of the form: 35 | \preformatted{ 36 | Truth 37 | Prediction TRUE FALSE 38 | TRUE TP FP 39 | FALSE FN TN 40 | } 41 | } 42 | \description{ 43 | Compute the binary contingency table for a set of \emph{predicted} 44 | coreferent (linked) pairs given a set of \emph{ground truth} coreferent pairs. 45 | } 46 | \examples{ 47 | ### Example where pairs/edges are undirected 48 | # ground truth is 3-clique 49 | true_pairs <- rbind(c(1,2), c(2,3), c(1,3)) 50 | # prediction misses one edge 51 | pred_pairs <- rbind(c(1,2), c(2,3)) 52 | # total number of pairs assuming 3 elements 53 | num_pairs <- 3 * (3 - 1) / 2 54 | eval_report_pairs(true_pairs, pred_pairs, num_pairs) 55 | 56 | ### Example where pairs/edges are directed 57 | # ground truth is a 3-star 58 | true_pairs <- rbind(c(2,1), c(3,1), c(4,1)) 59 | # prediction gets direction of one edge incorrect 60 | pred_pairs <- rbind(c(2,1), c(3,1), c(1,4)) 61 | # total number of pairs assuming 4 elements 62 | num_pairs <- 4 * 4 63 | eval_report_pairs(true_pairs, pred_pairs, num_pairs, ordered = TRUE) 64 | 65 | } 66 | \seealso{ 67 | The \code{\link{membership_to_pairs}} and \code{\link{clusters_to_pairs}} functions can be 68 | used to transform other clustering representations into lists of pairs, 69 | as required by this function. 70 | The \code{\link{eval_report_pairs}} function computes common evaluation measures 71 | derived from binary contingency matrices, like the ones output by this 72 | function. 73 | } 74 | -------------------------------------------------------------------------------- /man/eval_report_pairs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/measures_pairs.R 3 | \name{eval_report_pairs} 4 | \alias{eval_report_pairs} 5 | \title{Evaluation Report for Linked Pairs} 6 | \usage{ 7 | eval_report_pairs(true_pairs, pred_pairs, num_pairs = NULL, ordered = FALSE) 8 | } 9 | \arguments{ 10 | \item{true_pairs}{set of true coreferent pairs stored in a matrix or 11 | data.frame, where rows index pairs and columns index the ids of the 12 | constituents. Any pairs not included are assumed to be \emph{non-coreferent}. 13 | Duplicate pairs (including equivalent pairs with reversed ids) are 14 | automatically removed.} 15 | 16 | \item{pred_pairs}{set of predicted coreferent pairs, following the same 17 | specification as \code{true_pairs}.} 18 | 19 | \item{num_pairs}{the total number of coreferent and non-coreferent pairs, 20 | excluding equivalent pairs with reversed ids. If not provided, 21 | measures that depend on the number of true negatives will be returned 22 | as \code{NA}.} 23 | 24 | \item{ordered}{whether to treat the element pairs as ordered---i.e. whether 25 | pair \eqn{(x, y)} is distinct from pair \eqn{(y, x)} for \eqn{x \neq y}. 26 | Defaults to FALSE, which is appropriate for clustering, undirected link 27 | prediction, record linkage etc.} 28 | } 29 | \value{ 30 | Returns a list containing the following measures: 31 | \describe{ 32 | \item{precision}{see \code{\link{precision_pairs}}} 33 | \item{recall}{see \code{\link{recall_pairs}}} 34 | \item{specificity}{see \code{\link{specificity_pairs}}} 35 | \item{sensitivity}{see \code{\link{sensitivity_pairs}}} 36 | \item{f1score}{see \code{\link{f_measure_pairs}}} 37 | \item{accuracy}{see \code{\link{accuracy_pairs}}} 38 | \item{balanced_accuracy}{see \code{\link{balanced_accuracy_pairs}}} 39 | \item{fowlkes_mallows}{see \code{\link{fowlkes_mallows_pairs}}} 40 | } 41 | } 42 | \description{ 43 | Compute various evaluation measures for a set of \emph{predicted} 44 | coreferent (linked) pairs given a set of \emph{ground truth} coreferent pairs. 45 | } 46 | \examples{ 47 | ### Example where pairs/edges are undirected 48 | # ground truth is 3-clique 49 | true_pairs <- rbind(c(1,2), c(2,3), c(1,3)) 50 | # prediction misses one edge 51 | pred_pairs <- rbind(c(1,2), c(2,3)) 52 | # total number of pairs assuming 3 elements 53 | num_pairs <- 3 * (3 - 1) / 2 54 | eval_report_pairs(true_pairs, pred_pairs, num_pairs) 55 | 56 | ### Example where pairs/edges are directed 57 | # ground truth is a 3-star 58 | true_pairs <- rbind(c(2,1), c(3,1), c(4,1)) 59 | # prediction gets direction of one edge incorrect 60 | pred_pairs <- rbind(c(2,1), c(3,1), c(1,4)) 61 | # total number of pairs assuming 4 elements 62 | num_pairs <- 4 * 4 63 | eval_report_pairs(true_pairs, pred_pairs, num_pairs, ordered = TRUE) 64 | 65 | } 66 | \seealso{ 67 | The \code{\link{contingency_table_pairs}} function can be used to compute 68 | the contingency table for entity resolution or record linkage problems. 69 | } 70 | -------------------------------------------------------------------------------- /tests/testthat/test-measures_clusterings.R: -------------------------------------------------------------------------------- 1 | 2 | # Examples to test 3 | make_clusterings_identical <- function() { 4 | true <- c(1,1,1,2,2) 5 | pred <- c(1,1,1,2,2) 6 | measures <- list( 7 | "rand_index" = 1.0, 8 | "adj_rand_index" = 1.0, 9 | "fowlkes_mallows" = 1.0, 10 | "homogeneity" = 1.0, 11 | "completeness" = 1.0, 12 | "v_measure" = 1.0, 13 | "variation_info" = 0.0, 14 | "mutual_info" = 0.6730116670092563 15 | ) 16 | list("true" = true, "pred" = pred, "true_measures" = measures, 17 | "description" = "clusterings in perfect agreement") 18 | } 19 | 20 | make_clusterings_distinct <- function() { 21 | true <- c(1,2,3,4,5) 22 | pred <- c(1,1,1,1,1) 23 | measures <- list( 24 | "rand_index" = 0.0, 25 | "adj_rand_index" = 0.0, 26 | "fowlkes_mallows" = 0.0, 27 | "homogeneity" = 0.0, 28 | "completeness" = 1.0, 29 | "v_measure" = 0.0, 30 | "variation_info" = 1.6094379124341003, 31 | "mutual_info" = 0.0 32 | ) 33 | list("true" = true, "pred" = pred, "true_measures" = measures, 34 | "description" = "clusterings in complete disagreement") 35 | } 36 | 37 | make_clusterings_one_difference <- function() { 38 | true <- c(1,1,2,2,2) 39 | pred <- c(1,1,1,2,2) 40 | measures <- list( 41 | "rand_index" = 0.6, 42 | "adj_rand_index" = 0.16666666666666666, 43 | "fowlkes_mallows" = 0.5, 44 | "homogeneity" = 0.43253806776631243, 45 | "completeness" = 0.43253806776631243, 46 | "v_measure" = 0.43253806776631243, 47 | "variation_info" = 0.7638170019537754, 48 | "mutual_info" = 0.2911031660323686 49 | ) 50 | list("true" = true, "pred" = pred, "true_measures" = measures, 51 | "description" = "clusterings with one difference") 52 | } 53 | 54 | make_clusterings_anticorrelated <- function() { 55 | true <- c(1,1,1,2,3) 56 | pred <- c(1,2,3,4,4) 57 | measures <- list( 58 | "rand_index" = 0.6, 59 | "adj_rand_index" = -0.176470588235294, 60 | "fowlkes_mallows" = 0.0, 61 | "homogeneity" = 0.7082316448032829, 62 | "completeness" = 0.5051961085524235, 63 | "v_measure" = 0.5897275217561567, 64 | "variation_info" = 0.936426245424844, 65 | "mutual_info" = 0.6730116670092563 66 | ) 67 | list("true" = true, "pred" = pred, "true_measures" = measures, 68 | "description" = "clusterings that are anti-correlated") 69 | } 70 | 71 | examples_to_test <- list(make_clusterings_identical, 72 | make_clusterings_distinct, 73 | make_clusterings_one_difference, 74 | make_clusterings_anticorrelated) 75 | 76 | measures_to_test <- c("Rand Index" = "rand_index", 77 | "Adjusted Rand Index" = "adj_rand_index", 78 | "Fowlkes-Mallows Index" = "fowlkes_mallows", 79 | "Homogeneity" = "homogeneity", 80 | "Completeness" = "completeness", 81 | "V-Measure" = "v_measure", 82 | "Variation Information" = "variation_info", 83 | "Mutual Information" = "mutual_info") 84 | 85 | 86 | for (measure_name in names(measures_to_test)) { 87 | context(measure_name) 88 | measure <- measures_to_test[measure_name] 89 | for (example in examples_to_test) { 90 | example <- example() 91 | test_that(paste(measure_name, "is correct for", example$description), { 92 | true <- example$true 93 | pred <- example$pred 94 | expect_equal(eval(parse(text=paste0(measure, "(true, pred)"))), 95 | example$true_measures[[measure]]) 96 | }) 97 | } 98 | } 99 | 100 | test_that("V-Measure is correct for a simple example when beta != 1", { 101 | true <- c(1,1,2,2,2) 102 | pred <- c(1,1,1,2,2) 103 | expect_equal(v_measure(true, pred, beta = 0), homogeneity(true, pred)) 104 | expect_equal(v_measure(true, pred, beta = Inf), completeness(true, pred)) 105 | }) 106 | -------------------------------------------------------------------------------- /man/clustering_representations.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/transformations.R 3 | \name{clusters_to_membership} 4 | \alias{clusters_to_membership} 5 | \alias{membership_to_clusters} 6 | \alias{clusters_to_pairs} 7 | \alias{membership_to_pairs} 8 | \alias{pairs_to_membership} 9 | \alias{pairs_to_clusters} 10 | \title{Transform Clustering Representations} 11 | \usage{ 12 | clusters_to_membership(clusters, elem_ids = NULL, clust_ids = NULL) 13 | 14 | membership_to_clusters(membership, elem_ids = NULL, clust_ids = NULL) 15 | 16 | clusters_to_pairs(clusters) 17 | 18 | membership_to_pairs(membership, elem_ids = NULL) 19 | 20 | pairs_to_membership(pairs, elem_ids) 21 | 22 | pairs_to_clusters(pairs, elem_ids) 23 | } 24 | \arguments{ 25 | \item{clusters}{a representation of a clustering as a list of vectors, 26 | where the i-th vector contains the identifiers of elements assigned to the 27 | i-th cluster. If \code{clust_ids} is specified (see below), the i-th cluster 28 | is identified according to the corresponding entry in \code{clust_ids}. 29 | Otherwise the i-th cluster is identified according it's name (if 30 | \code{clusters} is a named list) or its integer index i.} 31 | 32 | \item{elem_ids}{a vector specifying the complete set of identifiers for the 33 | cluster elements in canonical order. Optional for all functions excluding 34 | \code{pairs_to_membership} and \code{pairs_to_clusters}.} 35 | 36 | \item{clust_ids}{a vector specifying the complete set of identifiers for 37 | the clusters in canonical order. Optional for all functions.} 38 | 39 | \item{membership}{a representation of a clustering as a membership vector, 40 | where the i-th entry contains the cluster identifier for the i-th element. 41 | If \code{elem_ids} is specified (see below), the i-th element is identified 42 | according to the corresponding entry in \code{elem_ids}. Otherwise the i-th 43 | element is identified according it's name (if \code{members} is a named vector) 44 | or its integer index i.} 45 | 46 | \item{pairs}{a representation of a clustering as a matrix or data.frame 47 | containing all pairs of elements that are co-clustered. The rows index 48 | of the matrix/data.frame index pairs and columns index the identifiers 49 | of the constituent elements. The \code{elem_ids} argument (see below) must be 50 | specified in order to recover singleton clusters (containing a single 51 | element).} 52 | } 53 | \value{ 54 | \code{clusters_to_membership} and \code{pairs_to_membership} both return a 55 | membership vector representation of the clustering. The order of the 56 | elements is taken from \code{elem_ids} if specified, otherwise the elements are 57 | ordered lexicographically by their identifiers. For 58 | \code{pairs_to_membership}, the cluster identifiers cannot be recovered and 59 | are taken to be integers. 60 | 61 | \code{membership_to_clusters} and \code{pairs_to_clusters} both return a 62 | representation of the clustering as a list of vectors. The order of the 63 | clusters is taken from \code{clust_ids} if specified, otherwise the clusters 64 | are ordered lexicographically by their identifiers. For 65 | \code{pairs_to_clusters}, the cluster identifiers cannot be recovered and 66 | are taken to be integers. 67 | 68 | \code{clusters_to_pairs} and \code{membership_to_pairs} both return a 69 | representation of the clustering as a matrix of element pairs that are 70 | co-clustered. This representation results in loss of information, as 71 | singleton clusters (with one element) and cluster identifiers are not 72 | represented. 73 | } 74 | \description{ 75 | Transform between different representations of a clustering. 76 | } 77 | \examples{ 78 | ## A clustering of three items represented as a membership vector 79 | m <- c("Item1" = 1, "Item2" = 2, "Item3" = 1) 80 | 81 | # Transform to list of clusters 82 | membership_to_clusters(m) 83 | # Specify different identifiers for the items 84 | membership_to_clusters(m, elem_ids = c(1, 2, 3)) 85 | # Transform to array of pairs that are co-clustered 86 | membership_to_pairs(m) 87 | 88 | ## A clustering represented as a list of clusters 89 | cl <- list("ClustA" = c(1,3), "ClustB" = c(2)) 90 | 91 | # Transform to membership vector representation 92 | clusters_to_membership(cl) 93 | # Transform to array of pairs that are co-clustered 94 | clusters_to_pairs(cl) 95 | 96 | ## A clustering (incompletely) represented as an array of pairs that 97 | ## are co-clustered 98 | p <- rbind(c(1,3)) # pairs of elements in the same cluster 99 | ids <- c(1,2,3) # necessary to specify set of all elements 100 | 101 | # Transform to membership vector representation 102 | pairs_to_membership(p, ids) 103 | # Transform to list of clusters 104 | pairs_to_clusters(p, ids) 105 | 106 | } 107 | -------------------------------------------------------------------------------- /tests/testthat/test-transformations.R: -------------------------------------------------------------------------------- 1 | context("Clusters to membership vector") 2 | 3 | test_that("un-named list of integer vectors correctly transformed to membership vector", { 4 | clusters <- list(c(100L, 1L), c(2L)) 5 | clust_ids <- c("A", "B") 6 | elem_ids <- c(1L, 2L, 100L) 7 | expect_equal(clusters_to_membership(clusters), 8 | c("1" = 1L, "100" = 1L, "2" = 2L)) 9 | expect_equal(clusters_to_membership(clusters, elem_ids = elem_ids), 10 | c("1" = 1L, "2" = 2L, "100" = 1L)) 11 | expect_equal(clusters_to_membership(clusters, clust_ids = clust_ids), 12 | c("1" = "A", "100" = "A", "2" = "B")) 13 | expect_equal(clusters_to_membership(clusters, elem_ids = elem_ids, clust_ids = clust_ids), 14 | c("1" = "A", "2" = "B", "100" = "A")) 15 | }) 16 | 17 | test_that("un-named list of character vectors correctly transformed to membership vector", { 18 | clusters <- list(c("ELEM3", "ELEM1"), c("ELEM2")) 19 | clust_ids <- c("A", "B") 20 | elem_ids <- c("ELEM3", "ELEM2", "ELEM1") 21 | expect_equal(clusters_to_membership(clusters), 22 | c("ELEM1" = 1L, "ELEM2" = 2L, "ELEM3" = 1L)) 23 | expect_equal(clusters_to_membership(clusters, elem_ids = elem_ids), 24 | c("ELEM3" = 1L, "ELEM2" = 2L, "ELEM1" = 1L)) 25 | expect_equal(clusters_to_membership(clusters, clust_ids = clust_ids), 26 | c("ELEM1" = "A", "ELEM2" = "B", "ELEM3" = "A")) 27 | expect_equal(clusters_to_membership(clusters, elem_ids = elem_ids, clust_ids = clust_ids), 28 | c("ELEM3" = "A", "ELEM2" = "B", "ELEM1" = "A")) 29 | }) 30 | 31 | test_that("named list of integer vectors correctly transformed to membership vector", { 32 | clusters <- list("A" = c(100L, 1L), "B" = c(2L)) 33 | clust_ids <- c("A", "B") 34 | elem_ids <- c(1L, 2L, 100L) 35 | expect_equal(clusters_to_membership(clusters), 36 | c("1" = "A", "100" = "A", "2" = "B")) 37 | expect_equal(clusters_to_membership(clusters, elem_ids = elem_ids), 38 | c("1" = "A", "2" = "B", "100" = "A")) 39 | expect_equal(clusters_to_membership(clusters, clust_ids = clust_ids), 40 | c("1" = "A", "100" = "A", "2" = "B")) 41 | expect_equal(clusters_to_membership(clusters, elem_ids = elem_ids, clust_ids = clust_ids), 42 | c("1" = "A", "2" = "B", "100" = "A")) 43 | }) 44 | 45 | 46 | context("Membership vector to clusters") 47 | 48 | test_that("un-named integer membership vector correctly transformed to list of vectors", { 49 | membership <- c(1L, 2L, 1L) 50 | clust_ids <- c(2L, 1L) 51 | elem_ids <- c(1L, 2L, 100L) 52 | expect_equal(membership_to_clusters(membership), 53 | list("1" = c(1L, 3L), "2" = 2L)) 54 | expect_equal(membership_to_clusters(membership, elem_ids = elem_ids), 55 | list("1" = c(1L, 100L), "2" = 2L)) 56 | expect_equal(membership_to_clusters(membership, clust_ids = clust_ids), 57 | list("2" = 2L, "1" = c(1L, 3L))) 58 | expect_equal(membership_to_clusters(membership, elem_ids = elem_ids, clust_ids = clust_ids), 59 | list("2" = 2L, "1" = c(1L, 100L))) 60 | }) 61 | 62 | test_that("un-named character membership vector correctly transformed to list of vectors", { 63 | membership <- c("B", "A", "B") 64 | clust_ids <- c("B", "A") 65 | elem_ids <- c(1L, 2L, 100L) 66 | expect_equal(membership_to_clusters(membership), 67 | list("A" = 2L, "B" = c(1L, 3L))) 68 | expect_equal(membership_to_clusters(membership, elem_ids = elem_ids), 69 | list("A" = 2L, "B" = c(1L, 100L))) 70 | expect_equal(membership_to_clusters(membership, clust_ids = clust_ids), 71 | list("B" = c(1L, 3L), "A" = 2L)) 72 | expect_equal(membership_to_clusters(membership, elem_ids = elem_ids, clust_ids = clust_ids), 73 | list("B" = c(1L, 100L), "A" = 2L)) 74 | }) 75 | 76 | test_that("named character membership vector correctly transformed to list of vectors", { 77 | membership <- c("1" = "B", "2" = "A", "100" = "B") 78 | clust_ids <- c("B", "A") 79 | elem_ids <- c(1L, 2L, 100L) 80 | expect_equal(membership_to_clusters(membership), 81 | list("A" = "2", "B" = c("1", "100"))) 82 | expect_equal(membership_to_clusters(membership, elem_ids = elem_ids), 83 | list("A" = 2L, "B" = c(1L, 100L))) 84 | expect_equal(membership_to_clusters(membership, clust_ids = clust_ids), 85 | list("B" = c("1", "100"), "A" = "2")) 86 | expect_equal(membership_to_clusters(membership, elem_ids = elem_ids, clust_ids = clust_ids), 87 | list("B" = c(1L, 100L), "A" = 2L)) 88 | }) 89 | 90 | 91 | context("Pairs to membership vector") 92 | 93 | test_that("integer matrix of pairs correctly transformed to membership vector", { 94 | pairs <- rbind(c(1L, 2L), c(1L, 3L), c(2L, 3L), c(4L, 5L)) 95 | elem_ids <- seq_len(5) 96 | expect_equal(pairs_to_membership(pairs, elem_ids), 97 | c("1" = 1, "2" = 1, "3" = 1, "4" = 2, "5" = 2)) 98 | }) 99 | 100 | test_that("special case of no pairs handled correctly", { 101 | pairs <- matrix(0L, nrow = 0, ncol = 2) 102 | elem_ids <- seq_len(5) 103 | expect_equal(pairs_to_membership(pairs, elem_ids), 104 | c("1" = 1, "2" = 2, "3" = 3, "4" = 4, "5" = 5)) 105 | }) 106 | 107 | test_that("character matrix of pairs correctly transformed to membership vector", { 108 | pairs <- rbind(c("A", "B"), c("B", "C"), c("A", "C"), c("D", "E")) 109 | elem_ids <- LETTERS[1:5] 110 | expect_equal(pairs_to_membership(pairs, elem_ids), 111 | c("A" = 1, "B" = 1, "C" = 1, "D" = 2, "E" = 2)) 112 | }) 113 | 114 | test_that("missing element identifiers in pairs produces a warning", { 115 | pairs <- rbind(c(NA, 2L), c(1L, 3L), c(2L, 3L)) 116 | elem_ids <- seq_len(5) 117 | expect_warning(pairs_to_membership(pairs, elem_ids)) 118 | }) 119 | 120 | test_that("missing element identifiers in `elem_ids` results in error", { 121 | pairs <- rbind(c(1L, 2L), c(1L, 3L), c(2L, 3L)) 122 | elem_ids <- c(1L, NA, 3L) 123 | expect_error(pairs_to_membership(pairs, elem_ids)) 124 | }) 125 | 126 | test_that("passing pairs with incorrect dimensions results in error", { 127 | pairs <- rbind(c(1L, 2L), c(1L, 3L), c(2L, 3L)) 128 | elem_ids <- c(1L, 2L, 3L) 129 | expect_error(pairs_to_membership(pairs[,0], elem_ids)) 130 | }) 131 | 132 | 133 | context("Canonicalize pairs") 134 | 135 | test_that("rows are ordered lexicographically by first column then second column", { 136 | pairs <- rbind(c(3,4), c(1,5), c(1,2)) 137 | expect_equal(canonicalize_pairs(pairs), 138 | rbind(c(1,2), c(1,5), c(3,4))) 139 | }) 140 | 141 | test_that("identifiers are ordered lexicographically within each row", { 142 | pairs <- rbind(c(4,3), c(1,5), c(2,1)) 143 | expect_equal(canonicalize_pairs(pairs), 144 | rbind(c(1,2), c(1,5), c(3,4))) 145 | }) 146 | 147 | test_that("duplicate pairs are removed", { 148 | pairs <- rbind(c(1,2), c(2,1)) 149 | expect_equal(canonicalize_pairs(pairs), 150 | rbind(c(1,2))) 151 | }) 152 | -------------------------------------------------------------------------------- /tests/testthat/test-measures_pairs.R: -------------------------------------------------------------------------------- 1 | 2 | context("Binary Contingency Table for Linked Pairs") 3 | 4 | test_that("pairwise contingency table is correct for a simple example", { 5 | pred_pairs <- rbind(c(1, 2), c(1, 3), c(4, 5)) 6 | true_pairs <- rbind(c(1, 2), c(1, 5)) 7 | result <- contingency_table_pairs(true_pairs, pred_pairs, num_pairs = 25) 8 | true_result <- rbind("TRUE" = c("TRUE" = 1,"FALSE" = 2), "FALSE" = c("TRUE" = 1, "FALSE" = 21)) 9 | true_result <- as.table(true_result) 10 | names(dimnames(true_result)) <- c("Prediction", "Truth") 11 | expect_equal(result, true_result) 12 | }) 13 | 14 | test_that("pairwise contingency table is correct when pairs are represented using different types", { 15 | pred_pairs <- rbind(c(2,17), c(16, 17), c(18, 23)) 16 | true_pairs <- pred_pairs 17 | storage.mode(true_pairs) <- "character" 18 | result <- contingency_table_pairs(true_pairs, pred_pairs) 19 | true_result <- rbind("TRUE" = c("TRUE" = 3,"FALSE" = 0), "FALSE" = c("TRUE" = 0, "FALSE" = NA)) 20 | true_result <- as.table(true_result) 21 | names(dimnames(true_result)) <- c("Prediction", "Truth") 22 | expect_equal(result, true_result) 23 | }) 24 | 25 | # Examples to test 26 | make_pairs_identical <- function() { 27 | true <- rbind(c(1, 2), c(1, 3), c(2, 3), c(4, 5)) 28 | pred <- rbind(c(1, 2), c(1, 3), c(2, 3), c(4, 5)) 29 | num_pairs <- 10 30 | measures <- list( 31 | "precision_pairs" = 1.0, 32 | "recall_pairs" = 1.0, 33 | "specificity_pairs" = 1.0, 34 | "sensitivity_pairs" = 1.0, 35 | "f_measure_pairs" = 1.0, 36 | "accuracy_pairs" = 1.0, 37 | "balanced_accuracy_pairs" = 1.0, 38 | "fowlkes_mallows_pairs" = 1.0 39 | ) 40 | list("true" = true, "pred" = pred, "num_pairs" = num_pairs, "true_measures" = measures, 41 | "description" = "pairs in complete agreement") 42 | } 43 | 44 | make_pairs_distinct <- function() { 45 | true <- rbind(c(1, 2), c(1, 3), c(2, 3)) 46 | pred <- rbind(c(1, 4), c(2, 4), c(3, 4)) 47 | num_pairs <- 6 48 | measures <- list( 49 | "precision_pairs" = 0.0, 50 | "recall_pairs" = 0.0, 51 | "specificity_pairs" = 0.0, 52 | "sensitivity_pairs" = 0.0, 53 | "f_measure_pairs" = 0.0, 54 | "accuracy_pairs" = 0.0, 55 | "balanced_accuracy_pairs" = 0.0, 56 | "fowlkes_mallows_pairs" = 0.0 57 | ) 58 | list("true" = true, "pred" = pred, "num_pairs" = num_pairs, "true_measures" = measures, 59 | "description" = "pairs in complete disagreement") 60 | } 61 | 62 | make_pairs_no_pred <- function() { 63 | true <- rbind(c(1, 2), c(1, 3), c(2, 3)) 64 | pred <- matrix(0L, nrow = 0, ncol = 2) 65 | num_pairs <- 3 66 | measures <- list( 67 | "precision_pairs" = NaN, 68 | "recall_pairs" = 0.0, 69 | "specificity_pairs" = NaN, 70 | "sensitivity_pairs" = 0.0, 71 | "f_measure_pairs" = NaN, 72 | "accuracy_pairs" = 0.0, 73 | "balanced_accuracy_pairs" = NaN, 74 | "fowlkes_mallows_pairs" = NaN 75 | ) 76 | list("true" = true, "pred" = pred, "num_pairs" = num_pairs, "true_measures" = measures, 77 | "description" = "pairs with zero recall") 78 | } 79 | 80 | make_pairs_one_fp <- function() { 81 | true <- rbind(c(1, 2), c(1, 3), c(2, 3), c(4, 5)) 82 | pred <- rbind(c(1, 2), c(1, 3), c(2, 3), c(4, 5), c(1, 4)) 83 | num_pairs <- 10 84 | measures <- list( 85 | "precision_pairs" = 4/5, 86 | "recall_pairs" = 1.0, 87 | "specificity_pairs" = 5/6, 88 | "sensitivity_pairs" = 1.0, 89 | "f_measure_pairs" = 8/9, 90 | "accuracy_pairs" = 9/10, 91 | "balanced_accuracy_pairs" = 11/12, 92 | "fowlkes_mallows_pairs" = 2/sqrt(5) 93 | ) 94 | list("true" = true, "pred" = pred, "num_pairs" = num_pairs, "true_measures" = measures, 95 | "description" = "pairs with one false positive error") 96 | } 97 | 98 | make_pairs_no_true <- function() { 99 | true <- matrix(0L, nrow = 0, ncol = 2) 100 | pred <- rbind(c(1, 2), c(1, 3), c(2, 3)) 101 | num_pairs <- 3 102 | measures <- list( 103 | "precision_pairs" = 0.0, 104 | "recall_pairs" = NaN, 105 | "specificity_pairs" = 0.0, 106 | "sensitivity_pairs" = NaN, 107 | "f_measure_pairs" = NaN, 108 | "accuracy_pairs" = 0.0, 109 | "balanced_accuracy_pairs" = NaN, 110 | "fowlkes_mallows_pairs" = NaN 111 | ) 112 | list("true" = true, "pred" = pred, "num_pairs" = num_pairs, "true_measures" = measures, 113 | "description" = "pairs with zero precision") 114 | } 115 | 116 | examples_to_test <- list(make_pairs_identical, 117 | make_pairs_distinct, 118 | make_pairs_no_pred, 119 | make_pairs_no_true, 120 | make_pairs_one_fp) 121 | 122 | 123 | context("Precision of Linked Pairs") 124 | for (example in examples_to_test) { 125 | example <- example() 126 | test_that(paste("measure is correct for", example$description), { 127 | true <- example$true 128 | pred <- example$pred 129 | expect_equal(precision_pairs(true, pred), 130 | example$true_measures[["precision_pairs"]]) 131 | }) 132 | } 133 | 134 | context("Recall of Linked Pairs") 135 | for (example in examples_to_test) { 136 | example <- example() 137 | test_that(paste("measure is correct for", example$description), { 138 | true <- example$true 139 | pred <- example$pred 140 | expect_equal(recall_pairs(true, pred), 141 | example$true_measures[["recall_pairs"]]) 142 | }) 143 | } 144 | 145 | context("Specificity of Linked Pairs") 146 | for (example in examples_to_test) { 147 | example <- example() 148 | test_that(paste("measure is correct for", example$description), { 149 | true <- example$true 150 | pred <- example$pred 151 | num_pairs <- example$num_pairs 152 | expect_equal(specificity_pairs(true, pred, num_pairs), 153 | example$true_measures[["specificity_pairs"]]) 154 | }) 155 | } 156 | 157 | context("Sensitivity of Linked Pairs") 158 | for (example in examples_to_test) { 159 | example <- example() 160 | test_that(paste("measure is correct for", example$description), { 161 | true <- example$true 162 | pred <- example$pred 163 | expect_equal(sensitivity_pairs(true, pred), 164 | example$true_measures[["sensitivity_pairs"]]) 165 | }) 166 | } 167 | 168 | context("F-Measure of Linked Pairs") 169 | for (example in examples_to_test) { 170 | example <- example() 171 | test_that(paste("measure is correct for", example$description), { 172 | true <- example$true 173 | pred <- example$pred 174 | expect_equal(f_measure_pairs(true, pred), 175 | example$true_measures[["f_measure_pairs"]]) 176 | }) 177 | } 178 | 179 | context("Accuracy of Linked Pairs") 180 | for (example in examples_to_test) { 181 | example <- example() 182 | test_that(paste("measure is correct for", example$description), { 183 | true <- example$true 184 | pred <- example$pred 185 | num_pairs <- example$num_pairs 186 | expect_equal(accuracy_pairs(true, pred, num_pairs), 187 | example$true_measures[["accuracy_pairs"]]) 188 | }) 189 | } 190 | 191 | context("Balanced Accuracy of Linked Pairs") 192 | for (example in examples_to_test) { 193 | example <- example() 194 | test_that(paste("measure is correct for", example$description), { 195 | true <- example$true 196 | pred <- example$pred 197 | num_pairs <- example$num_pairs 198 | expect_equal(balanced_accuracy_pairs(true, pred, num_pairs), 199 | example$true_measures[["balanced_accuracy_pairs"]]) 200 | }) 201 | } 202 | 203 | context("Fowlkes-Mallows Index of Linked Pairs") 204 | for (example in examples_to_test) { 205 | example <- example() 206 | test_that(paste("measure is correct for", example$description), { 207 | true <- example$true 208 | pred <- example$pred 209 | expect_equal(fowlkes_mallows_pairs(true, pred), 210 | example$true_measures[["fowlkes_mallows_pairs"]]) 211 | }) 212 | } 213 | -------------------------------------------------------------------------------- /R/transformations.R: -------------------------------------------------------------------------------- 1 | #' Transform Clustering Representations 2 | #' 3 | #' @description 4 | #' Transform between different representations of a clustering. 5 | #' 6 | #' @param clusters a representation of a clustering as a list of vectors, 7 | #' where the i-th vector contains the identifiers of elements assigned to the 8 | #' i-th cluster. If `clust_ids` is specified (see below), the i-th cluster 9 | #' is identified according to the corresponding entry in `clust_ids`. 10 | #' Otherwise the i-th cluster is identified according it's name (if 11 | #' `clusters` is a named list) or its integer index i. 12 | #' @param membership a representation of a clustering as a membership vector, 13 | #' where the i-th entry contains the cluster identifier for the i-th element. 14 | #' If `elem_ids` is specified (see below), the i-th element is identified 15 | #' according to the corresponding entry in `elem_ids`. Otherwise the i-th 16 | #' element is identified according it's name (if `members` is a named vector) 17 | #' or its integer index i. 18 | #' @param pairs a representation of a clustering as a matrix or data.frame 19 | #' containing all pairs of elements that are co-clustered. The rows index 20 | #' of the matrix/data.frame index pairs and columns index the identifiers 21 | #' of the constituent elements. The `elem_ids` argument (see below) must be 22 | #' specified in order to recover singleton clusters (containing a single 23 | #' element). 24 | #' @param elem_ids a vector specifying the complete set of identifiers for the 25 | #' cluster elements in canonical order. Optional for all functions excluding 26 | #' `pairs_to_membership` and `pairs_to_clusters`. 27 | #' @param clust_ids a vector specifying the complete set of identifiers for 28 | #' the clusters in canonical order. Optional for all functions. 29 | #' @return `clusters_to_membership` and `pairs_to_membership` both return a 30 | #' membership vector representation of the clustering. The order of the 31 | #' elements is taken from `elem_ids` if specified, otherwise the elements are 32 | #' ordered lexicographically by their identifiers. For 33 | #' `pairs_to_membership`, the cluster identifiers cannot be recovered and 34 | #' are taken to be integers. 35 | #' 36 | #' `membership_to_clusters` and `pairs_to_clusters` both return a 37 | #' representation of the clustering as a list of vectors. The order of the 38 | #' clusters is taken from `clust_ids` if specified, otherwise the clusters 39 | #' are ordered lexicographically by their identifiers. For 40 | #' `pairs_to_clusters`, the cluster identifiers cannot be recovered and 41 | #' are taken to be integers. 42 | #' 43 | #' `clusters_to_pairs` and `membership_to_pairs` both return a 44 | #' representation of the clustering as a matrix of element pairs that are 45 | #' co-clustered. This representation results in loss of information, as 46 | #' singleton clusters (with one element) and cluster identifiers are not 47 | #' represented. 48 | #' 49 | #' @examples 50 | #' ## A clustering of three items represented as a membership vector 51 | #' m <- c("Item1" = 1, "Item2" = 2, "Item3" = 1) 52 | #' 53 | #' # Transform to list of clusters 54 | #' membership_to_clusters(m) 55 | #' # Specify different identifiers for the items 56 | #' membership_to_clusters(m, elem_ids = c(1, 2, 3)) 57 | #' # Transform to array of pairs that are co-clustered 58 | #' membership_to_pairs(m) 59 | #' 60 | #' ## A clustering represented as a list of clusters 61 | #' cl <- list("ClustA" = c(1,3), "ClustB" = c(2)) 62 | #' 63 | #' # Transform to membership vector representation 64 | #' clusters_to_membership(cl) 65 | #' # Transform to array of pairs that are co-clustered 66 | #' clusters_to_pairs(cl) 67 | #' 68 | #' ## A clustering (incompletely) represented as an array of pairs that 69 | #' ## are co-clustered 70 | #' p <- rbind(c(1,3)) # pairs of elements in the same cluster 71 | #' ids <- c(1,2,3) # necessary to specify set of all elements 72 | #' 73 | #' # Transform to membership vector representation 74 | #' pairs_to_membership(p, ids) 75 | #' # Transform to list of clusters 76 | #' pairs_to_clusters(p, ids) 77 | #' 78 | #' @export 79 | #' @importFrom stats na.fail 80 | #' @rdname clustering_representations 81 | clusters_to_membership <- function(clusters, elem_ids = NULL, clust_ids = NULL) 82 | { 83 | if (!is.null(clust_ids)) { 84 | # Check provided clust_ids for consistency 85 | if (length(clust_ids) != length(clusters)) 86 | stop("`clust_ids` must be the same length as `clusters`") 87 | tryCatch(na.fail(clust_ids), error = function(e) 88 | stop("`clust_ids` cannot contain NA values")) 89 | } else { 90 | # Infer clust_ids from names first, falling back to integer ids 91 | if (!is.null(names(clusters))) { 92 | clust_ids <- names(clusters) 93 | } else { 94 | clust_ids <- seq_along(clusters) 95 | } 96 | } 97 | 98 | clust_sizes <- sapply(clusters, length) 99 | if (!is.null(elem_ids)) { 100 | if (sum(clust_sizes) != length(elem_ids)) 101 | stop("`elem_ids` does not match number of elements in `clusters`") 102 | tryCatch(na.fail(elem_ids), error = function(e) 103 | stop("`elem_ids` cannot contain NA values")) 104 | } 105 | 106 | membership <- rep(clust_ids, times=clust_sizes) 107 | names(membership) <- as.character(unlist(clusters)) 108 | 109 | # Reorder membership vector 110 | if (!is.null(elem_ids)) { 111 | # Use order in elem_ids 112 | membership <- membership[as.character(elem_ids)] 113 | } else { 114 | # Order lexicographically by name 115 | ordered_idx <- order(names(membership)) 116 | membership <- membership[ordered_idx] 117 | } 118 | 119 | return(membership) 120 | } 121 | 122 | 123 | #' @importFrom stats na.fail 124 | #' @export 125 | #' @rdname clustering_representations 126 | membership_to_clusters <- function(membership, elem_ids = NULL, clust_ids = NULL) { 127 | if (!is.null(elem_ids)) { 128 | # Check provided elem_ids for consistency 129 | if (length(elem_ids) != length(membership)) 130 | stop("`elem_ids` must be the same length as `membership`") 131 | tryCatch(na.fail(elem_ids), error = function(e) 132 | stop("`elem_ids` cannot contain NA values")) 133 | } else { 134 | # Infer elem_ids from names first, falling back to integer ids 135 | if (!is.null(names(membership))) { 136 | elem_ids <- names(membership) 137 | } else { 138 | elem_ids <- seq_along(membership) 139 | } 140 | } 141 | 142 | clusters <- split(elem_ids, membership) 143 | 144 | # Reorder clusters list 145 | if (!is.null(clust_ids)) { 146 | # Use order in clust_ids, but first check consistency 147 | tryCatch(na.fail(clust_ids), error = function(e) 148 | stop("`clust_ids` cannot contain NA values")) 149 | clusters <- clusters[as.character(clust_ids)] 150 | } else { 151 | # Order lexicographically by name 152 | ordered_idx <- order(names(clusters)) 153 | clusters <- clusters[ordered_idx] 154 | } 155 | 156 | return(clusters) 157 | } 158 | 159 | 160 | #' @importFrom utils combn 161 | #' @export 162 | #' @rdname clustering_representations 163 | clusters_to_pairs <- function(clusters) { 164 | 165 | non_singletons <- Filter(function(x) length(x) > 1, clusters) 166 | 167 | if (length(non_singletons) == 0) { 168 | if (length(clusters) == 0) { 169 | # No clusters 170 | pairs <- array(dim = c(0, 2), data = 0L) 171 | 172 | } else { 173 | # All clusters are singletons: no pairs to return 174 | element_id_type <- typeof(clusters[[1]]) 175 | pairs <- array(dim = c(0, 2), data = vector(mode = element_id_type)) 176 | } 177 | return(pairs) 178 | } 179 | 180 | # Make ? x 2 array of pairs for each cluster and store in a list 181 | pairs <- lapply(non_singletons, function(x) t(combn(x, 2))) 182 | # Merge pairs from all clusters into single ? x 2 array 183 | pairs <- do.call(rbind, pairs) 184 | 185 | pairs <- canonicalize_pairs(pairs) 186 | return(pairs) 187 | } 188 | 189 | 190 | #' @export 191 | #' @rdname clustering_representations 192 | membership_to_pairs <- function(membership, elem_ids = NULL) { 193 | clusters <- membership_to_clusters(membership, elem_ids = elem_ids) 194 | pairs <- clusters_to_pairs(clusters) 195 | return(pairs) 196 | } 197 | 198 | 199 | #' @importFrom stats na.fail na.omit na.action 200 | #' @export 201 | #' @rdname clustering_representations 202 | pairs_to_membership <- function(pairs, elem_ids) { 203 | # Need to convert to matrix in order for factor to work below 204 | pairs <- as.matrix(pairs) 205 | pairs <- na.omit(pairs) 206 | 207 | if (ncol(pairs) != 2) stop("`pairs` must have exactly two columns") 208 | if (length(na.action(pairs))!= 0) 209 | warning("rows with NA values were removed from `pairs`") 210 | 211 | tryCatch(na.fail(elem_ids), error = function(e) stop("`elem_ids` cannot contain NA values")) 212 | 213 | # Transform pairs so that elem_ids are represented as integers starting at 0 214 | original_dim <- dim(pairs) 215 | pairs <- factor(pairs) 216 | pairs <- unclass(pairs) - 1 217 | dim(pairs) <- original_dim 218 | 219 | # Save mapping to original elem_ids used in pairs 220 | pairs_elem_ids <- levels(pairs) 221 | 222 | membership <- pairs_to_membership_cpp(pairs, length(elem_ids)) 223 | # R indexing starts at 1 224 | membership <- membership + 1 225 | 226 | # Fill names with elem_ids 227 | char_elem_ids <- as.character(elem_ids) 228 | singleton_elem_ids <- setdiff(char_elem_ids, pairs_elem_ids) 229 | names(membership) <- c(pairs_elem_ids, singleton_elem_ids) 230 | 231 | # Sort according to elem_ids 232 | membership <- membership[char_elem_ids] 233 | 234 | return(membership) 235 | } 236 | 237 | 238 | #' @export 239 | #' @rdname clustering_representations 240 | pairs_to_clusters <- function(pairs, elem_ids) { 241 | membership <- pairs_to_membership(pairs, elem_ids) 242 | clusters <- membership_to_clusters(membership, elem_ids = elem_ids) 243 | names(clusters) <- NULL 244 | return(clusters) 245 | } 246 | 247 | 248 | #' Canonicalize element pairs 249 | #' 250 | #' @description 251 | #' Coerce a collection of element pairs into canonical form. Facilitates 252 | #' testing of equivalence. 253 | #' 254 | #' @param pairs a matrix or data.frame of element pairs where rows correspond 255 | #' to element pairs and columns correspond to element identifiers. 256 | #' @param ordered whether to treat the element pairs as ordered---i.e. whether 257 | #' pair \eqn{(x, y)} is distinct from pair \eqn{(y, x)} for \eqn{x \neq y}. 258 | #' Defaults to FALSE, which is appropriate for clustering, undirected link 259 | #' prediction, record linkage etc. 260 | #' @return Returns the element pairs in canonical form, so that: 261 | #' * the first element id precedes the second element id lexicographically 262 | #' if `ordered = FALSE`---i.e. pair (3, 2) becomes pair (2, 3); 263 | #' * duplicate pairs are removed; and 264 | #' * the rows in the matrix/data.frame pairs are sorted lexicographically 265 | #' by the first element id, then by the second element id. 266 | #' 267 | #' @examples 268 | #' messy_pairs <- rbind(c(2,1), c(1,2), c(3,1), c(1,2)) 269 | #' clean_pairs <- canonicalize_pairs(messy_pairs) 270 | #' all(rbind(c(1,2), c(1,3)) == clean_pairs) # duplicates removed and order fixed 271 | #' 272 | #' @export 273 | canonicalize_pairs <- function(pairs, ordered=FALSE) { 274 | if (ncol(pairs) != 2) stop("`pairs` must have exactly two columns") 275 | 276 | pairs <- as.matrix(pairs) 277 | 278 | if (nrow(pairs) == 0) return(pairs) 279 | 280 | # Sort entries in each row lexicographically 281 | if (!ordered) { 282 | pairs <- t(apply(pairs, 1, sort)) 283 | } 284 | 285 | # Remove duplicate rows 286 | pairs <- unique(pairs) 287 | 288 | # Sort rows lexicographically, by first column then second 289 | ordered_row <- order(pairs[,1], pairs[,2]) 290 | pairs <- pairs[ordered_row, , drop=FALSE] 291 | 292 | return(pairs) 293 | } 294 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 2, June 1991 3 | 4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc., 5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | Preamble 10 | 11 | The licenses for most software are designed to take away your 12 | freedom to share and change it. By contrast, the GNU General Public 13 | License is intended to guarantee your freedom to share and change free 14 | software--to make sure the software is free for all its users. This 15 | General Public License applies to most of the Free Software 16 | Foundation's software and to any other program whose authors commit to 17 | using it. (Some other Free Software Foundation software is covered by 18 | the GNU Lesser General Public License instead.) You can apply it to 19 | your programs, too. 20 | 21 | When we speak of free software, we are referring to freedom, not 22 | price. Our General Public Licenses are designed to make sure that you 23 | have the freedom to distribute copies of free software (and charge for 24 | this service if you wish), that you receive source code or can get it 25 | if you want it, that you can change the software or use pieces of it 26 | in new free programs; and that you know you can do these things. 27 | 28 | To protect your rights, we need to make restrictions that forbid 29 | anyone to deny you these rights or to ask you to surrender the rights. 30 | These restrictions translate to certain responsibilities for you if you 31 | distribute copies of the software, or if you modify it. 32 | 33 | For example, if you distribute copies of such a program, whether 34 | gratis or for a fee, you must give the recipients all the rights that 35 | you have. You must make sure that they, too, receive or can get the 36 | source code. And you must show them these terms so they know their 37 | rights. 38 | 39 | We protect your rights with two steps: (1) copyright the software, and 40 | (2) offer you this license which gives you legal permission to copy, 41 | distribute and/or modify the software. 42 | 43 | Also, for each author's protection and ours, we want to make certain 44 | that everyone understands that there is no warranty for this free 45 | software. If the software is modified by someone else and passed on, we 46 | want its recipients to know that what they have is not the original, so 47 | that any problems introduced by others will not reflect on the original 48 | authors' reputations. 49 | 50 | Finally, any free program is threatened constantly by software 51 | patents. We wish to avoid the danger that redistributors of a free 52 | program will individually obtain patent licenses, in effect making the 53 | program proprietary. To prevent this, we have made it clear that any 54 | patent must be licensed for everyone's free use or not licensed at all. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | GNU GENERAL PUBLIC LICENSE 60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 61 | 62 | 0. This License applies to any program or other work which contains 63 | a notice placed by the copyright holder saying it may be distributed 64 | under the terms of this General Public License. The "Program", below, 65 | refers to any such program or work, and a "work based on the Program" 66 | means either the Program or any derivative work under copyright law: 67 | that is to say, a work containing the Program or a portion of it, 68 | either verbatim or with modifications and/or translated into another 69 | language. (Hereinafter, translation is included without limitation in 70 | the term "modification".) Each licensee is addressed as "you". 71 | 72 | Activities other than copying, distribution and modification are not 73 | covered by this License; they are outside its scope. The act of 74 | running the Program is not restricted, and the output from the Program 75 | is covered only if its contents constitute a work based on the 76 | Program (independent of having been made by running the Program). 77 | Whether that is true depends on what the Program does. 78 | 79 | 1. You may copy and distribute verbatim copies of the Program's 80 | source code as you receive it, in any medium, provided that you 81 | conspicuously and appropriately publish on each copy an appropriate 82 | copyright notice and disclaimer of warranty; keep intact all the 83 | notices that refer to this License and to the absence of any warranty; 84 | and give any other recipients of the Program a copy of this License 85 | along with the Program. 86 | 87 | You may charge a fee for the physical act of transferring a copy, and 88 | you may at your option offer warranty protection in exchange for a fee. 89 | 90 | 2. You may modify your copy or copies of the Program or any portion 91 | of it, thus forming a work based on the Program, and copy and 92 | distribute such modifications or work under the terms of Section 1 93 | above, provided that you also meet all of these conditions: 94 | 95 | a) You must cause the modified files to carry prominent notices 96 | stating that you changed the files and the date of any change. 97 | 98 | b) You must cause any work that you distribute or publish, that in 99 | whole or in part contains or is derived from the Program or any 100 | part thereof, to be licensed as a whole at no charge to all third 101 | parties under the terms of this License. 102 | 103 | c) If the modified program normally reads commands interactively 104 | when run, you must cause it, when started running for such 105 | interactive use in the most ordinary way, to print or display an 106 | announcement including an appropriate copyright notice and a 107 | notice that there is no warranty (or else, saying that you provide 108 | a warranty) and that users may redistribute the program under 109 | these conditions, and telling the user how to view a copy of this 110 | License. (Exception: if the Program itself is interactive but 111 | does not normally print such an announcement, your work based on 112 | the Program is not required to print an announcement.) 113 | 114 | These requirements apply to the modified work as a whole. If 115 | identifiable sections of that work are not derived from the Program, 116 | and can be reasonably considered independent and separate works in 117 | themselves, then this License, and its terms, do not apply to those 118 | sections when you distribute them as separate works. But when you 119 | distribute the same sections as part of a whole which is a work based 120 | on the Program, the distribution of the whole must be on the terms of 121 | this License, whose permissions for other licensees extend to the 122 | entire whole, and thus to each and every part regardless of who wrote it. 123 | 124 | Thus, it is not the intent of this section to claim rights or contest 125 | your rights to work written entirely by you; rather, the intent is to 126 | exercise the right to control the distribution of derivative or 127 | collective works based on the Program. 128 | 129 | In addition, mere aggregation of another work not based on the Program 130 | with the Program (or with a work based on the Program) on a volume of 131 | a storage or distribution medium does not bring the other work under 132 | the scope of this License. 133 | 134 | 3. You may copy and distribute the Program (or a work based on it, 135 | under Section 2) in object code or executable form under the terms of 136 | Sections 1 and 2 above provided that you also do one of the following: 137 | 138 | a) Accompany it with the complete corresponding machine-readable 139 | source code, which must be distributed under the terms of Sections 140 | 1 and 2 above on a medium customarily used for software interchange; or, 141 | 142 | b) Accompany it with a written offer, valid for at least three 143 | years, to give any third party, for a charge no more than your 144 | cost of physically performing source distribution, a complete 145 | machine-readable copy of the corresponding source code, to be 146 | distributed under the terms of Sections 1 and 2 above on a medium 147 | customarily used for software interchange; or, 148 | 149 | c) Accompany it with the information you received as to the offer 150 | to distribute corresponding source code. (This alternative is 151 | allowed only for noncommercial distribution and only if you 152 | received the program in object code or executable form with such 153 | an offer, in accord with Subsection b above.) 154 | 155 | The source code for a work means the preferred form of the work for 156 | making modifications to it. For an executable work, complete source 157 | code means all the source code for all modules it contains, plus any 158 | associated interface definition files, plus the scripts used to 159 | control compilation and installation of the executable. However, as a 160 | special exception, the source code distributed need not include 161 | anything that is normally distributed (in either source or binary 162 | form) with the major components (compiler, kernel, and so on) of the 163 | operating system on which the executable runs, unless that component 164 | itself accompanies the executable. 165 | 166 | If distribution of executable or object code is made by offering 167 | access to copy from a designated place, then offering equivalent 168 | access to copy the source code from the same place counts as 169 | distribution of the source code, even though third parties are not 170 | compelled to copy the source along with the object code. 171 | 172 | 4. You may not copy, modify, sublicense, or distribute the Program 173 | except as expressly provided under this License. Any attempt 174 | otherwise to copy, modify, sublicense or distribute the Program is 175 | void, and will automatically terminate your rights under this License. 176 | However, parties who have received copies, or rights, from you under 177 | this License will not have their licenses terminated so long as such 178 | parties remain in full compliance. 179 | 180 | 5. You are not required to accept this License, since you have not 181 | signed it. However, nothing else grants you permission to modify or 182 | distribute the Program or its derivative works. These actions are 183 | prohibited by law if you do not accept this License. Therefore, by 184 | modifying or distributing the Program (or any work based on the 185 | Program), you indicate your acceptance of this License to do so, and 186 | all its terms and conditions for copying, distributing or modifying 187 | the Program or works based on it. 188 | 189 | 6. Each time you redistribute the Program (or any work based on the 190 | Program), the recipient automatically receives a license from the 191 | original licensor to copy, distribute or modify the Program subject to 192 | these terms and conditions. You may not impose any further 193 | restrictions on the recipients' exercise of the rights granted herein. 194 | You are not responsible for enforcing compliance by third parties to 195 | this License. 196 | 197 | 7. If, as a consequence of a court judgment or allegation of patent 198 | infringement or for any other reason (not limited to patent issues), 199 | conditions are imposed on you (whether by court order, agreement or 200 | otherwise) that contradict the conditions of this License, they do not 201 | excuse you from the conditions of this License. If you cannot 202 | distribute so as to satisfy simultaneously your obligations under this 203 | License and any other pertinent obligations, then as a consequence you 204 | may not distribute the Program at all. For example, if a patent 205 | license would not permit royalty-free redistribution of the Program by 206 | all those who receive copies directly or indirectly through you, then 207 | the only way you could satisfy both it and this License would be to 208 | refrain entirely from distribution of the Program. 209 | 210 | If any portion of this section is held invalid or unenforceable under 211 | any particular circumstance, the balance of the section is intended to 212 | apply and the section as a whole is intended to apply in other 213 | circumstances. 214 | 215 | It is not the purpose of this section to induce you to infringe any 216 | patents or other property right claims or to contest validity of any 217 | such claims; this section has the sole purpose of protecting the 218 | integrity of the free software distribution system, which is 219 | implemented by public license practices. Many people have made 220 | generous contributions to the wide range of software distributed 221 | through that system in reliance on consistent application of that 222 | system; it is up to the author/donor to decide if he or she is willing 223 | to distribute software through any other system and a licensee cannot 224 | impose that choice. 225 | 226 | This section is intended to make thoroughly clear what is believed to 227 | be a consequence of the rest of this License. 228 | 229 | 8. If the distribution and/or use of the Program is restricted in 230 | certain countries either by patents or by copyrighted interfaces, the 231 | original copyright holder who places the Program under this License 232 | may add an explicit geographical distribution limitation excluding 233 | those countries, so that distribution is permitted only in or among 234 | countries not thus excluded. In such case, this License incorporates 235 | the limitation as if written in the body of this License. 236 | 237 | 9. The Free Software Foundation may publish revised and/or new versions 238 | of the General Public License from time to time. Such new versions will 239 | be similar in spirit to the present version, but may differ in detail to 240 | address new problems or concerns. 241 | 242 | Each version is given a distinguishing version number. If the Program 243 | specifies a version number of this License which applies to it and "any 244 | later version", you have the option of following the terms and conditions 245 | either of that version or of any later version published by the Free 246 | Software Foundation. If the Program does not specify a version number of 247 | this License, you may choose any version ever published by the Free Software 248 | Foundation. 249 | 250 | 10. If you wish to incorporate parts of the Program into other free 251 | programs whose distribution conditions are different, write to the author 252 | to ask for permission. For software which is copyrighted by the Free 253 | Software Foundation, write to the Free Software Foundation; we sometimes 254 | make exceptions for this. Our decision will be guided by the two goals 255 | of preserving the free status of all derivatives of our free software and 256 | of promoting the sharing and reuse of software generally. 257 | 258 | NO WARRANTY 259 | 260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 268 | REPAIR OR CORRECTION. 269 | 270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 278 | POSSIBILITY OF SUCH DAMAGES. 279 | 280 | END OF TERMS AND CONDITIONS 281 | -------------------------------------------------------------------------------- /R/measures_clusterings.R: -------------------------------------------------------------------------------- 1 | 2 | #' @importFrom stats xtabs 3 | #' @importFrom Matrix rowSums colSums crossprod 4 | #' @noRd 5 | pair_contingency_table_clusters <- function(true, pred) { 6 | if (length(true) != length(pred)) 7 | stop("`true` and `pred` must have the same length") 8 | 9 | # TODO: NA treatment 10 | data <- data.frame("Truth" = true, "Prediction" = pred, 11 | stringsAsFactors = FALSE) 12 | ct <- xtabs(~ pred + true, data = data, sparse = TRUE) 13 | 14 | sizes_true <- colSums(ct) 15 | sizes_pred <- rowSums(ct) 16 | sum_squares <- sum(ct^2) 17 | num_items <- length(true) 18 | pair_ct <- matrix(nrow = 2, ncol = 2, data = NA_integer_) 19 | pair_ct[1,1] <- sum_squares - num_items # TP 20 | pair_ct[2,1] <- sum(ct %*% sizes_true) - sum_squares # FP 21 | pair_ct[1,2] <- sum(crossprod(ct, sizes_pred)) - sum_squares # FN 22 | pair_ct[2,2] <- num_items^2 - pair_ct[1,2] - pair_ct[2,1] - sum_squares # TN 23 | dimnames(pair_ct) <- list("Prediction" = c("TRUE", "FALSE"), "Truth" = c("TRUE", "FALSE")) 24 | return(as.table(pair_ct)) 25 | } 26 | 27 | 28 | #' Contingency Table for Clusterings 29 | #' 30 | #' @description Compute the contingency table for a _predicted_ clustering 31 | #' given a _ground truth_ clustering. 32 | #' 33 | #' @param true ground truth clustering represented as a membership 34 | #' vector. Each entry corresponds to an element and the value identifies 35 | #' the assigned cluster. The specific values of the cluster identifiers 36 | #' are arbitrary. 37 | #' @param pred predicted clustering represented as a membership 38 | #' vector. 39 | #' @return Returns a table \eqn{C} (stored as a sparse matrix) such that 40 | #' \eqn{C_{ij}}{C_ij} counts the number of elements assigned to 41 | #' cluster \eqn{i} in `pred` and cluster \eqn{j} in `true`. 42 | #' 43 | #' @seealso 44 | #' [`eval_report_clusters`] computes common evaluation measures derived 45 | #' from the output of this function. 46 | #' 47 | #' @examples 48 | #' true <- c(1,1,1,2,2) # ground truth clustering 49 | #' pred <- c(1,1,2,2,2) # predicted clustering 50 | #' contingency_table_clusters(true, pred) 51 | #' 52 | #' @export 53 | #' @importFrom stats xtabs 54 | contingency_table_clusters <- function(true, pred) { 55 | if (length(true) != length(pred)) 56 | stop("`true` and `pred` must have the same length") 57 | 58 | # TODO: NA treatment 59 | data <- data.frame("true" = true, "pred" = pred, 60 | stringsAsFactors = FALSE) 61 | ct <- xtabs(~ pred + true, data = data, sparse = TRUE) 62 | ct 63 | } 64 | 65 | 66 | #' Evaluation Report for Clustering 67 | #' 68 | #' @description Compute various evaluation measures for a predicted 69 | #' clustering using a ground truth clustering as a reference. 70 | #' 71 | #' @param true ground truth clustering represented as a membership 72 | #' vector. Each entry corresponds to an element and the value identifies 73 | #' the assigned cluster. The specific values of the cluster identifiers 74 | #' are arbitrary. 75 | #' @param pred predicted clustering represented as a membership 76 | #' vector. 77 | #' @return Returns a list containing the following measures: 78 | #' \describe{ 79 | #' \item{homogeneity}{see [`homogeneity`]} 80 | #' \item{completeness}{see [`completeness`]} 81 | #' \item{v_measure}{see [`v_measure`]} 82 | #' \item{rand_index}{see [`rand_index`]} 83 | #' \item{adj_rand_index}{see [`adj_rand_index`]} 84 | #' \item{variation_info}{see [`variation_info`]} 85 | #' \item{mutual_info}{see [`mutual_info`]} 86 | #' \item{fowlkes_mallows}{see [`fowlkes_mallows`]} 87 | #' } 88 | #' 89 | #' @examples 90 | #' true <- c(1,1,1,2,2) # ground truth clustering 91 | #' pred <- c(1,1,2,2,2) # predicted clustering 92 | #' eval_report_clusters(true, pred) 93 | #' 94 | #' @export 95 | eval_report_clusters <- function(true, pred) { 96 | pair_ct <- pair_contingency_table_clusters(true, pred) 97 | ct <- contingency_table_clusters(true, pred) 98 | list("homogeneity" = homogeneity_ct(ct), 99 | "completeness" = completeness_ct(ct), 100 | "v_measure" = v_measure_ct(ct), 101 | "rand_index" = rand_index_ct(pair_ct), 102 | "adj_rand_index" = adj_rand_index_ct(pair_ct), 103 | "variation_info" = variation_info_ct(ct), 104 | "mutual_info" = mutual_info_ct(ct), 105 | "fowlkes_mallows" = fowlkes_mallows_ct(ct)) 106 | } 107 | 108 | 109 | #' Rand Index Between Clusterings 110 | #' 111 | #' @description Computes the Rand index (RI) between two clusterings, such 112 | #' as a predicted and ground truth clustering. 113 | #' 114 | #' @details The Rand index (RI) can be expressed as: 115 | #' \deqn{\frac{a + b}{{n \choose 2}}.}{(a + b)/binom(n, 2).} 116 | #' where 117 | #' * \eqn{n} is the number of elements, 118 | #' * \eqn{a} is the number of pairs of elements that appear in the 119 | #' same cluster in both clusterings, and 120 | #' * \eqn{b} is the number of pairs of elements that appear in distinct 121 | #' clusters in both clusterings. 122 | #' 123 | #' The RI takes on values between 0 and 1, where 1 denotes exact agreement 124 | #' between the clusterings and 0 denotes disagreement on all pairs of 125 | #' elements. 126 | #' 127 | #' @param true ground truth clustering represented as a membership 128 | #' vector. Each entry corresponds to an element and the value identifies 129 | #' the assigned cluster. The specific values of the cluster identifiers 130 | #' are arbitrary. 131 | #' @param pred predicted clustering represented as a membership 132 | #' vector. 133 | #' 134 | #' @references 135 | #' Rand, W. M. "Objective Criteria for the Evaluation of Clustering Methods." 136 | #' _Journal of the American Statistical Association_ 66(336), 846-850 (1971). 137 | #' \doi{10.1080/01621459.1971.10482356} 138 | #' 139 | #' @examples 140 | #' true <- c(1,1,1,2,2) # ground truth clustering 141 | #' pred <- c(1,1,2,2,2) # predicted clustering 142 | #' rand_index(true, pred) 143 | #' 144 | #' @export 145 | rand_index <- function(true, pred) { 146 | pair_ct <- pair_contingency_table_clusters(true, pred) 147 | rand_index_ct(pair_ct) 148 | } 149 | 150 | 151 | #' Adjusted Rand Index Between Clusterings 152 | #' 153 | #' @description Computes the adjusted Rand index (ARI) between two clusterings, 154 | #' such as a predicted and ground truth clustering. 155 | #' 156 | #' @details The adjusted Rand index (ARI) is a variant of the Rand index (RI) 157 | #' which is corrected for chance using the Permutation Model for 158 | #' clusterings. It is related to the RI as follows: 159 | #' \deqn{\frac{RI - E(RI)}{1 - E(RI)},}{(RI - E(RI))/(1 - E(RI)),} 160 | #' where \eqn{E(RI)} is the expected value of the RI under the Permutation 161 | #' Model. 162 | #' Unlike the RI, the ARI takes values in the range -1 to 1. A value 163 | #' of 1 indicates that the clusterings are identical, while a value of 164 | #' 0 indicates the clusterings are drawn randomly independent of one 165 | #' another. 166 | #' 167 | #' @param true ground truth clustering represented as a membership 168 | #' vector. Each entry corresponds to an element and the value identifies 169 | #' the assigned cluster. The specific values of the cluster identifiers 170 | #' are arbitrary. 171 | #' @param pred predicted clustering represented as a membership 172 | #' vector. 173 | #' 174 | #' @examples 175 | #' true <- c(1,1,1,2,2) # ground truth clustering 176 | #' pred <- c(1,1,2,2,2) # predicted clustering 177 | #' adj_rand_index(true, pred) 178 | #' 179 | #' @references 180 | #' Hubert, L., Arabie, P. "Comparing partitions." _Journal of Classification_ 181 | #' **2**, 193–218 (1985). \doi{10.1007/BF01908075} 182 | #' 183 | #' @export 184 | adj_rand_index <- function(true, pred) { 185 | pair_ct <- pair_contingency_table_clusters(true, pred) 186 | adj_rand_index_ct(pair_ct) 187 | } 188 | 189 | 190 | #' Fowlkes-Mallows Index Between Clusterings 191 | #' 192 | #' @description Computes the Fowlkes-Mallows index between two clusterings, 193 | #' such as a predicted and ground truth clustering. 194 | #' 195 | #' @details The Fowlkes-Mallows index is defined as the geometric mean of 196 | #' precision and recall, computed with respect to pairs of elements. 197 | #' 198 | #' @param true ground truth clustering represented as a membership 199 | #' vector. Each entry corresponds to an element and the value identifies 200 | #' the assigned cluster. The specific values of the cluster identifiers 201 | #' are arbitrary. 202 | #' @param pred predicted clustering represented as a membership 203 | #' vector. 204 | #' 205 | #' @references 206 | #' Fowlkes, E. B. and Mallows, C. L. "A Method for Comparing Two Hierarchical 207 | #' Clusterings." _Journal of the American Statistical Association_ **78:383**, 208 | #' 553-569, (1983). \doi{10.1080/01621459.1983.10478008} 209 | #' 210 | #' @examples 211 | #' true <- c(1,1,1,2,2) # ground truth clustering 212 | #' pred <- c(1,1,2,2,2) # predicted clustering 213 | #' fowlkes_mallows(true, pred) 214 | #' 215 | #' @export 216 | fowlkes_mallows <- function(true, pred) { 217 | ct <- contingency_table_clusters(true, pred) 218 | fowlkes_mallows_ct(ct) 219 | } 220 | 221 | 222 | 223 | #' Homogeneity Between Clusterings 224 | #' 225 | #' @description Computes the homogeneity between two clusterings, such 226 | #' as a predicted and ground truth clustering. 227 | #' 228 | #' @details Homogeneity is an entropy-based measure of the similarity 229 | #' between two clusterings, say \eqn{t} and \eqn{p}. The homogeneity 230 | #' is high if clustering \eqn{t} only assigns members of a cluster to 231 | #' a single cluster in \eqn{p}. The homogeneity ranges between 0 232 | #' and 1, where 1 indicates a perfect homogeneity. 233 | #' 234 | #' @param true ground truth clustering represented as a membership 235 | #' vector. Each entry corresponds to an element and the value identifies 236 | #' the assigned cluster. The specific values of the cluster identifiers 237 | #' are arbitrary. 238 | #' @param pred predicted clustering represented as a membership 239 | #' vector. 240 | #' 241 | #' @references 242 | #' Rosenberg, A. and Hirschberg, J. "V-measure: A conditional entropy-based external cluster evaluation measure." _Proceedings of the 2007 Joint Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning_ (EMNLP-CoNLL), (2007). 243 | #' 244 | #' @seealso [`completeness`] evaluates the _completeness_, which is a dual 245 | #' measure to _homogeneity_. [`v_measure`] evaluates the harmonic mean of 246 | #' _completeness_ and _homogeneity_. 247 | #' 248 | #' @examples 249 | #' true <- c(1,1,1,2,2) # ground truth clustering 250 | #' pred <- c(1,1,2,2,2) # predicted clustering 251 | #' homogeneity(true, pred) 252 | #' 253 | #' @export 254 | homogeneity <- function(true, pred) { 255 | ct <- contingency_table_clusters(true, pred) 256 | homogeneity_ct(ct) 257 | } 258 | 259 | 260 | #' Completeness Between Clusterings 261 | #' 262 | #' @description Computes the completeness between two clusterings, such 263 | #' as a predicted and ground truth clustering. 264 | #' 265 | #' @details Completeness is an entropy-based measure of the similarity 266 | #' between two clusterings, say \eqn{t} and \eqn{p}. The completeness 267 | #' is high if _all_ members of a given cluster in \eqn{t} are assigned 268 | #' to a single cluster in \eqn{p}. The completeness ranges between 0 269 | #' and 1, where 1 indicates perfect completeness. 270 | #' 271 | #' @param true ground truth clustering represented as a membership 272 | #' vector. Each entry corresponds to an element and the value identifies 273 | #' the assigned cluster. The specific values of the cluster identifiers 274 | #' are arbitrary. 275 | #' @param pred predicted clustering represented as a membership 276 | #' vector. 277 | #' 278 | #' @references 279 | #' Rosenberg, A. and Hirschberg, J. "V-measure: A conditional entropy-based external cluster evaluation measure." _Proceedings of the 2007 Joint Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning_ (EMNLP-CoNLL), (2007). 280 | #' 281 | #' @seealso [`homogeneity`] evaluates the _homogeneity_, which is a dual 282 | #' measure to _completeness_. [`v_measure`] evaluates the harmonic mean of 283 | #' _completeness_ and _homogeneity_. 284 | #' 285 | #' @examples 286 | #' true <- c(1,1,1,2,2) # ground truth clustering 287 | #' pred <- c(1,1,2,2,2) # predicted clustering 288 | #' completeness(true, pred) 289 | #' 290 | #' @export 291 | completeness <- function(true, pred) { 292 | ct <- contingency_table_clusters(true, pred) 293 | completeness_ct(ct) 294 | } 295 | 296 | 297 | #' V-measure Between Clusterings 298 | #' 299 | #' @description Computes the V-measure between two clusterings, such 300 | #' as a predicted and ground truth clustering. 301 | #' 302 | #' @details V-measure is defined as the \eqn{\beta}{β}-weighted harmonic 303 | #' mean of homogeneity \eqn{h} and completeness \eqn{c}: 304 | #' \deqn{(1 + \beta)\frac{h \cdot c}{\beta \cdot h + c}.}{(1 + β)·h·c/(β·h + c).} 305 | #' The range of V-measure is between 0 and 1, where 1 corresponds to a 306 | #' perfect match between the clusterings. It is equivalent to the 307 | #' normalised mutual information, when the aggregation function is the 308 | #' arithmetic mean. 309 | #' 310 | #' @param true ground truth clustering represented as a membership 311 | #' vector. Each entry corresponds to an element and the value identifies 312 | #' the assigned cluster. The specific values of the cluster identifiers 313 | #' are arbitrary. 314 | #' @param pred predicted clustering represented as a membership 315 | #' vector. 316 | #' @param beta non-negative weight. A value of 0 assigns no weight to 317 | #' completeness (i.e. the measure reduces to homogeneity), while larger 318 | #' values assign increasing weight to completeness. A value of 1 weights 319 | #' completeness and homogeneity equally. 320 | #' 321 | #' @references 322 | #' Rosenberg, A. and Hirschberg, J. "V-measure: A conditional entropy-based external cluster evaluation measure." _Proceedings of the 2007 Joint Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning_ (EMNLP-CoNLL), (2007). 323 | #' 324 | #' Becker, H. "Identification and characterization of events in social media." 325 | #' _PhD dissertation_, Columbia University, (2011). 326 | #' 327 | #' @seealso [`homogeneity`] and [`completeness`] evaluate the component 328 | #' measures upon which this measure is based. 329 | #' 330 | #' @examples 331 | #' true <- c(1,1,1,2,2) # ground truth clustering 332 | #' pred <- c(1,1,2,2,2) # predicted clustering 333 | #' v_measure(true, pred) 334 | #' 335 | #' @export 336 | v_measure <- function(true, pred, beta=1) { 337 | ct <- contingency_table_clusters(true, pred) 338 | v_measure_ct(ct, beta=beta) 339 | } 340 | 341 | 342 | #' Variation of Information Between Clusterings 343 | #' 344 | #' @description Computes the variation of information between two 345 | #' clusterings, such as a predicted and ground truth clustering. 346 | #' 347 | #' @details Variation of information is an entropy-based distance metric 348 | #' on the space of clusterings. It is unnormalized and varies between 349 | #' \eqn{0} and \eqn{\log(N)}{log(N)} where \eqn{N} is the number of 350 | #' clustered elements. Larger values of the distance metric correspond 351 | #' to greater dissimilarity between the clusterings. 352 | #' 353 | #' @param true ground truth clustering represented as a membership 354 | #' vector. Each entry corresponds to an element and the value identifies 355 | #' the assigned cluster. The specific values of the cluster identifiers 356 | #' are arbitrary. 357 | #' @param pred predicted clustering represented as a membership 358 | #' vector. 359 | #' @param base base of the logarithm. Defaults to `exp(1)`. 360 | #' 361 | #' @references 362 | #' Arabie, P. and Boorman, S. A. "Multidimensional scaling of measures of 363 | #' distance between partitions." _Journal of Mathematical Psychology_ **10:2**, 364 | #' 148-203, (1973). \doi{10.1016/0022-2496(73)90012-6} 365 | #' 366 | #' Meilă, M. "Comparing Clusterings by the Variation of Information." In: 367 | #' Learning Theory and Kernel Machines, Lecture Notes in Computer Science 368 | #' **2777**, Springer, Berlin, Heidelberg, (2003). 369 | #' \doi{10.1007/978-3-540-45167-9_14} 370 | #' 371 | #' @examples 372 | #' true <- c(1,1,1,2,2) # ground truth clustering 373 | #' pred <- c(1,1,2,2,2) # predicted clustering 374 | #' variation_info(true, pred) 375 | #' 376 | #' @export 377 | variation_info <- function(true, pred, base=exp(1)) { 378 | ct <- contingency_table_clusters(true, pred) 379 | variation_info_ct(ct, base=base) 380 | } 381 | 382 | 383 | #' Mutual Information Between Clusterings 384 | #' 385 | #' @description Computes the mutual information between two 386 | #' clusterings, such as a predicted and ground truth clustering. 387 | #' 388 | #' @details Mutual information is an entropy-based measure of the similarity 389 | #' between two clusterings. 390 | #' 391 | #' @param true ground truth clustering represented as a membership 392 | #' vector. Each entry corresponds to an element and the value identifies 393 | #' the assigned cluster. The specific values of the cluster identifiers 394 | #' are arbitrary. 395 | #' @param pred predicted clustering represented as a membership 396 | #' vector. 397 | #' @param base base of the logarithm. Defaults to `exp(1)`. 398 | #' 399 | #' @examples 400 | #' true <- c(1,1,1,2,2) # ground truth clustering 401 | #' pred <- c(1,1,2,2,2) # predicted clustering 402 | #' mutual_info(true, pred) 403 | #' 404 | #' @export 405 | mutual_info <- function(true, pred, base=exp(1)) { 406 | ct <- contingency_table_clusters(true, pred) 407 | mutual_info_ct(ct, base=base) 408 | } 409 | 410 | 411 | # Definition of clustering measures in terms of contingency tables 412 | rand_index_ct <- function(pair_ct) { 413 | correct <- sum(diag(pair_ct)) 414 | total <- sum(pair_ct) 415 | 416 | if (correct == total || total == 0) 417 | # Special cases: no clustering since the data is not split; 418 | # or trivial clustering where each item is assigned a unique 419 | # cluster. These are perfect matches hence return 1.0. 420 | return(1.0) 421 | 422 | return(correct / total) 423 | } 424 | 425 | adj_rand_index_ct <- function(pair_ct) { 426 | tp <- pair_ct["TRUE", "TRUE"] 427 | fp <- pair_ct["TRUE", "FALSE"] 428 | fn <- pair_ct["FALSE", "TRUE"] 429 | tn <- pair_ct["FALSE", "FALSE"] 430 | 431 | # Special cases: empty data or full agreement 432 | if (fn == 0 && fp == 0) return(1.0) 433 | 434 | return(2 * (tp * tn - fn * fp) / 435 | ((tp + fn) * (fn + tn) + (tp + fp) * (fp + tn))) 436 | } 437 | 438 | 439 | #' @param ct contingency table represented as a sparse matrix, specifically 440 | #' an object of S4 class [`Matrix::dgCMatrix-class`] 441 | #' @importFrom Matrix colSums 442 | #' @noRd 443 | homogeneity_ct <- function(ct) { 444 | true_counts <- colSums(ct) 445 | entropy <- entropy_counts(true_counts) 446 | if (entropy == 0) return(1.0) 447 | mi <- mutual_info_ct(ct) 448 | mi / entropy 449 | } 450 | 451 | 452 | #' @param ct contingency table represented as a sparse matrix, specifically 453 | #' an object of S4 class [`Matrix::dgCMatrix-class`] 454 | #' @importFrom Matrix rowSums 455 | #' @noRd 456 | completeness_ct <- function(ct) { 457 | pred_counts <- rowSums(ct) 458 | entropy <- entropy_counts(pred_counts) 459 | if (entropy == 0) return(1.0) 460 | mi <- mutual_info_ct(ct) 461 | mi / entropy 462 | } 463 | 464 | 465 | #' @param ct contingency table represented as a sparse matrix, specifically 466 | #' an object of S4 class [`Matrix::dgCMatrix-class`] 467 | #' @importFrom Matrix rowSums colSums which 468 | #' @noRd 469 | fowlkes_mallows_ct <- function(ct) { 470 | n <- sum(ct) 471 | tk <- sum(ct^2) - n 472 | pk <- sum(rowSums(ct)^2) - n 473 | qk <- sum(colSums(ct)^2) - n 474 | ifelse(tk == 0, 0.0, sqrt(tk / pk) * sqrt(tk / qk)) 475 | } 476 | 477 | 478 | #' @param ct contingency table represented as a sparse matrix, specifically 479 | #' an object of S4 class [`Matrix::dgCMatrix-class`] 480 | #' @importFrom Matrix rowSums colSums which 481 | #' @noRd 482 | v_measure_ct <- function(ct, beta=1.0) { 483 | true_counts <- colSums(ct) 484 | pred_counts <- rowSums(ct) 485 | entropy_true <- entropy_counts(true_counts) 486 | entropy_pred <- entropy_counts(pred_counts) 487 | mi <- mutual_info_ct(ct) 488 | homogeneity <- ifelse(entropy_true==0, 1.0, mi / entropy_true) 489 | completeness <- ifelse(entropy_pred==0, 1.0, mi / entropy_pred) 490 | alpha <- 1/(1 + beta^2) 491 | 1 / (alpha / homogeneity + (1 - alpha) / completeness) 492 | } 493 | 494 | 495 | #' @param ct contingency table represented as a sparse matrix, specifically 496 | #' an object of S4 class [`Matrix::dgCMatrix-class`] 497 | #' @param base base of the logarithm. Defaults to `exp(1)`. 498 | #' @importFrom Matrix rowSums colSums 499 | #' @noRd 500 | variation_info_ct <- function(ct, base=exp(1)) { 501 | # Get array indices of non-zero elements 502 | true_counts <- colSums(ct) 503 | pred_counts <- rowSums(ct) 504 | entropy_true <- entropy_counts(true_counts, base=base) 505 | entropy_pred <- entropy_counts(pred_counts, base=base) 506 | mi <- mutual_info_ct(ct, base=base) 507 | vi <- entropy_true + entropy_pred - 2 * mi 508 | ifelse(vi >= 0, vi, 0.0) 509 | } 510 | 511 | 512 | #' @param counts numeric vector of counts for categories 513 | #' @param base base of the logarithm. Defaults to `exp(1)`. 514 | #' @noRd 515 | entropy_counts <- function(counts, base=exp(1)) { 516 | counts <- counts[counts > 0] 517 | total <- sum(counts) 518 | - sum(counts / total * (log(counts, base=base) - log(total, base=base))) 519 | } 520 | 521 | 522 | #' @param ct contingency table represented as a sparse matrix, specifically 523 | #' an object of S4 class [`Matrix::dgCMatrix-class`] 524 | #' @param base base of the logarithm. Defaults to `exp(1)`. 525 | #' @importFrom Matrix rowSums colSums which 526 | #' @noRd 527 | mutual_info_ct <- function(ct, base=exp(1)) { 528 | # Get array indices of non-zero elements 529 | nz_ind <- which(ct > 0, arr.ind = TRUE, useNames = FALSE) 530 | total <- sum(ct) 531 | row_totals <- rowSums(ct) 532 | col_totals <- colSums(ct) 533 | ct_nz <- ct@x # non-zero entries 534 | ct_nz_norm <- ct_nz / total # normalized non-zero entries 535 | log_ct_nz <- log(ct_nz, base=base) 536 | outer <- row_totals[nz_ind[,1]] * col_totals[nz_ind[,2]] 537 | log_outer <- - log(outer, base=base) + 2 * log(total, base=base) 538 | mi <- sum(ct_nz_norm * (log_ct_nz - log(total, base=base)) + ct_nz_norm * log_outer) 539 | ifelse(mi >= 0, mi, 0.0) 540 | } 541 | -------------------------------------------------------------------------------- /R/measures_pairs.R: -------------------------------------------------------------------------------- 1 | #' @include transformations.R 2 | NULL 3 | 4 | #' Binary Contingency Table for Linked Pairs 5 | #' 6 | #' @description Compute the binary contingency table for a set of _predicted_ 7 | #' coreferent (linked) pairs given a set of _ground truth_ coreferent pairs. 8 | #' 9 | #' @param true_pairs set of true coreferent pairs stored in a matrix or 10 | #' data.frame, where rows index pairs and columns index the ids of the 11 | #' constituents. Any pairs not included are assumed to be _non-coreferent_. 12 | #' Duplicate pairs (including equivalent pairs with reversed ids) are 13 | #' automatically removed. 14 | #' @param pred_pairs set of predicted coreferent pairs, following the same 15 | #' specification as `true_pairs`. 16 | #' @param num_pairs the total number of coreferent and non-coreferent pairs, 17 | #' excluding equivalent pairs with reversed ids. If not provided, 18 | #' the true negative cell will be set to `NA`. 19 | #' @param ordered whether to treat the element pairs as ordered---i.e. whether 20 | #' pair \eqn{(x, y)} is distinct from pair \eqn{(y, x)} for \eqn{x \neq y}. 21 | #' Defaults to FALSE, which is appropriate for clustering, undirected link 22 | #' prediction, record linkage etc. 23 | #' @return Returns a \eqn{2 \times 2}{2×2} contingency table of the form: 24 | #' \preformatted{ 25 | #' Truth 26 | #' Prediction TRUE FALSE 27 | #' TRUE TP FP 28 | #' FALSE FN TN 29 | #' } 30 | #' 31 | #' @seealso 32 | #' The [`membership_to_pairs`] and [`clusters_to_pairs`] functions can be 33 | #' used to transform other clustering representations into lists of pairs, 34 | #' as required by this function. 35 | #' The [`eval_report_pairs`] function computes common evaluation measures 36 | #' derived from binary contingency matrices, like the ones output by this 37 | #' function. 38 | #' 39 | #' @examples 40 | #' ### Example where pairs/edges are undirected 41 | #' # ground truth is 3-clique 42 | #' true_pairs <- rbind(c(1,2), c(2,3), c(1,3)) 43 | #' # prediction misses one edge 44 | #' pred_pairs <- rbind(c(1,2), c(2,3)) 45 | #' # total number of pairs assuming 3 elements 46 | #' num_pairs <- 3 * (3 - 1) / 2 47 | #' eval_report_pairs(true_pairs, pred_pairs, num_pairs) 48 | #' 49 | #' ### Example where pairs/edges are directed 50 | #' # ground truth is a 3-star 51 | #' true_pairs <- rbind(c(2,1), c(3,1), c(4,1)) 52 | #' # prediction gets direction of one edge incorrect 53 | #' pred_pairs <- rbind(c(2,1), c(3,1), c(1,4)) 54 | #' # total number of pairs assuming 4 elements 55 | #' num_pairs <- 4 * 4 56 | #' eval_report_pairs(true_pairs, pred_pairs, num_pairs, ordered = TRUE) 57 | #' 58 | #' @export 59 | contingency_table_pairs <- function(true_pairs, pred_pairs, num_pairs=NULL, ordered=FALSE) { 60 | if (!is.null(num_pairs)) { 61 | if (length(num_pairs) != 1 | num_pairs <= 0) 62 | stop("num_pairs must be a positive scalar or NULL") 63 | } 64 | 65 | # Binding pairs ensures that they are coerced to the same type 66 | comb_pairs <- rbind(true_pairs, pred_pairs) 67 | true_pairs <- comb_pairs[seq_len(nrow(true_pairs)),] 68 | pred_pairs <- comb_pairs[nrow(true_pairs) + seq_len(nrow(pred_pairs)),] 69 | 70 | # Canonicalize pairs 71 | pred_pairs <- as.data.frame(canonicalize_pairs(pred_pairs, ordered = ordered)) 72 | true_pairs <- as.data.frame(canonicalize_pairs(true_pairs, ordered = ordered)) 73 | 74 | # Standardize column names 75 | colnames(pred_pairs) <- c("ID.x", "ID.y") 76 | colnames(true_pairs) <- c("ID.x", "ID.y") 77 | 78 | # Allow for empty data frames 79 | pred_pairs[["PRED_MATCH"]] <- rep(TRUE, times=nrow(pred_pairs)) 80 | true_pairs[["MATCH"]] <- rep(TRUE, times=nrow(true_pairs)) 81 | 82 | # Perform a full outer join on the two data frames. 83 | merged_pairs <- merge(pred_pairs, true_pairs, by=c("ID.x", "ID.y"), all=TRUE) 84 | 85 | # An NA in PRED_MATCH or MATCH represents 'FALSE' 86 | merged_pairs$PRED_MATCH[is.na(merged_pairs$PRED_MATCH)] <- FALSE 87 | merged_pairs$MATCH[is.na(merged_pairs$MATCH)] <- FALSE 88 | 89 | # Convert to factors so we can use built-in table function 90 | prediction = factor(merged_pairs$PRED_MATCH, levels = c(TRUE, FALSE)) 91 | truth = factor(merged_pairs$MATCH, levels = c(TRUE, FALSE)) 92 | 93 | ct <- table(prediction, truth, dnn = c("Prediction", "Truth")) 94 | 95 | if (is.null(num_pairs)) { 96 | ct[2,2] <- NA # number of true negatives is unknown since links are incomplete 97 | } else { 98 | ct[2,2] <- num_pairs - nrow(merged_pairs) 99 | } 100 | 101 | return(ct) 102 | } 103 | 104 | 105 | #' Evaluation Report for Linked Pairs 106 | #' 107 | #' @description Compute various evaluation measures for a set of _predicted_ 108 | #' coreferent (linked) pairs given a set of _ground truth_ coreferent pairs. 109 | #' 110 | #' @param true_pairs set of true coreferent pairs stored in a matrix or 111 | #' data.frame, where rows index pairs and columns index the ids of the 112 | #' constituents. Any pairs not included are assumed to be _non-coreferent_. 113 | #' Duplicate pairs (including equivalent pairs with reversed ids) are 114 | #' automatically removed. 115 | #' @param pred_pairs set of predicted coreferent pairs, following the same 116 | #' specification as `true_pairs`. 117 | #' @param num_pairs the total number of coreferent and non-coreferent pairs, 118 | #' excluding equivalent pairs with reversed ids. If not provided, 119 | #' measures that depend on the number of true negatives will be returned 120 | #' as `NA`. 121 | #' @param ordered whether to treat the element pairs as ordered---i.e. whether 122 | #' pair \eqn{(x, y)} is distinct from pair \eqn{(y, x)} for \eqn{x \neq y}. 123 | #' Defaults to FALSE, which is appropriate for clustering, undirected link 124 | #' prediction, record linkage etc. 125 | #' 126 | #' @return Returns a list containing the following measures: 127 | #' \describe{ 128 | #' \item{precision}{see [`precision_pairs`]} 129 | #' \item{recall}{see [`recall_pairs`]} 130 | #' \item{specificity}{see [`specificity_pairs`]} 131 | #' \item{sensitivity}{see [`sensitivity_pairs`]} 132 | #' \item{f1score}{see [`f_measure_pairs`]} 133 | #' \item{accuracy}{see [`accuracy_pairs`]} 134 | #' \item{balanced_accuracy}{see [`balanced_accuracy_pairs`]} 135 | #' \item{fowlkes_mallows}{see [`fowlkes_mallows_pairs`]} 136 | #' } 137 | #' 138 | #' @seealso The [`contingency_table_pairs`] function can be used to compute 139 | #' the contingency table for entity resolution or record linkage problems. 140 | #' 141 | #' @examples 142 | #' ### Example where pairs/edges are undirected 143 | #' # ground truth is 3-clique 144 | #' true_pairs <- rbind(c(1,2), c(2,3), c(1,3)) 145 | #' # prediction misses one edge 146 | #' pred_pairs <- rbind(c(1,2), c(2,3)) 147 | #' # total number of pairs assuming 3 elements 148 | #' num_pairs <- 3 * (3 - 1) / 2 149 | #' eval_report_pairs(true_pairs, pred_pairs, num_pairs) 150 | #' 151 | #' ### Example where pairs/edges are directed 152 | #' # ground truth is a 3-star 153 | #' true_pairs <- rbind(c(2,1), c(3,1), c(4,1)) 154 | #' # prediction gets direction of one edge incorrect 155 | #' pred_pairs <- rbind(c(2,1), c(3,1), c(1,4)) 156 | #' # total number of pairs assuming 4 elements 157 | #' num_pairs <- 4 * 4 158 | #' eval_report_pairs(true_pairs, pred_pairs, num_pairs, ordered = TRUE) 159 | #' 160 | #' @export 161 | eval_report_pairs <- function(true_pairs, pred_pairs, num_pairs = NULL, ordered=FALSE) 162 | { 163 | ct <- contingency_table_pairs(true_pairs, pred_pairs, num_pairs = num_pairs, ordered = ordered) 164 | list("precision" = precision_pairs_ct(ct), 165 | "recall" = recall_pairs_ct(ct), 166 | "specificity" = specificity_pairs_ct(ct), 167 | "sensitivity" = recall_pairs_ct(ct), 168 | "f1score" = f_measure_pairs_ct(ct), 169 | "accuracy" = accuracy_pairs_ct(ct), 170 | "balanced_accuracy" = balanced_accuracy_pairs_ct(ct)) 171 | } 172 | 173 | 174 | #' Precision of Linked Pairs 175 | #' 176 | #' @description Computes the precision of a set of _predicted_ coreferent 177 | #' (linked) pairs given a set of _ground truth_ coreferent pairs. 178 | #' 179 | #' @details The precision is defined as: 180 | #' \deqn{\frac{|T \cap P|}{|P|}}{|T ∩ P|/|P|} 181 | #' where \eqn{T} is the set of true coreferent pairs and \eqn{P} is the 182 | #' set of predicted coreferent pairs. 183 | #' 184 | #' @param true_pairs set of true coreferent pairs stored in a matrix or 185 | #' data.frame, where rows index pairs and columns index the ids of the 186 | #' constituents. Any pairs not included are assumed to be _non-coreferent_. 187 | #' Duplicate pairs (including equivalent pairs with reversed ids) are 188 | #' automatically removed. 189 | #' @param pred_pairs set of predicted coreferent pairs, following the same 190 | #' specification as `true_pairs`. 191 | #' @param ordered whether to treat the element pairs as ordered---i.e. whether 192 | #' pair \eqn{(x, y)} is distinct from pair \eqn{(y, x)} for \eqn{x \neq y}. 193 | #' Defaults to FALSE, which is appropriate for clustering, undirected link 194 | #' prediction, record linkage etc. 195 | #' 196 | #' @examples 197 | #' true_pairs <- rbind(c(1,2), c(2,3), c(1,3)) # ground truth is 3-clique 198 | #' pred_pairs <- rbind(c(1,2), c(2,3)) # prediction misses one edge 199 | #' num_pairs <- 3 # assuming 3 elements 200 | #' precision_pairs(true_pairs, pred_pairs, num_pairs) 201 | #' 202 | #' @export 203 | precision_pairs <- function(true_pairs, pred_pairs, ordered=FALSE) { 204 | ct <- contingency_table_pairs(true_pairs, pred_pairs, ordered = ordered) 205 | precision_pairs_ct(ct) 206 | } 207 | 208 | 209 | #' Recall of Linked Pairs 210 | #' 211 | #' @description Computes the precision of a set of _predicted_ coreferent 212 | #' (linked) pairs given a set of _ground truth_ coreferent pairs. 213 | #' 214 | #' @details The recall is defined as: 215 | #' \deqn{\frac{|T \cap P|}{|T|}}{|T ∩ P|/|T|} 216 | #' where \eqn{T} is the set of true coreferent pairs and \eqn{P} is the 217 | #' set of predicted coreferent pairs. 218 | #' 219 | #' @param true_pairs set of true coreferent pairs stored in a matrix or 220 | #' data.frame, where rows index pairs and columns index the ids of the 221 | #' constituents. Any pairs not included are assumed to be _non-coreferent_. 222 | #' Duplicate pairs (including equivalent pairs with reversed ids) are 223 | #' automatically removed. 224 | #' @param pred_pairs set of predicted coreferent pairs, following the same 225 | #' specification as `true_pairs`. 226 | #' @param ordered whether to treat the element pairs as ordered---i.e. whether 227 | #' pair \eqn{(x, y)} is distinct from pair \eqn{(y, x)} for \eqn{x \neq y}. 228 | #' Defaults to FALSE, which is appropriate for clustering, undirected link 229 | #' prediction, record linkage etc. 230 | #' 231 | #' @examples 232 | #' true_pairs <- rbind(c(1,2), c(2,3), c(1,3)) # ground truth is 3-clique 233 | #' pred_pairs <- rbind(c(1,2), c(2,3)) # prediction misses one edge 234 | #' num_pairs <- 3 # assuming 3 elements 235 | #' recall_pairs(true_pairs, pred_pairs, num_pairs) 236 | #' 237 | #' @rdname recall_pairs 238 | #' @export 239 | recall_pairs <- function(true_pairs, pred_pairs, ordered=FALSE) { 240 | ct <- contingency_table_pairs(true_pairs, pred_pairs, ordered = ordered) 241 | recall_pairs_ct(ct) 242 | } 243 | 244 | 245 | #' @note `sensitivity_pairs` is an alias for `recall_pairs`. 246 | #' 247 | #' @rdname recall_pairs 248 | #' @export 249 | sensitivity_pairs <- function(true_pairs, pred_pairs, ordered=FALSE) { 250 | recall_pairs(true_pairs, pred_pairs) 251 | } 252 | 253 | 254 | #' F-measure of Linked Pairs 255 | #' 256 | #' @description Computes the F-measure (a.k.a. F-score) of a set of 257 | #' _predicted_ coreferent (linked) pairs given a set of _ground truth_ 258 | #' coreferent pairs. 259 | #' 260 | #' @details The \eqn{\beta}{β}-weighted F-measure is defined as the weighted 261 | #' harmonic mean of precision \eqn{P} and recall \eqn{R}: 262 | #' \deqn{(1 + \beta^2)\frac{P \cdot R}{\beta^2 \cdot P + R}.}{(1 + β^2)·P·R/(β^2·P + R).} 263 | #' 264 | #' @param true_pairs set of true coreferent pairs stored in a matrix or 265 | #' data.frame, where rows index pairs and columns index the ids of the 266 | #' constituents. Any pairs not included are assumed to be _non-coreferent_. 267 | #' Duplicate pairs (including equivalent pairs with reversed ids) are 268 | #' automatically removed. 269 | #' @param pred_pairs set of predicted coreferent pairs, following the same 270 | #' specification as `true_pairs`. 271 | #' @param beta non-negative weight. A value of 0 assigns no weight to recall 272 | #' (i.e. the measure reduces to precision), while larger values assign 273 | #' increasing weight to recall. A value of 1 weights precision and recall 274 | #' equally. 275 | #' @param ordered whether to treat the element pairs as ordered---i.e. whether 276 | #' pair \eqn{(x, y)} is distinct from pair \eqn{(y, x)} for \eqn{x \neq y}. 277 | #' Defaults to FALSE, which is appropriate for clustering, undirected link 278 | #' prediction, record linkage etc. 279 | #' 280 | #' @references 281 | #' Van Rijsbergen, C. J. "Information Retrieval." (2nd ed.). 282 | #' Butterworth-Heinemann, USA, (1979). 283 | #' 284 | #' @examples 285 | #' true_pairs <- rbind(c(1,2), c(2,3), c(1,3)) # ground truth is 3-clique 286 | #' pred_pairs <- rbind(c(1,2), c(2,3)) # prediction misses one edge 287 | #' num_pairs <- 3 # assuming 3 elements 288 | #' f_measure_pairs(true_pairs, pred_pairs, num_pairs) 289 | #' 290 | #' @export 291 | f_measure_pairs <- function(true_pairs, pred_pairs, beta=1, ordered=FALSE) { 292 | ct <- contingency_table_pairs(true_pairs, pred_pairs, ordered = ordered) 293 | f_measure_pairs_ct(ct, beta) 294 | } 295 | 296 | 297 | #' Specificity of Linked Pairs 298 | #' 299 | #' @description Computes the specificity of a set of _predicted_ coreferent 300 | #' (linked) pairs given a set of _ground truth_ coreferent pairs. 301 | #' 302 | #' @details The specificity is defined as: 303 | #' \deqn{\frac{|P' \cap T'|}{|P'|}}{|P' ∩ T'|/|P'|} 304 | #' where \eqn{T'} is the set of true non-coreferent pairs, \eqn{P} is the 305 | #' set of predicted non-coreferent pairs. 306 | #' 307 | #' @param true_pairs set of true coreferent pairs stored in a matrix or 308 | #' data.frame, where rows index pairs and columns index the ids of the 309 | #' constituents. Any pairs not included are assumed to be _non-coreferent_. 310 | #' Duplicate pairs (including equivalent pairs with reversed ids) are 311 | #' automatically removed. 312 | #' @param pred_pairs set of predicted coreferent pairs, following the same 313 | #' specification as `true_pairs`. 314 | #' @param num_pairs the total number of coreferent and non-coreferent pairs, 315 | #' excluding equivalent pairs with reversed ids. 316 | #' @param ordered whether to treat the element pairs as ordered---i.e. whether 317 | #' pair \eqn{(x, y)} is distinct from pair \eqn{(y, x)} for \eqn{x \neq y}. 318 | #' Defaults to FALSE, which is appropriate for clustering, undirected link 319 | #' prediction, record linkage etc. 320 | #' 321 | #' @examples 322 | #' true_pairs <- rbind(c(1,2), c(2,3), c(1,3)) # ground truth is 3-clique 323 | #' pred_pairs <- rbind(c(1,2), c(2,3)) # prediction misses one edge 324 | #' num_pairs <- 3 # assuming 3 elements 325 | #' specificity_pairs(true_pairs, pred_pairs, num_pairs) 326 | #' 327 | #' @export 328 | specificity_pairs <- function(true_pairs, pred_pairs, num_pairs, ordered=FALSE) { 329 | ct <- contingency_table_pairs(true_pairs, pred_pairs, num_pairs = num_pairs, ordered = ordered) 330 | specificity_pairs_ct(ct) 331 | } 332 | 333 | 334 | #' Accuracy of Linked Pairs 335 | #' 336 | #' @description Computes the accuracy of a set of _predicted_ coreferent 337 | #' (linked) pairs given a set of _ground truth_ coreferent pairs. 338 | #' 339 | #' @details The accuracy is defined as: 340 | #' \deqn{\frac{|T \cap P| + |T' \cap P'|}{N}}{(|T ∩ P| + |T' ∩ P'|)/N} 341 | #' where: 342 | #' * \eqn{T} is the set of true coreferent pairs, 343 | #' * \eqn{P} is the set of predicted coreferent pairs, 344 | #' * \eqn{T'} is the set of true non-coreferent pairs, 345 | #' * \eqn{P'} is the set of predicted non-coreferent pairs, and 346 | #' * \eqn{N} is the total number of coreferent and non-coreferent pairs. 347 | #' 348 | #' @param true_pairs set of true coreferent pairs stored in a matrix or 349 | #' data.frame, where rows index pairs and columns index the ids of the 350 | #' constituents. Any pairs not included are assumed to be _non-coreferent_. 351 | #' Duplicate pairs (including equivalent pairs with reversed ids) are 352 | #' automatically removed. 353 | #' @param pred_pairs set of predicted coreferent pairs, following the same 354 | #' specification as `true_pairs`. 355 | #' @param num_pairs the total number of coreferent and non-coreferent pairs, 356 | #' excluding equivalent pairs with reversed ids. 357 | #' @param ordered whether to treat the element pairs as ordered---i.e. whether 358 | #' pair \eqn{(x, y)} is distinct from pair \eqn{(y, x)} for \eqn{x \neq y}. 359 | #' Defaults to FALSE, which is appropriate for clustering, undirected link 360 | #' prediction, record linkage etc. 361 | #' 362 | #' @examples 363 | #' true_pairs <- rbind(c(1,2), c(2,3), c(1,3)) # ground truth is 3-clique 364 | #' pred_pairs <- rbind(c(1,2), c(2,3)) # prediction misses one edge 365 | #' num_pairs <- 3 # assuming 3 elements 366 | #' accuracy_pairs(true_pairs, pred_pairs, num_pairs) 367 | #' 368 | #' @export 369 | accuracy_pairs <- function(true_pairs, pred_pairs, num_pairs, ordered=FALSE) { 370 | ct <- contingency_table_pairs(true_pairs, pred_pairs, num_pairs = num_pairs, ordered = ordered) 371 | accuracy_pairs_ct(ct) 372 | } 373 | 374 | 375 | #' Balanced Accuracy of Linked Pairs 376 | #' 377 | #' @description Computes the balanced accuracy of a set of _predicted_ 378 | #' coreferent (linked) pairs given a set of _ground truth_ coreferent 379 | #' pairs. 380 | #' 381 | #' @details The balanced accuracy is defined as: 382 | #' \deqn{\frac{\frac{|T \cap P|}{|P|} + \frac{|T' \cap P'|}{|P'|}}{2}}{|T ∩ P|/(2|P|) + |T' ∩ P'|/(2|P'|)} 383 | #' where: 384 | #' * \eqn{T} is the set of true coreferent pairs, 385 | #' * \eqn{P} is the set of predicted coreferent pairs, 386 | #' * \eqn{T'} is the set of true non-coreferent pairs, and 387 | #' * \eqn{P'} is the set of predicted non-coreferent pairs. 388 | #' 389 | #' @param true_pairs set of true coreferent pairs stored in a matrix or 390 | #' data.frame, where rows index pairs and columns index the ids of the 391 | #' constituents. Any pairs not included are assumed to be _non-coreferent_. 392 | #' Duplicate pairs (including equivalent pairs with reversed ids) are 393 | #' automatically removed. 394 | #' @param pred_pairs set of predicted coreferent pairs, following the same 395 | #' specification as `true_pairs`. 396 | #' @param num_pairs the total number of coreferent and non-coreferent pairs, 397 | #' excluding equivalent pairs with reversed ids. 398 | #' @param ordered whether to treat the element pairs as ordered---i.e. whether 399 | #' pair \eqn{(x, y)} is distinct from pair \eqn{(y, x)} for \eqn{x \neq y}. 400 | #' Defaults to FALSE, which is appropriate for clustering, undirected link 401 | #' prediction, record linkage etc. 402 | #' 403 | #' @examples 404 | #' true_pairs <- rbind(c(1,2), c(2,3), c(1,3)) # ground truth is 3-clique 405 | #' pred_pairs <- rbind(c(1,2), c(2,3)) # prediction misses one edge 406 | #' num_pairs <- 3 # assuming 3 elements 407 | #' balanced_accuracy_pairs(true_pairs, pred_pairs, num_pairs) 408 | #' 409 | #' @export 410 | balanced_accuracy_pairs <- function(true_pairs, pred_pairs, num_pairs, ordered=FALSE) { 411 | ct <- contingency_table_pairs(true_pairs, pred_pairs, num_pairs = num_pairs, ordered = ordered) 412 | balanced_accuracy_pairs_ct(ct) 413 | } 414 | 415 | 416 | #' Fowlkes-Mallows Index of Linked Pairs 417 | #' 418 | #' @description Computes the Fowlkes-Mallows index for a set of _predicted_ 419 | #' coreferent (linked) pairs given a set of _ground truth_ coreferent pairs. 420 | #' 421 | #' @details The Fowlkes-Mallows index is defined as the geometric mean of 422 | #' precision \eqn{P} and recall \eqn{R}: 423 | #' \deqn{\sqrt{P R}.}{√(P·R).} 424 | #' 425 | #' @param true_pairs set of true coreferent pairs stored in a matrix or 426 | #' data.frame, where rows index pairs and columns index the ids of the 427 | #' constituents. Any pairs not included are assumed to be _non-coreferent_. 428 | #' Duplicate pairs (including equivalent pairs with reversed ids) are 429 | #' automatically removed. 430 | #' @param pred_pairs set of predicted coreferent pairs, following the same 431 | #' specification as `true_pairs`. 432 | #' @param ordered whether to treat the element pairs as ordered---i.e. whether 433 | #' pair \eqn{(x, y)} is distinct from pair \eqn{(y, x)} for \eqn{x \neq y}. 434 | #' Defaults to FALSE, which is appropriate for clustering, undirected link 435 | #' prediction, record linkage etc. 436 | #' 437 | #' @references 438 | #' Fowlkes, E. B. and Mallows, C. L. "A Method for Comparing Two Hierarchical 439 | #' Clusterings." _Journal of the American Statistical Association_ **78:383**, 440 | #' 553-569, (1983). \doi{10.1080/01621459.1983.10478008}. 441 | #' 442 | #' @examples 443 | #' true_pairs <- rbind(c(1,2), c(2,3), c(1,3)) # ground truth is 3-clique 444 | #' pred_pairs <- rbind(c(1,2), c(2,3)) # prediction misses one edge 445 | #' num_pairs <- 3 # assuming 3 elements 446 | #' fowlkes_mallows_pairs(true_pairs, pred_pairs, num_pairs) 447 | #' 448 | #' @export 449 | fowlkes_mallows_pairs <- function(true_pairs, pred_pairs, ordered=FALSE) { 450 | ct <- contingency_table_pairs(true_pairs, pred_pairs, ordered = ordered) 451 | fowlkes_mallows_pairs_ct(ct) 452 | } 453 | 454 | 455 | # Definition of measures in terms of contingency table 456 | precision_pairs_ct <- function(ct) { 457 | tp <- ct["TRUE", "TRUE"] 458 | fp <- ct["TRUE", "FALSE"] 459 | pp <- tp + fp 460 | return(tp / pp) 461 | } 462 | 463 | recall_pairs_ct <- function(ct) { 464 | tp <- ct["TRUE", "TRUE"] 465 | fn <- ct["FALSE", "TRUE"] 466 | p <- tp + fn 467 | return(tp / p) 468 | } 469 | 470 | f_measure_pairs_ct <- function(ct, beta=1.0) { 471 | if (beta < 0) 472 | stop("`beta` must be non-negative") 473 | P <- precision_pairs_ct(ct) 474 | R <- recall_pairs_ct(ct) 475 | alpha <- 1/(1 + beta^2) 476 | 1 / (alpha / P + (1 - alpha) / R) 477 | } 478 | 479 | specificity_pairs_ct <- function(ct) { 480 | fp <- ct["TRUE", "FALSE"] 481 | tn <- ct["FALSE", "FALSE"] 482 | n <- tn + fp 483 | tn / n 484 | } 485 | 486 | accuracy_pairs_ct <- function(ct) { 487 | tp <- ct["TRUE", "TRUE"] 488 | fp <- ct["TRUE", "FALSE"] 489 | fn <- ct["FALSE", "TRUE"] 490 | tn <- ct["FALSE", "FALSE"] 491 | correct <- tp + tn 492 | total <- tp + fp + tn + fn 493 | correct/total 494 | } 495 | 496 | balanced_accuracy_pairs_ct <- function(ct) { 497 | sensitivity <- recall_pairs_ct(ct) 498 | specificity <- specificity_pairs_ct(ct) 499 | (sensitivity + specificity) / 2 500 | } 501 | 502 | fowlkes_mallows_pairs_ct <- function(ct) { 503 | P <- precision_pairs_ct(ct) 504 | R <- recall_pairs_ct(ct) 505 | sqrt(P) * sqrt(R) 506 | } 507 | --------------------------------------------------------------------------------