├── data ├── distinct1.cnl ├── 2cls3nds.cnl ├── 2cls3nds_novp.cnl ├── 3cls5nds_novp.cnl ├── 3cls5nds_novp2.cnl ├── 1lev4nds3cls1s.cnl ├── 1lev4nds3cls2.cnl ├── 1lev4nds4cls.cnl ├── 4nds_1lev_novp1.cnl ├── 4nds_1lev_novp2.cnl ├── 15nds_novp2.cnl ├── 15nds_novp3.cnl ├── 15nds_novp1.cnl ├── 4nds_2lev_novp1.cnl ├── 4nds_2lev_novp2.cnl ├── 3cls5nds_2lev.cnl ├── omega_c2-1.cnl ├── omega_c2-2.cnl ├── omega_c4.3-2.cnl ├── 3cls5nds_novp_3lev.cnl ├── omega_c4.3-1.cnl ├── 4cls5nds.cnl ├── 4cls6nds.cnl ├── 1lev4nds2cls.cnl ├── 4cls5nds_2lev.cnl ├── 4cls6nds_2lev.cnl └── 3cls5nds.cnl ├── docs └── xmeasures.pdf ├── images └── CPU-Timings-DBLP.png ├── GenerateArgparser.sh ├── .gitignore ├── shared ├── macrodef.h ├── agghash.hpp ├── fileio.cpp └── fileio.hpp ├── xmeasures.py ├── xmeasures.cbp ├── Makefile_lib ├── libxmeasures.cbp ├── Makefile ├── src ├── interface.cpp ├── main.cpp └── interface_c.cpp ├── include ├── interface_c.h └── interface.h ├── args.ggo ├── LICENSE ├── README.md └── autogen └── cmdline.h /data/distinct1.cnl: -------------------------------------------------------------------------------- 1 | # Clusters: 1, Nodes: 1, Fuzzy: 0, Numbered: 0 2 | 9 3 | -------------------------------------------------------------------------------- /data/2cls3nds.cnl: -------------------------------------------------------------------------------- 1 | # Clusters: 2, Nodes:3,Fuzzy: 0, Numbered: 0 2 | 1 2 3 | 3 2 4 | -------------------------------------------------------------------------------- /data/2cls3nds_novp.cnl: -------------------------------------------------------------------------------- 1 | # Clusters: 2, Nodes:3,Fuzzy: 0, Numbered: 0 2 | 1 2 3 | 3 4 | -------------------------------------------------------------------------------- /data/3cls5nds_novp.cnl: -------------------------------------------------------------------------------- 1 | # Clusters: 3, Nodes:0,Fuzzy: 0, Numbered: 0 2 | 1 2 3 3 | 4 4 | 5 5 | -------------------------------------------------------------------------------- /data/3cls5nds_novp2.cnl: -------------------------------------------------------------------------------- 1 | # Clusters: 3, Nodes:0,Fuzzy: 0, Numbered: 0 2 | 1 2 3 3 | 4 5 4 | -------------------------------------------------------------------------------- /data/1lev4nds3cls1s.cnl: -------------------------------------------------------------------------------- 1 | # Clusters: 3, Nodes:4,Fuzzy: 0, Numbered: 0 2 | 1 2 3 | 3 2 4 | 4 5 | -------------------------------------------------------------------------------- /data/1lev4nds3cls2.cnl: -------------------------------------------------------------------------------- 1 | # Clusters: 3, Nodes:4,Fuzzy: 0, Numbered: 0 2 | 1 2 3 | 3 2 4 | 4 2 5 | -------------------------------------------------------------------------------- /docs/xmeasures.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eXascaleInfolab/xmeasures/HEAD/docs/xmeasures.pdf -------------------------------------------------------------------------------- /data/1lev4nds4cls.cnl: -------------------------------------------------------------------------------- 1 | # Clusters: 3, Nodes:4,Fuzzy: 0, Numbered: 0 2 | 1 2 3 | 3 2 4 | 4 2 5 | 3 4 6 | -------------------------------------------------------------------------------- /data/4nds_1lev_novp1.cnl: -------------------------------------------------------------------------------- 1 | # Clusters: 4, Nodes: 4, Fuzzy: 0, Numbered: 0 2 | # Level 1 3 | 1 2 4 | 3 4 5 | -------------------------------------------------------------------------------- /data/4nds_1lev_novp2.cnl: -------------------------------------------------------------------------------- 1 | # Clusters: 6, Nodes:4, Fuzzy: 0, Numbered: 0 2 | # Level 1 3 | 1 2 4 | 3 5 | 4 6 | -------------------------------------------------------------------------------- /images/CPU-Timings-DBLP.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eXascaleInfolab/xmeasures/HEAD/images/CPU-Timings-DBLP.png -------------------------------------------------------------------------------- /data/15nds_novp2.cnl: -------------------------------------------------------------------------------- 1 | # Clusters: 4, Nodes:15,Fuzzy: 0, Numbered: 0 2 | 1 2 3 11 12 13 3 | 4 5 4 | 6 7 8 14 15 5 | 9 10 6 | -------------------------------------------------------------------------------- /data/15nds_novp3.cnl: -------------------------------------------------------------------------------- 1 | # Clusters: 4, Nodes:15,Fuzzy: 0, Numbered: 0 2 | 1 2 3 8 4 5 3 | 6 7 4 | 11 12 13 14 15 5 | 9 10 6 | -------------------------------------------------------------------------------- /data/15nds_novp1.cnl: -------------------------------------------------------------------------------- 1 | # Clusters: 5, Nodes:15,Fuzzy: 0, Numbered: 0 2 | 1 2 3 3 | 4 5 14 4 | 6 7 8 5 | 9 10 11 12 13 6 | 15 7 | -------------------------------------------------------------------------------- /data/4nds_2lev_novp1.cnl: -------------------------------------------------------------------------------- 1 | # Clusters: 4, Nodes: 4, Fuzzy: 0, Numbered: 0 2 | # Level 1 3 | 1 2 4 | 3 4 5 | # Level 2 6 | 1 2 3 4 7 | -------------------------------------------------------------------------------- /data/4nds_2lev_novp2.cnl: -------------------------------------------------------------------------------- 1 | # Clusters: 6, Nodes:4, Fuzzy: 0, Numbered: 0 2 | # Level 1 3 | 1 2 4 | 3 5 | 4 6 | # Level 2 7 | 1 2 3 8 | 4 9 | -------------------------------------------------------------------------------- /data/3cls5nds_2lev.cnl: -------------------------------------------------------------------------------- 1 | # Clusters: 0 Nodes:5 2 | # Multilevel and overlapping collection 3 | # First level 4 | 0> 1 2:0.2 3:1 5 | 1> 2:0.3 4 6 | 2> 5 2:0.3 7 | # Second level 8 | 1 3 9 | 2 4 5 10 | -------------------------------------------------------------------------------- /data/omega_c2-1.cnl: -------------------------------------------------------------------------------- 1 | # Omega-a_general_formulation_of_the_Rand_index_of_cluster_recovery_suitable_for_non-disjoint_solutions-1988.pdf 2 | # Solution V, p. 236 (8) 3 | # Pairs 15 (6, 9) => 6*5, i.e. 6 nodes 4 | 0 1 2 5 | 3 4 5 6 | -------------------------------------------------------------------------------- /data/omega_c2-2.cnl: -------------------------------------------------------------------------------- 1 | # Omega-a_general_formulation_of_the_Rand_index_of_cluster_recovery_suitable_for_non-disjoint_solutions-1988.pdf 2 | # Solution V, p. 236 (8) 3 | # Pairs 15 (6, 9) => 6*5, i.e. 6 nodes 4 | 0 1 5 | 2 3 4 6 | 5 7 | -------------------------------------------------------------------------------- /data/omega_c4.3-2.cnl: -------------------------------------------------------------------------------- 1 | # Omega-a_general_formulation_of_the_Rand_index_of_cluster_recovery_suitable_for_non-disjoint_solutions-1988.pdf 2 | # Solution V, p. 240 (12) 3 | # Nodes: 10, Clusters: 3 4 | 0 1 2 3 5 | 2 3 4 5 6 6 | 7 8 9 7 | -------------------------------------------------------------------------------- /data/3cls5nds_novp_3lev.cnl: -------------------------------------------------------------------------------- 1 | # Clusters: 0, Nodes:5,Fuzzy: 0, Numbered: 0 2 | # Multilevel flattened collection 3 | # First level 4 | 1 2 3 5 | 4 6 | 5 7 | # Second level 8 | 1 2 3 9 | 4 5 10 | # Third level 11 | 1 2 3 4 5 12 | -------------------------------------------------------------------------------- /data/omega_c4.3-1.cnl: -------------------------------------------------------------------------------- 1 | # Omega-a_general_formulation_of_the_Rand_index_of_cluster_recovery_suitable_for_non-disjoint_solutions-1988.pdf 2 | # Solution V, p. 240 (12) 3 | # Nodes: 10, Clusters: 4 4 | 0 1 2 3 5 | 3 4 5 6 6 | 7 8 7 | 9 8 | -------------------------------------------------------------------------------- /data/4cls5nds.cnl: -------------------------------------------------------------------------------- 1 | # Clusters: 4, Nodes:, Fuzzy: 0, Numbered: 0 2 | 1 2:0.2 3 | 2:0.3 4 4 | 5 2:0.3 3 5 | 3 6 | 7 | # Cleared version: 8 | # 1 2 [Matches in 3cls5nds: 1+3=4] 9 | # 2 4 [3+1=4] 10 | # 5 2 3 [1+3+1=5] 11 | # 3 [1] 12 | -------------------------------------------------------------------------------- /data/4cls6nds.cnl: -------------------------------------------------------------------------------- 1 | # Clusters: 4, Nodes:6, Fuzzy: 0, Numbered: 0 2 | 1 2:0.2 6 3 | 2:0.3 4 4 | 5 2:0.3 3 5 | 3 6 | 7 | # Cleared version: 8 | # 1 2 6 [Matches in 3cls5nds: 1+3+0=4] 9 | # 2 4 [3+1=4] 10 | # 5 2 3 [1+3+1=5] 11 | # 3 [1] 12 | -------------------------------------------------------------------------------- /data/1lev4nds2cls.cnl: -------------------------------------------------------------------------------- 1 | # Clusters: 2, Nodes:4,Fuzzy: 0, Numbered: 0 2 | # Note: 1lev4nds2cls matching with 1lev4nds3cls1s vs 1lev4nds3cls2 is an example 3 | # where standard novp NMI (not gecmi NMIovp) fails to evaluate overlapping 4 | # matching unlike F1. 5 | 1 2 6 | 3 4 2 7 | -------------------------------------------------------------------------------- /data/4cls5nds_2lev.cnl: -------------------------------------------------------------------------------- 1 | # Clusters: 4, Nodes:, Fuzzy: 0, Numbered: 0 2 | 1 2:0.2 3 | 2:0.3 4 4 | 5 2:0.3 3 5 | 3 6 | # Lev 2 7 | 1 2 4 8 | 5 2 3 9 | 10 | # Cleared version: 11 | # 1 2 [Matches in 3cls5nds: 1+3=4] 12 | # 2 4 [3+1=4] 13 | # 5 2 3 [1+3+1=5] 14 | # 3 [1] 15 | -------------------------------------------------------------------------------- /data/4cls6nds_2lev.cnl: -------------------------------------------------------------------------------- 1 | # Clusters: 4, Nodes:6, Fuzzy: 0, Numbered: 0 2 | 1 2:0.2 6 3 | 2:0.3 4 4 | 5 2:0.3 3 5 | 3 6 | # Lev 2 7 | 1 2 6 3 8 | 5 2 3 4 9 | 10 | # Cleared version: 11 | # 1 2 6 [Matches in 3cls5nds: 1+3+0=4] 12 | # 2 4 [3+1=4] 13 | # 5 2 3 [1+3+1=5] 14 | # 3 [1] 15 | -------------------------------------------------------------------------------- /GenerateArgparser.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Generate Arguments parser 3 | 4 | sh -c 'mkdir autogen' 2> /dev/null 5 | gengetopt --output-dir autogen -i args.ggo 6 | 7 | if [ $? -eq 0 ]; then 8 | echo "The arguments parser is generated" 9 | else 10 | echo "The arguments parser generation is FAILED" 11 | fi 12 | -------------------------------------------------------------------------------- /data/3cls5nds.cnl: -------------------------------------------------------------------------------- 1 | # Clusters: 3 Nodes:5,Fuzzy: 1, Numbered: 1 2 | # Note that the number of clusters corresponds to the number of payload lines in the file 3 | 0> 1 2:0.2 3:1 4 | # Empty lines and comments are allowed 5 | 1> 2:0.3 4 6 | 2> 5 2:0.3 7 | 8 | # Cleared version: 9 | # 1 2 3 [Mathces in 4cls6nds: 1+3+2=6] 10 | # 2 4 [3+1=4] 11 | # 5 2 [1+3=4] 12 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files 2 | *.slo 3 | *.lo 4 | *.o 5 | *.obj 6 | 7 | # Precompiled Headers 8 | *.gch 9 | *.pch 10 | 11 | # Compiled Dynamic libraries 12 | *.so 13 | *.dylib 14 | *.dll 15 | 16 | # Fortran module files 17 | *.mod 18 | *.smod 19 | 20 | # Compiled Static libraries 21 | *.lai 22 | *.la 23 | *.a 24 | *.lib 25 | 26 | # Executables 27 | *.exe 28 | *.out 29 | *.app 30 | 31 | # Codeblocks files 32 | *.depend 33 | *.layout 34 | -------------------------------------------------------------------------------- /shared/macrodef.h: -------------------------------------------------------------------------------- 1 | //! \brief Global macro definitions. 2 | //! The Dao (Deterministic Agglomerative Overlapping) of Clustering library: 3 | //! Robust & Fine-grained Deterministic Clustering for Large Networks. 4 | //! 5 | //! \license Apache License, Version 2.0: http://www.apache.org/licenses/LICENSE-2.0.html 6 | //! > Simple explanation: https://tldrlegal.com/license/apache-license-2.0-(apache-2.0) 7 | //! 8 | //! Copyright (c) 9 | //! \authr Artem Lutov 10 | //! \email luart@ya.ru 11 | //! \date 2016-07-25 12 | 13 | #ifndef MACRODEF_H 14 | #define MACRODEF_H 15 | 16 | // Global MACROSES: 17 | // - VALIDATE - use alternative evaluations to validate results 18 | // - 0 - turn off heavy validation 19 | // - 1 - default value for the heavy validation 20 | // - 2 - extra heavy validation (might duplicate already performed heavy validation) 21 | // - 3 - cross validation of functions (executed on each call, but only once is enough) 22 | // 23 | // - TRACE, TRACE_EXTRA - detailed tracing under debug (trace nodes weights) 24 | // - 0 - turn off the tracing 25 | // - 1 - brief tracing that can be used in release to show warnings, etc. 26 | // - 2 - detailed tracing for DEBUG 27 | // - 3 - extra detailed tracing 28 | // 29 | // - FTRACE_GLOBAL - use global ftrace file for the whole project, or "shared/" headers 30 | // define it locally 31 | // 32 | // - UTEST - build [also] unit tests, requires installation and linking of the unit test library. 33 | // 34 | // - NO_FILEIO - omit STL file I/O related routines 35 | // 36 | // NOTE: undefined macro definition is interpreted as having value 0 37 | 38 | #ifndef TRACE 39 | #ifdef DEBUG 40 | #define TRACE 2 41 | #elif !defined(NDEBUG) // RELEASE, !NDEBUG 42 | #define TRACE 1 43 | //#else // RELEASE, NDEBUG 44 | // #define TRACE 0 45 | #endif // DEBUG 46 | #endif // TRACE 47 | 48 | #ifndef VALIDATE 49 | #ifdef DEBUG 50 | #define VALIDATE 2 51 | #elif !defined(NDEBUG) // RELEASE, !NDEBUG 52 | #define VALIDATE 1 53 | //#else // ELEASE, NDEBUG 54 | // #define VALIDATE 0 55 | #endif // DEBUG 56 | #endif // VALIDATE 57 | 58 | // SWIG related macro definitions 59 | // Swig 3.0.12 does not understand some structures, workarounds are applied 60 | // Note: defined only for SWIG interfaces 61 | #ifdef SWIG 62 | // Just skip the static assert 63 | #define static_assert(a, b) 64 | #endif // SWIG 65 | 66 | // Note: SWIG_VERSION is not defined for SWIGJAVA and SWIGCSHARP 67 | // Note: defined both for the SWIG interfaces and implementation 68 | #if defined(SWIG_VERSION) || defined(SWIGJAVA) || defined(SWIGCSHARP) 69 | // Defined automatically when any SWIG processing is performed 70 | // (either the included as SWIG interface or implementation) 71 | #define DAOC_SWIGPROC 72 | #endif // SWIG processing 73 | 74 | // Define macros for the case when SWIG supports functions overloading 75 | #if defined(SWIGCSHARP) || defined(SWIGD) || defined(SWIGJAVA) 76 | #define SWIG_OVERLOADS 77 | #endif // OVERLOADS 78 | 79 | 80 | #endif // MACRODEF_H 81 | -------------------------------------------------------------------------------- /xmeasures.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | :Description: Python API and usage example for C API of the xmeasures library 5 | Note that items in collections are allowed to be non-contiguous (i.e., hashes can be used as ids) 6 | 7 | :Authors: (c) Artem Lutov 8 | :Date: 2020-03-12 9 | """ 10 | 11 | import pathlib 12 | import numpy as np 13 | from ctypes import Structure, CDLL, POINTER, c_uint, c_float #, c_void_p 14 | from collections.abc import Iterable 15 | 16 | # Python wrappers for C types ------------------------------------------------------------------------------------------ 17 | c_uint_p = POINTER(c_uint) 18 | c_float_p = POINTER(c_float) 19 | # null_ptr = c_void_p() 20 | 21 | 22 | class ClusterNodes(Structure): 23 | _fields_ = [('num', c_uint), 24 | ('ids', c_uint_p), 25 | ('weights', c_float_p)] 26 | ClusterNodesPtr = POINTER(ClusterNodes) 27 | 28 | def clusterNodes(ids, weights=None): 29 | """ClusterNodes initialization 30 | 31 | ids: iterable(uint) - cluster node ids 32 | weights: iterable - cluster node weights 33 | 34 | return ClusterNodes 35 | """ 36 | assert isinstance(ids, Iterable) and (weights is None or isinstance(weights, Iterable)), 'Invalid argument types' 37 | cnIds = (c_uint * len(ids))(*ids) 38 | cnWeights = c_float_p() if not weights else (c_float * len(weights))(*weights) 39 | return ClusterNodes(len(ids), cnIds, cnWeights) 40 | 41 | 42 | class ClusterCollection(Structure): 43 | _fields_ = [('num', c_uint), 44 | ('nodes', ClusterNodesPtr)] 45 | 46 | def clusterCollection(clusters): 47 | """ClusterCollection initialization 48 | 49 | clusters: iterable(iterable(uint)) - clusters (collection of nodes) 50 | 51 | return ClusterCollection 52 | """ 53 | assert isinstance(clusters, Iterable) and isinstance(clusters[0], Iterable), 'Invalid argument type' 54 | cc = (ClusterNodes * len(clusters))(*(clusterNodes(nds) for nds in clusters)) 55 | return ClusterCollection(len(clusters), cc) 56 | 57 | 58 | def weightedClusterCollection(clusters): 59 | """ClusterCollection initialization 60 | 61 | nodes: iterable((iterable(uint), iterable(float))) - weighted clusters (collections of nodes and their weights) 62 | 63 | return ClusterCollection 64 | """ 65 | assert isinstance(clusters, Iterable) and len(clusters[0]) == 2 and isinstance(clusters[0][0], Iterable), 'Invalid argument type' 66 | cc = (c_uint * len(clusters))(*(clusterNodes(nds, wgs) for nds, wgs in clusters)) 67 | return ClusterCollection(len(clusters), cc) 68 | 69 | 70 | # Example of xmeasures usage from Python ------------------------------------------------------------------------------- 71 | if __name__ == "__main__": 72 | # Load the shared library into ctypes 73 | libXms = pathlib.Path().absolute() / "bin/Release/libxmeasures.so" 74 | xms = CDLL(libXms) 75 | # Set proper return types for the importing functions 76 | xms.f1p.restype = c_float 77 | xms.f1h.restype = c_float 78 | xms.omegaExt.restype = c_float 79 | xms.omega.restype = c_float 80 | # Perform evaluations 81 | nc1 = clusterCollection(((9,2,4), (2,13))) 82 | nc2 = clusterCollection([[9,13,2], [2,4]]) 83 | print('F1p: {}, F1h: {}, omegaExt: {}, omega: {}'.format( 84 | xms.f1p(nc1, nc2), 85 | xms.f1h(nc1, nc2), 86 | xms.omegaExt(nc1, nc2), 87 | xms.omega(nc1, nc2) 88 | )) 89 | -------------------------------------------------------------------------------- /xmeasures.cbp: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 106 | 107 | -------------------------------------------------------------------------------- /Makefile_lib: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------------# 2 | # This makefile was generated by 'cbp2make' tool rev.147 # 3 | #------------------------------------------------------------------------------# 4 | 5 | 6 | WORKDIR = `pwd` 7 | 8 | CC = gcc 9 | CXX = g++ 10 | AR = ar 11 | LD = g++ 12 | WINDRES = windres 13 | 14 | INC = -Iinclude -Iautogen -Ishared 15 | CFLAGS = -Wnon-virtual-dtor -Winit-self -Wcast-align -Wundef -Wfloat-equal -Wunreachable-code -Weffc++ -std=c++14 -Wmissing-include-dirs -Wzero-as-null-pointer-constant -fpic -fexceptions -fstack-protector-strong -fstack-clash-protection -Werror=format-security -Wold-style-cast -Wno-unused-function -Wno-noexcept-type -D_FORTIFY_SOURCE=2 -DNO_FILEIO -DC_API 16 | RESINC = 17 | LIBDIR = 18 | LIB = 19 | LDFLAGS = -Wl,-z,defs 20 | 21 | INC_DEBUG = $(INC) 22 | CFLAGS_DEBUG = $(CFLAGS) -Wredundant-decls -Winline -Wswitch-enum -Wswitch-default -Wmain -Wall -fPIC -Og -g -Wsuggest-final-types -Wsuggest-final-methods -ftemplate-backtrace-limit=32 -fasynchronous-unwind-tables -fsanitize=leak -fsanitize=address -DDEBUG -D_GLIBCXX_DEBUG -D_GLIBCXX_ASSERTIONS -DTRACE=2 -DVALIDATE=2 23 | RESINC_DEBUG = $(RESINC) 24 | RCFLAGS_DEBUG = $(RCFLAGS) 25 | LIBDIR_DEBUG = $(LIBDIR) 26 | LIB_DEBUG = $(LIB)-lasan 27 | LDFLAGS_DEBUG = $(LDFLAGS) 28 | OBJDIR_DEBUG = obj/Debug 29 | DEP_DEBUG = 30 | OUT_DEBUG = bin/Debug/libxmeasures.so 31 | 32 | INC_RELEASE = $(INC) 33 | CFLAGS_RELEASE = $(CFLAGS) -fomit-frame-pointer -O3 -march=core2 -ftemplate-backtrace-limit=32 -Wno-strict-aliasing -DTRACE=1 -DVALIDATE=1 34 | RESINC_RELEASE = $(RESINC) 35 | RCFLAGS_RELEASE = $(RCFLAGS) 36 | LIBDIR_RELEASE = $(LIBDIR) 37 | LIB_RELEASE = $(LIB) 38 | LDFLAGS_RELEASE = $(LDFLAGS) -s 39 | OBJDIR_RELEASE = obj/Release 40 | DEP_RELEASE = 41 | OUT_RELEASE = bin/Release/libxmeasures.so 42 | 43 | OBJ_DEBUG = $(OBJDIR_DEBUG)/src/interface.o $(OBJDIR_DEBUG)/src/interface_c.o 44 | 45 | OBJ_RELEASE = $(OBJDIR_RELEASE)/src/interface.o $(OBJDIR_RELEASE)/src/interface_c.o 46 | 47 | all: debug release 48 | 49 | clean: clean_debug clean_release 50 | 51 | before_debug: 52 | test -d bin/Debug || mkdir -p bin/Debug 53 | test -d $(OBJDIR_DEBUG)/src || mkdir -p $(OBJDIR_DEBUG)/src 54 | 55 | after_debug: 56 | 57 | debug: before_debug out_debug after_debug 58 | 59 | out_debug: before_debug $(OBJ_DEBUG) $(DEP_DEBUG) 60 | $(LD) -shared $(LIBDIR_DEBUG) $(OBJ_DEBUG) -o $(OUT_DEBUG) $(LDFLAGS_DEBUG) $(LIB_DEBUG) 61 | 62 | $(OBJDIR_DEBUG)/src/interface.o: src/interface.cpp 63 | $(CXX) $(CFLAGS_DEBUG) $(INC_DEBUG) -c src/interface.cpp -o $(OBJDIR_DEBUG)/src/interface.o 64 | 65 | $(OBJDIR_DEBUG)/src/interface_c.o: src/interface_c.cpp 66 | $(CXX) $(CFLAGS_DEBUG) $(INC_DEBUG) -c src/interface_c.cpp -o $(OBJDIR_DEBUG)/src/interface_c.o 67 | 68 | clean_debug: 69 | rm -f $(OBJ_DEBUG) $(OUT_DEBUG) 70 | rm -rf bin/Debug 71 | rm -rf $(OBJDIR_DEBUG)/src 72 | 73 | before_release: 74 | test -d bin/Release || mkdir -p bin/Release 75 | test -d $(OBJDIR_RELEASE)/src || mkdir -p $(OBJDIR_RELEASE)/src 76 | 77 | after_release: 78 | 79 | release: before_release out_release after_release 80 | 81 | out_release: before_release $(OBJ_RELEASE) $(DEP_RELEASE) 82 | $(LD) -shared $(LIBDIR_RELEASE) $(OBJ_RELEASE) -o $(OUT_RELEASE) $(LDFLAGS_RELEASE) $(LIB_RELEASE) 83 | 84 | $(OBJDIR_RELEASE)/src/interface.o: src/interface.cpp 85 | $(CXX) $(CFLAGS_RELEASE) $(INC_RELEASE) -c src/interface.cpp -o $(OBJDIR_RELEASE)/src/interface.o 86 | 87 | $(OBJDIR_RELEASE)/src/interface_c.o: src/interface_c.cpp 88 | $(CXX) $(CFLAGS_RELEASE) $(INC_RELEASE) -c src/interface_c.cpp -o $(OBJDIR_RELEASE)/src/interface_c.o 89 | 90 | clean_release: 91 | rm -f $(OBJ_RELEASE) $(OUT_RELEASE) 92 | rm -rf bin/Release 93 | rm -rf $(OBJDIR_RELEASE)/src 94 | 95 | .PHONY: before_debug after_debug clean_debug before_release after_release clean_release 96 | 97 | -------------------------------------------------------------------------------- /libxmeasures.cbp: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 108 | 109 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------------# 2 | # This makefile was generated by 'cbp2make' tool rev.147 # 3 | #------------------------------------------------------------------------------# 4 | 5 | 6 | WORKDIR = `pwd` 7 | 8 | CC = gcc 9 | CXX = g++ 10 | AR = ar 11 | LD = g++ 12 | WINDRES = windres 13 | 14 | INC = -Iinclude -Iautogen -Ishared 15 | CFLAGS = -Wnon-virtual-dtor -Winit-self -Wcast-align -Wundef -Wfloat-equal -Wunreachable-code -Wmissing-include-dirs -Weffc++ -Wzero-as-null-pointer-constant -Wall -std=c++14 -fexceptions -fstack-protector-strong -fstack-clash-protection -Werror=format-security -Wold-style-cast -Wno-float-equal -D_FORTIFY_SOURCE=2 16 | RESINC = 17 | LIBDIR = 18 | LIB = -lstdc++fs 19 | LDFLAGS = -Wl,-z,defs 20 | 21 | INC_DEBUG = $(INC) 22 | CFLAGS_DEBUG = $(CFLAGS) -Wredundant-decls -Wswitch-default -Wmain -Wextra -Og -g -fasynchronous-unwind-tables -fsanitize=address -fsanitize=leak -DDEBUG -D_GLIBCXX_DEBUG -D_GLIBCXX_ASSERTIONS -DTRACE=2 -DVALIDATE=2 23 | RESINC_DEBUG = $(RESINC) 24 | RCFLAGS_DEBUG = $(RCFLAGS) 25 | LIBDIR_DEBUG = $(LIBDIR) 26 | LIB_DEBUG = $(LIB) -lasan 27 | LDFLAGS_DEBUG = $(LDFLAGS) 28 | OBJDIR_DEBUG = obj/Debug 29 | DEP_DEBUG = 30 | OUT_DEBUG = bin/Debug/xmeasures 31 | 32 | INC_RELEASE = $(INC) 33 | CFLAGS_RELEASE = $(CFLAGS) -march=core2 -fomit-frame-pointer -O3 -Wfatal-errors -DTRACE=1 -DVALIDATE=1 34 | RESINC_RELEASE = $(RESINC) 35 | RCFLAGS_RELEASE = $(RCFLAGS) 36 | LIBDIR_RELEASE = $(LIBDIR) 37 | LIB_RELEASE = $(LIB) 38 | LDFLAGS_RELEASE = $(LDFLAGS) -s -Wl,-z,relro,-z,now 39 | OBJDIR_RELEASE = obj/Release 40 | DEP_RELEASE = 41 | OUT_RELEASE = bin/Release/xmeasures 42 | 43 | OBJ_DEBUG = $(OBJDIR_DEBUG)/autogen/cmdline.o $(OBJDIR_DEBUG)/shared/fileio.o $(OBJDIR_DEBUG)/src/interface.o $(OBJDIR_DEBUG)/src/main.o 44 | 45 | OBJ_RELEASE = $(OBJDIR_RELEASE)/autogen/cmdline.o $(OBJDIR_RELEASE)/shared/fileio.o $(OBJDIR_RELEASE)/src/interface.o $(OBJDIR_RELEASE)/src/main.o 46 | 47 | all: debug release 48 | 49 | clean: clean_debug clean_release 50 | 51 | before_debug: 52 | test -d bin/Debug || mkdir -p bin/Debug 53 | test -d $(OBJDIR_DEBUG)/autogen || mkdir -p $(OBJDIR_DEBUG)/autogen 54 | test -d $(OBJDIR_DEBUG)/shared || mkdir -p $(OBJDIR_DEBUG)/shared 55 | test -d $(OBJDIR_DEBUG)/src || mkdir -p $(OBJDIR_DEBUG)/src 56 | 57 | after_debug: 58 | 59 | debug: before_debug out_debug after_debug 60 | 61 | out_debug: before_debug $(OBJ_DEBUG) $(DEP_DEBUG) 62 | $(LD) $(LIBDIR_DEBUG) -o $(OUT_DEBUG) $(OBJ_DEBUG) $(LDFLAGS_DEBUG) $(LIB_DEBUG) 63 | 64 | $(OBJDIR_DEBUG)/autogen/cmdline.o: autogen/cmdline.c 65 | $(CC) $(CFLAGS_DEBUG) $(INC_DEBUG) -c autogen/cmdline.c -o $(OBJDIR_DEBUG)/autogen/cmdline.o 66 | 67 | $(OBJDIR_DEBUG)/shared/fileio.o: shared/fileio.cpp 68 | $(CXX) $(CFLAGS_DEBUG) $(INC_DEBUG) -c shared/fileio.cpp -o $(OBJDIR_DEBUG)/shared/fileio.o 69 | 70 | $(OBJDIR_DEBUG)/src/interface.o: src/interface.cpp 71 | $(CXX) $(CFLAGS_DEBUG) $(INC_DEBUG) -c src/interface.cpp -o $(OBJDIR_DEBUG)/src/interface.o 72 | 73 | $(OBJDIR_DEBUG)/src/main.o: src/main.cpp 74 | $(CXX) $(CFLAGS_DEBUG) $(INC_DEBUG) -c src/main.cpp -o $(OBJDIR_DEBUG)/src/main.o 75 | 76 | clean_debug: 77 | rm -f $(OBJ_DEBUG) $(OUT_DEBUG) 78 | rm -rf bin/Debug 79 | rm -rf $(OBJDIR_DEBUG)/autogen 80 | rm -rf $(OBJDIR_DEBUG)/shared 81 | rm -rf $(OBJDIR_DEBUG)/src 82 | 83 | before_release: 84 | test -d bin/Release || mkdir -p bin/Release 85 | test -d $(OBJDIR_RELEASE)/autogen || mkdir -p $(OBJDIR_RELEASE)/autogen 86 | test -d $(OBJDIR_RELEASE)/shared || mkdir -p $(OBJDIR_RELEASE)/shared 87 | test -d $(OBJDIR_RELEASE)/src || mkdir -p $(OBJDIR_RELEASE)/src 88 | 89 | after_release: 90 | 91 | release: before_release out_release after_release 92 | 93 | out_release: before_release $(OBJ_RELEASE) $(DEP_RELEASE) 94 | $(LD) $(LIBDIR_RELEASE) -o $(OUT_RELEASE) $(OBJ_RELEASE) $(LDFLAGS_RELEASE) $(LIB_RELEASE) 95 | 96 | $(OBJDIR_RELEASE)/autogen/cmdline.o: autogen/cmdline.c 97 | $(CC) $(CFLAGS_RELEASE) $(INC_RELEASE) -c autogen/cmdline.c -o $(OBJDIR_RELEASE)/autogen/cmdline.o 98 | 99 | $(OBJDIR_RELEASE)/shared/fileio.o: shared/fileio.cpp 100 | $(CXX) $(CFLAGS_RELEASE) $(INC_RELEASE) -c shared/fileio.cpp -o $(OBJDIR_RELEASE)/shared/fileio.o 101 | 102 | $(OBJDIR_RELEASE)/src/interface.o: src/interface.cpp 103 | $(CXX) $(CFLAGS_RELEASE) $(INC_RELEASE) -c src/interface.cpp -o $(OBJDIR_RELEASE)/src/interface.o 104 | 105 | $(OBJDIR_RELEASE)/src/main.o: src/main.cpp 106 | $(CXX) $(CFLAGS_RELEASE) $(INC_RELEASE) -c src/main.cpp -o $(OBJDIR_RELEASE)/src/main.o 107 | 108 | clean_release: 109 | rm -f $(OBJ_RELEASE) $(OUT_RELEASE) 110 | rm -rf bin/Release 111 | rm -rf $(OBJDIR_RELEASE)/autogen 112 | rm -rf $(OBJDIR_RELEASE)/shared 113 | rm -rf $(OBJDIR_RELEASE)/src 114 | 115 | .PHONY: before_debug after_debug clean_debug before_release after_release clean_release 116 | 117 | -------------------------------------------------------------------------------- /src/interface.cpp: -------------------------------------------------------------------------------- 1 | //! \brief Extrinsic measures evaluation interface implementation. 2 | //! 3 | //! \license Apache License, Version 2.0: http://www.apache.org/licenses/LICENSE-2.0.html 4 | //! > Simple explanation: https://tldrlegal.com/license/apache-license-2.0-(apache-2.0) 5 | //! 6 | //! Copyright (c) 7 | //! \authr Artem Lutov 8 | //! \email luart@ya.ru 9 | //! \date 2017-12-15 10 | 11 | #include 12 | //#include 13 | #include 14 | 15 | #include "operations.hpp" 16 | #include "interface.h" 17 | 18 | 19 | using std::overflow_error; 20 | using std::invalid_argument; 21 | using namespace daoc; 22 | 23 | 24 | // Omega Index related types and functions ------------------------------------- 25 | Id mutualnum(const RawClusterPtrs* a, const RawClusterPtrs* b, const Id nmax) noexcept 26 | { 27 | #if VALIDATE >= 2 28 | assert(a && b && "mutualnum(), valid containers are expected"); 29 | #endif // VALIDATE 30 | Id num = 0; 31 | if(b->size() < a->size()) { 32 | auto t = b; 33 | b = a; 34 | a = t; 35 | } 36 | const auto eb = b->end(); 37 | auto ib = b->begin(); 38 | for(auto acp: *a) { 39 | while(ib != eb && cmpBase(*ib, acp)) 40 | ++ib; 41 | if(ib == eb 42 | || (*ib == acp && ++num >= nmax)) 43 | break; 44 | } 45 | return num; 46 | } 47 | 48 | Id mutualnum(const RawClusterPtrs* a, const RawClusterPtrs* b) noexcept 49 | { 50 | #if VALIDATE >= 2 51 | assert(a && b && "mutualnum(), valid containers are expected"); 52 | #endif // VALIDATE 53 | Id num = 0; 54 | if(b->size() < a->size()) { 55 | auto t = b; 56 | b = a; 57 | a = t; 58 | } 59 | const auto eb = b->end(); 60 | auto ib = b->begin(); 61 | for(auto acp: *a) { 62 | while(ib != eb && cmpBase(*ib, acp)) 63 | ++ib; 64 | if(ib == eb) 65 | break; 66 | if(*ib == acp) 67 | ++num; 68 | } 69 | return num; 70 | } 71 | 72 | // Other Measures related functions -------------------------------------------- 73 | //string to_string(Evaluation eval, bool bitstr) 74 | //{ 75 | // static_assert(sizeof(Evaluation) == sizeof(EvalBase) 76 | // , "to_string(), Evaluation type must be the same size as EvalBase"); 77 | // // Convert to bit string 78 | // if(bitstr) 79 | // return bitset(static_cast(eval)) 80 | // .to_string().insert(0, "0b"); 81 | // 82 | // // Convert to semantic string 83 | // string val; 84 | // switch(eval) { 85 | // case Evaluation::MULTIRES: 86 | // val = "MULTIRES"; 87 | // break; 88 | // case Evaluation::OVERLAPPING: 89 | // val = "OVERLAPPING"; 90 | // break; 91 | // case Evaluation::MULRES_OVP: 92 | // val = "MULRES_OVP"; 93 | // break; 94 | // case Evaluation::NONE: 95 | // default: 96 | // val = "NONE"; 97 | // } 98 | // return val; 99 | //} 100 | 101 | string to_string(F1 f1) 102 | { 103 | // Convert to semantic string 104 | string val; 105 | switch(f1) { 106 | case F1::PARTPROB: 107 | val = "PARTPROB"; 108 | break; 109 | case F1::HARMONIC: 110 | val = "HARMONIC"; 111 | break; 112 | case F1::AVERAGE: 113 | val = "AVERAGE"; // Suggested by Leskovec 114 | break; 115 | case F1::NONE: 116 | default: 117 | val = "NONE"; 118 | } 119 | return val; 120 | } 121 | 122 | string to_string(Match mkind) 123 | { 124 | // Convert to semantic string 125 | string val; 126 | switch(mkind) { 127 | case Match::WEIGHTED: 128 | val = "WEIGHTED"; 129 | break; 130 | case Match::UNWEIGHTED: 131 | val = "UNWEIGHTED"; 132 | break; 133 | case Match::COMBINED: 134 | val = "COMBINED"; 135 | break; 136 | case Match::NONE: 137 | default: 138 | val = "NONE"; 139 | } 140 | return val; 141 | } 142 | 143 | bool xwmatch(Match m) noexcept 144 | { 145 | return m == Match::WEIGHTED || m == Match::COMBINED; 146 | } 147 | 148 | 149 | bool xumatch(Match m) noexcept 150 | { 151 | return m == Match::UNWEIGHTED || m == Match::COMBINED; 152 | } 153 | 154 | #ifndef NO_FILEIO 155 | NodeBase NodeBase::load(const char* filename, float membership 156 | , ::AggHash* ahash, size_t cmin, size_t cmax, bool verbose) 157 | { 158 | NodeBase nb; // Return using NRVO optimization 159 | NamedFileWrapper finp(filename, "r"); 160 | if(finp) 161 | static_cast(nb) = loadNodes(finp, membership 162 | , ahash, cmin, cmax, verbose); 163 | else perror((string("WARNING load(), can't open ") += filename).c_str()); 164 | 165 | return nb; 166 | } 167 | #endif // NO_FILEIO 168 | 169 | // Accessory functions --------------------------------------------------------- 170 | Id parseId(char* str) 171 | { 172 | #if VALIDATE >= 2 173 | assert(!errno && "Initial errno should be zero"); 174 | #endif // VALIDATE 175 | auto nid = strtoul(str, nullptr, 10); 176 | static_assert(sizeof(nid) >= sizeof(Id), "Parsing value type is too small for Id"); 177 | if(nid > numeric_limits::max() || (!nid && errno != 0)) { 178 | if(nid > numeric_limits::max()) 179 | throw overflow_error("Loaded value of id is too large: " + std::to_string(nid) + "\n"); 180 | else if(errno != 0) 181 | throw invalid_argument(string("Conversion to id can't be performed: ").append(str) 182 | + ", errno: " + std::to_string(errno).append("\n")); 183 | } 184 | return nid; 185 | } 186 | 187 | AccProb hmean(AccProb a, AccProb b) noexcept 188 | { 189 | static_assert(is_floating_point::value, "AccProb should be a floating point type"); 190 | // Note: both a = b = 0 and a = -b are considered and yield 0 191 | return a + b != 0 ? 2 * a / (a + b) * b : 0; 192 | } 193 | 194 | AccProb gmean(AccProb a, AccProb b) noexcept 195 | { 196 | #ifdef DEBUG 197 | assert(a >= 0 && b >= 0 && "gmean(), the probabilities should E [0, 1]"); 198 | #endif // DEBUG 199 | return sqrt(a * b); 200 | } 201 | 202 | AccProb amean(AccProb a, AccProb b) noexcept 203 | { 204 | return (a + b) / 2; 205 | } 206 | -------------------------------------------------------------------------------- /include/interface_c.h: -------------------------------------------------------------------------------- 1 | //! \brief Extrinsic measures evaluation interface. 2 | //! 3 | //! \license Apache License, Version 2.0: http://www.apache.org/licenses/LICENSE-2.0.html 4 | //! > Simple explanation: https://tldrlegal.com/license/apache-license-2.0-(apache-2.0) 5 | //! 6 | //! Copyright (c) 7 | //! \authr Artem Lutov 8 | //! \email luart@ya.ru 9 | //! \date 2021-03-11 10 | 11 | #ifndef INTERFACE_C_H_INCLUDED 12 | #define INTERFACE_C_H_INCLUDED 13 | 14 | #ifdef __cplusplus 15 | extern "C" { 16 | #endif // __cplusplus 17 | 18 | #include // uintX_t 19 | 20 | 21 | typedef uint32_t NodeId; //!< Node Id type 22 | typedef uint64_t AccNodeId; //!< Accumulated Node Id type 23 | typedef float LinkWeight; ///< Link weight 24 | 25 | //! \brief Node relations 26 | typedef struct { 27 | NodeId num; //!< The number of cluster nodes 28 | NodeId* ids; //!< Node ids 29 | LinkWeight* weights; //!< Node weights in this cluster, can be NULL which means equal weights = 1 30 | } ClusterNodes; 31 | 32 | //! \brief Node collection (clusters) 33 | typedef struct { 34 | NodeId num; //!< The number of node relations (clusters) in a collection 35 | ClusterNodes* nodes; //!< Relations of nodes 36 | } ClusterCollection; 37 | 38 | //! \brief F1 Kind 39 | typedef enum { 40 | //! Not initialized 41 | F1_NONE = 0, 42 | //! Harmonic mean of the [weighted] average of the greatest (maximal) match 43 | //! by partial probabilities 44 | F1_PARTPROB, 45 | //! Harmonic mean of the [weighted] average of the greatest (maximal) match by F1s 46 | F1_HARMONIC, 47 | //! Arithmetic mean (average) of the [weighted] average of the greatest (maximal) 48 | //! match by F1s, i.e. F1-Score 49 | F1_AVERAGE // Suggested by Leskovec 50 | } F1Kind; 51 | 52 | //! \brief Collection matching kind 53 | typedef enum { 54 | MATCH_NONE = 0, //!< Note initialized 55 | MATCH_WEIGHTED, //!< Weighted matching by the number of members in each cluster (macro weighting) 56 | MATCH_UNWEIGHTED, //!< Unweighted matching of each cluster (micro weighting) 57 | MATCH_COMBINED //!< Combined of macro and micro weightings using geometric mean 58 | } MatchKind; 59 | 60 | typedef float Probability; 61 | 62 | //! \brief Specified F1 evaluation of the Greatest (Max) Match for the 63 | //! multi-resolution clustering with possibly unequal node base 64 | //! 65 | //! Supported F1 measures are F1p <= F1h <= F1s, where: 66 | //! - F1p - Harmonic mean of the [weighted] average of partial probabilities, 67 | //! the most discriminative and satisfies the largest number of the Formal 68 | //! Constraints (homogeneity, completeness, rag bag, size/quantity, balance); 69 | //! - F1h - Harmonic mean of the [weighted] average of F1s; 70 | //! - F1a - Average F1-Score, i.e. arithmetic mean (average) of the [weighted] 71 | //! average of F1s, the least discriminative and satisfies the lowest number 72 | //! of the Formal Constraints. 73 | //! 74 | //! of the Greatest (Max) Match [Weighted] Average Harmonic Mean evaluation 75 | //! \note Undirected (symmetric) evaluation 76 | //! 77 | //! \param cn1 const ClusterCollection - first collection of clusters (node relations) 78 | //! \param cn2 const ClusterCollection - second collection 79 | //! \param kind F1Kind - kind of F1 to be evaluated 80 | //! \param[out] rec Probability* - recall of cn2 relative to the ground-truth cn1 or 81 | //! 0 if the matching strategy does not have the precision/recall notations 82 | //! \param[out] prc Probability* - precision of cn2 relative to the ground-truth cn1 or 83 | //! 0 if the matching strategy does not have the precision/recall notations 84 | //! \param mkind=MATCH_WEIGHTED MatchKind - matching kind 85 | //! \param sync uint8_t - synchronize node base of the input collections, by appending 86 | //! the lacking single-node clusters 87 | //! \param makeunique uint8_t - ensure that clusters contain unique members by 88 | //! removing the duplicates 89 | //! \param verbose=0 uint8_t - print intermediate results to the stdout 90 | //! \return Probability - resulting F1_gm 91 | Probability f1x(const ClusterCollection cn1, const ClusterCollection cn2, F1Kind kind 92 | , Probability* rec, Probability* prc, MatchKind mkind, uint8_t sync, uint8_t makeunique, uint8_t verbose); 93 | Probability f1(const ClusterCollection cn1, const ClusterCollection cn2, F1Kind kind 94 | , Probability* rec, Probability* prc); // MATCH_WEIGHTED, false 95 | Probability f1p(const ClusterCollection cn1, const ClusterCollection cn2); // MATCH_WEIGHTED, false 96 | Probability f1h(const ClusterCollection cn1, const ClusterCollection cn2); // MATCH_WEIGHTED, false 97 | 98 | //! \brief (Extended) Omega Index evaluation 99 | //! 100 | //! \param cn1 const ClusterCollection - first collection of clusters (node relations) 101 | //! \param cn2 const ClusterCollection - second collection 102 | //! \return Probability - omega index 103 | Probability omega(const ClusterCollection cn1, const ClusterCollection cn2); 104 | Probability omegaExt(const ClusterCollection cn1, const ClusterCollection cn2); 105 | 106 | //! \brief (Extended) Omega Index evaluation 107 | //! 108 | //! \param cn1 const ClusterCollection - first collection of clusters (node relations) 109 | //! \param cn2 const ClusterCollection - second collection 110 | //! \param ext uint8_t - evaluate extended omega index 111 | //! \param sync uint8_t - synchronize node base of the input collections, by appending the lacking single-node clusters 112 | //! \param makeunique uint8_t - ensure that clusters contain unique members by 113 | //! removing the duplicates 114 | //! \return Probability - omega index 115 | Probability omegax(const ClusterCollection cn1, const ClusterCollection cn2, uint8_t ext, uint8_t sync, uint8_t makeunique); 116 | 117 | #ifdef __cplusplus 118 | }; 119 | #endif // __cplusplus 120 | 121 | #endif // INTERFACE_C_H_INCLUDED 122 | -------------------------------------------------------------------------------- /shared/agghash.hpp: -------------------------------------------------------------------------------- 1 | //! \brief AggHash simple (Aggregating Order Invariant Hashing) of the DAOC clustering library 2 | //! 3 | //! \license Apache License, Version 2.0: http://www.apache.org/licenses/LICENSE-2.0.html 4 | //! > Simple explanation: https://tldrlegal.com/license/apache-license-2.0-(apache-2.0) 5 | //! 6 | //! Copyright (c) 7 | //! \authr Artem Lutov 8 | //! \email luart@ya.ru 9 | //! \date 2017-02-21 10 | 11 | #ifndef CODING_HPP 12 | #define CODING_HPP 13 | 14 | #include // uintX_t 15 | //#include // size_t 16 | #include // uintX_t 17 | #include // hash 18 | //#include // memcmp 19 | #include // is_integral 20 | #include // numeric_limits 21 | #include // numeric_limits 22 | 23 | 24 | namespace daoc { 25 | 26 | using std::string; 27 | using std::is_integral; 28 | using std::numeric_limits; 29 | using std::domain_error; 30 | 31 | // Type Declarations --------------------------------------------------- 32 | //! \brief Aggregation hash of ids 33 | //! \pre Template types should be integral 34 | //! 35 | //! \tparam Id - type of the member ids 36 | //! \tparam AccId - type of the accumulated Ids and accumulated squares of Ids 37 | //! should have at least twice magnitude of the Id type (i.e. squared) 38 | template 39 | class AggHash { 40 | static_assert(is_integral::value && is_integral::value 41 | && sizeof(AccId) >= 2*sizeof(Id), "AggHash, types constraints are violated"); 42 | 43 | // ATTENTION: type of the m_size should not be less than of m_idsum to 44 | // avoid gaps filled with trash on memory alignment 45 | // Note: size should be first as the most discriminative attribute, which 46 | // can be potentially used for the ordering 47 | // Note: the size is redundant and does not have any impact except for the structured ordering 48 | // if the sum does not increase AccId_MAX or if zero value of id is NOT allowed. The size is 49 | // necessary if id=0 may be present in the clusters. 50 | AccId m_size; //!< Size of the container 51 | AccId m_idsum; //!< Sum of the member ids 52 | AccId m_id2sum; //!< Sum of the squared member ids 53 | protected: 54 | //! Id correction to prevent collisions 55 | constexpr static Id idcor = sqrt(numeric_limits::max()); 56 | public: 57 | // Export the template parameter types 58 | using IdT = Id; //!< Type of the member ids 59 | using AccIdT = AccId; //!< Type of the accumulated Ids and accumulated squares of Ids 60 | 61 | //! \brief Default constructor 62 | AggHash() noexcept 63 | : m_size(0), m_idsum(0), m_id2sum(0) {} 64 | 65 | //! \brief Add id to the aggregation 66 | //! \note In case correction is used and id becomes out of range (initial id > IDMAX - IDCORR) 67 | //! then an exception is thrown, which crashes the whole application, which is OK 68 | //! 69 | //! \param id Id - id to be included into the hash 70 | //! \return void 71 | void add(Id id) noexcept; 72 | 73 | //! \brief Clear/reset the aggregation 74 | //! 75 | //! \return void 76 | void clear() noexcept; 77 | 78 | //! \brief Number of the aggregated ids 79 | //! 80 | //! \return size_t - number of the aggregated ids 81 | size_t size() const noexcept { return m_size; } 82 | 83 | //! \brief Sum of the aggregated ids 84 | //! 85 | //! \return size_t - sum of the aggregated ids 86 | size_t idsum() const noexcept { return m_idsum; } 87 | 88 | //! \brief Sum of squares of the aggregated ids 89 | //! 90 | //! \return size_t - sum of squares of the aggregated ids 91 | size_t id2sum() const noexcept { return m_id2sum; } 92 | 93 | // //! \brief The hash is empty 94 | // //! 95 | // //! \return bool - the hash is empty 96 | // bool empty() const noexcept { return !m_size; } 97 | 98 | //! \brief Evaluate hash of the aggregation 99 | //! 100 | //! \return size_t - resulting hash 101 | size_t hash() const; 102 | 103 | //! \brief Operator less 104 | //! 105 | //! \param ah const AggHash& - comparing object 106 | //! \return bool operator - result of the comparison 107 | inline bool operator <(const AggHash& ah) const noexcept; 108 | 109 | //! \brief Operator less or equal 110 | //! 111 | //! \param ah const AggHash& - comparing object 112 | //! \return bool operator - result of the comparison 113 | inline bool operator <=(const AggHash& ah) const noexcept; 114 | 115 | //! \brief Operator greater 116 | //! 117 | //! \param ah const AggHash& - comparing object 118 | //! \return bool operator - result of the comparison 119 | bool operator >(const AggHash& ah) const noexcept { return !(*this <= ah); } 120 | 121 | //! \brief Operator greater or equal 122 | //! 123 | //! \param ah const AggHash& - comparing object 124 | //! \return bool operator - result of the comparison 125 | bool operator >=(const AggHash& ah) const noexcept { return !(*this < ah); } 126 | 127 | //! \brief Operator equal 128 | //! 129 | //! \param ah const AggHash& - comparing object 130 | //! \return bool operator - result of the comparison 131 | inline bool operator ==(const AggHash& ah) const noexcept; 132 | 133 | //! \brief Operator unequal (not equal) 134 | //! 135 | //! \param ah const AggHash& - comparing object 136 | //! \return bool operator - result of the comparison 137 | bool operator !=(const AggHash& ah) const noexcept { return !(*this == ah); } 138 | }; 139 | 140 | // Type Definitions ---------------------------------------------------- 141 | #pragma GCC diagnostic push 142 | #pragma GCC diagnostic ignored "-Wterminate" // Disable the warning about the exception throwing function marked as noexcept 143 | template 144 | void AggHash::add(Id id) noexcept 145 | { 146 | id += idcor; // Correct id to prevent collisions (see AgordiHash for details) 147 | // Check for the overflow after the correction 148 | // Note: the exception will crash the whole app since noexcept is used but it is fine 149 | if(id < idcor) 150 | throw domain_error(string("The corrected value of ").append(std::to_string(id)) 151 | .append(" is too large and causes the overflow\n")); 152 | ++m_size; 153 | m_idsum += id; 154 | m_id2sum += id * id; 155 | } 156 | #pragma GCC diagnostic pop 157 | 158 | template 159 | void AggHash::clear() noexcept 160 | { 161 | m_size = 0; 162 | m_idsum = 0; 163 | m_id2sum = 0; 164 | } 165 | 166 | template 167 | size_t AggHash::hash() const 168 | { 169 | // ATTENTION: requires filling with zero memory alignment trash or avoid the padding 170 | return std::hash()(string(reinterpret_cast(this), sizeof *this)); 171 | } 172 | 173 | template 174 | bool AggHash::operator <(const AggHash& ah) const noexcept 175 | { 176 | return m_size < ah.m_size || (m_size == ah.m_size 177 | && (m_idsum < ah.m_idsum || (m_idsum == ah.m_idsum && m_id2sum < ah.m_id2sum))); 178 | } 179 | 180 | template 181 | bool AggHash::operator <=(const AggHash& ah) const noexcept 182 | { 183 | return m_size < ah.m_size || (m_size == ah.m_size 184 | && (m_idsum < ah.m_idsum || (m_idsum == ah.m_idsum && m_id2sum <= ah.m_id2sum))); 185 | } 186 | 187 | template 188 | bool AggHash::operator ==(const AggHash& ah) const noexcept 189 | { 190 | return m_size == ah.m_size && m_idsum == ah.m_idsum && m_id2sum == ah.m_id2sum; 191 | //return !memcmp(this, &ah, sizeof(AggHash)); // Note: memcmp returns 0 on full match 192 | } 193 | 194 | } // daoc 195 | 196 | #endif // CODING_HPP 197 | -------------------------------------------------------------------------------- /args.ggo: -------------------------------------------------------------------------------- 1 | # Configuration file for the automatic generation of the input options parsing 2 | 3 | package "xmeasures" 4 | version "4.0.4" 5 | versiontext "Author: (c) Artem Lutov 6 | Sources: https://github.com/eXascaleInfolab/xmeasures 7 | Paper: \"Accuracy Evaluation of Overlapping and Multi-resolution Clustering Algorithms on Large Datasets\" by Artem Lutov, Mourad Khayati and Philippe Cudré-Mauroux, BigComp 2019 8 | " 9 | 10 | purpose "Extrinsic measures evaluation: Omega Index (a fuzzy version of the\ 11 | Adjusted Rand Index, identical to the Fuzzy Rand Index) and [mean] F1-score\ 12 | (prob, harm and avg) for the overlapping multi-resolution clusterings,\ 13 | and standard NMI for the non-overlapping clustering on a single resolution.\ 14 | Unequal node base is allowed in the evaluating clusterings and optionally can\ 15 | be synchronized removing nodes from the clusters missed in one of the clusterings (collections)." 16 | 17 | usage "xmeasures [OPTIONS] clustering1 clustering2 18 | 19 | clustering - input file, collection of the clusters to be evaluated. 20 | 21 | Examples: 22 | $ ./xmeasures -fp -kc networks/5K25.cnl tests/5K25_l0.825/5K25_l0.825_796.cnl 23 | $ ./xmeasures -fh -kc -i tests/5K25.cll -ph -l networks/5K25.cnl tests/5K25_l0.825/5K25_l0.825_796.cnl 24 | $ ./xmeasures -ox tests/clsevalsx/omega_c4.3-1.cnl tests/clsevalsx/omega_c4.3-2.cnl 25 | " 26 | 27 | description "Extrinsic measures are evaluated, i.e. two input clusterings\ 28 | (collections of clusters) are compared to each other. Optionally, a labeling\ 29 | of the evaluating clusters with the specified ground-truth clusters is performed. 30 | NOTE: 31 | - Multiple evaluating measures can be specified. 32 | - Each cluster should contain unique members, which is ensured only if the\ 33 | 'unique' option is specified. 34 | - All clusters should be unique to not affect Omega Index evaluation, which\ 35 | can be ensured by the [resmerge](https://github.com/eXascaleInfolab/resmerge) utility. 36 | - Non-corrected unequal node base in the clusterings is allowed, it penalizes the match.\ 37 | Use [OvpNMI](https://github.com/eXascaleInfolab/OvpNMI) or\ 38 | [GenConvNMI](https://github.com/eXascaleInfolab/GenConvNMI) for NMI evaluation\ 39 | in the arbitrary collections (still each cluster should contain unique members). 40 | 41 | Evaluating measures are: 42 | - OI - Omega Index (a fuzzy version of the Adjusted Rand Index, identical to\ 43 | the Fuzzy Rand Index), which yields the same value as Adjusted Rand Index when\ 44 | applied to the non-overlapping clusterings. 45 | - [M]F1 - various [mean] F1 measures of the Greatest (Max) Match including\ 46 | the Average F1-Score (suggested by J. Leskovec) with the optional weighting. 47 | NOTE: There are 3 matching policies available for each kind of F1. The most\ 48 | representative evaluation is performed by the F1p with combined matching\ 49 | policy (considers both micro and macro weighting). 50 | - NMI - Normalized Mutual Information, normalized by either max or also sqrt,\ 51 | avg and min information content denominators. 52 | ATTENTION: This is a standard NMI, which should be used ONLY for the HARD\ 53 | partitioning evaluation (non-overlapping clustering on a single resolution).\ 54 | It penalizes overlapping and multi-resolution structures. 55 | " 56 | 57 | option "ovp" O "evaluate overlapping instead of the multi-resolution clusters,\ 58 | where max matching for any shared member between R overlapping clusters is 1/R\ 59 | (the member is shared) instead of 1 (the member fully belongs to each [hierarchical\ 60 | sub]group) for the member belonging to R distinct clusters on R resolutions. 61 | NOTE: It has no effect for the Omega Index evaluation." 62 | flag off 63 | #NOTE: Multi-resolution mode can be used as approximation of the overlapping\ 64 | # clusters evaluation, but not vice verse" flag off 65 | # Note: ovp option requires shares evaluation/reading and processing of the directory 66 | # of collections in case of both multi-resolution and overlapping clustering evaluation 67 | option "unique" q "ensure on loading that all cluster members are unique by\ 68 | removing all duplicates." 69 | flag off 70 | option "sync" s "synchronize with the specified node base omitting the non-matching nodes. 71 | NOTE: The node base can be either a separate, or an evaluating CNL file, in the\ 72 | latter case this option should precede the evaluating filename not repeating it" 73 | string typestr="filename" 74 | option "membership" m "average expected membership of the nodes in the clusters,\ 75 | > 0, typically >= 1. Used only to facilitate estimation of the nodes number on\ 76 | the containers preallocation if this number is not specified in the file header." 77 | float default="1" 78 | option "detailed" d "detailed (verbose) results output" flag off 79 | 80 | section "Omega Index" 81 | option "omega" o "evaluate Omega Index (a fuzzy version of the Adjusted Rand Index,\ 82 | identical to the Fuzzy Rand Index and on the non-overlapping clusterings equals to ARI)." 83 | flag off 84 | option "extended" x "evaluate extended (Soft) Omega Index, which does not excessively\ 85 | penalize distinctly shared nodes." flag off dependon="omega" 86 | 87 | section "Mean F1" 88 | option "f1" f "evaluate mean F1 of the [weighted] average of the greatest (maximal)\ 89 | match by F1 or partial probability. 90 | NOTE: F1h <= F1a, where: 91 | - p (F1p or Ph) - Harmonic mean (F1) of two [weighted] averages of the Partial Probabilities,\ 92 | the most indicative as satisfies the largest number of the Formal Constraints\ 93 | (homogeneity, completeness and size/quantity except the rag bag in some cases); 94 | - h (F1h) - Harmonic mean (F1) of two [weighted] averages of all local F1\ 95 | (harmonic means of the Precision and Recall of the best matches of the clusters); 96 | - a (F1a) - Arithmetic mean (average) of two [weighted] averages of all local F1,\ 97 | the least discriminative and satisfies the lowest number of the Formal Constraints. 98 | Precision and recall are evaluated relative to the FIRST clustering dataset (ground-truth, gold standard). 99 | " 100 | values="partprob","harmonic","average" enum default="partprob" argoptional 101 | option "kind" k "kind of the matching policy: 102 | - w - Weighted by the number of nodes in each cluster (known as micro weighting, MF1_micro) 103 | - u - Unweighed, where each cluster is treated equally (known as macro weighting, MF1_macro) 104 | - c - Combined(w, u) using geometric mean (drops the value not so much as harmonic mean) 105 | " 106 | values ="weighted","unweighed","combined" enum default="weighted" argoptional 107 | dependon="f1" 108 | 109 | section "Clusters Labeling & F1 evaluation with Precision and Recall" 110 | option "label" l "label evaluating clusters with the specified ground-truth (gt)\ 111 | cluster indices and evaluate F1 (including Precision and Recall) of the (best) MATCHED\ 112 | labeled clusters only (without the probable subclusters). 113 | NOTE: If 'sync' option is specified then the file name of the clusters labels\ 114 | should be the same as the node base (if specified) and should be in the .cnl format.\ 115 | The file name can be either a separate or an evaluating CNL file, in the\ 116 | latter case this option should precede the evaluating filename not repeating it. 117 | Precision and recall are evaluated relative to the FIRST clustering dataset (ground-truth, gold standard). 118 | " 119 | string typestr="gt_filename" 120 | option "policy" p "Labels matching policy: 121 | - p - Partial Probabilities (maximizes gain) 122 | - h - Harmonic Mean (minimizes loss, maximizes F1) 123 | " 124 | values="partprob","harmonic" enum default="harmonic" argoptional dependon="label" 125 | option "unweighted" u "Labels weighting policy on F1 evaluation: weighted by the number\ 126 | of instances in each label by default (micro weighting, F1_micro) or unweighed,\ 127 | where each label is treated equally (i.e. macro weighting, F1_macro)" 128 | flag off dependon="label" 129 | option "identifiers" i "output labels (identifiers) of the evaluating clusters\ 130 | as lines of space-separated indices of the ground-truth clusters (.cll - clusters\ 131 | labels list) 132 | NOTE: If 'sync' option is specified then the reduced collection is outputted to the\ 133 | .cnl besides the 134 | " string typestr="labels_filename" dependon="label" 135 | 136 | section "NMI" 137 | option "nmi" n "evaluate NMI (Normalized Mutual Information), applicable only\ 138 | to the non-overlapping clusters" flag off 139 | option "all" a "evaluate all NMIs using sqrt, avg and min denominators besides\ 140 | the max one" flag off dependon="nmi" 141 | option "ln" e "use ln (exp base) instead of log2 (Shannon entropy, bits)\ 142 | for the information measuring" flag off dependon="nmi" 143 | # Note: log2 vs ln have no any influence on the resulting value 144 | 145 | # Set optional options by default, allow input files to be unnamed parameters 146 | args "--default-optional --unamed-opts=clusterings" 147 | #args "--unamed-opts=clusterings" # Allow input files to be unnamed parameters 148 | 149 | 150 | # = Changelog = 151 | # v4.0.4 - Precision and recall added to the MF1 output, mixed Prc, Rec in F1 fixed 152 | # v4.0.3 - Renamed F1s -> F1a to be synced with the paper, description refined 153 | # v4.0.2 - Description and output measures notations refined 154 | # v4.0.1 - Aggregated output for multiple measures added 155 | # v4.0.0 - Omega index added and bound to the "-o" argument 156 | # - the former "-o" argument (overlaps) renamed to "-O" 157 | # - F1s renamed to F1a (average), option "-s" renamed to "-a" 158 | # v3.2.2 - F1 weighting considered for the labels 159 | # v3.2.1 - Interpretation of F1p modified to have semantic of geometric mean, now F1h < F1p < F1s 160 | # v3.2.0 - Clusters labeling, labels F1 (with precision and recall) evaluation 161 | # v3.1.0 - Matching policy for the F measures parameterized (weighted, unweighted, combined) 162 | # v3.0.2 - NMI_sqrt added 163 | # v3.0 - Command line interface changed for F1 164 | # - Standard F1-Score added 165 | # - Option for the detailed results output added 166 | # v2.3 - Node base synchronization added 167 | # v2.2 - Input arguments redesigned 168 | # v2.1 - F1 of partial probabilities implemented besides F1 of f1s 169 | # v2.0 - Standard NMI implemented and evaluation base parameterized (exp, 2) 170 | # v1.1 - Weighted F1 implemented 171 | # v1.0 - Initial Release 172 | 173 | #TODO: 174 | # NF1 measure 175 | 176 | 177 | # Basic structure, see https://www.gnu.org/software/gengetopt/gengetopt.html#Basic-Usage 178 | # package "" 179 | # version "" 180 | # purpose "" 181 | # usage "" 182 | # description "" 183 | # versiontext "" 184 | # 185 | # args "" 186 | # 187 | # option "" 188 | # {details=""} 189 | # {argtype} {typestr=""} 190 | # {values="","",...} 191 | # {default=""} 192 | # {dependon=""} 193 | # {required} {argoptional} {multiple} 194 | # {hidden} 195 | # 196 | # option "" flag 197 | # 198 | # section "section name" {sectiondesc="optional section description"} 199 | # 200 | # text "a textual sentence" 201 | # 202 | # 203 | # Mutually exclusive options should belong to a group: 204 | # 205 | # defgroup "" {groupdesc=""} {required} 206 | # groupoption "" group="" \ 207 | # {argoptional} {multiple} 208 | # 209 | # defgroup "my grp2" 210 | # defgroup "grp1" groupdesc="an option of this group is required" required 211 | # groupoption "opta" a "string a" group="grp1" multiple 212 | -------------------------------------------------------------------------------- /src/main.cpp: -------------------------------------------------------------------------------- 1 | //! \brief Extrinsic measures evaluation for overlapping multi-resolution clusterings 2 | //! with possible unequal node base. 3 | //! 4 | //! \license Apache License, Version 2.0: http://www.apache.org/licenses/LICENSE-2.0.html 5 | //! > Simple explanation: https://tldrlegal.com/license/apache-license-2.0-(apache-2.0) 6 | //! 7 | //! Copyright (c) 8 | //! \authr Artem Lutov 9 | //! \email luart@ya.ru 10 | //! \date 2017-02-13 11 | 12 | #include 13 | #include 14 | #include 15 | #include "cmdline.h" // Arguments parsing 16 | #include "macrodef.h" 17 | #include "interface.hpp" 18 | 19 | using std::stringstream; 20 | 21 | 22 | //! \brief Arguments parser 23 | struct ArgParser: gengetopt_args_info { 24 | ArgParser(int argc, char **argv) { 25 | auto err = cmdline_parser(argc, argv, this); 26 | if(err) 27 | throw std::invalid_argument("Arguments parsing failed: " + to_string(err)); 28 | } 29 | 30 | ~ArgParser() { 31 | cmdline_parser_free(this); 32 | } 33 | }; 34 | 35 | 36 | int main(int argc, char **argv) 37 | { 38 | ArgParser args_info(argc, argv); 39 | 40 | // Validate required xmeasure 41 | if(!args_info.omega_flag && !args_info.nmi_flag && !args_info.f1_given && !args_info.label_given) { 42 | fputs("WARNING, no any measures to evaluate are specified\n", stderr); 43 | cmdline_parser_print_help(); 44 | return EINVAL; 45 | } 46 | 47 | if(args_info.membership_arg <= 0) { 48 | fprintf(stderr, "ERROR, positive membership is expected: %G\n", args_info.membership_arg); 49 | return EDOM; 50 | } 51 | 52 | { // Validate the number of input files 53 | // Note: sync_arg is specified if sync_given 54 | const auto inpfiles = args_info.inputs_num + (args_info.sync_given || args_info.label_given); // The number of input files 55 | if(inpfiles < 2 || inpfiles > 2 + args_info.sync_given + args_info.label_given) { 56 | fputs("ERROR, 2 input clusterings are required with possibly additional" 57 | " node base and clusters labels, i.e. 2-4 input files in total\n", stderr); 58 | cmdline_parser_print_help(); 59 | return EINVAL; 60 | } 61 | } 62 | 63 | // Verify that labeled clusters correspond to the node base if any of them is specified 64 | if(args_info.sync_given && args_info.label_given && (strcmp(args_info.sync_arg, args_info.label_arg) 65 | || (args_info.inputs_num == 2 && strcmp(args_info.sync_arg, args_info.inputs[0])))) 66 | throw invalid_argument("ERROR, node base file should correspond to the labeled clusters and" 67 | " represent the first evaluating collection if both are specified\n"); 68 | 69 | // Load node base if required 70 | NodeBase ndbase; 71 | ::AggHash nbhash; 72 | // Note: if label_given then either inputs_num < 2 or inputs_num[0] = sync_arg = label_arg 73 | if(args_info.sync_given && args_info.inputs_num == 2 && !args_info.label_given) 74 | ndbase = NodeBase::load(args_info.sync_arg, args_info.membership_arg 75 | , &nbhash, 0, 0, args_info.detailed_flag); 76 | 77 | auto process = [&](auto evaluation) -> int { 78 | using Count = decltype(evaluation); 79 | using Collection = Collection; 80 | // Load collections as relations 81 | ::AggHash cn1hash, cn2hash; 82 | // Note: cn1 is nodebase if specified and not in the separated file 83 | const bool cn1base = (args_info.sync_given || args_info.label_given) && args_info.inputs_num < 2; 84 | //const char* nbfile = args_info.sync_given 85 | auto cn1 = Collection::load(cn1base ? args_info.sync_given ? args_info.sync_arg 86 | : args_info.label_arg : args_info.inputs[0] 87 | , args_info.unique_flag, args_info.membership_arg, &cn1hash 88 | , ndbase ? &ndbase : nullptr, nullptr, args_info.detailed_flag); 89 | if(ndbase) { 90 | if(nbhash != cn1hash) { 91 | fprintf(stderr, "ERROR, nodebase hash %lu (%lu nodes) != filtered" 92 | " collection nodes hash %lu (%lu)\n", nbhash.hash(), nbhash.size() 93 | , cn1hash.hash(), cn1hash.size()); 94 | return EINVAL; 95 | } 96 | ndbase.clear(); 97 | } 98 | RawIds lostcls; 99 | auto cn2 = Collection::load(args_info.inputs[!cn1base] 100 | , args_info.unique_flag, args_info.membership_arg, &cn2hash 101 | , args_info.sync_given ? &cn1 : nullptr 102 | , args_info.sync_given && args_info.label_given ? &lostcls : nullptr 103 | , args_info.detailed_flag); 104 | 105 | if(!cn1.ndsnum() || ! cn2.ndsnum()) { 106 | fprintf(stderr, "WARNING, at least one of the collections is empty, there is nothing" 107 | " to evaluate. Collection nodes sizes: %u, %u\n", cn1.ndsnum(), cn2.ndsnum()); 108 | return EINVAL; 109 | } 110 | 111 | // Check the collections' nodebase 112 | if(cn1hash != cn2hash) { 113 | fprintf(stderr, "WARNING, the nodes in the collections differ (the quality will be penalized)" 114 | ": %u nodes with hash %lu, size: %lu, ids: %lu, id2s: %lu) !=" 115 | " %u nodes with hash %lu, size: %lu, ids: %lu, id2s: %lu); synchronize: %s, label: %s\n" 116 | , cn1.ndsnum(), cn1hash.hash(), cn1hash.size(), cn1hash.idsum(), cn1hash.id2sum() 117 | , cn2.ndsnum(), cn2hash.hash(), cn2hash.size(), cn2hash.idsum(), cn2hash.id2sum() 118 | , daoc::toYesNo(args_info.sync_given), daoc::toYesNo(args_info.label_given)); 119 | //if(args_info.sync_given) { 120 | // fputs("ERROR, the nodes base should be synchronized\n", stderr); 121 | // return EINVAL; 122 | //} 123 | } 124 | 125 | // The number of outputting measures (1 .. 4) 126 | uint8_t outsnum = args_info.omega_flag + args_info.nmi_flag 127 | + args_info.f1_given + args_info.label_given; 128 | stringstream aggouts; // Aggregated outputs 129 | // Evaluate and output measures 130 | // Note: evaluation of overlapping F1 after NMI allows to reuse some 131 | // calculations, for other cases the order of evaluations does not matter 132 | puts(string("= ").append(is_floating_point::value 133 | ? "Overlaps" : "Multi-resolution").append(" Evaluation =").c_str()); 134 | if(args_info.nmi_flag) { 135 | auto rnmi = Collection::nmi(cn1, cn2, args_info.ln_flag, args_info.detailed_flag); 136 | // Set NMI to NULL if collections have no any mutual information 137 | // ATTENTION: for some cases, for example when one of the collections is a single cluster, 138 | // NMI will always yield 0 for any clusters in the second collection, which is limitation 139 | // of the original NMI measure. Similar issues possible in more complex configurations. 140 | if(rnmi.mi <= precision_limit()) { // Note: strict ! is fine here 141 | throw domain_error("NMI is not applicable to the specified collections: 0, which says nothing about the similarity\n"); 142 | rnmi.h1 = rnmi.h2 = 1; 143 | } 144 | const auto nmix = rnmi.mi / std::max(rnmi.h1, rnmi.h2); 145 | if(args_info.all_flag) { 146 | printf("NMI_max: %G, NMI_sqrt: %G, NMI_avg: %G, NMI_min: %G\n" 147 | , nmix, rnmi.mi / sqrt(rnmi.h1 * rnmi.h2) 148 | , 2 * rnmi.mi / (rnmi.h1 + rnmi.h2) 149 | , rnmi.mi / std::min(rnmi.h1, rnmi.h2)); 150 | if(--outsnum || aggouts.tellp()) 151 | aggouts << "NMI_max: " << nmix 152 | << ", NMI_sqrt: " << rnmi.mi / sqrt(rnmi.h1 * rnmi.h2) 153 | << ", NMI_avg: " << 2 * rnmi.mi / (rnmi.h1 + rnmi.h2) 154 | << ", NMI_min: " << rnmi.mi / std::min(rnmi.h1, rnmi.h2); 155 | } else { 156 | printf("NMI_max:\n%G\n", nmix); 157 | if(--outsnum || aggouts.tellp()) 158 | aggouts << "NMI_max: " << nmix; 159 | } 160 | } 161 | if(args_info.f1_given) { 162 | // Assign required F1 type 163 | F1 f1kind = F1::NONE; 164 | // Note: args_info.f1_orig is empty if default value is used 165 | char f1suf = '-'; // Suffix char of the selected F1 measure 166 | switch(args_info.f1_arg) { 167 | case f1_arg_partprob: 168 | f1kind = F1::PARTPROB; 169 | f1suf = 'p'; 170 | break; 171 | case f1_arg_harmonic: 172 | f1kind = F1::HARMONIC; 173 | f1suf = 'h'; 174 | break; 175 | case f1_arg_average: 176 | f1kind = F1::AVERAGE; // Suggested by Leskovec 177 | f1suf = 'a'; 178 | break; 179 | default: 180 | throw invalid_argument("main(), UNKNOWN F1 policy specified\n"); 181 | } 182 | // Assign matching kind 183 | Match mkind = Match::NONE; 184 | // Note: args_info.kind_orig is empty if default value is used 185 | char kindsuf = '-'; // Suffix char of the selected F1 measure 186 | switch(args_info.kind_arg) { 187 | case kind_arg_weighted: 188 | mkind = Match::WEIGHTED; 189 | kindsuf = 'w'; 190 | break; 191 | case kind_arg_unweighed: 192 | mkind = Match::UNWEIGHTED; 193 | kindsuf = 'u'; 194 | break; 195 | case kind_arg_combined: 196 | mkind = Match::COMBINED; 197 | kindsuf = 'c'; 198 | break; 199 | default: 200 | throw invalid_argument("main(), UNKNOWN Matching policy specified\n"); 201 | } 202 | 203 | //if(args_info.nmi_flag) 204 | // fputs("; ", stdout); 205 | Prob prc, rec; // Precision and recall of cn2 relative to ground-truth cn1 206 | const auto f1val = Collection::f1(cn1, cn2, f1kind, rec, prc, mkind, args_info.detailed_flag); 207 | printf("MF1%c_%c (%s, %s):\n%G", f1suf, kindsuf, to_string(f1kind).c_str() 208 | , to_string(mkind).c_str(), f1val); 209 | if(prc || rec) 210 | printf(" (Prc: %G, Rec: %G)", prc, rec); 211 | fputc('\n', stdout); 212 | if(--outsnum || aggouts.tellp()) { 213 | if(aggouts.tellp()) 214 | aggouts << "; "; 215 | aggouts << "MF1" << f1suf << '_' << kindsuf << ": " << f1val; 216 | // Note: prc and rec are zeroized if the matching strategy does not support them 217 | if(prc || rec) 218 | aggouts << " (Prc: " << prc << ", Rec: " << rec << ')'; 219 | } 220 | } 221 | // Label clusters with the ground-truth clusters indices and output F1 for the labels if required 222 | if(args_info.label_given) { 223 | if(args_info.policy_arg == policy__NULL) { 224 | fputs("WARNING f1(), labels matching policy is not specified, the evaluation is skipped\n", stderr); 225 | return 0; 226 | } 227 | // Reset cluster counters if they were set (could be set only by F1) 228 | if(args_info.f1_given) { 229 | cn1.clearcounts(); 230 | cn2.clearcounts(); 231 | } 232 | const bool prob = args_info.policy_arg == policy_arg_partprob; // Partial Probabilities matching policy 233 | const bool weighted = !args_info.unweighted_flag; 234 | PrcRec pr = Collection::label(cn1, cn2 //, lostcls 235 | , prob, weighted, args_info.identifiers_arg); //, args_info.detailed_flag); 236 | // Note: each measure name should form a single world to be properly parsed in a uniform way (see Clubmark), 237 | // that is why doubled underscore is used rather than a single space. 238 | printf("F1%c_%c__labels: %G (Prc: %G, Rec: %G)\n" 239 | , prob ? 'p' : 'h', weighted ? 'w' : 'u' 240 | , hmean(pr.prc, pr.rec), pr.prc, pr.rec); 241 | if(--outsnum || aggouts.tellp()) { 242 | if(aggouts.tellp()) 243 | aggouts << "; "; 244 | aggouts << "F1" << (prob ? 'p' : 'h') << '_' << (weighted ? 'w' : 'u') 245 | << "__labels: " << hmean(pr.prc, pr.rec) 246 | << " (Prc: " << pr.prc << ", Rec: " << pr.rec << ')'; 247 | } 248 | } 249 | if(args_info.omega_flag) { 250 | // Transform loaded and pre-processed collection to the representation 251 | // suitable for Omega Index evaluation 252 | RawClusters cls1; 253 | RawClusters cls2; 254 | NodeRClusters ndrcs; 255 | 256 | cn1.template transfer(cls1, ndrcs); 257 | cn2.template transfer(cls2, ndrcs); 258 | const auto oi = args_info.extended_flag 259 | ? omega(ndrcs, cls1, cls2) 260 | : omega(ndrcs, cls1, cls2) 261 | ; 262 | printf("OI%s:\n%G\n", args_info.extended_flag ? "x" : "", oi); 263 | if(--outsnum || aggouts.tellp()) { 264 | if(aggouts.tellp()) 265 | aggouts << "; "; 266 | aggouts << "OI" << (args_info.extended_flag ? "x" : "") << ": " << oi; 267 | } 268 | } 269 | if(aggouts.tellp()) 270 | puts(aggouts.str().c_str()); 271 | 272 | return 0; 273 | }; 274 | 275 | 276 | return args_info.ovp_flag ? process(AccProb()) : process(Id()); 277 | } 278 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2016, Artem Lutov 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /shared/fileio.cpp: -------------------------------------------------------------------------------- 1 | //! \brief File IO utils 2 | //! 3 | //! \license Apache License, Version 2.0: http://www.apache.org/licenses/LICENSE-2.0.html 4 | //! > Simple explanation: https://tldrlegal.com/license/apache-license-2.0-(apache-2.0) 5 | //! 6 | //! Copyright (c) 7 | //! \authr Artem Lutov 8 | //! \email luart@ya.ru 9 | //! \date 2017-02-13 10 | 11 | #include 12 | #include // error_code 13 | //#include 14 | 15 | #ifdef __unix__ 16 | #include 17 | #endif // __unix__ 18 | 19 | #define INCLUDE_STL_FS 20 | #include "fileio.hpp" 21 | 22 | 23 | using std::error_code; 24 | using std::to_string; 25 | using fs::path; 26 | using fs::create_directories; 27 | using fs::is_directory; 28 | using fs::exists; 29 | using fs::status; 30 | using std::logic_error; 31 | using namespace daoc; 32 | 33 | // File IO Types definitions --------------------------------------------------- 34 | size_t NamedFileWrapper::size() const noexcept 35 | { 36 | size_t cmsbytes = -1; // Return -1 on error 37 | #ifdef __unix__ // sqrt(cmsbytes) lines => linebuf = max(4-8Kb, sqrt(cmsbytes) * 2) with dynamic realloc 38 | struct stat filest; 39 | int fd = fileno(m_file); 40 | if(fd != -1 && !fstat(fd, &filest)) 41 | return filest.st_size; 42 | #endif // __unix 43 | error_code err; 44 | cmsbytes = fs::file_size(m_name, err); 45 | if(cmsbytes == size_t(-1)) 46 | fprintf(stderr, "WARNING size(), file size evaluation failed: %s\n" 47 | , err.message().c_str()); 48 | 49 | // // Get length of the file 50 | // fseek(m_file, 0, SEEK_END); 51 | // cmsbytes = ftell(m_file); // The number of bytes in the input communities 52 | // if(cmsbytes == size_t(-1)) 53 | // perror("WARNING size(), file size evaluation failed"); 54 | // //fprintf(stderr, " %s: %lu bytes\n", fname, cmsbytes); 55 | // rewind(m_file); // Set position to the begin of the file 56 | 57 | return cmsbytes; 58 | } 59 | 60 | NamedFileWrapper& NamedFileWrapper::reset(const char* filename, const char* mode) 61 | { 62 | if(filename) { 63 | m_file.reset(fopen(filename, mode)); 64 | m_name = filename; 65 | } else m_file.reset(); 66 | return *this; 67 | } 68 | 69 | // File Reading Types ---------------------------------------------------------- 70 | StringBuffer::StringBuffer(size_t size) 71 | : StringBufferBase(size), m_cur(0), m_length(0) 72 | { 73 | if(size <= 2) 74 | size = 2; 75 | *data() = 0; // Set first element to 0 76 | data()[size-2] = 0; // Set prelast reserved element to 0 77 | // Note: data()[size-1] is set to 0 automatically on file read if 78 | // the reading data size >= size - 1 bytes 79 | } 80 | 81 | void StringBuffer::reset(size_t size) 82 | { 83 | // Reset writing position 84 | m_cur = 0; 85 | m_length = 0; 86 | // Reset the buffer 87 | resize(size); // Note: can throw bad_alloc 88 | shrink_to_fit(); // Free reserved memory 89 | *data() = 0; // Set first element to 0 90 | data()[size-2] = 0; // Set prelast reserved element to 0 91 | // Note: data()[size-1] is set to 0 automatically on file read if 92 | // the reading data size >= size - 1 bytes 93 | } 94 | 95 | //size_t StringBuffer::length() const 96 | //#if VALIDATE < 2 97 | // noexcept 98 | //#endif // VALIDATE 99 | //{ 100 | //#if VALIDATE >= 2 101 | // const auto slen = strlen(data()); 102 | // if(m_length != slen) { 103 | //#if TRACE >= 2 104 | // fprintf(stderr, "length(), string: %s\n", data()); 105 | //#endif // TRACE 106 | // throw logic_error("ERROR length(), m_length (" + to_string(m_length) 107 | // + ") != actual string length (" + to_string(slen) + ")\n"); 108 | // } 109 | //#endif // VALIDATE 110 | // return m_length; 111 | //} 112 | 113 | bool StringBuffer::empty() const 114 | #if VALIDATE < 2 115 | noexcept 116 | #endif // VALIDATE 117 | { 118 | #if VALIDATE >= 2 119 | if((!front() || front() == '\n') && m_length >= 2) 120 | throw logic_error("ERROR empty(), m_length (" + to_string(m_length) 121 | + ") != actual string length (" + to_string(int(front() != 0)) + ")\n"); 122 | #endif // VALIDATE 123 | return !front() || front() == '\n'; 124 | } 125 | 126 | bool StringBuffer::readline(FILE* input) 127 | { 128 | #if VALIDATE >= 2 129 | assert(input && !m_cur 130 | && "readline(), valid file stream should be specified and have initial m_cur = 0"); 131 | #endif // VALIDATE 132 | *data() = 0; // Set first element to 0 as an initialization to have the empty string on errors 133 | const auto ibeg = ftell(input); 134 | // Read data from file until the string is read or an error occurs 135 | while(fgets(data() + m_cur, size() - m_cur, input) && data()[size()-2]) { 136 | #if TRACE >= 3 // Verified 137 | fprintf(stderr, "readline(), resizing buffer of %lu bytes, %lu pos: %s\n" 138 | , size(), m_cur, data()); 139 | #endif // TRACE 140 | m_cur = size() - 1; // Start overwriting ending '0' of the string 141 | resize(size() + (size() / (spagesize * 2) + 1) * spagesize); 142 | data()[size() - 2] = 0; // Set prelast element to 0 143 | } 144 | const auto iend = ftell(input); 145 | #if VALIDATE >= 2 146 | if(iend == -1 || ibeg == -1) 147 | perror("ERROR, file position reading error"); 148 | const size_t slen = strlen(data()); 149 | if(!((!m_cur || slen >= m_cur) && size_t(iend - ibeg) == slen)) { 150 | fprintf(stderr, "readline(), m_cur: %lu, slen: %lu, dpos: %li, str: %s\n" 151 | , m_cur, slen, iend - ibeg, data()); 152 | assert(0 && "readline(), string size validation failed"); 153 | } 154 | #endif // VALIDATE 155 | m_cur = 0; // Reset the writing (appending) position 156 | // Note: prelast and last elements of the buffer will be always zero 157 | 158 | // Set string length 159 | m_length = iend != -1 && ibeg != -1 ? iend - ibeg : strlen(data()); 160 | 161 | // Check for errors 162 | if((!m_length && feof(input)) || ferror(input)) { 163 | if(ferror(input)) 164 | perror("ERROR readline(), file reading error"); 165 | return false; // No more lines can be read 166 | } 167 | 168 | return true; // More lines can be read 169 | } 170 | 171 | // File I/O functions ---------------------------------------------------------- 172 | namespace daoc { 173 | 174 | void ensureDir(const string& dir) 175 | { 176 | #if TRACE >= 3 177 | fprintf(stderr, "ensureDir(), ensuring existence of: %s\n", dir.c_str()); 178 | #endif // TRACE 179 | // Check whether the output directory exists and create it otherwise 180 | path outdir = dir; 181 | if(!exists(outdir)) { 182 | error_code err; 183 | if(!create_directories(outdir, err)) 184 | // fputs(string("ERROR ensureDir(), target directory '").append(dir) 185 | // .append("' can't be created: ").append(err.message()) 186 | // .append("\n").c_str(), stderr); 187 | throw std::ios_base::failure(string("ERROR ensureDir(), target directory '") 188 | .append(dir).append("' can't be created: ") += err.message()); 189 | } else if(!is_directory(outdir)) 190 | // fputs(string("ERROR ensureDir(), target entry '").append(dir) 191 | // .append("' already exists as a non-directory path\n").c_str(), stderr); 192 | throw std::ios_base::failure(string("ERROR ensureDir(), target entry '").append(dir) 193 | += "' already exists as a non-directory path\n"); 194 | } 195 | 196 | void parseCnlHeader(NamedFileWrapper& fcls, StringBuffer& line, size_t& clsnum 197 | , size_t& ndsnum, [[maybe_unused]] bool verbose) 198 | { 199 | //! Parse count value 200 | //! \return - id value of 0 in case of parsing errors 201 | auto parseCount = []() noexcept -> size_t { 202 | char* tok = strtok(nullptr, " \t,"); // Note: the value can't be ended with ':' 203 | //errno = 0; 204 | const auto val = strtoul(tok, nullptr, 10); 205 | if(errno) 206 | perror(string("WARNING parseCount(), id value parsing error for the tok '") 207 | .append(tok).append("'").c_str()); 208 | return val; 209 | }; 210 | 211 | errno = 0; // Reset errno 212 | // Process the header, which is a special initial comment 213 | // The target header is: # Clusters: [,] Nodes: 214 | constexpr char clsmark[] = "clusters"; 215 | constexpr char ndsmark[] = "nodes"; 216 | constexpr char attrnameDelim[] = " \t:,"; 217 | #if TRACE >= 2 218 | size_t lnum = 0; // The number of lines read 219 | #endif // TRACE 220 | while(line.readline(fcls)) { 221 | #if TRACE >= 2 222 | ++lnum; 223 | #endif // TRACE 224 | // Skip empty lines 225 | if(line.empty()) 226 | continue; 227 | // Consider only subsequent comments 228 | if(line[0] != '#') 229 | break; 230 | 231 | // Tokenize the line 232 | char *tok = strtok(line + 1, attrnameDelim); // Note: +1 to skip the leading '#' 233 | // Skip comment without the string continuation and continuous comment 234 | if(!tok || tok[0] == '#') 235 | continue; 236 | uint8_t attrs = 0; // The number of read attributes 237 | do { 238 | // Lowercase the token 239 | for(char* pos = tok; *pos; ++pos) 240 | *pos = tolower(*pos); 241 | 242 | // Identify the attribute and read it's value 243 | if(!strcmp(tok, clsmark)) { 244 | clsnum = parseCount(); 245 | ++attrs; 246 | #if TRACE >= 2 247 | fprintf(stderr, "parseCnlHeader(), clusters: %lu\n", clsnum); 248 | #endif // TRACE 249 | } else if(!strcmp(tok, ndsmark)) { 250 | ndsnum = parseCount(); 251 | ++attrs; 252 | #if TRACE >= 2 253 | fprintf(stderr, "parseCnlHeader(), nodes: %lu\n", ndsnum); 254 | #endif // TRACE 255 | } else { 256 | #if TRACE >= 1 257 | #if TRACE < 2 258 | if(verbose) 259 | #endif // TRACE 2 260 | fprintf( 261 | #if TRACE >= 2 262 | stderr 263 | #else 264 | stdout 265 | #endif // TRACE 2 266 | , "WARNING parseCnlHeader(), the header parsing is omitted" 267 | " because of the unexpected attribute: %s\n", tok); 268 | #endif // TRACE 1 269 | break; 270 | } 271 | } while((tok = strtok(nullptr, attrnameDelim)) && attrs < 2); 272 | 273 | // Validate and correct the number of clusters if required 274 | // Note: it's better to reallocate a container a few times than too much overconsume the memory 275 | if(ndsnum && clsnum > ndsnum) { 276 | fprintf(stderr, "WARNING parseCnlHeader(), clsnum (%lu) typically should be" 277 | " less than ndsnum (%lu)\n", clsnum, ndsnum); 278 | clsnum = ndsnum; 279 | //assert(0 && "parseCnlHeader(), clsnum typically should be less than ndsnum"); 280 | } 281 | // Get following line for the unified subsequent processing 282 | line.readline(fcls); 283 | break; 284 | } 285 | #if TRACE >= 2 286 | fprintf(stderr, "parseCnlHeader(), processed %lu lines of '%s'\n" 287 | , lnum, fcls.name().c_str()); 288 | #endif // TRACE 289 | } 290 | 291 | size_t estimateCnlNodes(size_t filesize, float membership) noexcept 292 | { 293 | if(membership <= 0) { 294 | fprintf(stderr, "WARNING estimateCnlNodes(), invalid membership = %G specified" 295 | ", reseted to 1\n", membership); 296 | membership = 1; 297 | //throw invalid_argument("estimateCnlNodes(), membership = " 298 | // + to_string(membership) + " should be positive\n"); 299 | } 300 | 301 | size_t ndsnum = 0; // The estimated number of nodes 302 | if(filesize) { 303 | size_t magn = 10; // Decimal ids magnitude 304 | unsigned img = 1; // Index of the magnitude (10^1) 305 | size_t reminder = filesize % magn; // Reminder in bytes 306 | ndsnum = reminder / ++img; // img digits + 1 delimiter for each element 307 | while(filesize >= magn) { 308 | magn *= 10; 309 | ndsnum += (filesize - reminder) % magn / ++img; 310 | reminder = filesize % magn; 311 | } 312 | } 313 | return ndsnum / membership; 314 | } 315 | 316 | size_t estimateClusters(size_t ndsnum, float membership) noexcept 317 | { 318 | if(membership <= 0) { 319 | fprintf(stderr, "WARNING estimateClusters(), invalid membership = %G specified" 320 | ", reseted to 1\n", membership); 321 | membership = 1; 322 | //throw invalid_argument("estimateClusters(), membership = " 323 | // + to_string(membership) + " should be positive\n"); 324 | } 325 | 326 | size_t clsnum = 0; // The estimated number of clusters 327 | // Usually the number of clusters does not increase square root of the number of nodes 328 | // Note: do not estimate in case the number of nodes is not specified 329 | if(ndsnum) 330 | clsnum = sqrt(ndsnum * membership) + 1; // Note: +1 to consider rounding down 331 | return clsnum; 332 | } 333 | 334 | } // daoc 335 | -------------------------------------------------------------------------------- /src/interface_c.cpp: -------------------------------------------------------------------------------- 1 | //! \brief Extrinsic measures evaluation interface implementation. 2 | //! 3 | //! \license Apache License, Version 2.0: http://www.apache.org/licenses/LICENSE-2.0.html 4 | //! > Simple explanation: https://tldrlegal.com/license/apache-license-2.0-(apache-2.0) 5 | //! 6 | //! Copyright (c) 7 | //! \authr Artem Lutov 8 | //! \email luart@ya.ru 9 | //! \date 2021-03-11 10 | 11 | #include // move 12 | #include 13 | #include 14 | #include 15 | // For the template definitions 16 | #include // sqrt 17 | #include // sort 18 | #include 19 | #include "agghash.hpp" 20 | #include "interface.hpp" 21 | #include "interface_c.h" 22 | 23 | using std::move; 24 | using std::string; 25 | using std::vector; 26 | using std::unordered_set; 27 | 28 | // Accessory routines ---------------------------------------------------------- 29 | 30 | // Note: a dedicated declaration id required to define default parameters 31 | //! \brief Load collection from the provided raw collection 32 | //! \note This is an accessory routine for C API 33 | //! \pre All clusters in the collection are expected to be unique and not validated for 34 | //! the mutual match until makeunique is set; 35 | //! (reduce == (nodebase->ndsnum() < rcn.num)) || nodebase->ndsnum() == rcn.num 36 | //! 37 | //! \param rcn const ClusterCollection - raw collection of clusters 38 | //! \param makeunique=false bool - ensure that clusters contain unique members by 39 | //! removing the duplicates 40 | //! \param membership=1 float - expected membership of the nodes, >0, typically >= 1. 41 | //! Used only for the node container preallocation to estimate the number of nodes 42 | //! if not specified in the file header 43 | //! \param ahash=nullptr AggHash* - resulting hash of the loaded 44 | //! member ids base (unique ids only are hashed, not all ids) if not nullptr 45 | //! \param const nodebase=nullptr NodeBaseI* - node base to filter-out or complement nodes if required 46 | //! \param reduce=false bool - whether to reduce collections by removing the non-matching nodes 47 | //! or extend collections by appending those nodes them to a single "noise" cluster 48 | //! \param lostcls=nullptr RawIds* - indices of the lost clusters during the node base 49 | //! synchronization 50 | //! \param verbose=false bool - print the number of loaded nodes to the stdout 51 | //! \return CollectionT - the collection is loaded successfully 52 | Collection loadCollection(const ClusterCollection rcn, bool makeunique=false 53 | , float membership=1, ::AggHash* ahash=nullptr, const NodeBaseI* nodebase=nullptr 54 | , bool reduce=false, RawIds* lostcls=nullptr, bool verbose=false); 55 | 56 | Collection loadCollection(const ClusterCollection rcn, bool makeunique, float membership 57 | , ::AggHash* ahash, const NodeBaseI* nodebase, bool reduce, RawIds* lostcls, bool verbose) 58 | { 59 | Collection cn; // Return using NRVO, named return value optimization 60 | 61 | #ifdef DEBUG 62 | // Note: asserts break libraries (=> may crash a webservice), and, hence, should be avoided in the release mode 63 | assert(((reduce == (nodebase->ndsnum() < rcn.num)) || nodebase->ndsnum() == rcn.num) 64 | && "Nodebase is not synced with the reduce argument"); 65 | #else 66 | if(!((reduce == (nodebase->ndsnum() < rcn.num)) || nodebase->ndsnum() == rcn.num)) { 67 | fprintf(stderr, "ERROR: loadCollection(). Nodebase is not synced with the reduce argument (reduce: %u, nodebase: %u, rcn: %u)\n" 68 | , reduce, nodebase->ndsnum(), rcn.num); 69 | return cn; 70 | } 71 | #endif // DEBUG 72 | 73 | if(!rcn.nodes) { 74 | fputs("WARNING loadCollection(), the empty input collection is omitted\n", stderr); 75 | return cn; 76 | } 77 | 78 | // Preallocate space for the clusters and nodes 79 | size_t nsnum = rcn.num * 2; // The (estimated) number of nodes 80 | if(cn.m_cls.capacity() < rcn.num) // * cn.m_cls.max_load_factor() 81 | cn.m_cls.reserve(rcn.num); 82 | if(cn.m_ndcs.bucket_count() * cn.m_ndcs.max_load_factor() < nsnum) 83 | cn.m_ndcs.reserve(nsnum); 84 | 85 | // Load clusters 86 | #if TRACE >= 2 87 | fprintf(stderr, "loadCollection(), expected %lu clusters, %lu nodes from %u raw node relations\n" 88 | , rcn.num, nsnum, rcn.num); 89 | if(nodebase) 90 | fprintf(stderr, "loadCollection(), nodebase provided with %u nodes\n", nodebase->ndsnum()); 91 | #endif // TRACE 92 | 93 | // Parse clusters 94 | ClusterHolder chd(new Cluster()); 95 | for(NodeId i = 0; i < rcn.num; ++i) { 96 | Cluster* const pcl = chd.get(); 97 | auto& members = pcl->members; 98 | const auto& ndrels = rcn.nodes[i]; 99 | members.reserve(ndrels.num); 100 | for(NodeId j = 0; j < ndrels.num; ++j) { 101 | #ifdef DEBUG 102 | assert(ndrels.ids && "Invalid (non-allocated) node relations"); 103 | #else 104 | if(!ndrels.ids) { 105 | fputs("ERROR: loadCollection(). Invalid (non-allocated) node relations\n", stderr); 106 | cn = Collection(); 107 | return cn; 108 | } 109 | #endif // DEBUG 110 | const auto did = ndrels.ids[j]; 111 | // Filter out nodes if required 112 | if(nodebase && reduce && !nodebase->nodeExists(did)) 113 | continue; 114 | members.push_back(did); 115 | auto& ncs = cn.m_ndcs[did]; 116 | ncs.push_back(pcl); 117 | } 118 | if(!members.empty()) { 119 | members.shrink_to_fit(); // Free over reserved space 120 | if(makeunique) { 121 | // Ensure or validate that members are unique 122 | std::sort(members.begin(), members.end()); 123 | const auto im = unique(members.begin(), members.end()); 124 | //const auto im = adjacent_find(members.begin(), members.end()); 125 | if(im != members.end()) { 126 | fprintf(stderr, "WARNING loadCollection(), #%lu cluster contained %lu duplicated members, corrected.\n" 127 | , cn.m_cls.size(), distance(im, members.end())); 128 | // Remove associated clusters 129 | for(auto jm = im; jm != members.end(); ++jm) 130 | cn.m_ndcs[*jm].pop_back(); 131 | // Remove the tail of duplicated node ids 132 | members.erase(im, members.end()); 133 | //fprintf(stderr, "WARNING loadCollection(), #%lu cluster contains duplicated member #%lu: %u\n" 134 | // , cn.m_cls.size(), distance(members.begin(), im), *im); 135 | //throw invalid_argument("loadCollection(), the cluster contains duplicated members\n"); 136 | } 137 | } 138 | members.shrink_to_fit(); // Free over reserved space 139 | //for(auto v: members) 140 | // printf(" %u", v); 141 | //puts(""); 142 | cn.m_cls.push_back(chd.release()); 143 | // Start filling a new cluster 144 | chd.reset(new Cluster()); 145 | } else if(lostcls) 146 | lostcls->push_back(lostcls->size() + cn.m_cls.size()); 147 | } 148 | 149 | // Extend collection with a single "noise" cluster containing missed nodes if required 150 | if(nodebase && !reduce && cn.m_ndcs.size() < nodebase->ndsnum()) { 151 | // Fetch complementary nodes 152 | RawIds nids; 153 | nids.reserve(nodebase->ndsnum() - cn.m_ndcs.size()); 154 | for(auto nid: nodebase->nodes()) 155 | if(!cn.m_ndcs.count(nid)) 156 | nids.push_back(nid); 157 | // Add complementary nodes to the 158 | Cluster* const pcl = chd.get(); 159 | pcl->members.insert(pcl->members.end(), nids.begin(), nids.end()); 160 | for(auto nid: nids) 161 | cn.m_ndcs[nid].push_back(pcl); 162 | cn.m_cls.push_back(chd.release()); 163 | } 164 | 165 | // Save some space if it is essential 166 | if(cn.m_cls.size() < cn.m_cls.capacity() / 2) 167 | cn.m_cls.shrink_to_fit(); 168 | // Rehash the clusters and nodes for faster traversing if required 169 | //if(cn.m_cls.size() < cn.m_cls.bucket_count() * cn.m_cls.max_load_factor() / 2) 170 | // cn.m_cls.reserve(cn.m_cls.size()); 171 | if(cn.m_ndcs.size() < cn.m_ndcs.bucket_count() * cn.m_ndcs.max_load_factor() / 2) 172 | cn.m_ndcs.reserve(cn.m_ndcs.size()); 173 | 174 | // Evaluate the node hash 175 | ::AggHash mbhash; // Nodes hash (only unique nodes, not all the members) 176 | for(const auto& ndcl: cn.m_ndcs) 177 | mbhash.add(ndcl.first); 178 | // Assign hash to the results 179 | cn.m_ndshash = mbhash.hash(); // Note: required to identify the unequal node base in the processing collections 180 | if(ahash) 181 | *ahash = move(mbhash); 182 | #if TRACE >= 2 183 | printf("loadCollection(), loaded %lu clusters (capacity: %lu, overhead: %0.2f %%) and" 184 | " %lu nodes (reserved %lu buckets, overhead: %0.2f %%) with hash %lu from %u raw node relations\n" 185 | , cn.m_cls.size(), cn.m_cls.capacity() 186 | , cn.m_cls.size() ? float(cn.m_cls.capacity() - cn.m_cls.size()) / cn.m_cls.size() * 100 187 | : numeric_limits::infinity() 188 | , cn.m_ndcs.size(), cn.m_ndcs.bucket_count() 189 | , cn.m_ndcs.size() ? float(cn.m_ndcs.bucket_count() - cn.m_ndcs.size()) / cn.m_ndcs.size() * 100 190 | : numeric_limits::infinity() 191 | , cn.m_ndshash, rcn.num); 192 | #elif TRACE >= 1 193 | if(verbose) 194 | printf("loadCollection(), loaded %lu clusters %lu nodes from %u raw node relations\n", cn.m_cls.size() 195 | , cn.m_ndcs.size(), rcn.num); 196 | #endif 197 | 198 | return cn; 199 | } 200 | 201 | /// \brief Fetch nodes from the raw collection of clusters 202 | /// 203 | /// \param cn const ClusterCollection - raw collection of clusters 204 | /// \return UniqIds - cluster nodes 205 | UniqIds fetchNodes(const ClusterCollection cn) 206 | { 207 | UniqIds nodes; // Uses NRVO return value optimization 208 | nodes.reserve(cn.num * 2); 209 | 210 | if(cn.nodes) { 211 | for(NodeId i = 0; i < cn.num; ++i) { 212 | const auto& ndrs = cn.nodes[i]; 213 | if(!ndrs.ids) { 214 | fprintf(stderr, "WARNING %s(), the empty node ids (nominally: %u ids) is omitted\n", __FUNCTION__, ndrs.num); 215 | continue; 216 | } 217 | for(NodeId j = 0; j < ndrs.num; ++j) 218 | nodes.insert(nodes.end(), ndrs.ids[j]); 219 | } 220 | } else fprintf(stderr, "WARNING %s(), the empty input collection (nominally: %u nodes) is omitted\n", __FUNCTION__, cn.num); 221 | 222 | return nodes; 223 | } 224 | 225 | /// \brief Fetch nodebase from collection of clusters, reduced (intersection) or extended (union) one 226 | /// 227 | /// \param cn1 ClusterCollection const - first raw collection of clusters 228 | /// \param cn2 ClusterCollection const - second raw collection of clusters 229 | /// \param reduced=false bool - whether reduce or extend tham 230 | /// \return NodeBase - resulting nodebase 231 | NodeBase fetchNodebase(const ClusterCollection cn1, const ClusterCollection cn2, bool reduced=false) 232 | { 233 | NodeBase nodes; // Uses NRVO return value optimization 234 | if(reduced) { 235 | UniqIds nds1 = fetchNodes(cn1); 236 | UniqIds nds2 = fetchNodes(cn2); 237 | nodes.reserve(abs(static_cast(nds1.size()) - static_cast(nds2.size()))); 238 | for(auto nid: nds1) 239 | if(!nds2.count(nid)) 240 | nodes.insert(nodes.end(), nid); 241 | for(auto nid: nds2) 242 | if(!nds1.count(nid)) 243 | nodes.insert(nodes.end(), nid); 244 | #if VALIDATE >= 2 245 | assert((nodes.ndsnum() <= min(nds1.size(), nds2.size())) && "Unexpected size of resulting nodes"); 246 | #endif // VALIDATE 247 | } else for(const auto& cn: {cn1, cn2}) { 248 | const auto partnds = fetchNodes(cn); 249 | nodes.insert(partnds.begin(), partnds.end()); 250 | } 251 | return nodes; 252 | } 253 | 254 | // Interface implementation ---------------------------------------------------- 255 | Probability f1p(const ClusterCollection cn1, const ClusterCollection cn2) 256 | { 257 | return f1(cn1, cn2, F1_PARTPROB, nullptr, nullptr); 258 | } 259 | 260 | Probability f1h(const ClusterCollection cn1, const ClusterCollection cn2) 261 | { 262 | return f1(cn1, cn2, F1_HARMONIC, nullptr, nullptr); 263 | } 264 | 265 | Probability f1(const ClusterCollection cn1, const ClusterCollection cn2, F1Kind kind 266 | , Probability* rec, Probability* prc) 267 | { 268 | Probability tmp; // Temporary buffer, a placeholder 269 | return f1x(cn1, cn2, kind, rec ? rec : &tmp, prc ? prc : &tmp, MATCH_WEIGHTED, 1, 1, 0); 270 | } 271 | 272 | Probability f1x(const ClusterCollection cn1, const ClusterCollection cn2, F1Kind kind 273 | , Probability* rec, Probability* prc, MatchKind mkind, uint8_t sync, uint8_t makeunique, uint8_t verbose) 274 | { 275 | #if TRACE >= 2 276 | if(verbose) 277 | printf("%s(), loading clustering collections of size: %u, %u\n", __FUNCTION__ 278 | , cn1.num, cn2.num); 279 | #endif // TRACE 280 | assert(rec && prc && "Invalid output arguments"); 281 | // Load nodes 282 | const bool reduce = false; // Whether to reduce or extend collections of clusters 283 | Probability res = 0; 284 | if(sync) { 285 | NodeBase ndbase = fetchNodebase(cn1, cn2, reduce); 286 | Collection c1 = loadCollection(cn1, makeunique, 1, nullptr, &ndbase, reduce); 287 | Collection c2 = loadCollection(cn2, makeunique, 1, nullptr, &ndbase, reduce); 288 | res = Collection::f1(c1, c2, static_cast(kind), *rec, *prc, static_cast(mkind), verbose); 289 | } else { 290 | Collection c1 = loadCollection(cn1); 291 | Collection c2 = loadCollection(cn2); 292 | res = Collection::f1(c1, c2, static_cast(kind), *rec, *prc, static_cast(mkind), verbose); 293 | } 294 | return res; 295 | } 296 | 297 | Probability omega(const ClusterCollection cn1, const ClusterCollection cn2) 298 | { 299 | return omegax(cn1, cn2, 0, 1, 1); 300 | } 301 | 302 | Probability omegaExt(const ClusterCollection cn1, const ClusterCollection cn2) 303 | { 304 | return omegax(cn1, cn2, 1, 1, 1); 305 | } 306 | 307 | Probability omegax(const ClusterCollection cn1, const ClusterCollection cn2, uint8_t ext, uint8_t sync, uint8_t makeunique) 308 | { 309 | // Transform loaded and pre-processed collection to the representation 310 | // suitable for Omega Index evaluation 311 | RawClusters cls1; 312 | RawClusters cls2; 313 | NodeRClusters ndrcs; 314 | 315 | const bool reduce = false; // Whether to reduce or expand collections of clusters 316 | if(sync) { 317 | NodeBase ndbase = fetchNodebase(cn1, cn2, reduce); 318 | Collection c1 = loadCollection(cn1, makeunique, 1, nullptr, &ndbase, reduce); 319 | Collection c2 = loadCollection(cn2, makeunique, 1, nullptr, &ndbase, reduce); 320 | c1.template transfer(cls1, ndrcs); 321 | c2.template transfer(cls2, ndrcs); 322 | } else { 323 | Collection c1 = loadCollection(cn1); 324 | Collection c2 = loadCollection(cn2); 325 | c1.template transfer(cls1, ndrcs); 326 | c2.template transfer(cls2, ndrcs); 327 | } 328 | return ext ? omega(ndrcs, cls1, cls2) 329 | : omega(ndrcs, cls1, cls2); 330 | } 331 | -------------------------------------------------------------------------------- /shared/fileio.hpp: -------------------------------------------------------------------------------- 1 | //! \brief File IO utils 2 | //! 3 | //! Interface macro definitions: 4 | //! INCLUDE_STL_FS - include STL filesystem library under fs namespace. This macros is 5 | //! defined to avoid repetitive conditional inclusion of the STL FS. 6 | //! 7 | //! \license Apache License, Version 2.0: http://www.apache.org/licenses/LICENSE-2.0.html 8 | //! > Simple explanation: https://tldrlegal.com/license/apache-license-2.0-(apache-2.0) 9 | //! 10 | //! Copyright (c) 11 | //! \authr Artem Lutov 12 | //! \email luart@ya.ru 13 | //! \date 2017-02-13 14 | 15 | #ifndef FILEIO_H 16 | #define FILEIO_H 17 | 18 | #include // uintX_t 19 | #include // FILE 20 | #include // move 21 | #include 22 | #include 23 | #include 24 | // For the template definitions 25 | #include // strtok 26 | #include // sqrt 27 | 28 | #ifdef INCLUDE_STL_FS 29 | #if defined(__has_include) && __has_include() && __cplusplus >= 201703L // C++17+ 30 | #include 31 | namespace fs = std::filesystem; 32 | #elif defined(__has_include) && __has_include() 33 | #include 34 | namespace fs = std::experimental::filesystem; 35 | #else 36 | #error "STL filesystem is not available. The native alternative is not implemented." 37 | #endif // __has_include 38 | #endif // INCLUDE_STL_FS 39 | 40 | #include "agghash.hpp" 41 | 42 | //#include "types.h" 43 | 44 | 45 | namespace daoc { 46 | 47 | using std::move; 48 | using std::string; 49 | using std::vector; 50 | using std::unordered_set; 51 | 52 | // File Wrapping Types --------------------------------------------------------- 53 | //! \brief Wrapper around the FILE* to prevent hanging file descriptors 54 | class FileWrapper { 55 | FILE* m_dsc; 56 | bool m_tidy; 57 | public: 58 | //! \brief Constructor 59 | //! 60 | //! \param fd FILE* - the file descriptor to be held 61 | //! \param cleanup=true bool - close the file descriptor on destruction 62 | //! (typically false if stdin/out is supplied) 63 | FileWrapper(FILE* fd=nullptr, bool cleanup=true) noexcept 64 | : m_dsc(fd), m_tidy(cleanup) {} 65 | 66 | //! \brief Copy constructor 67 | //! \note Any file descriptor should have a single owner 68 | FileWrapper(const FileWrapper&)=delete; 69 | 70 | //! \brief Move constructor 71 | // ATTENTION: fw.m_dsc is not set to nullptr by the default move operation 72 | // ATTENTION: std::vector will move their elements if the elements' move constructor 73 | // is noexcept, and copy otherwise (unless the copy constructor is not accessible) 74 | FileWrapper(FileWrapper&& fw) noexcept 75 | : FileWrapper(fw.m_dsc, fw.m_tidy) 76 | { 77 | fw.m_dsc = nullptr; 78 | } 79 | 80 | //! \brief Copy assignment 81 | //! \note Any file descriptor should have the single owner 82 | FileWrapper& operator= (const FileWrapper&)=delete; 83 | 84 | //! \brief Move assignment 85 | // ATTENTION: fw.m_dsc is not set to nullptr by the default move operation 86 | FileWrapper& operator= (FileWrapper&& fw) noexcept 87 | { 88 | reset(fw.m_dsc, fw.m_tidy); 89 | fw.m_dsc = nullptr; 90 | return *this; 91 | } 92 | 93 | //! \brief Destructor 94 | ~FileWrapper() // noexcept by default 95 | { 96 | if(m_dsc && m_tidy) { 97 | fclose(m_dsc); 98 | m_dsc = nullptr; 99 | } 100 | } 101 | 102 | //! \brief Implicit conversion to the file descriptor 103 | //! 104 | //! \return FILE* - self as a file descriptor 105 | operator FILE*() const noexcept { return m_dsc; } 106 | 107 | //! \brief Reset the wrapper 108 | //! 109 | //! \param fd FILE* - the file descriptor to be held 110 | //! \param cleanup=true bool - close the file descriptor on destruction 111 | //! (typically false if stdin/out is supplied) 112 | //! \return void 113 | void reset(FILE* fd=nullptr, bool cleanup=true) noexcept 114 | { 115 | if(m_dsc && m_tidy && m_dsc != fd) 116 | fclose(m_dsc); 117 | m_dsc = fd; 118 | m_tidy = cleanup; 119 | } 120 | 121 | //! \brief Release ownership of the holding file 122 | //! 123 | //! \return FILE* - file descriptor 124 | FILE* release() noexcept 125 | { 126 | auto fd = m_dsc; 127 | m_dsc = nullptr; 128 | return fd; 129 | } 130 | }; 131 | 132 | //! \brief Wrapper around the FILE* that holds also the filename giving ability 133 | //! to reopen it and perform meaningful 134 | // Note: we can't inherit from the FileWrapper because semantic of reset differs 135 | class NamedFileWrapper { 136 | FileWrapper m_file; //!< File descriptor 137 | string m_name; //!< File name 138 | public: 139 | //! \brief Default Constructor 140 | // Note: Required tor return empty objects using NRVO optimization 141 | NamedFileWrapper() noexcept: m_file(), m_name() {} 142 | 143 | //! \brief Constructor 144 | //! \pre Parent directory must exists 145 | //! 146 | //! \param filename const char* - new file name to be opened 147 | //! \param mode const char* - opening mode, the same as fopen() has 148 | NamedFileWrapper(const char* filename, const char* mode) 149 | : m_file(filename && mode ? fopen(filename, mode) : nullptr) 150 | , m_name(filename ? filename : "") {} 151 | 152 | //! \brief Copy constructor 153 | //! \note Any file descriptor should have a single owner 154 | NamedFileWrapper(const NamedFileWrapper&)=delete; 155 | 156 | //! \brief Move constructor 157 | // ATTENTION: std::vector will move their elements if the elements' move constructor 158 | // is noexcept, and copy otherwise (unless the copy constructor is not accessible) 159 | NamedFileWrapper(NamedFileWrapper&& fw) noexcept 160 | : m_file(move(fw.m_file)), m_name(move(fw.m_name)) {} 161 | 162 | //! \brief Copy assignment 163 | //! \note Any file descriptor should have the single owner 164 | NamedFileWrapper& operator= (const NamedFileWrapper&)=delete; 165 | 166 | //! \brief Move assignment 167 | NamedFileWrapper& operator= (NamedFileWrapper&& fw) noexcept 168 | { 169 | m_file = move(fw.m_file); 170 | m_name = move(fw.m_name); 171 | return *this; 172 | } 173 | 174 | //! \brief File name 175 | //! 176 | //! \return const string& - file name 177 | const string& name() const noexcept { return m_name; } 178 | 179 | //! \brief File size 180 | //! 181 | //! \return size_t - file size or -1 on error 182 | size_t size() const noexcept; 183 | 184 | //! \brief Implicit conversion to the file descriptor 185 | //! 186 | //! \return FILE* - file descriptor 187 | operator FILE*() const noexcept { return m_file; } 188 | 189 | //! \brief Reopen the file under another mode 190 | //! 191 | //! \param mode const char* - the mode of operations, the same as in fopen() 192 | //! \return NamedFileWrapper& - the reopened file or closed (if can't be opened) 193 | NamedFileWrapper& reopen(const char* mode) 194 | { 195 | m_file.reset(freopen(nullptr, mode, m_file)); // m_name.c_str() 196 | return *this; 197 | } 198 | 199 | //! \brief Reset the file, closes current file and opens another one 200 | //! \pre Parent directory must exists 201 | //! 202 | //! \param filename const char* - new file name to be opened 203 | //! \param mode const char* - opening mode, the same as fopen() has 204 | //! \return NamedFileWrapper& - the newly opened file or just the old one closed 205 | NamedFileWrapper& reset(const char* filename, const char* mode); 206 | 207 | //! \brief Release ownership of the holding file 208 | //! 209 | //! \return FILE* - file descriptor 210 | FILE* release() noexcept { return m_file.release(); } 211 | }; 212 | 213 | // File Reading Types ---------------------------------------------------------- 214 | //! \brief Base of the StringBuffer 215 | using StringBufferBase = vector; 216 | 217 | //! \brief String buffer to real file by lines using c-strings 218 | //! \note The last symbol in the string is always set to 0 automatically 219 | class StringBuffer: protected StringBufferBase { 220 | constexpr static size_t spagesize = 4096; // Small page size on x64 221 | 222 | size_t m_cur; //! Current position for the writing 223 | size_t m_length; //! Current length of the holding c-string 224 | //protected: 225 | // StringBufferBase::size(); 226 | public: 227 | //! \brief 228 | //! \post the allocated buffer will have size >= 2 229 | //! 230 | //! \param size=spagesize size_t - size of the buffer 231 | // Note: can throw bad_alloc 232 | StringBuffer(size_t size=spagesize); 233 | 234 | //! \brief Reset the string and it's shrink the allocated buffer 235 | //! 236 | //! \param size=spagesize size_t - new initial size of the string buffer 237 | //! \return void 238 | void reset(size_t size=spagesize); 239 | 240 | //! \brief Length of the string including the terminating '\n' if present, 241 | //! but without the terminating '0' 242 | //! 243 | //! \return size_t - length of the holding c-string without the null terminator 244 | size_t length() const noexcept { return m_length; } 245 | 246 | //! \brief Whether the string is empty or starts with the newline symbol 247 | //! \attention empty() is true for '\n' when length() == 1 248 | //! 249 | //! \return bool - the string is empty or starts with the '\n' 250 | bool empty() const 251 | #if VALIDATE < 2 252 | noexcept 253 | #endif // VALIDATE 254 | ; 255 | 256 | //! \brief C-string including '\n' if it was present in the file 257 | operator char*() noexcept { return data(); } 258 | 259 | //! \brief Const C-string including '\n' if it was present in the file 260 | operator const char*() const noexcept { return data(); } 261 | 262 | //! \brief Make public indexing operators 263 | using StringBufferBase::operator[]; 264 | using StringBufferBase::at; 265 | 266 | //! \brief Read line from the file and store including the terminating '\n' symbol 267 | //! \attention The read string contains the trailing '\n' if exist in the file 268 | //! \note The buffer might contain [part of] the read line on reading error 269 | //! 270 | //! \param input FILE* - processing file 271 | //! \return bool - whether the current line is read without any errors or 272 | //! the all lines already read (and the current one is empty) 273 | bool readline(FILE* input); 274 | }; 275 | 276 | #ifndef NO_FILEIO // Turn off file I/O 277 | 278 | // File I/O functions declaration ---------------------------------------------- 279 | //! \brief Ensure existence of the specified directory 280 | //! 281 | //! \param dir const string& - directory to be created if has not existed 282 | //! \return void 283 | void ensureDir(const string& dir); 284 | 285 | //! \brief Parse the header of CNL file and validate the results 286 | //! \post clsnum <= ndsnum if ndsnum > 0. 0 means not specified 287 | //! 288 | //! \param fcls NamedFileWrapper& - the reading file 289 | //! \param line StringBuffer& - processing line (string, header) being read from the file 290 | //! \param[out] clsnum size_t& - resulting number of clusters if specified, 0 in case of parsing errors 291 | //! \param[out] ndsnum size_t& - resulting number of nodes if specified, 0 in case of parsing errors 292 | //! \param verbose=false bool - print information about the header parsing issue to the stdout 293 | //! \return void 294 | void parseCnlHeader(NamedFileWrapper& fcls, StringBuffer& line, size_t& clsnum 295 | , size_t& ndsnum, bool verbose=false); 296 | 297 | //! \brief Load all unique nodes from the CNL file with optional filtering by the cluster size 298 | //! 299 | //! \tparam Id - Node id type 300 | //! \tparam AccId - Accumulated node ids type 301 | //! 302 | //! \param file NamedFileWrapper& - input collection of clusters in the CNL format 303 | //! \param membership=1 float - expected membership of the nodes, >0, typically >= 1. 304 | //! Used only for the node container preallocation to estimate the number of nodes 305 | //! if not specified in the file header 306 | //! \param ahash=nullptr AggHash* - resulting aggregated hash of the loaded 307 | //! node ids if not nullptr 308 | //! \param cmin=0 size_t - min allowed cluster size 309 | //! \param cmax=0 size_t - max allowed cluster size, 0 means any size 310 | //! \param verbose=true bool - print the number of loaded nodes to the stdout 311 | //! \return unordered_set - the loaded collection of nodes 312 | template 313 | unordered_set loadNodes(NamedFileWrapper& file, float membership=1 314 | , AggHash* ahash=nullptr, size_t cmin=0, size_t cmax=0, bool verbose=true); 315 | 316 | //! \brief Estimate the number of nodes from the CNL file size 317 | //! 318 | //! \param filesize size_t - the number of bytes in the CNL file 319 | //! \param membership=1.f float - average membership of the node, 320 | //! > 0, typically ~= 1 321 | //! \return size_t - estimated number of nodes 322 | size_t estimateCnlNodes(size_t filesize, float membership=1.f) noexcept; 323 | 324 | //! \brief Estimate the number of clusters from the number of nodes 325 | //! 326 | //! \param ndsnum size_t - the number of nodes 327 | //! \param membership=1.f float - average membership of the node, 328 | //! > 0, typically ~= 1 329 | //! \return size_t - estimated number of clusters 330 | size_t estimateClusters(size_t ndsnum, float membership=1.f) noexcept; 331 | 332 | //! \brief Convert value to yes/no c-string 333 | //! 334 | //! \param val bool - value to be converted 335 | //! \return constexpr const char* - resulting c-string 336 | constexpr const char* toYesNo(bool val) noexcept { return val ? "yes" : "no"; } 337 | 338 | // File I/O templates definition ----------------------------------------------- 339 | template 340 | unordered_set loadNodes(NamedFileWrapper& file, float membership 341 | , AggHash* ahash, size_t cmin, size_t cmax, bool verbose) 342 | { 343 | unordered_set nodebase; // Node base; Note: returned using NRVO optimization 344 | 345 | if(!file) 346 | return nodebase; 347 | 348 | // Note: CNL [CSN] format only is supported 349 | size_t clsnum = 0; // The number of clusters 350 | size_t ndsnum = 0; // The number of nodes 351 | 352 | // Note: strings defined out of the cycle to avoid reallocations 353 | StringBuffer line; // Reading line 354 | // Parse header and read the number of clusters if specified 355 | // Note: line includes terminating '\n' 356 | parseCnlHeader(file, line, clsnum, ndsnum, verbose); 357 | 358 | // Estimate the number of nodes in the file if not specified 359 | if(!ndsnum) { 360 | size_t cmsbytes = file.size(); 361 | if(cmsbytes != size_t(-1)) // File length fetching failed 362 | ndsnum = estimateCnlNodes(cmsbytes, membership); 363 | else if(clsnum) 364 | ndsnum = 2 * clsnum; // / membership; // Note: use optimistic estimate instead of pessimistic (square / membership) to not overuse the memory 365 | #if TRACE >= 2 366 | fprintf(stderr, "loadNodes(), estimated %lu nodes\n", ndsnum); 367 | #endif // TRACE 368 | } 369 | #if TRACE >= 2 370 | else fprintf(stderr, "loadNodes(), specified %lu nodes\n", ndsnum); 371 | #endif // TRACE 372 | 373 | // Preallocate space for nodes 374 | if(ndsnum) 375 | nodebase.reserve(ndsnum); 376 | 377 | // Load clusters 378 | // ATTENTION: without '\n' delimiter the terminating '\n' is read as an item 379 | constexpr char mbdelim[] = " \t\n"; // Delimiter for the members 380 | vector cnds; // Cluster nodes. Note: a dedicated container is required to filter clusters by size 381 | cnds.reserve(sqrt(ndsnum)); // Note: typically cluster size does not increase the square root of the number of nodes 382 | #if TRACE >= 2 383 | size_t totmbs = 0; // The number of read member nodes from the file including repetitions 384 | size_t fclsnum = 0; // The number of read clusters from the file 385 | #endif // TRACE 386 | do { 387 | #if TRACE >= 3 388 | // Note: line includes terminating '\n' 389 | fprintf(stderr, "%lu> %s", fclsnum, static_cast(line)); 390 | #endif // TRACE 391 | char *tok = strtok(line, mbdelim); // const_cast(line.data()) 392 | 393 | // Skip comments 394 | if(!tok || tok[0] == '#') 395 | continue; 396 | // Skip the cluster id if present 397 | if(tok[strlen(tok) - 1] == '>') { 398 | const char* cidstr = tok; 399 | tok = strtok(nullptr, mbdelim); 400 | // Skip empty clusters, which actually should not exist 401 | if(!tok) { 402 | fprintf(stderr, "WARNING loadNodes(), empty cluster" 403 | " exists: '%s', skipped\n", cidstr); 404 | continue; 405 | } 406 | } 407 | do { 408 | // Note: only node id is parsed, share part is skipped if exists, 409 | // but potentially can be considered in NMI and F1 evaluation. 410 | // In the latter case abs diff of shares instead of co occurrence 411 | // counting should be performed. 412 | Id nid = strtoul(tok, nullptr, 10); 413 | #if VALIDATE >= 2 414 | if(!nid && tok[0] != '0') { 415 | fprintf(stderr, "WARNING loadNodes(), conversion error of '%s' into 0: %s\n" 416 | , tok, strerror(errno)); 417 | continue; 418 | } 419 | #endif // VALIDATE 420 | #if TRACE >= 2 421 | ++totmbs; // Update the total number of read members 422 | #endif // TRACE 423 | cnds.push_back(nid); 424 | } while((tok = strtok(nullptr, mbdelim))); 425 | #if TRACE >= 2 426 | ++fclsnum; // The number of valid read lines, i.e. clusters 427 | #endif // TRACE 428 | 429 | // Filter read cluster by size 430 | if(cnds.size() >= cmin && (!cmax || cnds.size() <= cmax)) 431 | nodebase.insert(cnds.begin(), cnds.end()); 432 | // Prepare outer vars for the next iteration 433 | cnds.clear(); 434 | } while(line.readline(file)); 435 | // // Rehash the nodes decreasing the allocated space if required 436 | // if(nodebase.size() <= nodebase.bucket_count() * nodebase.max_load_factor() / 3) 437 | // nodebase.reserve(nodebase.size()); 438 | #if TRACE >= 2 439 | printf("loadNodes(), the loaded base has %lu nodes from the input %lu members of %lu clusters\n" 440 | , nodebase.size(), totmbs, fclsnum); 441 | #else 442 | if(verbose) 443 | printf("loadNodes(), nodebase nodes loaded: %lu\n", nodebase.size()); 444 | #endif // TRACE 2 445 | 446 | // Evaluate nodes hash if required 447 | if(ahash && nodebase.size()) { 448 | AggHash ndsh; 449 | for(auto nid: nodebase) 450 | ndsh.add(nid); 451 | *ahash = move(ndsh); 452 | } 453 | 454 | return nodebase; 455 | } 456 | 457 | #endif // NO_FILEIO 458 | 459 | } // daoc 460 | 461 | #endif // FILEIO_H 462 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # xmeasures - Extrinsic Clustering Measures 2 | Extremely fast evaluation of accuracy (extrinsic quality) measures for the [overlapping/fuzzy] clusterings (collections of groups of items): 3 | family of **[mean] F1 measures** (including Average F1-Score) and **Omega Index** *(fuzzy version of the Adjusted Rand Index)* for overlapping multi-resolution clusterings with unequal node base (and optional node base synchronization) using various matching policies (micro, macro and combined weighting), 4 | and standard **NMI** for non-overlapping clustering on a single resolution. `xmeasures` also provides clusters labeling with the indices of the ground-truth clusters considering 1:n match and evaluating F1, precision and recall of the labeled clusters. 5 | 6 | `xmeasures` evaluates F1 and NMI for collections of hundreds thousands [overlapping] clusters (covers, communities) withing a dozen seconds on an ordinary laptop using a single CPU core. The computational time is O(N) 7 | unlike O(N \* C) 8 | of the existing state of the art implementations, where N is the number of nodes in the network and C is the number of clusters. Computational complexity for Omega Index is standard and equals O(N^2 \* s/2), where s is the average sharing ratio (membership) of the nodes, typically s -> 1. 9 | `xmeasures` is one of the utilities designed for the [PyCaBeM](https://github.com/eXascaleInfolab/PyCABeM) clustering benchmark to evaluate clusterings of large networks. 10 | 11 | `The paper:` [Accuracy Evaluation of Overlapping and Multi-resolution Clustering Algorithms on Large Datasets](https://github.com/eXascaleInfolab/xmeasures/blob/master/docs/xmeasures.pdf) 12 | ```bibtex 13 | @inproceedings{Xms19, 14 | author={Artem Lutov and Mourad Khayati and Philippe Cudr{\'e}-Mauroux}, 15 | title={Accuracy Evaluation of Overlapping and Multi-resolution Clustering Algorithms on Large Datasets}, 16 | booktitle={6th IEEE International Conference on Big Data and Smart Computing (BigComp 2019)}, 17 | year={2019}, 18 | keywords={accuracy metrics, overlapping community evaluation, multi-resolution clustering evaluation, Generalized NMI, Omega Index, MF1, similarity of collections of sets} 19 | } 20 | ``` 21 | 22 | Related papers about the implemented measures: 23 | - [Omega Index](http://dx.doi.org/10.1207/s15327906mbr2302_6) ([fuzzy version of the Adjusted Rand Index](http://iopscience.iop.org/article/10.1088/1742-5468/2011/02/P02017/meta)), which equal to ARI when applied for the non-overlapping clusterings; 24 | - Mean F1 measures: [F1a (Average F1-Score)](https://cs.stanford.edu/people/jure/pubs/bigclam-wsdm13.pdf), F1p is much more indicative and discriminative than the presented there F1a but the respective paper has not been published yet; 25 | - [NMI measure](http://www.jmlr.org/papers/volume11/vinh10a/vinh10a.pdf). 26 | > Standard NMI is implemented considering overlapping and multi-resolution clustering only to demonstrate non-applicability of the standard NMI for such cases, where it yields unfair results. See [GenConvNMI](https://github.com/eXascaleInfolab/GenConvNMI) for the fair generalized NMI evaluation. 27 | 28 | The execution time and the total processing time (relative power consumption) of `xmeasures` on a single CPU core vs [ParallelComMetric](https://github.com/eXascaleInfolab/ParallelComMetric) on multiple SMP cores evaluated on the SNAP DBLP dataset and shown in the log scale demonstrates that `xmeasures` evaluates F1 family measures multiple orders of magnitude faster than other state-of-the-art solutions: 29 | ![Clubmark_Poster-w1024](images/CPU-Timings-DBLP.png) 30 | 31 | 32 | Author: (c) Artem Lutov 33 | 34 | ## Content 35 | - [Deployment](#deployment) 36 | - [Requirements](#requirements) 37 | - [Compilation](#compilation) 38 | - [Application Compilation](#application-compilation) 39 | - [Library Compilation](#library-compilation) 40 | - [Usage](#usage) 41 | - [Related Projects](#related-projects) 42 | 43 | # Deployment 44 | 45 | The target platform is NIX/Posix, the binary is compiled for Linux Ubuntu x64 and also should work on Windows 10+ x64 (see details in [this article](https://www.howtogeek.com/249966/how-to-install-and-use-the-linux-bash-shell-on-windows-10/)). 46 | 47 | ## Requirements 48 | There are no any requirements for the execution or compilation except the *standard C++ library*. 49 | 50 | To run the *prebuilt executable* on Linux Ubuntu 16.04 x64, the standard library can be installed by: `$ sudo apt-get install libstdc++6`. 51 | 52 | ## Compilation 53 | ### Application Compilation 54 | ``` 55 | $ make release 56 | ``` 57 | 58 | The following **build errors** might occur on some platforms and should be resolved as outlined. 59 | * If your default compiler is *g++/gcc < 5.x*, then `g++-5` or higher should be installed and `Makefile` might need to be edited replacing `g++`, `gcc` with `g++-5`, `gcc-5`. 60 | * `-fstack-clash-protection` compiler flag is added since `xmeasures v4.0.5`, which might not be supported by Clang/LLVM and *GCC < 8.2*. This flag just hardens the application against some stack overflow attacks and should be excluded from the `Makefile` if not supported on your platform. 61 | 62 | To **update/extend the input parameters**, modify `args.ggo` and run `GenerateArgparser.sh` (calls `gengetopt`) before running `make`. To install [*gengetopt*](https://www.gnu.org/software/gengetopt), execute: `$ sudo apt-get install gengetopt`. 63 | 64 | 65 | ### Library Compilation 66 | Some core functionality of xmeasures is available as a library with C API, making it possible to link the library from Python and other scripting languages. 67 | The interface is defined in `include/interface_c.h`. 68 | To build the library, execute: 69 | ``` 70 | $ make -f Makefile_lib release 71 | ``` 72 | 73 | # Usage 74 | Execution Options: 75 | ``` 76 | $ ../xmeasures -h 77 | xmeasures 4.0.4 78 | 79 | Extrinsic measures evaluation: Omega Index (a fuzzy version of the Adjusted 80 | Rand Index, identical to the Fuzzy Rand Index) and [mean] F1-score (prob, harm 81 | and avg) for the overlapping multi-resolution clusterings, and standard NMI for 82 | the non-overlapping clustering on a single resolution. Unequal node base is 83 | allowed in the evaluating clusterings and optionally can be synchronized 84 | removing nodes from the clusters missed in one of the clusterings 85 | (collections). 86 | 87 | Usage: xmeasures [OPTIONS] clustering1 clustering2 88 | 89 | clustering - input file, collection of the clusters to be evaluated. 90 | 91 | Examples: 92 | $ ./xmeasures -fp -kc networks/5K25.cnl tests/5K25_l0.825/5K25_l0.825_796.cnl 93 | $ ./xmeasures -fh -kc -i tests/5K25.cll -ph -l networks/5K25.cnl 94 | tests/5K25_l0.825/5K25_l0.825_796.cnl 95 | $ ./xmeasures -ox tests/clsevalsx/omega_c4.3-1.cnl 96 | tests/clsevalsx/omega_c4.3-2.cnl 97 | 98 | 99 | Extrinsic measures are evaluated, i.e. two input clusterings (collections of 100 | clusters) are compared to each other. Optionally, a labeling of the evaluating 101 | clusters with the specified ground-truth clusters is performed. 102 | NOTE: 103 | - Multiple evaluating measures can be specified. 104 | - Each cluster should contain unique members, which is ensured only if the 105 | 'unique' option is specified. 106 | - All clusters should be unique to not affect Omega Index evaluation, which 107 | can be ensured by the [resmerge](https://github.com/eXascaleInfolab/resmerge) 108 | utility. 109 | - Non-corrected unequal node base in the clusterings is allowed, it penalizes 110 | the match.Use [OvpNMI](https://github.com/eXascaleInfolab/OvpNMI) or 111 | [GenConvNMI](https://github.com/eXascaleInfolab/GenConvNMI) for NMI evaluation 112 | in the arbitrary collections (still each cluster should contain unique 113 | members). 114 | 115 | Evaluating measures are: 116 | - OI - Omega Index (a fuzzy version of the Adjusted Rand Index, identical to 117 | the Fuzzy Rand Index), which yields the same value as Adjusted Rand Index when 118 | applied to the non-overlapping clusterings. 119 | - [M]F1 - various [mean] F1 measures of the Greatest (Max) Match including 120 | the Average F1-Score (suggested by J. Leskovec) with the optional weighting. 121 | NOTE: There are 3 matching policies available for each kind of F1. The most 122 | representative evaluation is performed by the F1p with combined matching 123 | policy (considers both micro and macro weighting). 124 | - NMI - Normalized Mutual Information, normalized by either max or also 125 | sqrt, avg and min information content denominators. 126 | ATTENTION: This is a standard NMI, which should be used ONLY for the HARD 127 | partitioning evaluation (non-overlapping clustering on a single resolution). 128 | It penalizes overlapping and multi-resolution structures. 129 | 130 | 131 | -h, --help Print help and exit 132 | -V, --version Print version and exit 133 | -O, --ovp evaluate overlapping instead of the 134 | multi-resolution clusters, where max matching 135 | for any shared member between R overlapping 136 | clusters is 1/R (the member is shared) 137 | instead of 1 (the member fully belongs to 138 | each [hierarchical sub]group) for the member 139 | belonging to R distinct clusters on R 140 | resolutions. 141 | NOTE: It has no effect for the Omega Index 142 | evaluation. (default=off) 143 | -q, --unique ensure on loading that all cluster members are 144 | unique by removing all duplicates. 145 | (default=off) 146 | -s, --sync=filename synchronize with the specified node base 147 | omitting the non-matching nodes. 148 | NOTE: The node base can be either a separate, 149 | or an evaluating CNL file, in the latter case 150 | this option should precede the evaluating 151 | filename not repeating it 152 | -m, --membership=FLOAT average expected membership of the nodes in the 153 | clusters, > 0, typically >= 1. Used only to 154 | facilitate estimation of the nodes number on 155 | the containers preallocation if this number 156 | is not specified in the file header. 157 | (default=`1') 158 | -d, --detailed detailed (verbose) results output 159 | (default=off) 160 | 161 | Omega Index: 162 | -o, --omega evaluate Omega Index (a fuzzy version of the 163 | Adjusted Rand Index, identical to the Fuzzy 164 | Rand Index and on the non-overlapping 165 | clusterings equals to ARI). (default=off) 166 | -x, --extended evaluate extended (Soft) Omega Index, which 167 | does not excessively penalize distinctly 168 | shared nodes. (default=off) 169 | 170 | Mean F1: 171 | -f, --f1[=ENUM] evaluate mean F1 of the [weighted] average of 172 | the greatest (maximal) match by F1 or partial 173 | probability. 174 | NOTE: F1h <= F1a, where: 175 | - p (F1p or Ph) - Harmonic mean (F1) of two 176 | [weighted] averages of the Partial 177 | Probabilities, the most indicative as 178 | satisfies the largest number of the Formal 179 | Constraints (homogeneity, completeness and 180 | size/quantity except the rag bag in some 181 | cases); 182 | - h (F1h) - Harmonic mean (F1) of two 183 | [weighted] averages of all local F1 (harmonic 184 | means of the Precision and Recall of the best 185 | matches of the clusters); 186 | - a (F1a) - Arithmetic mean (average) of 187 | two [weighted] averages of all local F1, the 188 | least discriminative and satisfies the lowest 189 | number of the Formal Constraints. 190 | Precision and recall are evaluated relative 191 | to the FIRST clustering dataset 192 | (ground-truth, gold standard). 193 | (possible values="partprob", 194 | "harmonic", "average" default=`partprob') 195 | -k, --kind[=ENUM] kind of the matching policy: 196 | - w - Weighted by the number of nodes in 197 | each cluster (known as micro weighting, 198 | MF1_micro) 199 | - u - Unweighed, where each cluster is 200 | treated equally (known as macro weighting, 201 | MF1_macro) 202 | - c - Combined(w, u) using geometric mean 203 | (drops the value not so much as harmonic 204 | mean) 205 | (possible values="weighted", 206 | "unweighed", "combined" 207 | default=`weighted') 208 | 209 | Clusters Labeling & F1 evaluation with Precision and Recall: 210 | -l, --label=gt_filename label evaluating clusters with the specified 211 | ground-truth (gt) cluster indices and 212 | evaluate F1 (including Precision and Recall) 213 | of the (best) MATCHED labeled clusters only 214 | (without the probable subclusters). 215 | NOTE: If 'sync' option is specified then the 216 | file name of the clusters labels should be 217 | the same as the node base (if specified) and 218 | should be in the .cnl format. The file name 219 | can be either a separate or an evaluating CNL 220 | file, in the latter case this option should 221 | precede the evaluating filename not repeating 222 | it. 223 | Precision and recall are evaluated relative 224 | to the FIRST clustering dataset 225 | (ground-truth, gold standard). 226 | 227 | -p, --policy[=ENUM] Labels matching policy: 228 | - p - Partial Probabilities (maximizes 229 | gain) 230 | - h - Harmonic Mean (minimizes loss, 231 | maximizes F1) 232 | (possible values="partprob", "harmonic" 233 | default=`harmonic') 234 | -u, --unweighted Labels weighting policy on F1 evaluation: 235 | weighted by the number of instances in each 236 | label by default (micro weighting, F1_micro) 237 | or unweighed, where each label is treated 238 | equally (i.e. macro weighting, F1_macro) 239 | (default=off) 240 | -i, --identifiers=labels_filename 241 | output labels (identifiers) of the evaluating 242 | clusters as lines of space-separated indices 243 | of the ground-truth clusters (.cll - clusters 244 | labels list) 245 | NOTE: If 'sync' option is specified then the 246 | reduced collection is outputted to the 247 | .cnl besides the 248 | 249 | 250 | NMI: 251 | -n, --nmi evaluate NMI (Normalized Mutual Information), 252 | applicable only to the non-overlapping 253 | clusters (default=off) 254 | -a, --all evaluate all NMIs using sqrt, avg and min 255 | denominators besides the max one 256 | (default=off) 257 | -e, --ln use ln (exp base) instead of log2 (Shannon 258 | entropy, bits) for the information measuring 259 | (default=off) 260 | ``` 261 | 262 | > Empty lines and comments (lines starting with #) in the input file (cnl format) are omitted. 263 | 264 | **Examples** 265 | Evaluate harmonic mean of the weighted average of the greatest (maximal) match by partial probabilities (the most discriminative F1-measure) using macro weighting (default as the most frequently used, thought combined weighting is the most indicative one): 266 | ``` 267 | $ ./xmeasures -f data/3cls5nds.cnl data/4cls6nds.cnl 268 | ``` 269 | 270 | Evaluate harmonic mean of the weighted average (by the cluster size) of the greatest (maximal) match by F1s and insure than all cluster members are unique (the duplicated members are removed): 271 | ``` 272 | $ ./xmeasures -fh -q data/3cls5nds.cnl data/4cls6nds.cnl 273 | ``` 274 | 275 | Evaluate harmonic mean of the [unweighted] average of the greatest (maximal) match by partial probabilities and synchronize the node base with the first evaluating collection, and considering overlapping clusters instead of multi-resolutions (`-O` does not matter for the case of non-overlapping single resolution collections): 276 | ``` 277 | $ ./xmeasures -sku -fp -O data/3cls5nds.cnl data/4cls6nds.cnl 278 | ``` 279 | 280 | Evaluate arithmetic mean of the weighted average (by the cluster size) of the greatest (maximal) match by F1s and NMI with all denominators synchronizing node base of the evaluating collections with `1lev4nds2cls.cnl`: 281 | ``` 282 | $ ./xmeasures -fa -na -s data/1lev4nds2cls.cnl data/3cls5nds.cnl data/4cls6nds.cnl 283 | ``` 284 | 285 | Evaluate combined weighed and unweighted F1h (harmonic mean of the average F1s), label the clusters with the indices of provided labels, evaluate standard F1, precision and recall of the labeled clusters and output the labels to the `clslbs.cll`: 286 | ``` 287 | $ ./xmeasures -fh -kc -i clslbs.cll -l labels.cnl clusters.cnl 288 | ``` 289 | 290 | Evaluate extended Omega Index and mean F1h (harmonic mean of the weighted average of the greatest (maximal) match by F1): 291 | ``` 292 | $ ./xmeasures -ox -fh omega_c4.3-1.cnl omega_c4.3-2.cnl 293 | ``` 294 | 295 | **Note:** Please, [star this project](https://github.com/eXascaleInfolab/xmeasures) if you use it. 296 | 297 | # Related Projects 298 | - [GenConvNMI](https://github.com/eXascaleInfolab/GenConvNMI) - Overlapping NMI evaluation that is compatible with the original NMI and suitable for both overlapping and multi resolution (hierarchical) clustering evaluation. 299 | - [OvpNMI](https://github.com/eXascaleInfolab/OvpNMI) - NMI evaluation for the overlapping clusters (communities) that is not compatible with the standard NMI value unlike GenConvNMI, but it is much faster than GenConvNMI. 300 | - [Clubmark](https://github.com/eXascaleInfolab/clubmark) - A parallel isolation framework for benchmarking and profiling clustering (community detection) algorithms considering overlaps (covers). 301 | - [ParallelComMetric](https://github.com/eXascaleInfolab/ParallelComMetric) - A parallel toolkit implemented with Pthreads (or MPI) to calculate various extrinsic and intrinsic quality metrics (with and without ground truth community structure) for non-overlapping (hard, single membership) clusterings. 302 | - [CluSim](https://github.com/Hoosier-Clusters/clusim) - A Python module that evaluates (slowly) various extrinsic quality metrics (accuracy) for non-overlapping (hard, single membership) clusterings. 303 | - [resmerge](https://github.com/eXascaleInfolab/resmerge) - Resolution levels clustering merger with filtering. Flattens hierarchy/list of multiple resolutions levels (clusterings) into the single flat clustering with clusters on various resolution levels synchronizing the node base. 304 | - [ExecTime](https://bitbucket.org/lumais/exectime/) - A lightweight resource consumption profiler. 305 | - [TInfES](https://github.com/eXascaleInfolab/TInfES) - Type inference evaluation scripts and accessory apps used for the benchmarking. 306 | -------------------------------------------------------------------------------- /autogen/cmdline.h: -------------------------------------------------------------------------------- 1 | /** @file cmdline.h 2 | * @brief The header file for the command line option parser 3 | * generated by GNU Gengetopt version 2.23 4 | * http://www.gnu.org/software/gengetopt. 5 | * DO NOT modify this file, since it can be overwritten 6 | * @author GNU Gengetopt */ 7 | 8 | #ifndef CMDLINE_H 9 | #define CMDLINE_H 10 | 11 | /* If we use autoconf. */ 12 | #ifdef HAVE_CONFIG_H 13 | #include "config.h" 14 | #endif 15 | 16 | #include /* for FILE */ 17 | 18 | #ifdef __cplusplus 19 | extern "C" { 20 | #endif /* __cplusplus */ 21 | 22 | #ifndef CMDLINE_PARSER_PACKAGE 23 | /** @brief the program name (used for printing errors) */ 24 | #define CMDLINE_PARSER_PACKAGE "xmeasures" 25 | #endif 26 | 27 | #ifndef CMDLINE_PARSER_PACKAGE_NAME 28 | /** @brief the complete program name (used for help and version) */ 29 | #define CMDLINE_PARSER_PACKAGE_NAME "xmeasures" 30 | #endif 31 | 32 | #ifndef CMDLINE_PARSER_VERSION 33 | /** @brief the program version */ 34 | #define CMDLINE_PARSER_VERSION "4.0.4" 35 | #endif 36 | 37 | enum enum_f1 { f1__NULL = -1, f1_arg_partprob = 0, f1_arg_harmonic, f1_arg_average }; 38 | enum enum_kind { kind__NULL = -1, kind_arg_weighted = 0, kind_arg_unweighed, kind_arg_combined }; 39 | enum enum_policy { policy__NULL = -1, policy_arg_partprob = 0, policy_arg_harmonic }; 40 | 41 | /** @brief Where the command line options are stored */ 42 | struct gengetopt_args_info 43 | { 44 | const char *help_help; /**< @brief Print help and exit help description. */ 45 | const char *version_help; /**< @brief Print version and exit help description. */ 46 | int ovp_flag; /**< @brief evaluate overlapping instead of the multi-resolution clusters, where max matching for any shared member between R overlapping clusters is 1/R (the member is shared) instead of 1 (the member fully belongs to each [hierarchical sub]group) for the member belonging to R distinct clusters on R resolutions. 47 | NOTE: It has no effect for the Omega Index evaluation. (default=off). */ 48 | const char *ovp_help; /**< @brief evaluate overlapping instead of the multi-resolution clusters, where max matching for any shared member between R overlapping clusters is 1/R (the member is shared) instead of 1 (the member fully belongs to each [hierarchical sub]group) for the member belonging to R distinct clusters on R resolutions. 49 | NOTE: It has no effect for the Omega Index evaluation. help description. */ 50 | int unique_flag; /**< @brief ensure on loading that all cluster members are unique by removing all duplicates. (default=off). */ 51 | const char *unique_help; /**< @brief ensure on loading that all cluster members are unique by removing all duplicates. help description. */ 52 | char * sync_arg; /**< @brief synchronize with the specified node base omitting the non-matching nodes. 53 | NOTE: The node base can be either a separate, or an evaluating CNL file, in the latter case this option should precede the evaluating filename not repeating it. */ 54 | char * sync_orig; /**< @brief synchronize with the specified node base omitting the non-matching nodes. 55 | NOTE: The node base can be either a separate, or an evaluating CNL file, in the latter case this option should precede the evaluating filename not repeating it original value given at command line. */ 56 | const char *sync_help; /**< @brief synchronize with the specified node base omitting the non-matching nodes. 57 | NOTE: The node base can be either a separate, or an evaluating CNL file, in the latter case this option should precede the evaluating filename not repeating it help description. */ 58 | float membership_arg; /**< @brief average expected membership of the nodes in the clusters, > 0, typically >= 1. Used only to facilitate estimation of the nodes number on the containers preallocation if this number is not specified in the file header. (default='1'). */ 59 | char * membership_orig; /**< @brief average expected membership of the nodes in the clusters, > 0, typically >= 1. Used only to facilitate estimation of the nodes number on the containers preallocation if this number is not specified in the file header. original value given at command line. */ 60 | const char *membership_help; /**< @brief average expected membership of the nodes in the clusters, > 0, typically >= 1. Used only to facilitate estimation of the nodes number on the containers preallocation if this number is not specified in the file header. help description. */ 61 | int detailed_flag; /**< @brief detailed (verbose) results output (default=off). */ 62 | const char *detailed_help; /**< @brief detailed (verbose) results output help description. */ 63 | int omega_flag; /**< @brief evaluate Omega Index (a fuzzy version of the Adjusted Rand Index, identical to the Fuzzy Rand Index and on the non-overlapping clusterings equals to ARI). (default=off). */ 64 | const char *omega_help; /**< @brief evaluate Omega Index (a fuzzy version of the Adjusted Rand Index, identical to the Fuzzy Rand Index and on the non-overlapping clusterings equals to ARI). help description. */ 65 | int extended_flag; /**< @brief evaluate extended (Soft) Omega Index, which does not excessively penalize distinctly shared nodes. (default=off). */ 66 | const char *extended_help; /**< @brief evaluate extended (Soft) Omega Index, which does not excessively penalize distinctly shared nodes. help description. */ 67 | enum enum_f1 f1_arg; /**< @brief evaluate mean F1 of the [weighted] average of the greatest (maximal) match by F1 or partial probability. 68 | NOTE: F1h <= F1a, where: 69 | - p (F1p or Ph) - Harmonic mean (F1) of two [weighted] averages of the Partial Probabilities, the most indicative as satisfies the largest number of the Formal Constraints (homogeneity, completeness and size/quantity except the rag bag in some cases); 70 | - h (F1h) - Harmonic mean (F1) of two [weighted] averages of all local F1 (harmonic means of the Precision and Recall of the best matches of the clusters); 71 | - a (F1a) - Arithmetic mean (average) of two [weighted] averages of all local F1, the least discriminative and satisfies the lowest number of the Formal Constraints. 72 | Precision and recall are evaluated relative to the FIRST clustering dataset (ground-truth, gold standard). 73 | (default='partprob'). */ 74 | char * f1_orig; /**< @brief evaluate mean F1 of the [weighted] average of the greatest (maximal) match by F1 or partial probability. 75 | NOTE: F1h <= F1a, where: 76 | - p (F1p or Ph) - Harmonic mean (F1) of two [weighted] averages of the Partial Probabilities, the most indicative as satisfies the largest number of the Formal Constraints (homogeneity, completeness and size/quantity except the rag bag in some cases); 77 | - h (F1h) - Harmonic mean (F1) of two [weighted] averages of all local F1 (harmonic means of the Precision and Recall of the best matches of the clusters); 78 | - a (F1a) - Arithmetic mean (average) of two [weighted] averages of all local F1, the least discriminative and satisfies the lowest number of the Formal Constraints. 79 | Precision and recall are evaluated relative to the FIRST clustering dataset (ground-truth, gold standard). 80 | original value given at command line. */ 81 | const char *f1_help; /**< @brief evaluate mean F1 of the [weighted] average of the greatest (maximal) match by F1 or partial probability. 82 | NOTE: F1h <= F1a, where: 83 | - p (F1p or Ph) - Harmonic mean (F1) of two [weighted] averages of the Partial Probabilities, the most indicative as satisfies the largest number of the Formal Constraints (homogeneity, completeness and size/quantity except the rag bag in some cases); 84 | - h (F1h) - Harmonic mean (F1) of two [weighted] averages of all local F1 (harmonic means of the Precision and Recall of the best matches of the clusters); 85 | - a (F1a) - Arithmetic mean (average) of two [weighted] averages of all local F1, the least discriminative and satisfies the lowest number of the Formal Constraints. 86 | Precision and recall are evaluated relative to the FIRST clustering dataset (ground-truth, gold standard). 87 | help description. */ 88 | enum enum_kind kind_arg; /**< @brief kind of the matching policy: 89 | - w - Weighted by the number of nodes in each cluster (known as micro weighting, MF1_micro) 90 | - u - Unweighed, where each cluster is treated equally (known as macro weighting, MF1_macro) 91 | - c - Combined(w, u) using geometric mean (drops the value not so much as harmonic mean) 92 | (default='weighted'). */ 93 | char * kind_orig; /**< @brief kind of the matching policy: 94 | - w - Weighted by the number of nodes in each cluster (known as micro weighting, MF1_micro) 95 | - u - Unweighed, where each cluster is treated equally (known as macro weighting, MF1_macro) 96 | - c - Combined(w, u) using geometric mean (drops the value not so much as harmonic mean) 97 | original value given at command line. */ 98 | const char *kind_help; /**< @brief kind of the matching policy: 99 | - w - Weighted by the number of nodes in each cluster (known as micro weighting, MF1_micro) 100 | - u - Unweighed, where each cluster is treated equally (known as macro weighting, MF1_macro) 101 | - c - Combined(w, u) using geometric mean (drops the value not so much as harmonic mean) 102 | help description. */ 103 | char * label_arg; /**< @brief label evaluating clusters with the specified ground-truth (gt) cluster indices and evaluate F1 (including Precision and Recall) of the (best) MATCHED labeled clusters only (without the probable subclusters). 104 | NOTE: If 'sync' option is specified then the file name of the clusters labels should be the same as the node base (if specified) and should be in the .cnl format. The file name can be either a separate or an evaluating CNL file, in the latter case this option should precede the evaluating filename not repeating it. 105 | Precision and recall are evaluated relative to the FIRST clustering dataset (ground-truth, gold standard). 106 | . */ 107 | char * label_orig; /**< @brief label evaluating clusters with the specified ground-truth (gt) cluster indices and evaluate F1 (including Precision and Recall) of the (best) MATCHED labeled clusters only (without the probable subclusters). 108 | NOTE: If 'sync' option is specified then the file name of the clusters labels should be the same as the node base (if specified) and should be in the .cnl format. The file name can be either a separate or an evaluating CNL file, in the latter case this option should precede the evaluating filename not repeating it. 109 | Precision and recall are evaluated relative to the FIRST clustering dataset (ground-truth, gold standard). 110 | original value given at command line. */ 111 | const char *label_help; /**< @brief label evaluating clusters with the specified ground-truth (gt) cluster indices and evaluate F1 (including Precision and Recall) of the (best) MATCHED labeled clusters only (without the probable subclusters). 112 | NOTE: If 'sync' option is specified then the file name of the clusters labels should be the same as the node base (if specified) and should be in the .cnl format. The file name can be either a separate or an evaluating CNL file, in the latter case this option should precede the evaluating filename not repeating it. 113 | Precision and recall are evaluated relative to the FIRST clustering dataset (ground-truth, gold standard). 114 | help description. */ 115 | enum enum_policy policy_arg; /**< @brief Labels matching policy: 116 | - p - Partial Probabilities (maximizes gain) 117 | - h - Harmonic Mean (minimizes loss, maximizes F1) 118 | (default='harmonic'). */ 119 | char * policy_orig; /**< @brief Labels matching policy: 120 | - p - Partial Probabilities (maximizes gain) 121 | - h - Harmonic Mean (minimizes loss, maximizes F1) 122 | original value given at command line. */ 123 | const char *policy_help; /**< @brief Labels matching policy: 124 | - p - Partial Probabilities (maximizes gain) 125 | - h - Harmonic Mean (minimizes loss, maximizes F1) 126 | help description. */ 127 | int unweighted_flag; /**< @brief Labels weighting policy on F1 evaluation: weighted by the number of instances in each label by default (micro weighting, F1_micro) or unweighed, where each label is treated equally (i.e. macro weighting, F1_macro) (default=off). */ 128 | const char *unweighted_help; /**< @brief Labels weighting policy on F1 evaluation: weighted by the number of instances in each label by default (micro weighting, F1_micro) or unweighed, where each label is treated equally (i.e. macro weighting, F1_macro) help description. */ 129 | char * identifiers_arg; /**< @brief output labels (identifiers) of the evaluating clusters as lines of space-separated indices of the ground-truth clusters (.cll - clusters labels list) 130 | NOTE: If 'sync' option is specified then the reduced collection is outputted to the .cnl besides the 131 | . */ 132 | char * identifiers_orig; /**< @brief output labels (identifiers) of the evaluating clusters as lines of space-separated indices of the ground-truth clusters (.cll - clusters labels list) 133 | NOTE: If 'sync' option is specified then the reduced collection is outputted to the .cnl besides the 134 | original value given at command line. */ 135 | const char *identifiers_help; /**< @brief output labels (identifiers) of the evaluating clusters as lines of space-separated indices of the ground-truth clusters (.cll - clusters labels list) 136 | NOTE: If 'sync' option is specified then the reduced collection is outputted to the .cnl besides the 137 | help description. */ 138 | int nmi_flag; /**< @brief evaluate NMI (Normalized Mutual Information), applicable only to the non-overlapping clusters (default=off). */ 139 | const char *nmi_help; /**< @brief evaluate NMI (Normalized Mutual Information), applicable only to the non-overlapping clusters help description. */ 140 | int all_flag; /**< @brief evaluate all NMIs using sqrt, avg and min denominators besides the max one (default=off). */ 141 | const char *all_help; /**< @brief evaluate all NMIs using sqrt, avg and min denominators besides the max one help description. */ 142 | int ln_flag; /**< @brief use ln (exp base) instead of log2 (Shannon entropy, bits) for the information measuring (default=off). */ 143 | const char *ln_help; /**< @brief use ln (exp base) instead of log2 (Shannon entropy, bits) for the information measuring help description. */ 144 | 145 | unsigned int help_given ; /**< @brief Whether help was given. */ 146 | unsigned int version_given ; /**< @brief Whether version was given. */ 147 | unsigned int ovp_given ; /**< @brief Whether ovp was given. */ 148 | unsigned int unique_given ; /**< @brief Whether unique was given. */ 149 | unsigned int sync_given ; /**< @brief Whether sync was given. */ 150 | unsigned int membership_given ; /**< @brief Whether membership was given. */ 151 | unsigned int detailed_given ; /**< @brief Whether detailed was given. */ 152 | unsigned int omega_given ; /**< @brief Whether omega was given. */ 153 | unsigned int extended_given ; /**< @brief Whether extended was given. */ 154 | unsigned int f1_given ; /**< @brief Whether f1 was given. */ 155 | unsigned int kind_given ; /**< @brief Whether kind was given. */ 156 | unsigned int label_given ; /**< @brief Whether label was given. */ 157 | unsigned int policy_given ; /**< @brief Whether policy was given. */ 158 | unsigned int unweighted_given ; /**< @brief Whether unweighted was given. */ 159 | unsigned int identifiers_given ; /**< @brief Whether identifiers was given. */ 160 | unsigned int nmi_given ; /**< @brief Whether nmi was given. */ 161 | unsigned int all_given ; /**< @brief Whether all was given. */ 162 | unsigned int ln_given ; /**< @brief Whether ln was given. */ 163 | 164 | char **inputs ; /**< @brief unnamed options (options without names) */ 165 | unsigned inputs_num ; /**< @brief unnamed options number */ 166 | } ; 167 | 168 | /** @brief The additional parameters to pass to parser functions */ 169 | struct cmdline_parser_params 170 | { 171 | int override; /**< @brief whether to override possibly already present options (default 0) */ 172 | int initialize; /**< @brief whether to initialize the option structure gengetopt_args_info (default 1) */ 173 | int check_required; /**< @brief whether to check that all required options were provided (default 1) */ 174 | int check_ambiguity; /**< @brief whether to check for options already specified in the option structure gengetopt_args_info (default 0) */ 175 | int print_errors; /**< @brief whether getopt_long should print an error message for a bad option (default 1) */ 176 | } ; 177 | 178 | /** @brief the purpose string of the program */ 179 | extern const char *gengetopt_args_info_purpose; 180 | /** @brief the usage string of the program */ 181 | extern const char *gengetopt_args_info_usage; 182 | /** @brief the description string of the program */ 183 | extern const char *gengetopt_args_info_description; 184 | /** @brief all the lines making the help output */ 185 | extern const char *gengetopt_args_info_help[]; 186 | 187 | /** 188 | * The command line parser 189 | * @param argc the number of command line options 190 | * @param argv the command line options 191 | * @param args_info the structure where option information will be stored 192 | * @return 0 if everything went fine, NON 0 if an error took place 193 | */ 194 | int cmdline_parser (int argc, char **argv, 195 | struct gengetopt_args_info *args_info); 196 | 197 | /** 198 | * The command line parser (version with additional parameters - deprecated) 199 | * @param argc the number of command line options 200 | * @param argv the command line options 201 | * @param args_info the structure where option information will be stored 202 | * @param override whether to override possibly already present options 203 | * @param initialize whether to initialize the option structure my_args_info 204 | * @param check_required whether to check that all required options were provided 205 | * @return 0 if everything went fine, NON 0 if an error took place 206 | * @deprecated use cmdline_parser_ext() instead 207 | */ 208 | int cmdline_parser2 (int argc, char **argv, 209 | struct gengetopt_args_info *args_info, 210 | int override, int initialize, int check_required); 211 | 212 | /** 213 | * The command line parser (version with additional parameters) 214 | * @param argc the number of command line options 215 | * @param argv the command line options 216 | * @param args_info the structure where option information will be stored 217 | * @param params additional parameters for the parser 218 | * @return 0 if everything went fine, NON 0 if an error took place 219 | */ 220 | int cmdline_parser_ext (int argc, char **argv, 221 | struct gengetopt_args_info *args_info, 222 | struct cmdline_parser_params *params); 223 | 224 | /** 225 | * Save the contents of the option struct into an already open FILE stream. 226 | * @param outfile the stream where to dump options 227 | * @param args_info the option struct to dump 228 | * @return 0 if everything went fine, NON 0 if an error took place 229 | */ 230 | int cmdline_parser_dump(FILE *outfile, 231 | struct gengetopt_args_info *args_info); 232 | 233 | /** 234 | * Save the contents of the option struct into a (text) file. 235 | * This file can be read by the config file parser (if generated by gengetopt) 236 | * @param filename the file where to save 237 | * @param args_info the option struct to save 238 | * @return 0 if everything went fine, NON 0 if an error took place 239 | */ 240 | int cmdline_parser_file_save(const char *filename, 241 | struct gengetopt_args_info *args_info); 242 | 243 | /** 244 | * Print the help 245 | */ 246 | void cmdline_parser_print_help(void); 247 | /** 248 | * Print the version 249 | */ 250 | void cmdline_parser_print_version(void); 251 | 252 | /** 253 | * Initializes all the fields a cmdline_parser_params structure 254 | * to their default values 255 | * @param params the structure to initialize 256 | */ 257 | void cmdline_parser_params_init(struct cmdline_parser_params *params); 258 | 259 | /** 260 | * Allocates dynamically a cmdline_parser_params structure and initializes 261 | * all its fields to their default values 262 | * @return the created and initialized cmdline_parser_params structure 263 | */ 264 | struct cmdline_parser_params *cmdline_parser_params_create(void); 265 | 266 | /** 267 | * Initializes the passed gengetopt_args_info structure's fields 268 | * (also set default values for options that have a default) 269 | * @param args_info the structure to initialize 270 | */ 271 | void cmdline_parser_init (struct gengetopt_args_info *args_info); 272 | /** 273 | * Deallocates the string fields of the gengetopt_args_info structure 274 | * (but does not deallocate the structure itself) 275 | * @param args_info the structure to deallocate 276 | */ 277 | void cmdline_parser_free (struct gengetopt_args_info *args_info); 278 | 279 | /** 280 | * Checks that all the required options were specified 281 | * @param args_info the structure to check 282 | * @param prog_name the name of the program that will be used to print 283 | * possible errors 284 | * @return 285 | */ 286 | int cmdline_parser_required (struct gengetopt_args_info *args_info, 287 | const char *prog_name); 288 | 289 | extern const char *cmdline_parser_f1_values[]; /**< @brief Possible values for f1. */ 290 | extern const char *cmdline_parser_kind_values[]; /**< @brief Possible values for kind. */ 291 | extern const char *cmdline_parser_policy_values[]; /**< @brief Possible values for policy. */ 292 | 293 | 294 | #ifdef __cplusplus 295 | } 296 | #endif /* __cplusplus */ 297 | #endif /* CMDLINE_H */ 298 | -------------------------------------------------------------------------------- /include/interface.h: -------------------------------------------------------------------------------- 1 | //! \brief Extrinsic measures evaluation interface. 2 | //! 3 | //! \license Apache License, Version 2.0: http://www.apache.org/licenses/LICENSE-2.0.html 4 | //! > Simple explanation: https://tldrlegal.com/license/apache-license-2.0-(apache-2.0) 5 | //! 6 | //! Copyright (c) 7 | //! \authr Artem Lutov 8 | //! \email luart@ya.ru 9 | //! \date 2017-02-13 10 | 11 | #ifndef INTERFACE_H 12 | #define INTERFACE_H 13 | 14 | #include 15 | #include // unique_ptr 16 | #include 17 | #include 18 | #include 19 | #if VALIDATE >= 1 20 | #include 21 | #endif // VALIDATE 22 | 23 | #define INCLUDE_STL_FS 24 | #include "fileio.hpp" 25 | #if VALIDATE >= 2 26 | #include "operations.hpp" 27 | #endif // VALIDATE 2 28 | 29 | #ifdef C_API 30 | #include "interface_c.h" 31 | #endif // C_API 32 | 33 | 34 | using std::vector; 35 | using std::unordered_set; 36 | using std::unordered_map; 37 | using std::unique_ptr; 38 | using std::string; 39 | using std::pair; 40 | using std::is_integral; 41 | using std::is_pointer; 42 | using std::is_floating_point; 43 | using std::is_arithmetic; 44 | using std::is_same; 45 | //using std::enable_if; 46 | using std::enable_if_t; 47 | using std::conditional_t; 48 | using std::numeric_limits; 49 | #if VALIDATE >= 2 50 | using std::invalid_argument; 51 | #endif // VALIDATE 52 | 53 | // Data Types ------------------------------------------------------------------ 54 | using Id = uint32_t; //!< Node id type 55 | // Note: Size should a magnitude larger than Id to hold Id*Id 56 | using AccId = uint64_t; //!< Accumulated Id type 57 | 58 | using Prob = float; //!< Probability 59 | using AccProb = double; //!< Accumulated Probability 60 | 61 | //! Aggregated Hash of the loading cluster member ids 62 | using AggHash = daoc::AggHash; 63 | 64 | using RawIds = vector; //!< Node ids, unordered 65 | 66 | // Omega Index related types and functions ------------------------------------- 67 | using RawCluster = RawIds; //!< Raw cluster of member node ids 68 | using RawClusters = vector; //!< Raw clustering, container of the raw clusters 69 | using RawClusterPtrs = vector; 70 | using NodeRClusters = unordered_map>; //!< Raw node membership in the clusters 71 | 72 | //! \brief Omega Index evaluation 73 | //! 74 | //! \tparam EXT bool - extended Omega Index, which does not excessively penalize 75 | //! distinct node shares 76 | //! 77 | //! \param ndrcs const NodeRClusters& - node raw clusters relations 78 | //! \param cls1 const RawClusters& - clusters of the first collection 79 | //! \param cls2 const RawClusters& - clusters of the second collection 80 | //! \return Prob - omega index 81 | template 82 | Prob omega(const NodeRClusters& ndrcs, const RawClusters& cls1, const RawClusters& cls2); 83 | 84 | //! \brief Evaluate the number of mutual raw cluster pointers in the containers 85 | //! 86 | //! \pre Input raw clusters pointer containers are ordered by the cmpBase 87 | //! 88 | //! \param a const RawClusterPtrs* - first raw cluster pointers 89 | //! \param b const RawClusterPtrs* - second raw cluster pointers 90 | //! \param nmax const Id - max number of matches for the early termination, 91 | //! 0 is allowed but senseless. 92 | //! \return Id - the number of mutual members 93 | Id mutualnum(const RawClusterPtrs* a, const RawClusterPtrs* b, const Id nmax) noexcept; 94 | 95 | Id mutualnum(const RawClusterPtrs* a, const RawClusterPtrs* b) noexcept; 96 | 97 | // F1 & NMI related data types ------------------------------------------------- 98 | template 99 | struct Cluster; 100 | 101 | //! Cluster matching counter 102 | //! \note Required only for F1 evaluation 103 | //! \tparam Count - arithmetic counting type 104 | template 105 | class Counter { 106 | public: 107 | static_assert(is_arithmetic::value 108 | , "Counter(), Count should be an arithmetic type"); 109 | using CountT = Count; //!< Count type, arithmetic 110 | using ClusterT = Cluster; 111 | private: 112 | // Note: it's OK to copy this pointer on assignment since it is never 113 | // allocated in this class 114 | ClusterT* m_orig; //!< Originator cluster 115 | CountT m_count; //!< Occurrences counter, <= members size 116 | public: 117 | //! Default constructor 118 | Counter(): m_orig(nullptr), m_count(0) {} 119 | 120 | //! \brief Update the counter from the specified origin 121 | //! 122 | //! \param orig ClusterT* - counter origin 123 | //! \param cont Count - contribution or share, actual only for the floating point counter 124 | //! \return void 125 | void operator()(ClusterT* orig, Count cont) 126 | #if VALIDATE < 2 127 | noexcept 128 | #endif // VALIDATE 129 | { 130 | if(m_orig != orig) { 131 | m_orig = orig; 132 | m_count = 0; 133 | } 134 | if(is_integral::value) 135 | ++m_count; 136 | else { 137 | static_assert(!is_floating_point::value || sizeof(m_count) >= sizeof(double) 138 | , "operator(), types validation failed"); 139 | #if VALIDATE >= 2 140 | if(cont <= 0 || cont > 1) 141 | throw invalid_argument("operator(), cont should E (0, 1]\n"); 142 | #endif // VALIDATE 143 | m_count += cont; 144 | } 145 | } 146 | 147 | //! \brief Get counted value 148 | //! 149 | //! \return CountT - counted value 150 | CountT operator()() const noexcept { return m_count; } 151 | 152 | //! \brief Get counter origin 153 | //! 154 | //! \return ClusterT* - counter origin 155 | ClusterT* origin() const noexcept { return m_orig; } 156 | 157 | //! \brief Clear (reset) the counter 158 | void clear() noexcept 159 | { 160 | m_orig = nullptr; 161 | m_count = 0; 162 | } 163 | }; 164 | 165 | //! Cluster 166 | //! \tparam Count - nodes contribution counter type 167 | template 168 | struct Cluster { 169 | static_assert(is_arithmetic::value 170 | , "Counter(), Count should be an arithmetic type"); 171 | using CountT = Count; //!< Count type, arithmetic 172 | 173 | RawIds members; //!< Node ids, unordered 174 | // Note: used by F1 only and always 175 | Counter counter; //!< Cluster matching counter 176 | ////! Accumulated contribution 177 | //using AccCont = conditional_t; 178 | //!< Contribution from members 179 | // Note: used only in case of a) overlaps (by all measures) or 180 | // b) multiple resolutions (by NMI only) 181 | Count mbscont; 182 | static_assert(!is_floating_point::value || sizeof(mbscont) >= sizeof(double) 183 | , "operator(), types validation failed"); 184 | 185 | //! Default constructor 186 | Cluster(); 187 | 188 | //! \brief F1 measure 189 | //! \pre Clusters should be valid, i.e. non-empty 190 | //! 191 | //! \param matches Count - the number of matched members 192 | //! \param capacity Count - contributions capacity of the matching foreign cluster 193 | //! \return AccProb - resulting F1 194 | AccProb f1(Count matches, Count capacity) const 195 | #if VALIDATE < 2 196 | noexcept 197 | #endif // VALIDATE 198 | { 199 | // F1 = 2 * pr * rc / (pr + rc) 200 | // pr = m / c1 201 | // rc = m / c2 202 | // F1 = 2 * m/c1 * m/c2 / (m/c1 + m/c2) = 2 * m / (c2 + c1) 203 | // ATTENTION: F1 compares clusters per-pair, so it is much simpler and has another 204 | // semantics of contribution for the multi-resolution case 205 | const Count contrib = is_floating_point::value ? cont() : members.size(); 206 | #if VALIDATE >= 2 207 | if(matches < 0 || daoc::less::value 208 | , Prob, Count>>(capacity, matches) || contrib <= 0) 209 | throw invalid_argument(string("f1(), both clusters should be non-empty, matches: ") 210 | .append(std::to_string(matches)).append(", capacity: ").append(std::to_string(capacity)) 211 | .append(", contrib: ").append(std::to_string(contrib)) += '\n'); 212 | #endif // VALIDATE 213 | return 2 * matches / AccProb(capacity + contrib); // E [0, 1] 214 | // Note that partial probability (non-normalized to the remained matches, 215 | // it says only how far this match from the full match) of the match is: 216 | // P = AccProb(matches * matches) / AccProb(size * members.size()), 217 | // where nodes contribution instead of the size should be use for overlaps. 218 | // The probability is more discriminative than F1 for high values 219 | } 220 | 221 | //! \brief Partial probability of the match (non-normalized to the other matches) 222 | //! \pre Clusters should be valid, i.e. non-empty 223 | //! 224 | //! \param matches Count - the number of matched members 225 | //! \param capacity Count - contributions capacity of the matching foreign cluster 226 | //! \return AccProb - resulting probability 227 | AccProb pprob(Count matches, Count capacity) const 228 | #if VALIDATE < 2 229 | noexcept 230 | #endif // VALIDATE 231 | { 232 | // P = P1 * P2 = m/c1 * m/c2 = m*m / (c1*c2), 233 | // where nodes contribution instead of the size should be used for overlaps. 234 | // ATTENTION: F1 compares clusters per-pair, so it is much simpler and has another 235 | // semantics of contribution for the multi-resolution case comparing to NMI 236 | // that also uses cont() 237 | constexpr bool floating = is_floating_point::value; 238 | const Count contrib = floating ? cont() : members.size(); 239 | #if VALIDATE >= 2 240 | if(matches < 0 || daoc::less> 241 | (capacity, matches) || contrib <= 0) 242 | throw invalid_argument(string("pprob(), both clusters should be non-empty, matches: ") 243 | .append(std::to_string(matches)).append(", capacity: ").append(std::to_string(capacity)) 244 | .append(", contrib: ").append(std::to_string(contrib)) += '\n'); 245 | #endif // VALIDATE 246 | return floating ? static_cast(matches) * matches / (static_cast(capacity) * contrib) 247 | : static_cast(static_cast(matches) * matches) 248 | / (static_cast(capacity) * contrib); // E [0, 1] 249 | } 250 | 251 | //! \brief Cluster members contribution 252 | //! 253 | //! \return Count - total contribution from the members 254 | Count cont() const noexcept 255 | { 256 | // return is_same::value ? members.size() : mbscont; 257 | return mbscont; 258 | } 259 | }; 260 | 261 | //! Automatic storage for the Cluster; 262 | //! \tparam Count - arithmetic counting type 263 | template 264 | using ClusterHolder = unique_ptr>; 265 | 266 | //! Cluster pointers, unordered 267 | //! \tparam Count - arithmetic counting type 268 | template 269 | using ClusterPtrs = vector*>; 270 | 271 | //! Node to clusters relations 272 | //! \tparam Count - arithmetic counting type 273 | template 274 | using NodeClusters = unordered_map>; 275 | 276 | //! Resulting greatest matches for 2 input collections of clusters in a single direction 277 | using Probs = vector; 278 | 279 | // Label-related types -------------------------------------------------------- 280 | //! Clusters Labels, Labels are ORDERED by cmpBase 281 | template 282 | using ClustersLabels = unordered_map*, ClusterPtrs>; 283 | 284 | // F1-related types ----------------------------------------------------------- 285 | using F1Base = uint8_t; 286 | 287 | //! \brief F1 kind 288 | enum struct F1: F1Base { 289 | //! Not initialized 290 | NONE = 0, 291 | //! Harmonic mean of the [weighted] average of the greatest (maximal) match 292 | //! by partial probabilities 293 | PARTPROB, 294 | //! Harmonic mean of the [weighted] average of the greatest (maximal) match by F1s 295 | HARMONIC, 296 | //! Arithmetic mean (average) of the [weighted] average of the greatest (maximal) 297 | //! match by F1s, i.e. F1-Score 298 | AVERAGE // Suggested by Leskovec 299 | }; 300 | 301 | //! \brief String representation of the F1 302 | //! \relates F1 303 | //! 304 | //! \param f1 F1 - the value to be converted 305 | //! \return string - string value 306 | string to_string(F1 f1); 307 | 308 | // NMI-related types ----------------------------------------------------------- 309 | //! Internal element of the Sparse Matrix with Vector Rows 310 | //! \tparam Index - index (of the column) in the row 311 | //! \tparam Value - value type 312 | template 313 | struct RowVecItem { 314 | static_assert(is_integral::value || is_pointer::value 315 | , "RowVecItem, Index should be an integral type"); 316 | 317 | using CallT = Index; //!< Type of the functor call 318 | 319 | Index pos; //!< Position (index) in the row 320 | Value val; //!< Target value (payload) 321 | 322 | //! Constructor in case of the simple value 323 | //! 324 | //! \param i=Index() Index - index of value in the row 325 | //! \param v=Value() Value - payload value 326 | template * = nullptr> 327 | RowVecItem(Index i=Index(), Value v=Value()) noexcept(static_cast(Value())) 328 | : pos(i), val(v) {} 329 | 330 | //! Constructor in case of the compound value 331 | //! 332 | //! \param i=Index() Index - index of value in the row 333 | //! \param v=Value() Value - payload value 334 | template sizeof(void*)), bool>* = nullptr> 335 | RowVecItem(Index i=Index(), Value&& v=Value()) noexcept(Value()) 336 | : pos(i), val(move(v)) {} 337 | 338 | //! \brief Functor (call) operator 339 | //! 340 | //! \return CallT - index of the value 341 | // Note: required to call obj() 342 | CallT operator()() const noexcept { return pos; } 343 | 344 | // // Note: required for the comparison operations with index 345 | // operator CallT() const noexcept { return this } 346 | }; 347 | 348 | //! Row vector for the SparseMatrix 349 | template 350 | using SparseMatrixRowVec = vector>; 351 | 352 | //! Base type of the SparseMatrix (can be unordered_map, map, vector) 353 | template 354 | using SparseMatrixBase = unordered_map>; 355 | 356 | //! Sparse Matrix 357 | //! \tparam Index - index type 358 | //! \tparam Value - value type 359 | template 360 | struct SparseMatrix: SparseMatrixBase { 361 | static_assert((is_integral::value || is_pointer::value) 362 | && is_arithmetic::value, "SparseMatrix(), invalid parameter types"); 363 | 364 | using IndexT = Index; //!< Indexes type, integral 365 | using ValueT = Value; //!< Value type, arithmetic 366 | using BaseT = SparseMatrixBase; //!< SparseMatrixBase type 367 | using RowT = typename BaseT::mapped_type; //!< Matrix row type 368 | //! Matrix row element type, which contains the value and might have 369 | //! additional attributes 370 | using RowItemT = typename RowT::value_type; 371 | 372 | //! \brief Default constructor 373 | //! 374 | //! \param rows=0 Id - initial number of rows 375 | SparseMatrix(Id rows=0); 376 | 377 | //! \brief Access specified element inserting it if not exists 378 | //! 379 | //! \param i Index - row index 380 | //! \param j Index - column index 381 | //! \return Value& operator - value of the element to be set 382 | Value& operator ()(Index i, Index j); 383 | 384 | //! \brief Access specified element without bounds checking 385 | //! \note fast, but unsafe 386 | //! 387 | //! \param i Index - row index 388 | //! \param j Index - column index 389 | //! \return Value& operator - value of the element 390 | template * = nullptr> 391 | Value operator ()(Index i, Index j) const noexcept; // { return this->at(i) } 392 | 393 | //! \brief Access specified element without bounds checking 394 | //! \note fast, but unsafe 395 | //! 396 | //! \param i Index - row index 397 | //! \param j Index - column index 398 | //! \return Value& operator - value of the element 399 | template sizeof(void*)), bool>* = nullptr> 400 | const Value& operator ()(Index i, Index j) const noexcept; // { return this->at(i) } 401 | 402 | //! \brief Access specified element checking the bounds 403 | //! 404 | //! \param i Index - row index 405 | //! \param j Index - column index 406 | //! \return Value& operator - value of the element 407 | template * = nullptr> 408 | Value at(Index i, Index j); // { return this->at(i) } 409 | 410 | //! \brief Access specified element checking the bounds 411 | //! 412 | //! \param i Index - row index 413 | //! \param j Index - column index 414 | //! \return Value& operator - value of the element 415 | template sizeof(void*)), bool>* = nullptr> 416 | const Value& at(Index i, Index j); // { return this->at(i) } 417 | 418 | using BaseT::at; //!< Provide direct access to the matrix row 419 | }; 420 | 421 | //using EvalBase = uint8_t; //!< Base type for the Evaluation 422 | // 423 | ////! \brief Evaluation type 424 | //enum struct Evaluation: EvalBase { 425 | // NONE = 0, 426 | //// HARD = 0 427 | // MULTIRES = 1, //!< Multi-resolution non-overlapping clusters, compatible with hard partitioning 428 | // OVERLAPPING = 2, //!< Overlapping clusters, compatible with hard partitioning 429 | // MULRES_OVP = 3 //!< Multi-resolution clusters with possible overlaps on each resolution level 430 | //}; 431 | // 432 | ////! \brief Convert Evaluation to string 433 | ////! \relates Evaluation 434 | ////! 435 | ////! \param flag Evaluation - the flag to be converted 436 | ////! \param bitstr=false bool - convert to bits string or to Evaluation captions 437 | ////! \return string - resulting flag as a string 438 | //string to_string(Evaluation eval, bool bitstr=false); 439 | 440 | struct RawNmi { 441 | Prob mi; //!< Mutual information of two collections 442 | Prob h1; //!< Information content of the 1-st collection 443 | Prob h2; //!< Information content of the 2-nd collection 444 | //Evaluation eval; //!< Evaluation type 445 | 446 | static_assert(is_floating_point::value, "RawNmi, Prob should be a floating point type"); 447 | RawNmi() noexcept: mi(0), h1(numeric_limits::quiet_NaN()) 448 | , h2(numeric_limits::quiet_NaN()) {} 449 | 450 | void operator() (Prob mutinf, Prob cn1h, Prob cn2h) noexcept 451 | { 452 | mi = mutinf; 453 | h1 = cn1h; 454 | h2 = cn2h; 455 | }; 456 | }; 457 | 458 | // Collection ------------------------------------------------------------------ 459 | //! Unique ids (node ids) 460 | using UniqIds = unordered_set; 461 | 462 | //! Node base interface 463 | struct NodeBaseI { 464 | //! \brief Default virtual destructor 465 | virtual ~NodeBaseI()=default; 466 | 467 | //! \brief Whether the node base is actual (non-empty) 468 | //! 469 | //! \return bool - the node base is non-empty 470 | operator bool() const noexcept { return ndsnum(); }; 471 | 472 | //! \brief The number of nodes 473 | //! 474 | //! \return Id - the number of nodes in the collection 475 | virtual Id ndsnum() const noexcept = 0; 476 | 477 | //! \brief Whether exists the specified node 478 | //! 479 | //! \param nid - node id 480 | //! \return bool - specified node id exists 481 | virtual bool nodeExists(Id nid) const noexcept = 0; 482 | 483 | //! \brief Nodebase content 484 | //! 485 | //! \return virtual const UniqIds& - nodebase content 486 | virtual const UniqIds& nodes() const noexcept = 0; 487 | }; 488 | 489 | //! Node base 490 | struct NodeBase: protected UniqIds, NodeBaseI { 491 | using UniqIds::clear; 492 | using UniqIds::reserve; 493 | using UniqIds::insert; 494 | using UniqIds::begin; 495 | using UniqIds::end; 496 | 497 | //! \copydoc NodeBaseI::nodeExists(Id nid) const noexcept 498 | Id ndsnum() const noexcept override { return size(); } 499 | 500 | //! \copydoc NodeBaseI::nodeExists(Id nid) const noexcept 501 | bool nodeExists(Id nid) const noexcept override { return count(nid); } 502 | 503 | //! \copydoc NodeBaseI::nodes() const noexcept 504 | const UniqIds& nodes() const noexcept override { return *this; } 505 | 506 | #ifndef NO_FILEIO 507 | //! \brief Load all unique nodes from the CNL file with optional filtering by the cluster size 508 | //! 509 | //! \param filename const char* - name of the input file 510 | //! \param ahash=nullptr AggHash* - resulting aggregated hash of the loaded 511 | //! node ids if not nullptr 512 | //! \param membership=1 float - expected membership of the nodes, >0, typically >= 1. 513 | //! Used only for the node container preallocation to estimate the number of nodes 514 | //! if not specified in the file header 515 | //! \param cmin=0 size_t - min allowed cluster size 516 | //! \param cmax=0 size_t - max allowed cluster size, 0 means any size 517 | //! \param verbose=false bool - print intermediate results to the stdout 518 | //! \return bool - the collection is loaded successfully 519 | static NodeBase load(const char* filename, float membership=1 520 | , AggHash* ahash=nullptr, size_t cmin=0, size_t cmax=0, bool verbose=false); 521 | #endif // NO_FILEIO 522 | }; 523 | 524 | //template 525 | //Id iterValSimple(Iter it) noexcept { return *it; } 526 | // 527 | //template 528 | //Id iterValFirst(Iter it) noexcept { return it->first; } 529 | // 530 | ////! \brief Identify external nodes that are complementary (do not belong) to the node base 531 | ////! 532 | ////! \tparam Iter - iterator type for the external nodes 533 | ////! \tparam IterValF - function, obtaining node value from the iterator 534 | ////! 535 | ////! \param begin - begin of the external nodes 536 | ////! \param end - end of the external nodes 537 | ////! \param size=0 Id - size of the external nodes, 0 means not specified; used to pre-allocate data 538 | ////! \param itval=iterValSimple IterValF - iterator value extracting function 539 | ////! \return RawIds - external nodes that are complementary (do not belong) to the node base 540 | //template 541 | //virtual RawIds complementary(Iter begin, Iter end, Id size=0, IterValF itval=iterValSimple) const noexcept = 0; 542 | // 543 | //template 544 | //RawIds NodeBase::complementary(Iter begin, Iter end, Id size, IterValF itval) const override noexcept 545 | //{ 546 | // RawIds ndcpl; // Return using NRVO, named return value optimization 547 | // UniqIds ndext; // External nodes, whose complementary values should be extracted 548 | // if(size) 549 | // ndext.reserve(size); 550 | // for(Iter it = begin; it != end; ++it) 551 | // ndext.insert(ndext.end(), itval(it)); 552 | // ndcpl.reserve(ndext.size() - this->size()); 553 | // for(auto nid: ndext) 554 | // if(!count(nid)) 555 | // ndcpl.push_back(nid); 556 | // 557 | // return ndcpl; 558 | //} 559 | 560 | //! Collection matching kind base 561 | using MatchBase = uint8_t; 562 | 563 | //! \brief Collection matching kind 564 | enum struct Match: MatchBase { 565 | NONE = 0, //!< Note initialized 566 | WEIGHTED, //!< Weighted matching by the number of members in each cluster (macro weighting) 567 | UNWEIGHTED, //!< Unweighted matching of each cluster (micro weighting) 568 | COMBINED //!< Combined of macro and micro weightings using geometric mean 569 | }; 570 | 571 | //! \brief String representation of the Match 572 | //! \relates Match 573 | //! 574 | //! \param mkind Match - the value to be converted 575 | //! \return string - string value 576 | string to_string(Match mkind); 577 | 578 | //! \brief The matching includes weighted match 579 | //! \relates Match 580 | //! 581 | //! \param m Match - matching kind 582 | //! \return bool - weighted matching included 583 | bool xwmatch(Match m) noexcept; 584 | 585 | //! \brief The matching includes unweighted match 586 | //! \relates Match 587 | //! 588 | //! \param m Match - matching kind 589 | //! \return bool - unweighted matching included 590 | bool xumatch(Match m) noexcept; 591 | 592 | //! Precision and recall 593 | struct PrcRec { 594 | Prob prc; //!< Precision 595 | Prob rec; //!< Recall 596 | 597 | // Explicit members initialization by value to avoid uninitialized members 598 | PrcRec(Prob prc=0, Prob rec=0): prc(prc), rec(rec) {} 599 | }; 600 | 601 | #ifdef C_API 602 | template 603 | class Collection; 604 | 605 | Collection loadCollection(const ClusterCollection rcn, bool makeunique 606 | , float membership, ::AggHash* ahash, const NodeBaseI* nodebase, RawIds* lostcls, bool verbose); 607 | #endif // C_API 608 | 609 | //! Collection describing cluster-node relations 610 | //! \tparam Count - arithmetic counting type 611 | template 612 | class Collection: public NodeBaseI { 613 | public: 614 | using CollectionT = Collection; 615 | //! Overlaps / multi-resolutions evaluation flag 616 | constexpr static bool m_overlaps = is_floating_point::value; 617 | //! Accumulated contribution 618 | using AccCont = conditional_t; 619 | //! Clusters matching matrix 620 | using ClustersMatching = SparseMatrix*, AccCont>; // Used only for NMI 621 | using ClsLabels = ClustersLabels; 622 | 623 | #ifdef C_API 624 | friend Collection loadCollection(const ClusterCollection rcn, bool makeunique, float membership 625 | , ::AggHash* ahash, const NodeBaseI* nodebase, bool reduce, RawIds* lostcls, bool verbose); 626 | #endif // C_API 627 | private: 628 | // ATTENTNION: Collection manages the memory of the m_cls 629 | ClusterPtrs m_cls; //!< Clusters 630 | NodeClusters m_ndcs; //!< Node clusters relations 631 | size_t m_ndshash; //!< Nodes hash (of unique node ids only, not all members), 0 means was not evaluated 632 | //mutable bool m_dirty; //!< The cluster members contribution is not zero (should be reseted on reprocessing) 633 | //! Sum of contributions of all members in each cluster 634 | mutable AccCont m_contsum; // Used by NMI only, marked also by overlapping F1 635 | 636 | //! \copydoc NodeBaseI::nodes() const noexcept 637 | const UniqIds& nodes() const noexcept override // Make a stub and close it 638 | { 639 | assert(0 && "Nodes should not be accessed in collection via the NodeBaseI interface"); 640 | static UniqIds nds; 641 | return nds; // Stub output 642 | } 643 | protected: 644 | //! Default constructor 645 | Collection(): m_cls(), m_ndcs(), m_ndshash(0), m_contsum(0) {} //, m_dirty(false) {} 646 | 647 | // Note: Actual for NMI and overlapping F1 648 | //! \brief Initialized cluster members contributions 649 | //! 650 | //! \param cn const CollectionT& - target collection to initialize cluster 651 | //! members contributions 652 | //! \return void 653 | static void initconts(const CollectionT& cn) noexcept; 654 | public: 655 | ~Collection(); 656 | 657 | //! \brief The number of clusters 658 | //! 659 | //! \return Id - the number of clusters in the collection 660 | Id clsnum() const noexcept { return m_cls.size(); } 661 | 662 | //! \copydoc NodeBaseI::ndsnum() const noexcept 663 | Id ndsnum() const noexcept override { return m_ndcs.size(); } 664 | 665 | //! \copydoc NodeBaseI::nodeExists(Id nid) const noexcept 666 | bool nodeExists(Id nid) const noexcept override { return m_ndcs.count(nid); } 667 | 668 | #ifndef NO_FILEIO 669 | //! \brief Load collection from the CNL file 670 | //! \pre All clusters in the file are expected to be unique and not validated for 671 | //! the mutual match until makeunique is set 672 | //! 673 | //! \param filename const char* - name of the input file 674 | //! \param makeunique=false bool - ensure that clusters contain unique members by 675 | //! removing the duplicates 676 | //! \param membership=1 float - expected membership of the nodes, >0, typically >= 1. 677 | //! Used only for the node container preallocation to estimate the number of nodes 678 | //! if not specified in the file header 679 | //! \param ahash=nullptr AggHash* - resulting hash of the loaded 680 | //! member ids base (unique ids only are hashed, not all ids) if not nullptr 681 | //! \param const nodebase=nullptr NodeBaseI* - node base to filter-out nodes if required 682 | //! \param lostcls=nullptr RawIds* - indices of the lost clusters during the node base 683 | //! synchronization 684 | //! \param verbose=false bool - print the number of loaded nodes to the stdout 685 | //! \return CollectionT - the collection is loaded successfully 686 | static CollectionT load(const char* filename, bool makeunique=false 687 | , float membership=1, AggHash* ahash=nullptr, const NodeBaseI* nodebase=nullptr 688 | , RawIds* lostcls=nullptr, bool verbose=false); 689 | #endif // NO_FILEIO 690 | 691 | //! \brief Transfer collection data 692 | //! \post This collection becomes empty 693 | //! 694 | //! \tparam FIRST bool - fill first of second node clusters relations container 695 | //! 696 | //! \param cls RawClusters& - raw clusters to be extended 697 | //! \param nds NodeRClusters& - node raw clusters relations to be extended 698 | //! \return void 699 | template 700 | void transfer(RawClusters& cls, NodeRClusters& ndrcs); 701 | 702 | //! \brief Clear cluster counters 703 | //! 704 | //! \return void 705 | void clearcounts() const noexcept; 706 | 707 | // //! \brief Synchronize the node base of the cluster collections 708 | // //! 709 | // //! \tparam REDUCE bool - whether to reduce collections by removing the non-matching nodes 710 | // //! or extend collections by appending those nodes them to a single "noise" cluster 711 | // /// 712 | // /// \param cn1 CollectionT& - first collection 713 | // /// \param cn2 CollectionT& - second collection 714 | // /// \return Prob - harmonic mean of the nodebase correction (complement or reduction) for both collections 715 | // template 716 | // static Prob syncCollections(CollectionT& cn1, CollectionT& cn2); 717 | 718 | //! \brief Label collection of clusters according to the ground-truth cluster indices 719 | //! 720 | //! \param gt const CollectionT& - ground-truth cluster collection 721 | //! \param cn const CollectionT& - processing cluster collection 722 | // //! \param lostcls const RawIds& - indices of the lost clusters during the node base 723 | // //! synchronization 724 | //! \param prob bool - Partial Probabilities or F1 (harmonic) matching policy 725 | //! \param weighted=true bool - weight labels by the number of instances or 726 | //! treat each label equally 727 | //! \param flname=nullptr const char* - resulting label indices filename (.cll format) 728 | // //! \param verbose=false bool - print intermediate results to the stdout 729 | //! \return PrcRec - resulting precision and recall for the labeled items 730 | static PrcRec label(const CollectionT& gt, const CollectionT& cn //, const RawIds& lostcls 731 | , bool prob, bool weighted=true, const char* flname=nullptr); //, bool verbose=false); 732 | 733 | //! \brief Specified F1 evaluation of the Greatest (Max) Match for the 734 | //! multi-resolution clustering with possibly unequal node base 735 | //! 736 | //! Supported F1 measures are F1p <= F1h <= F1s, where: 737 | //! - F1p - Harmonic mean of the [weighted] average of partial probabilities, 738 | //! the most discriminative and satisfies the largest number of the Formal 739 | //! Constraints (homogeneity, completeness, rag bag, size/quantity, balance); 740 | //! - F1h - Harmonic mean of the [weighted] average of F1s; 741 | //! - F1a - Average F1-Score, i.e. arithmetic mean (average) of the [weighted] 742 | //! average of F1s, the least discriminative and satisfies the lowest number 743 | //! of the Formal Constraints. 744 | //! 745 | //! of the Greatest (Max) Match [Weighted] Average Harmonic Mean evaluation 746 | //! \note Undirected (symmetric) evaluation 747 | //! 748 | //! \param cn1 const CollectionT& - first collection 749 | //! \param cn2 const CollectionT& - second collection 750 | //! \param kind F1 - kind of F1 to be evaluated 751 | //! \param rec Prob& - recall of cn2 relative to the ground-truth cn1 or 752 | //! 0 if the matching strategy does not have the precision/recall notations 753 | //! \param prc Prob& - precision of cn2 relative to the ground-truth cn1 or 754 | //! 0 if the matching strategy does not have the precision/recall notations 755 | //! \param mkind=Match::WEIGHTED Match - matching kind 756 | //! \param verbose=false bool - print intermediate results to the stdout 757 | //! \return Prob - resulting F1_gm 758 | static Prob f1(const CollectionT& cn1, const CollectionT& cn2, F1 kind 759 | , Prob& rec, Prob& prc, Match mkind=Match::WEIGHTED, bool verbose=false); 760 | 761 | //! \brief NMI evaluation 762 | //! \note Undirected (symmetric) evaluation 763 | //! 764 | //! \param cn1 const CollectionT& - first collection 765 | //! \param cn2 const CollectionT& - second collection 766 | //! \param expbase=false bool - use ln (exp base) or log2 (Shannon entropy, bits) 767 | //! for the information measuring 768 | //! \param verbose=false bool - perform additional verification and print details 769 | //! \return RawNmi - resulting NMI 770 | static RawNmi nmi(const CollectionT& cn1, const CollectionT& cn2, bool expbase=false 771 | , bool verbose=false); 772 | protected: 773 | // Label related functions ------------------------------------------------- 774 | //! \brief Mark clusters of the argument collection with the labels 775 | //! \note For EACH label the best matching cluster is identified. Mutual match 776 | //! is not applied to guarantee coverage of the all ground-truth clusters to 777 | //! have meaningful F1 778 | //! 779 | //! \param cn const CollectionT& - the collection to be labeled 780 | //! \param prob bool - match labels by the Partial Probabilities or F1; 781 | //! prob maximizes gain otherwise loss is minimized and F1 is maximized 782 | //! \param weighted=true bool - weight labels by the number of instances or 783 | //! treat each label equally 784 | //! \param csls=nullptr ClsLabels* - resulting labels as clusters of the 785 | //! ground-truth collection if not nullptr 786 | //! \return PrcRec - resulting average over all labels Precision and Recall 787 | //! for all nodes of the marked clusters, where each label can be assigned 788 | //! to multiple cn clusters and then all nodes of that clusters are matched 789 | //! to the ground truth cluster (label) nodes 790 | PrcRec mark(const CollectionT& cn, bool prob, bool weighted=true, ClsLabels* csls=nullptr) const; 791 | 792 | // F1-related functions ---------------------------------------------------- 793 | //! \brief Average of the maximal matches (by F1 or partial probabilities) 794 | //! relative to the specified collection FROM this one 795 | //! \note External cn collection can have unequal node base and overlapping 796 | //! clusters on multiple resolutions. Small collection relative to the average 797 | //! or average relative to huge might yield best matching F1 equal to 1, but 798 | //! then the back match should be small. 799 | //! \attention Directed (non-symmetric) evaluation 800 | //! 801 | //! \param gmats const Probs& - greatest (max) matching with another collection 802 | //! \param weighted bool - weighted average by cluster size 803 | //! \return AccProb - resulting max average match value from this collection 804 | //! to the specified one (DIRECTED) 805 | inline AccProb avggms(const Probs& gmats, bool weighted) const; // const CollectionT& cn 806 | 807 | //! \brief Greatest (Max) matching value (F1 or partial probability) for each cluster 808 | //! to the corresponding clusters of the specified collection 809 | //! \note External cn collection can have unequal node base and overlapping 810 | //! clusters on multiple resolutions 811 | //! \attention Directed (non-symmetric) evaluation 812 | //! \post Modifies internal state of the collection 813 | //! 814 | //! \param cn const CollectionT& - collection to compare with 815 | //! \param prob bool - evaluate partial probability instead of F1 816 | //! \return Probs - resulting max F1 or partial probability for cluster 817 | //! (all member nodes are considered in the cluster) 818 | Probs gmatches(const CollectionT& cn, bool prob) const; 819 | 820 | // NMI-related functions --------------------------------------------------- 821 | //! \brief NMI evaluation considering overlaps, multi-resolution and possibly 822 | //! unequal node base 823 | //! \note Undirected (symmetric) evaluation 824 | //! 825 | //! \param cn const CollectionT& - collection to compare with 826 | //! \param expbase bool - use ln (exp base) or log2 (Shannon entropy, bits) 827 | //! for the information measuring 828 | //! \return RawNmi - resulting NMI 829 | RawNmi nmi(const CollectionT& cn, bool expbase) const; 830 | 831 | //! \brief Clear contributions in each cluster and optionally 832 | //! evaluate the clusters matching 833 | //! 834 | //! \param cn const CollectionT& - foreign collection to be processed with this one 835 | //! \param[out] clsmm=nullptr ClustersMatchingT* - clusters matching matrix to be filled 836 | //! \return AccCont - sum of all values of the clsmm matrix if specified 837 | AccCont evalconts(const CollectionT& cn, ClustersMatching* clsmm=nullptr) const; 838 | 839 | //! \brief Clear contributions in each cluster 840 | //! 841 | //! \return void 842 | void clearconts() const noexcept; 843 | }; 844 | 845 | // Accessory functions --------------------------------------------------------- 846 | //! \brief Compile time pair selector 847 | //! 848 | //! \tparam FIRST bool - select .first or .second 849 | //! 850 | //! \param pr P& - pair 851 | //! \return selected field 852 | template 853 | enable_if_t& pairsel(P& pr) noexcept { return pr.first; } 854 | 855 | template 856 | enable_if_t& pairsel(P& pr) noexcept { return pr.second; } 857 | 858 | //! \brief Parse decimal c-string as id 859 | //! 860 | //! \param str char* - id string 861 | //! \return Id - id value 862 | Id parseId(char* str); 863 | 864 | //! \brief Harmonic mean 865 | //! \note a + b = 0 are threated correctly resulting 0 866 | //! 867 | //! \param a AccProb - first item 868 | //! \param b AccProb - second item 869 | //! \return AccProb - resulting mean 870 | AccProb hmean(AccProb a, AccProb b) noexcept; 871 | 872 | //! \brief Geometric mean 873 | //! 874 | //! \param a AccProb - first item 875 | //! \param b AccProb - second item 876 | //! \return AccProb - resulting mean 877 | AccProb gmean(AccProb a, AccProb b) noexcept; 878 | 879 | //! \brief Arithmetic mean (average) 880 | //! 881 | //! \param a AccProb - first item 882 | //! \param b AccProb - second item 883 | //! \return AccProb - resulting mean 884 | AccProb amean(AccProb a, AccProb b) noexcept; 885 | 886 | #endif // INTERFACE_H 887 | --------------------------------------------------------------------------------