├── data
    ├── distinct1.cnl
    ├── 2cls3nds.cnl
    ├── 2cls3nds_novp.cnl
    ├── 3cls5nds_novp.cnl
    ├── 3cls5nds_novp2.cnl
    ├── 1lev4nds3cls1s.cnl
    ├── 1lev4nds3cls2.cnl
    ├── 1lev4nds4cls.cnl
    ├── 4nds_1lev_novp1.cnl
    ├── 4nds_1lev_novp2.cnl
    ├── 15nds_novp2.cnl
    ├── 15nds_novp3.cnl
    ├── 15nds_novp1.cnl
    ├── 4nds_2lev_novp1.cnl
    ├── 4nds_2lev_novp2.cnl
    ├── 3cls5nds_2lev.cnl
    ├── omega_c2-1.cnl
    ├── omega_c2-2.cnl
    ├── omega_c4.3-2.cnl
    ├── 3cls5nds_novp_3lev.cnl
    ├── omega_c4.3-1.cnl
    ├── 4cls5nds.cnl
    ├── 4cls6nds.cnl
    ├── 1lev4nds2cls.cnl
    ├── 4cls5nds_2lev.cnl
    ├── 4cls6nds_2lev.cnl
    └── 3cls5nds.cnl
├── docs
    └── xmeasures.pdf
├── images
    └── CPU-Timings-DBLP.png
├── GenerateArgparser.sh
├── .gitignore
├── shared
    ├── macrodef.h
    ├── agghash.hpp
    ├── fileio.cpp
    └── fileio.hpp
├── xmeasures.py
├── xmeasures.cbp
├── Makefile_lib
├── libxmeasures.cbp
├── Makefile
├── src
    ├── interface.cpp
    ├── main.cpp
    └── interface_c.cpp
├── include
    ├── interface_c.h
    └── interface.h
├── args.ggo
├── LICENSE
├── README.md
└── autogen
    └── cmdline.h


/data/distinct1.cnl:
--------------------------------------------------------------------------------
1 | # Clusters: 1, Nodes: 1, Fuzzy: 0, Numbered: 0
2 | 9
3 | 


--------------------------------------------------------------------------------
/data/2cls3nds.cnl:
--------------------------------------------------------------------------------
1 | # Clusters: 2, Nodes:3,Fuzzy: 0, Numbered: 0
2 | 1 2
3 | 3 2
4 | 


--------------------------------------------------------------------------------
/data/2cls3nds_novp.cnl:
--------------------------------------------------------------------------------
1 | # Clusters: 2, Nodes:3,Fuzzy: 0, Numbered: 0
2 | 1 2
3 | 3
4 | 


--------------------------------------------------------------------------------
/data/3cls5nds_novp.cnl:
--------------------------------------------------------------------------------
1 | # Clusters: 3, Nodes:0,Fuzzy: 0, Numbered: 0
2 | 1 2 3
3 | 4
4 | 5
5 | 


--------------------------------------------------------------------------------
/data/3cls5nds_novp2.cnl:
--------------------------------------------------------------------------------
1 | # Clusters: 3, Nodes:0,Fuzzy: 0, Numbered: 0
2 | 1 2 3
3 | 4 5
4 | 


--------------------------------------------------------------------------------
/data/1lev4nds3cls1s.cnl:
--------------------------------------------------------------------------------
1 | # Clusters: 3, Nodes:4,Fuzzy: 0, Numbered: 0
2 | 1 2
3 | 3 2
4 | 4
5 | 


--------------------------------------------------------------------------------
/data/1lev4nds3cls2.cnl:
--------------------------------------------------------------------------------
1 | # Clusters: 3, Nodes:4,Fuzzy: 0, Numbered: 0
2 | 1 2
3 | 3 2
4 | 4 2
5 | 


--------------------------------------------------------------------------------
/docs/xmeasures.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eXascaleInfolab/xmeasures/HEAD/docs/xmeasures.pdf


--------------------------------------------------------------------------------
/data/1lev4nds4cls.cnl:
--------------------------------------------------------------------------------
1 | # Clusters: 3, Nodes:4,Fuzzy: 0, Numbered: 0
2 | 1 2
3 | 3 2
4 | 4 2
5 | 3 4
6 | 


--------------------------------------------------------------------------------
/data/4nds_1lev_novp1.cnl:
--------------------------------------------------------------------------------
1 | # Clusters: 4, Nodes: 4, Fuzzy: 0, Numbered: 0
2 | # Level 1
3 | 1 2
4 | 3 4
5 | 


--------------------------------------------------------------------------------
/data/4nds_1lev_novp2.cnl:
--------------------------------------------------------------------------------
1 | # Clusters: 6, Nodes:4, Fuzzy: 0, Numbered: 0
2 | # Level 1
3 | 1 2
4 | 3
5 | 4
6 | 


--------------------------------------------------------------------------------
/images/CPU-Timings-DBLP.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eXascaleInfolab/xmeasures/HEAD/images/CPU-Timings-DBLP.png


--------------------------------------------------------------------------------
/data/15nds_novp2.cnl:
--------------------------------------------------------------------------------
1 | # Clusters: 4, Nodes:15,Fuzzy: 0, Numbered: 0
2 | 1 2 3 11 12 13
3 | 4 5
4 | 6 7 8 14 15
5 | 9 10
6 | 


--------------------------------------------------------------------------------
/data/15nds_novp3.cnl:
--------------------------------------------------------------------------------
1 | # Clusters: 4, Nodes:15,Fuzzy: 0, Numbered: 0
2 | 1 2 3 8 4 5
3 | 6 7
4 | 11 12 13 14 15
5 | 9 10
6 | 


--------------------------------------------------------------------------------
/data/15nds_novp1.cnl:
--------------------------------------------------------------------------------
1 | # Clusters: 5, Nodes:15,Fuzzy: 0, Numbered: 0
2 | 1 2 3
3 | 4 5 14
4 | 6 7 8
5 | 9 10 11 12 13
6 | 15
7 | 


--------------------------------------------------------------------------------
/data/4nds_2lev_novp1.cnl:
--------------------------------------------------------------------------------
1 | # Clusters: 4, Nodes: 4, Fuzzy: 0, Numbered: 0
2 | # Level 1
3 | 1 2
4 | 3 4
5 | # Level 2
6 | 1 2 3 4
7 | 


--------------------------------------------------------------------------------
/data/4nds_2lev_novp2.cnl:
--------------------------------------------------------------------------------
1 | # Clusters: 6, Nodes:4, Fuzzy: 0, Numbered: 0
2 | # Level 1
3 | 1 2
4 | 3
5 | 4
6 | # Level 2
7 | 1 2 3
8 | 4
9 | 


--------------------------------------------------------------------------------
/data/3cls5nds_2lev.cnl:
--------------------------------------------------------------------------------
 1 | # Clusters: 0	 Nodes:5
 2 | # Multilevel and overlapping collection
 3 | # First level
 4 | 0> 1 2:0.2 3:1
 5 | 1> 2:0.3 4
 6 | 2> 5 2:0.3
 7 | # Second level
 8 | 1 3
 9 | 2 4 5
10 | 


--------------------------------------------------------------------------------
/data/omega_c2-1.cnl:
--------------------------------------------------------------------------------
1 | # Omega-a_general_formulation_of_the_Rand_index_of_cluster_recovery_suitable_for_non-disjoint_solutions-1988.pdf
2 | # Solution V, p. 236 (8)
3 | # Pairs 15 (6, 9) => 6*5, i.e. 6 nodes
4 | 0 1 2
5 | 3 4 5
6 | 


--------------------------------------------------------------------------------
/data/omega_c2-2.cnl:
--------------------------------------------------------------------------------
1 | # Omega-a_general_formulation_of_the_Rand_index_of_cluster_recovery_suitable_for_non-disjoint_solutions-1988.pdf
2 | # Solution V, p. 236 (8)
3 | # Pairs 15 (6, 9) => 6*5, i.e. 6 nodes
4 | 0 1
5 | 2 3 4
6 | 5
7 | 


--------------------------------------------------------------------------------
/data/omega_c4.3-2.cnl:
--------------------------------------------------------------------------------
1 | # Omega-a_general_formulation_of_the_Rand_index_of_cluster_recovery_suitable_for_non-disjoint_solutions-1988.pdf
2 | # Solution V, p. 240 (12)
3 | # Nodes: 10, Clusters: 3
4 | 0 1 2 3
5 | 2 3 4 5 6
6 | 7 8 9
7 | 


--------------------------------------------------------------------------------
/data/3cls5nds_novp_3lev.cnl:
--------------------------------------------------------------------------------
 1 | # Clusters: 0, Nodes:5,Fuzzy: 0, Numbered: 0
 2 | # Multilevel flattened collection
 3 | # First level
 4 | 1 2 3
 5 | 4
 6 | 5
 7 | # Second level
 8 | 1 2 3
 9 | 4 5
10 | # Third level
11 | 1 2 3 4 5
12 | 


--------------------------------------------------------------------------------
/data/omega_c4.3-1.cnl:
--------------------------------------------------------------------------------
1 | # Omega-a_general_formulation_of_the_Rand_index_of_cluster_recovery_suitable_for_non-disjoint_solutions-1988.pdf
2 | # Solution V, p. 240 (12)
3 | # Nodes: 10, Clusters: 4
4 | 0 1 2 3
5 | 3 4 5 6
6 | 7 8
7 | 9
8 | 


--------------------------------------------------------------------------------
/data/4cls5nds.cnl:
--------------------------------------------------------------------------------
 1 | # Clusters: 4, Nodes:, Fuzzy: 0, Numbered: 0
 2 | 1 2:0.2
 3 | 2:0.3 4
 4 | 5 2:0.3 3
 5 | 3
 6 | 
 7 | # Cleared version:
 8 | # 1 2		[Matches in 3cls5nds: 1+3=4]
 9 | # 2 4		[3+1=4]
10 | # 5 2 3		[1+3+1=5]
11 | # 3			[1]
12 | 


--------------------------------------------------------------------------------
/data/4cls6nds.cnl:
--------------------------------------------------------------------------------
 1 | # Clusters: 4, Nodes:6, Fuzzy: 0, Numbered: 0
 2 | 1 2:0.2 6
 3 | 2:0.3 4
 4 | 5 2:0.3 3
 5 | 3
 6 | 
 7 | # Cleared version:
 8 | # 1 2 6		[Matches in 3cls5nds: 1+3+0=4]
 9 | # 2 4		[3+1=4]
10 | # 5 2 3		[1+3+1=5]
11 | # 3			[1]
12 | 


--------------------------------------------------------------------------------
/data/1lev4nds2cls.cnl:
--------------------------------------------------------------------------------
1 | # Clusters: 2, Nodes:4,Fuzzy: 0, Numbered: 0
2 | # Note: 1lev4nds2cls matching with 1lev4nds3cls1s vs 1lev4nds3cls2 is an example
3 | # where standard novp NMI (not gecmi NMIovp) fails to evaluate overlapping
4 | # matching unlike F1.
5 | 1 2
6 | 3 4 2
7 | 


--------------------------------------------------------------------------------
/data/4cls5nds_2lev.cnl:
--------------------------------------------------------------------------------
 1 | # Clusters: 4, Nodes:, Fuzzy: 0, Numbered: 0
 2 | 1 2:0.2
 3 | 2:0.3 4
 4 | 5 2:0.3 3
 5 | 3
 6 | # Lev 2
 7 | 1 2 4
 8 | 5 2 3
 9 | 
10 | # Cleared version:
11 | # 1 2		[Matches in 3cls5nds: 1+3=4]
12 | # 2 4		[3+1=4]
13 | # 5 2 3		[1+3+1=5]
14 | # 3			[1]
15 | 


--------------------------------------------------------------------------------
/data/4cls6nds_2lev.cnl:
--------------------------------------------------------------------------------
 1 | # Clusters: 4, Nodes:6, Fuzzy: 0, Numbered: 0
 2 | 1 2:0.2 6
 3 | 2:0.3 4
 4 | 5 2:0.3 3
 5 | 3
 6 | # Lev 2
 7 | 1 2 6 3
 8 | 5 2 3 4
 9 | 
10 | # Cleared version:
11 | # 1 2 6		[Matches in 3cls5nds: 1+3+0=4]
12 | # 2 4		[3+1=4]
13 | # 5 2 3		[1+3+1=5]
14 | # 3			[1]
15 | 


--------------------------------------------------------------------------------
/GenerateArgparser.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | # Generate Arguments parser
 3 | 
 4 | sh -c 'mkdir autogen'  2> /dev/null
 5 | gengetopt  --output-dir autogen -i args.ggo
 6 | 
 7 | if [ $? -eq 0 ]; then
 8 | 	echo  "The arguments parser is generated"
 9 | else
10 | 	echo  "The arguments parser generation is FAILED"
11 | fi
12 | 


--------------------------------------------------------------------------------
/data/3cls5nds.cnl:
--------------------------------------------------------------------------------
 1 | # Clusters: 3	 Nodes:5,Fuzzy: 1, Numbered: 1
 2 | # Note that the number of clusters corresponds to the number of payload lines in the file
 3 | 0> 1 2:0.2 3:1
 4 | # Empty lines and comments are allowed
 5 | 1> 2:0.3 4
 6 | 2> 5 2:0.3
 7 | 
 8 | # Cleared version:
 9 | # 1 2 3		[Mathces in 4cls6nds: 1+3+2=6]
10 | # 2 4		[3+1=4]
11 | # 5 2		[1+3=4]
12 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled Object files
 2 | *.slo
 3 | *.lo
 4 | *.o
 5 | *.obj
 6 | 
 7 | # Precompiled Headers
 8 | *.gch
 9 | *.pch
10 | 
11 | # Compiled Dynamic libraries
12 | *.so
13 | *.dylib
14 | *.dll
15 | 
16 | # Fortran module files
17 | *.mod
18 | *.smod
19 | 
20 | # Compiled Static libraries
21 | *.lai
22 | *.la
23 | *.a
24 | *.lib
25 | 
26 | # Executables
27 | *.exe
28 | *.out
29 | *.app
30 | 
31 | # Codeblocks files
32 | *.depend
33 | *.layout
34 | 


--------------------------------------------------------------------------------
/shared/macrodef.h:
--------------------------------------------------------------------------------
 1 | //! \brief Global macro definitions.
 2 | //! The Dao (Deterministic Agglomerative Overlapping) of Clustering library:
 3 | //! Robust & Fine-grained Deterministic Clustering for Large Networks.
 4 | //!
 5 | //! \license Apache License, Version 2.0: http://www.apache.org/licenses/LICENSE-2.0.html
 6 | //! > 	Simple explanation: https://tldrlegal.com/license/apache-license-2.0-(apache-2.0)
 7 | //!
 8 | //! Copyright (c)
 9 | //! \authr Artem Lutov
10 | //! \email luart@ya.ru
11 | //! \date 2016-07-25
12 | 
13 | #ifndef MACRODEF_H
14 | #define MACRODEF_H
15 | 
16 | // Global MACROSES:
17 | //	- VALIDATE  - use alternative evaluations to validate results
18 | //		- 0  - turn off heavy validation
19 | //		- 1  - default value for the heavy validation
20 | //		- 2  - extra heavy validation (might duplicate already performed heavy validation)
21 | //		- 3  - cross validation of functions (executed on each call, but only once is enough)
22 | //
23 | //	- TRACE, TRACE_EXTRA  - detailed tracing under debug (trace nodes weights)
24 | //		- 0  - turn off the tracing
25 | //		- 1  - brief tracing that can be used in release to show warnings, etc.
26 | //		- 2  - detailed tracing for DEBUG
27 | //		- 3  - extra detailed tracing
28 | //
29 | //	- FTRACE_GLOBAL  - use global ftrace file for the whole project, or "shared/" headers
30 | //		define it locally
31 | //
32 | //	- UTEST  - build [also] unit tests, requires installation and linking of the unit test library.
33 | //
34 | //	- NO_FILEIO  - omit STL file I/O related routines
35 | //
36 | // NOTE: undefined macro definition is interpreted as having value 0
37 | 
38 | #ifndef TRACE
39 | #ifdef DEBUG
40 | 	#define TRACE 2
41 | #elif !defined(NDEBUG)  // RELEASE, !NDEBUG
42 | 	#define TRACE 1
43 | //#else  // RELEASE, NDEBUG
44 | //	#define TRACE 0
45 | #endif // DEBUG
46 | #endif // TRACE
47 | 
48 | #ifndef VALIDATE
49 | #ifdef DEBUG
50 | 	#define VALIDATE 2
51 | #elif !defined(NDEBUG)  // RELEASE, !NDEBUG
52 | 	#define VALIDATE 1
53 | //#else  // ELEASE, NDEBUG
54 | //	#define VALIDATE 0
55 | #endif // DEBUG
56 | #endif // VALIDATE
57 | 
58 | // SWIG related macro definitions
59 | // Swig 3.0.12 does not understand some structures, workarounds are applied
60 | // Note: defined only for SWIG interfaces
61 | #ifdef SWIG
62 | 	// Just skip the static assert
63 |     #define static_assert(a, b)
64 | #endif // SWIG
65 | 
66 | // Note: SWIG_VERSION is not defined for SWIGJAVA and SWIGCSHARP
67 | // Note: defined both for the SWIG interfaces and implementation
68 | #if defined(SWIG_VERSION) || defined(SWIGJAVA) || defined(SWIGCSHARP)
69 | 	// Defined automatically when any SWIG processing is performed
70 | 	// (either the included as SWIG interface or implementation)
71 | 	#define DAOC_SWIGPROC
72 | #endif // SWIG processing
73 | 
74 | // Define macros for the case when SWIG supports functions overloading
75 | #if defined(SWIGCSHARP) || defined(SWIGD) || defined(SWIGJAVA)
76 | 	#define SWIG_OVERLOADS
77 | #endif // OVERLOADS
78 | 
79 | 
80 | #endif // MACRODEF_H
81 | 


--------------------------------------------------------------------------------
/xmeasures.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | :Description: Python API and usage example for C API of the xmeasures library
 5 | Note that items in collections are allowed to be non-contiguous (i.e., hashes can be used as ids)
 6 | 
 7 | :Authors: (c) Artem Lutov <lua@lutan.ch>
 8 | :Date: 2020-03-12
 9 | """
10 | 
11 | import pathlib
12 | import numpy as np
13 | from ctypes import Structure, CDLL, POINTER, c_uint, c_float #, c_void_p
14 | from collections.abc import Iterable
15 | 
16 | # Python wrappers for C types ------------------------------------------------------------------------------------------
17 | c_uint_p = POINTER(c_uint)
18 | c_float_p = POINTER(c_float)
19 | # null_ptr = c_void_p()
20 | 
21 | 
22 | class ClusterNodes(Structure):
23 | 	_fields_ = [('num', c_uint),
24 | 				('ids', c_uint_p),
25 | 				('weights', c_float_p)]
26 | ClusterNodesPtr = POINTER(ClusterNodes)
27 | 
28 | def clusterNodes(ids, weights=None):
29 | 	"""ClusterNodes initialization
30 | 
31 | 	ids: iterable(uint)  - cluster node ids
32 | 	weights: iterable  - cluster node weights
33 | 
34 | 	return ClusterNodes
35 | 	"""
36 | 	assert isinstance(ids, Iterable) and (weights is None or isinstance(weights, Iterable)), 'Invalid argument types'
37 | 	cnIds = (c_uint * len(ids))(*ids)
38 | 	cnWeights = c_float_p() if not weights else (c_float * len(weights))(*weights)
39 | 	return ClusterNodes(len(ids), cnIds, cnWeights)
40 | 
41 | 
42 | class ClusterCollection(Structure):
43 | 	_fields_ = [('num', c_uint),
44 | 				('nodes', ClusterNodesPtr)]
45 | 
46 | def clusterCollection(clusters):
47 | 	"""ClusterCollection initialization
48 | 
49 | 	clusters: iterable(iterable(uint))  - clusters (collection of nodes)
50 | 
51 | 	return ClusterCollection
52 | 	"""
53 | 	assert isinstance(clusters, Iterable) and isinstance(clusters[0], Iterable), 'Invalid argument type'
54 | 	cc = (ClusterNodes * len(clusters))(*(clusterNodes(nds) for nds in clusters))
55 | 	return ClusterCollection(len(clusters), cc)
56 | 
57 | 
58 | def weightedClusterCollection(clusters):
59 | 	"""ClusterCollection initialization
60 | 
61 | 	nodes: iterable((iterable(uint), iterable(float)))  - weighted clusters (collections of nodes and their weights)
62 | 
63 | 	return ClusterCollection
64 | 	"""
65 | 	assert isinstance(clusters, Iterable) and len(clusters[0]) == 2 and isinstance(clusters[0][0], Iterable), 'Invalid argument type'
66 | 	cc = (c_uint * len(clusters))(*(clusterNodes(nds, wgs) for nds, wgs in clusters))
67 | 	return ClusterCollection(len(clusters), cc)
68 | 
69 | 
70 | # Example of xmeasures usage from Python -------------------------------------------------------------------------------
71 | if __name__ == "__main__":
72 | 	# Load the shared library into ctypes
73 | 	libXms = pathlib.Path().absolute() / "bin/Release/libxmeasures.so"
74 | 	xms = CDLL(libXms)
75 | 	# Set proper return types for the importing functions
76 | 	xms.f1p.restype = c_float
77 | 	xms.f1h.restype = c_float
78 | 	xms.omegaExt.restype = c_float
79 | 	xms.omega.restype = c_float
80 | 	# Perform evaluations
81 | 	nc1 = clusterCollection(((9,2,4), (2,13)))
82 | 	nc2 = clusterCollection([[9,13,2], [2,4]])
83 | 	print('F1p: {}, F1h: {}, omegaExt: {}, omega: {}'.format(
84 | 		xms.f1p(nc1, nc2),
85 | 		xms.f1h(nc1, nc2),
86 | 		xms.omegaExt(nc1, nc2),
87 | 		xms.omega(nc1, nc2)
88 | 	))
89 | 


--------------------------------------------------------------------------------
/xmeasures.cbp:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
  2 | <CodeBlocks_project_file>
  3 | 	<FileVersion major="1" minor="6" />
  4 | 	<Project>
  5 | 		<Option title="xmeasures" />
  6 | 		<Option pch_mode="2" />
  7 | 		<Option compiler="gcc" />
  8 | 		<Build>
  9 | 			<Target title="Debug">
 10 | 				<Option output="bin/Debug/xmeasures" prefix_auto="1" extension_auto="1" />
 11 | 				<Option object_output="obj/Debug/" />
 12 | 				<Option type="1" />
 13 | 				<Option compiler="gcc" />
 14 | 				<Compiler>
 15 | 					<Add option="-Wredundant-decls" />
 16 | 					<Add option="-Wswitch-default" />
 17 | 					<Add option="-Wmain" />
 18 | 					<Add option="-Wextra" />
 19 | 					<Add option="-Og" />
 20 | 					<Add option="-g" />
 21 | 					<Add option="-fasynchronous-unwind-tables" />
 22 | 					<Add option="-fsanitize=address" />
 23 | 					<Add option="-fsanitize=leak" />
 24 | 					<Add option="-DDEBUG" />
 25 | 					<Add option="-D_GLIBCXX_DEBUG" />
 26 | 					<Add option="-D_GLIBCXX_ASSERTIONS" />
 27 | 					<Add option="-DTRACE=2" />
 28 | 					<Add option="-DVALIDATE=2" />
 29 | 				</Compiler>
 30 | 				<Linker>
 31 | 					<Add library="asan" />
 32 | 				</Linker>
 33 | 			</Target>
 34 | 			<Target title="Release">
 35 | 				<Option output="bin/Release/xmeasures" prefix_auto="1" extension_auto="1" />
 36 | 				<Option object_output="obj/Release/" />
 37 | 				<Option type="1" />
 38 | 				<Option compiler="gcc" />
 39 | 				<Compiler>
 40 | 					<Add option="-march=core2" />
 41 | 					<Add option="-fomit-frame-pointer" />
 42 | 					<Add option="-O3" />
 43 | 					<Add option="-Wfatal-errors" />
 44 | 					<Add option="-DTRACE=1" />
 45 | 					<Add option="-DVALIDATE=1" />
 46 | 				</Compiler>
 47 | 				<Linker>
 48 | 					<Add option="-s" />
 49 | 					<Add option="-Wl,-z,relro,-z,now" />
 50 | 				</Linker>
 51 | 			</Target>
 52 | 		</Build>
 53 | 		<Compiler>
 54 | 			<Add option="-Wnon-virtual-dtor" />
 55 | 			<Add option="-Winit-self" />
 56 | 			<Add option="-Wcast-align" />
 57 | 			<Add option="-Wundef" />
 58 | 			<Add option="-Wfloat-equal" />
 59 | 			<Add option="-Wunreachable-code" />
 60 | 			<Add option="-Wmissing-include-dirs" />
 61 | 			<Add option="-Weffc++" />
 62 | 			<Add option="-Wzero-as-null-pointer-constant" />
 63 | 			<Add option="-Wall" />
 64 | 			<Add option="-std=c++14" />
 65 | 			<Add option="-fexceptions" />
 66 | 			<Add option="-fstack-protector-strong" />
 67 | 			<Add option="-fstack-clash-protection" />
 68 | 			<Add option="-Werror=format-security" />
 69 | 			<Add option="-Wold-style-cast" />
 70 | 			<Add option="-Wno-float-equal" />
 71 | 			<Add option="-D_FORTIFY_SOURCE=2" />
 72 | 			<Add directory="include" />
 73 | 			<Add directory="autogen" />
 74 | 			<Add directory="shared" />
 75 | 		</Compiler>
 76 | 		<Linker>
 77 | 			<Add option="-Wl,-z,defs" />
 78 | 			<Add library="stdc++fs" />
 79 | 		</Linker>
 80 | 		<Unit filename="autogen/cmdline.c">
 81 | 			<Option compilerVar="CC" />
 82 | 		</Unit>
 83 | 		<Unit filename="autogen/cmdline.h" />
 84 | 		<Unit filename="include/interface.h" />
 85 | 		<Unit filename="include/interface.hpp" />
 86 | 		<Unit filename="shared/agghash.hpp" />
 87 | 		<Unit filename="shared/fileio.cpp" />
 88 | 		<Unit filename="shared/fileio.hpp" />
 89 | 		<Unit filename="shared/macrodef.h" />
 90 | 		<Unit filename="shared/operations.hpp" />
 91 | 		<Unit filename="src/interface.cpp" />
 92 | 		<Unit filename="src/main.cpp" />
 93 | 		<Extensions>
 94 | 			<DoxyBlocks>
 95 | 				<comment_style block="1" line="1" />
 96 | 				<doxyfile_project />
 97 | 				<doxyfile_build />
 98 | 				<doxyfile_warnings />
 99 | 				<doxyfile_output />
100 | 				<doxyfile_dot />
101 | 				<general />
102 | 			</DoxyBlocks>
103 | 			<lib_finder disable_auto="1" />
104 | 		</Extensions>
105 | 	</Project>
106 | </CodeBlocks_project_file>
107 | 


--------------------------------------------------------------------------------
/Makefile_lib:
--------------------------------------------------------------------------------
 1 | #------------------------------------------------------------------------------#
 2 | # This makefile was generated by 'cbp2make' tool rev.147                       #
 3 | #------------------------------------------------------------------------------#
 4 | 
 5 | 
 6 | WORKDIR = `pwd`
 7 | 
 8 | CC = gcc
 9 | CXX = g++
10 | AR = ar
11 | LD = g++
12 | WINDRES = windres
13 | 
14 | INC = -Iinclude -Iautogen -Ishared
15 | CFLAGS = -Wnon-virtual-dtor -Winit-self -Wcast-align -Wundef -Wfloat-equal -Wunreachable-code -Weffc++ -std=c++14 -Wmissing-include-dirs -Wzero-as-null-pointer-constant -fpic -fexceptions -fstack-protector-strong -fstack-clash-protection -Werror=format-security -Wold-style-cast -Wno-unused-function -Wno-noexcept-type -D_FORTIFY_SOURCE=2 -DNO_FILEIO -DC_API
16 | RESINC = 
17 | LIBDIR = 
18 | LIB = 
19 | LDFLAGS = -Wl,-z,defs
20 | 
21 | INC_DEBUG = $(INC)
22 | CFLAGS_DEBUG = $(CFLAGS) -Wredundant-decls -Winline -Wswitch-enum -Wswitch-default -Wmain -Wall -fPIC -Og -g -Wsuggest-final-types -Wsuggest-final-methods -ftemplate-backtrace-limit=32 -fasynchronous-unwind-tables -fsanitize=leak -fsanitize=address -DDEBUG -D_GLIBCXX_DEBUG -D_GLIBCXX_ASSERTIONS -DTRACE=2 -DVALIDATE=2
23 | RESINC_DEBUG = $(RESINC)
24 | RCFLAGS_DEBUG = $(RCFLAGS)
25 | LIBDIR_DEBUG = $(LIBDIR)
26 | LIB_DEBUG = $(LIB)-lasan
27 | LDFLAGS_DEBUG = $(LDFLAGS)
28 | OBJDIR_DEBUG = obj/Debug
29 | DEP_DEBUG = 
30 | OUT_DEBUG = bin/Debug/libxmeasures.so
31 | 
32 | INC_RELEASE = $(INC)
33 | CFLAGS_RELEASE = $(CFLAGS) -fomit-frame-pointer -O3 -march=core2 -ftemplate-backtrace-limit=32 -Wno-strict-aliasing -DTRACE=1 -DVALIDATE=1
34 | RESINC_RELEASE = $(RESINC)
35 | RCFLAGS_RELEASE = $(RCFLAGS)
36 | LIBDIR_RELEASE = $(LIBDIR)
37 | LIB_RELEASE = $(LIB)
38 | LDFLAGS_RELEASE = $(LDFLAGS) -s
39 | OBJDIR_RELEASE = obj/Release
40 | DEP_RELEASE = 
41 | OUT_RELEASE = bin/Release/libxmeasures.so
42 | 
43 | OBJ_DEBUG = $(OBJDIR_DEBUG)/src/interface.o $(OBJDIR_DEBUG)/src/interface_c.o
44 | 
45 | OBJ_RELEASE = $(OBJDIR_RELEASE)/src/interface.o $(OBJDIR_RELEASE)/src/interface_c.o
46 | 
47 | all: debug release
48 | 
49 | clean: clean_debug clean_release
50 | 
51 | before_debug: 
52 | 	test -d bin/Debug || mkdir -p bin/Debug
53 | 	test -d $(OBJDIR_DEBUG)/src || mkdir -p $(OBJDIR_DEBUG)/src
54 | 
55 | after_debug: 
56 | 
57 | debug: before_debug out_debug after_debug
58 | 
59 | out_debug: before_debug $(OBJ_DEBUG) $(DEP_DEBUG)
60 | 	$(LD) -shared $(LIBDIR_DEBUG) $(OBJ_DEBUG)  -o $(OUT_DEBUG) $(LDFLAGS_DEBUG) $(LIB_DEBUG)
61 | 
62 | $(OBJDIR_DEBUG)/src/interface.o: src/interface.cpp
63 | 	$(CXX) $(CFLAGS_DEBUG) $(INC_DEBUG) -c src/interface.cpp -o $(OBJDIR_DEBUG)/src/interface.o
64 | 
65 | $(OBJDIR_DEBUG)/src/interface_c.o: src/interface_c.cpp
66 | 	$(CXX) $(CFLAGS_DEBUG) $(INC_DEBUG) -c src/interface_c.cpp -o $(OBJDIR_DEBUG)/src/interface_c.o
67 | 
68 | clean_debug: 
69 | 	rm -f $(OBJ_DEBUG) $(OUT_DEBUG)
70 | 	rm -rf bin/Debug
71 | 	rm -rf $(OBJDIR_DEBUG)/src
72 | 
73 | before_release: 
74 | 	test -d bin/Release || mkdir -p bin/Release
75 | 	test -d $(OBJDIR_RELEASE)/src || mkdir -p $(OBJDIR_RELEASE)/src
76 | 
77 | after_release: 
78 | 
79 | release: before_release out_release after_release
80 | 
81 | out_release: before_release $(OBJ_RELEASE) $(DEP_RELEASE)
82 | 	$(LD) -shared $(LIBDIR_RELEASE) $(OBJ_RELEASE)  -o $(OUT_RELEASE) $(LDFLAGS_RELEASE) $(LIB_RELEASE)
83 | 
84 | $(OBJDIR_RELEASE)/src/interface.o: src/interface.cpp
85 | 	$(CXX) $(CFLAGS_RELEASE) $(INC_RELEASE) -c src/interface.cpp -o $(OBJDIR_RELEASE)/src/interface.o
86 | 
87 | $(OBJDIR_RELEASE)/src/interface_c.o: src/interface_c.cpp
88 | 	$(CXX) $(CFLAGS_RELEASE) $(INC_RELEASE) -c src/interface_c.cpp -o $(OBJDIR_RELEASE)/src/interface_c.o
89 | 
90 | clean_release: 
91 | 	rm -f $(OBJ_RELEASE) $(OUT_RELEASE)
92 | 	rm -rf bin/Release
93 | 	rm -rf $(OBJDIR_RELEASE)/src
94 | 
95 | .PHONY: before_debug after_debug clean_debug before_release after_release clean_release
96 | 
97 | 


--------------------------------------------------------------------------------
/libxmeasures.cbp:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
  2 | <CodeBlocks_project_file>
  3 | 	<FileVersion major="1" minor="6" />
  4 | 	<Project>
  5 | 		<Option title="libxmeasures" />
  6 | 		<Option pch_mode="2" />
  7 | 		<Option compiler="gcc" />
  8 | 		<Build>
  9 | 			<Target title="Debug">
 10 | 				<Option output="bin/Debug/xmeasures" prefix_auto="1" extension_auto="1" />
 11 | 				<Option object_output="obj/Debug/" />
 12 | 				<Option type="3" />
 13 | 				<Option compiler="gcc" />
 14 | 				<Option createDefFile="1" />
 15 | 				<Option createStaticLib="1" />
 16 | 				<Compiler>
 17 | 					<Add option="-Wredundant-decls" />
 18 | 					<Add option="-Winline" />
 19 | 					<Add option="-Wswitch-enum" />
 20 | 					<Add option="-Wswitch-default" />
 21 | 					<Add option="-Wmain" />
 22 | 					<Add option="-Wall" />
 23 | 					<Add option="-fPIC" />
 24 | 					<Add option="-Og" />
 25 | 					<Add option="-g" />
 26 | 					<Add option="-Wsuggest-final-types" />
 27 | 					<Add option="-Wsuggest-final-methods" />
 28 | 					<Add option="-ftemplate-backtrace-limit=32" />
 29 | 					<Add option="-fasynchronous-unwind-tables" />
 30 | 					<Add option="-fsanitize=leak" />
 31 | 					<Add option="-fsanitize=address" />
 32 | 					<Add option="-DDEBUG" />
 33 | 					<Add option="-D_GLIBCXX_DEBUG" />
 34 | 					<Add option="-D_GLIBCXX_ASSERTIONS" />
 35 | 					<Add option="-DTRACE=2" />
 36 | 					<Add option="-DVALIDATE=2" />
 37 | 				</Compiler>
 38 | 				<Linker>
 39 | 					<Add library="asan" />
 40 | 				</Linker>
 41 | 			</Target>
 42 | 			<Target title="Release">
 43 | 				<Option output="bin/Release/xmeasures" prefix_auto="1" extension_auto="1" />
 44 | 				<Option object_output="obj/Release/" />
 45 | 				<Option type="3" />
 46 | 				<Option compiler="gcc" />
 47 | 				<Option createDefFile="1" />
 48 | 				<Option createStaticLib="1" />
 49 | 				<Compiler>
 50 | 					<Add option="-fomit-frame-pointer" />
 51 | 					<Add option="-O3" />
 52 | 					<Add option="-march=core2" />
 53 | 					<Add option="-ftemplate-backtrace-limit=32" />
 54 | 					<Add option="-Wno-strict-aliasing" />
 55 | 					<Add option="-DTRACE=1" />
 56 | 					<Add option="-DVALIDATE=1" />
 57 | 				</Compiler>
 58 | 				<Linker>
 59 | 					<Add option="-s" />
 60 | 				</Linker>
 61 | 			</Target>
 62 | 		</Build>
 63 | 		<Compiler>
 64 | 			<Add option="-Wnon-virtual-dtor" />
 65 | 			<Add option="-Winit-self" />
 66 | 			<Add option="-Wcast-align" />
 67 | 			<Add option="-Wundef" />
 68 | 			<Add option="-Wfloat-equal" />
 69 | 			<Add option="-Wunreachable-code" />
 70 | 			<Add option="-Weffc++" />
 71 | 			<Add option="-std=c++14" />
 72 | 			<Add option="-Wmissing-include-dirs" />
 73 | 			<Add option="-Wzero-as-null-pointer-constant" />
 74 | 			<Add option="-fpic" />
 75 | 			<Add option="-fexceptions" />
 76 | 			<Add option="-fstack-protector-strong" />
 77 | 			<Add option="-fstack-clash-protection" />
 78 | 			<Add option="-Werror=format-security" />
 79 | 			<Add option="-Wold-style-cast" />
 80 | 			<Add option="-Wno-unused-function" />
 81 | 			<Add option="-Wno-noexcept-type" />
 82 | 			<Add option="-D_FORTIFY_SOURCE=2" />
 83 | 			<Add option="-DNO_FILEIO" />
 84 | 			<Add option="-DC_API" />
 85 | 			<Add directory="include" />
 86 | 			<Add directory="autogen" />
 87 | 			<Add directory="shared" />
 88 | 		</Compiler>
 89 | 		<Linker>
 90 | 			<Add option="-Wl,-z,defs" />
 91 | 		</Linker>
 92 | 		<Unit filename="include/interface_c.h" />
 93 | 		<Unit filename="shared/macrodef.h" />
 94 | 		<Unit filename="src/interface.cpp" />
 95 | 		<Unit filename="src/interface_c.cpp" />
 96 | 		<Extensions>
 97 | 			<DoxyBlocks>
 98 | 				<comment_style block="2" line="2" />
 99 | 				<doxyfile_project />
100 | 				<doxyfile_build />
101 | 				<doxyfile_warnings />
102 | 				<doxyfile_output />
103 | 				<doxyfile_dot />
104 | 				<general />
105 | 			</DoxyBlocks>
106 | 		</Extensions>
107 | 	</Project>
108 | </CodeBlocks_project_file>
109 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | #------------------------------------------------------------------------------#
  2 | # This makefile was generated by 'cbp2make' tool rev.147                       #
  3 | #------------------------------------------------------------------------------#
  4 | 
  5 | 
  6 | WORKDIR = `pwd`
  7 | 
  8 | CC = gcc
  9 | CXX = g++
 10 | AR = ar
 11 | LD = g++
 12 | WINDRES = windres
 13 | 
 14 | INC = -Iinclude -Iautogen -Ishared
 15 | CFLAGS = -Wnon-virtual-dtor -Winit-self -Wcast-align -Wundef -Wfloat-equal -Wunreachable-code -Wmissing-include-dirs -Weffc++ -Wzero-as-null-pointer-constant -Wall -std=c++14 -fexceptions -fstack-protector-strong -fstack-clash-protection -Werror=format-security -Wold-style-cast -Wno-float-equal -D_FORTIFY_SOURCE=2
 16 | RESINC = 
 17 | LIBDIR = 
 18 | LIB = -lstdc++fs
 19 | LDFLAGS = -Wl,-z,defs
 20 | 
 21 | INC_DEBUG = $(INC)
 22 | CFLAGS_DEBUG = $(CFLAGS) -Wredundant-decls -Wswitch-default -Wmain -Wextra -Og -g -fasynchronous-unwind-tables -fsanitize=address -fsanitize=leak -DDEBUG -D_GLIBCXX_DEBUG -D_GLIBCXX_ASSERTIONS -DTRACE=2 -DVALIDATE=2
 23 | RESINC_DEBUG = $(RESINC)
 24 | RCFLAGS_DEBUG = $(RCFLAGS)
 25 | LIBDIR_DEBUG = $(LIBDIR)
 26 | LIB_DEBUG = $(LIB) -lasan
 27 | LDFLAGS_DEBUG = $(LDFLAGS)
 28 | OBJDIR_DEBUG = obj/Debug
 29 | DEP_DEBUG = 
 30 | OUT_DEBUG = bin/Debug/xmeasures
 31 | 
 32 | INC_RELEASE = $(INC)
 33 | CFLAGS_RELEASE = $(CFLAGS) -march=core2 -fomit-frame-pointer -O3 -Wfatal-errors -DTRACE=1 -DVALIDATE=1
 34 | RESINC_RELEASE = $(RESINC)
 35 | RCFLAGS_RELEASE = $(RCFLAGS)
 36 | LIBDIR_RELEASE = $(LIBDIR)
 37 | LIB_RELEASE = $(LIB)
 38 | LDFLAGS_RELEASE = $(LDFLAGS) -s -Wl,-z,relro,-z,now
 39 | OBJDIR_RELEASE = obj/Release
 40 | DEP_RELEASE = 
 41 | OUT_RELEASE = bin/Release/xmeasures
 42 | 
 43 | OBJ_DEBUG = $(OBJDIR_DEBUG)/autogen/cmdline.o $(OBJDIR_DEBUG)/shared/fileio.o $(OBJDIR_DEBUG)/src/interface.o $(OBJDIR_DEBUG)/src/main.o
 44 | 
 45 | OBJ_RELEASE = $(OBJDIR_RELEASE)/autogen/cmdline.o $(OBJDIR_RELEASE)/shared/fileio.o $(OBJDIR_RELEASE)/src/interface.o $(OBJDIR_RELEASE)/src/main.o
 46 | 
 47 | all: debug release
 48 | 
 49 | clean: clean_debug clean_release
 50 | 
 51 | before_debug: 
 52 | 	test -d bin/Debug || mkdir -p bin/Debug
 53 | 	test -d $(OBJDIR_DEBUG)/autogen || mkdir -p $(OBJDIR_DEBUG)/autogen
 54 | 	test -d $(OBJDIR_DEBUG)/shared || mkdir -p $(OBJDIR_DEBUG)/shared
 55 | 	test -d $(OBJDIR_DEBUG)/src || mkdir -p $(OBJDIR_DEBUG)/src
 56 | 
 57 | after_debug: 
 58 | 
 59 | debug: before_debug out_debug after_debug
 60 | 
 61 | out_debug: before_debug $(OBJ_DEBUG) $(DEP_DEBUG)
 62 | 	$(LD) $(LIBDIR_DEBUG) -o $(OUT_DEBUG) $(OBJ_DEBUG)  $(LDFLAGS_DEBUG) $(LIB_DEBUG)
 63 | 
 64 | $(OBJDIR_DEBUG)/autogen/cmdline.o: autogen/cmdline.c
 65 | 	$(CC) $(CFLAGS_DEBUG) $(INC_DEBUG) -c autogen/cmdline.c -o $(OBJDIR_DEBUG)/autogen/cmdline.o
 66 | 
 67 | $(OBJDIR_DEBUG)/shared/fileio.o: shared/fileio.cpp
 68 | 	$(CXX) $(CFLAGS_DEBUG) $(INC_DEBUG) -c shared/fileio.cpp -o $(OBJDIR_DEBUG)/shared/fileio.o
 69 | 
 70 | $(OBJDIR_DEBUG)/src/interface.o: src/interface.cpp
 71 | 	$(CXX) $(CFLAGS_DEBUG) $(INC_DEBUG) -c src/interface.cpp -o $(OBJDIR_DEBUG)/src/interface.o
 72 | 
 73 | $(OBJDIR_DEBUG)/src/main.o: src/main.cpp
 74 | 	$(CXX) $(CFLAGS_DEBUG) $(INC_DEBUG) -c src/main.cpp -o $(OBJDIR_DEBUG)/src/main.o
 75 | 
 76 | clean_debug: 
 77 | 	rm -f $(OBJ_DEBUG) $(OUT_DEBUG)
 78 | 	rm -rf bin/Debug
 79 | 	rm -rf $(OBJDIR_DEBUG)/autogen
 80 | 	rm -rf $(OBJDIR_DEBUG)/shared
 81 | 	rm -rf $(OBJDIR_DEBUG)/src
 82 | 
 83 | before_release: 
 84 | 	test -d bin/Release || mkdir -p bin/Release
 85 | 	test -d $(OBJDIR_RELEASE)/autogen || mkdir -p $(OBJDIR_RELEASE)/autogen
 86 | 	test -d $(OBJDIR_RELEASE)/shared || mkdir -p $(OBJDIR_RELEASE)/shared
 87 | 	test -d $(OBJDIR_RELEASE)/src || mkdir -p $(OBJDIR_RELEASE)/src
 88 | 
 89 | after_release: 
 90 | 
 91 | release: before_release out_release after_release
 92 | 
 93 | out_release: before_release $(OBJ_RELEASE) $(DEP_RELEASE)
 94 | 	$(LD) $(LIBDIR_RELEASE) -o $(OUT_RELEASE) $(OBJ_RELEASE)  $(LDFLAGS_RELEASE) $(LIB_RELEASE)
 95 | 
 96 | $(OBJDIR_RELEASE)/autogen/cmdline.o: autogen/cmdline.c
 97 | 	$(CC) $(CFLAGS_RELEASE) $(INC_RELEASE) -c autogen/cmdline.c -o $(OBJDIR_RELEASE)/autogen/cmdline.o
 98 | 
 99 | $(OBJDIR_RELEASE)/shared/fileio.o: shared/fileio.cpp
100 | 	$(CXX) $(CFLAGS_RELEASE) $(INC_RELEASE) -c shared/fileio.cpp -o $(OBJDIR_RELEASE)/shared/fileio.o
101 | 
102 | $(OBJDIR_RELEASE)/src/interface.o: src/interface.cpp
103 | 	$(CXX) $(CFLAGS_RELEASE) $(INC_RELEASE) -c src/interface.cpp -o $(OBJDIR_RELEASE)/src/interface.o
104 | 
105 | $(OBJDIR_RELEASE)/src/main.o: src/main.cpp
106 | 	$(CXX) $(CFLAGS_RELEASE) $(INC_RELEASE) -c src/main.cpp -o $(OBJDIR_RELEASE)/src/main.o
107 | 
108 | clean_release: 
109 | 	rm -f $(OBJ_RELEASE) $(OUT_RELEASE)
110 | 	rm -rf bin/Release
111 | 	rm -rf $(OBJDIR_RELEASE)/autogen
112 | 	rm -rf $(OBJDIR_RELEASE)/shared
113 | 	rm -rf $(OBJDIR_RELEASE)/src
114 | 
115 | .PHONY: before_debug after_debug clean_debug before_release after_release clean_release
116 | 
117 | 


--------------------------------------------------------------------------------
/src/interface.cpp:
--------------------------------------------------------------------------------
  1 | //! \brief Extrinsic measures evaluation interface implementation.
  2 | //!
  3 | //! \license Apache License, Version 2.0: http://www.apache.org/licenses/LICENSE-2.0.html
  4 | //! > 	Simple explanation: https://tldrlegal.com/license/apache-license-2.0-(apache-2.0)
  5 | //!
  6 | //! Copyright (c)
  7 | //! \authr Artem Lutov
  8 | //! \email luart@ya.ru
  9 | //! \date 2017-12-15
 10 | 
 11 | #include <cstdio>
 12 | //#include <bitset>
 13 | #include <errno.h>
 14 | 
 15 | #include "operations.hpp"
 16 | #include "interface.h"
 17 | 
 18 | 
 19 | using std::overflow_error;
 20 | using std::invalid_argument;
 21 | using namespace daoc;
 22 | 
 23 | 
 24 | // Omega Index related types and functions -------------------------------------
 25 | Id mutualnum(const RawClusterPtrs* a, const RawClusterPtrs* b, const Id nmax) noexcept
 26 | {
 27 | #if VALIDATE >= 2
 28 | 	assert(a && b && "mutualnum(), valid containers are expected");
 29 | #endif // VALIDATE
 30 | 	Id num = 0;
 31 | 	if(b->size() < a->size()) {
 32 | 		auto t = b;
 33 | 		b = a;
 34 | 		a = t;
 35 | 	}
 36 | 	const auto  eb = b->end();
 37 | 	auto  ib = b->begin();
 38 | 	for(auto acp: *a) {
 39 | 		while(ib != eb && cmpBase(*ib, acp))
 40 | 			++ib;
 41 | 		if(ib == eb
 42 | 		|| (*ib == acp && ++num >= nmax))
 43 | 			break;
 44 | 	}
 45 | 	return num;
 46 | }
 47 | 
 48 | Id mutualnum(const RawClusterPtrs* a, const RawClusterPtrs* b) noexcept
 49 | {
 50 | #if VALIDATE >= 2
 51 | 	assert(a && b && "mutualnum(), valid containers are expected");
 52 | #endif // VALIDATE
 53 | 	Id num = 0;
 54 | 	if(b->size() < a->size()) {
 55 | 		auto t = b;
 56 | 		b = a;
 57 | 		a = t;
 58 | 	}
 59 | 	const auto  eb = b->end();
 60 | 	auto  ib = b->begin();
 61 | 	for(auto acp: *a) {
 62 | 		while(ib != eb && cmpBase(*ib, acp))
 63 | 			++ib;
 64 | 		if(ib == eb)
 65 | 			break;
 66 | 		if(*ib == acp)
 67 | 			++num;
 68 | 	}
 69 | 	return num;
 70 | }
 71 | 
 72 | // Other Measures related functions --------------------------------------------
 73 | //string to_string(Evaluation eval, bool bitstr)
 74 | //{
 75 | //	static_assert(sizeof(Evaluation) == sizeof(EvalBase)
 76 | //		, "to_string(), Evaluation type must be the same size as EvalBase");
 77 | //	// Convert to bit string
 78 | //	if(bitstr)
 79 | //		return bitset<sizeof(Evaluation) * 8>(static_cast<EvalBase>(eval))
 80 | //			.to_string().insert(0, "0b");
 81 | //
 82 | //	// Convert to semantic string
 83 | //	string  val;
 84 | //	switch(eval) {
 85 | //	case Evaluation::MULTIRES:
 86 | //		val = "MULTIRES";
 87 | //		break;
 88 | //	case Evaluation::OVERLAPPING:
 89 | //		val = "OVERLAPPING";
 90 | //		break;
 91 | //	case Evaluation::MULRES_OVP:
 92 | //		val = "MULRES_OVP";
 93 | //		break;
 94 | //	case Evaluation::NONE:
 95 | //	default:
 96 | //		val = "NONE";
 97 | //	}
 98 | //	return val;
 99 | //}
100 | 
101 | string to_string(F1 f1)
102 | {
103 | 	// Convert to semantic string
104 | 	string  val;
105 | 	switch(f1) {
106 | 	case F1::PARTPROB:
107 | 		val = "PARTPROB";
108 | 		break;
109 | 	case F1::HARMONIC:
110 | 		val = "HARMONIC";
111 | 		break;
112 | 	case F1::AVERAGE:
113 | 		val = "AVERAGE";  // Suggested by Leskovec
114 | 		break;
115 | 	case F1::NONE:
116 | 	default:
117 | 		val = "NONE";
118 | 	}
119 | 	return val;
120 | }
121 | 
122 | string to_string(Match mkind)
123 | {
124 | 	// Convert to semantic string
125 | 	string  val;
126 | 	switch(mkind) {
127 | 	case Match::WEIGHTED:
128 | 		val = "WEIGHTED";
129 | 		break;
130 | 	case Match::UNWEIGHTED:
131 | 		val = "UNWEIGHTED";
132 | 		break;
133 | 	case Match::COMBINED:
134 | 		val = "COMBINED";
135 | 		break;
136 | 	case Match::NONE:
137 | 	default:
138 | 		val = "NONE";
139 | 	}
140 | 	return val;
141 | }
142 | 
143 | bool xwmatch(Match m) noexcept
144 | {
145 | 	return m == Match::WEIGHTED || m == Match::COMBINED;
146 | }
147 | 
148 | 
149 | bool xumatch(Match m) noexcept
150 | {
151 | 	return m == Match::UNWEIGHTED || m == Match::COMBINED;
152 | }
153 | 
154 | #ifndef NO_FILEIO
155 | NodeBase NodeBase::load(const char* filename, float membership
156 | 	, ::AggHash* ahash, size_t cmin, size_t cmax, bool verbose)
157 | {
158 | 	NodeBase  nb;  // Return using NRVO optimization
159 | 	NamedFileWrapper  finp(filename, "r");
160 | 	if(finp)
161 | 		static_cast<UniqIds&>(nb) = loadNodes<Id, AccId>(finp, membership
162 | 			, ahash, cmin, cmax, verbose);
163 | 	else perror((string("WARNING load(), can't open ") += filename).c_str());
164 | 
165 | 	return nb;
166 | }
167 | #endif // NO_FILEIO
168 | 
169 | // Accessory functions ---------------------------------------------------------
170 | Id  parseId(char* str)
171 | {
172 | #if VALIDATE >= 2
173 | 	assert(!errno && "Initial errno should be zero");
174 | #endif // VALIDATE
175 | 	auto nid = strtoul(str, nullptr, 10);
176 | 	static_assert(sizeof(nid) >= sizeof(Id), "Parsing value type is too small for Id");
177 | 	if(nid > numeric_limits<Id>::max() || (!nid && errno != 0)) {
178 | 		if(nid > numeric_limits<Id>::max())
179 | 			throw overflow_error("Loaded value of id is too large: " + std::to_string(nid) + "\n");
180 | 		else if(errno != 0)
181 | 			throw invalid_argument(string("Conversion to id can't be performed: ").append(str)
182 | 				+ ", errno: " + std::to_string(errno).append("\n"));
183 | 	}
184 | 	return nid;
185 | }
186 | 
187 | AccProb hmean(AccProb a, AccProb b) noexcept
188 | {
189 | 	static_assert(is_floating_point<AccProb>::value, "AccProb should be a floating point type");
190 | 	// Note: both a = b = 0 and a = -b are considered and yield 0
191 | 	return a + b != 0 ? 2 * a / (a + b) * b : 0;
192 | }
193 | 
194 | AccProb gmean(AccProb a, AccProb b) noexcept
195 | {
196 | #ifdef DEBUG
197 | 	assert(a >= 0 && b >= 0 && "gmean(), the probabilities should E [0, 1]");
198 | #endif // DEBUG
199 | 	return sqrt(a * b);
200 | }
201 | 
202 | AccProb amean(AccProb a, AccProb b) noexcept
203 | {
204 | 	return (a + b) / 2;
205 | }
206 | 


--------------------------------------------------------------------------------
/include/interface_c.h:
--------------------------------------------------------------------------------
  1 | //! \brief Extrinsic measures evaluation interface.
  2 | //!
  3 | //! \license Apache License, Version 2.0: http://www.apache.org/licenses/LICENSE-2.0.html
  4 | //! > 	Simple explanation: https://tldrlegal.com/license/apache-license-2.0-(apache-2.0)
  5 | //!
  6 | //! Copyright (c)
  7 | //! \authr Artem Lutov
  8 | //! \email luart@ya.ru
  9 | //! \date 2021-03-11
 10 | 
 11 | #ifndef INTERFACE_C_H_INCLUDED
 12 | #define INTERFACE_C_H_INCLUDED
 13 | 
 14 | #ifdef __cplusplus
 15 | extern "C" {
 16 | #endif // __cplusplus
 17 | 
 18 | #include <stdint.h>  // uintX_t
 19 | 
 20 | 
 21 | typedef uint32_t  NodeId;  //!< Node Id type
 22 | typedef uint64_t  AccNodeId;  //!< Accumulated Node Id type
 23 | typedef float  LinkWeight;  ///< Link weight
 24 | 
 25 | //! \brief Node relations
 26 | typedef struct {
 27 | 	NodeId  num;  //!< The number of cluster nodes
 28 | 	NodeId*  ids;  //!< Node ids
 29 | 	LinkWeight*  weights;  //!< Node weights in this cluster, can be NULL which means equal weights = 1
 30 | } ClusterNodes;
 31 | 
 32 | //! \brief Node collection (clusters)
 33 | typedef struct {
 34 | 	NodeId  num;  //!< The number of node relations (clusters) in a collection
 35 | 	ClusterNodes*  nodes;  //!< Relations of nodes
 36 | } ClusterCollection;
 37 | 
 38 | //! \brief F1 Kind
 39 | typedef enum {
 40 | 	//! Not initialized
 41 | 	F1_NONE = 0,
 42 | 	//! Harmonic mean of the [weighted] average of the greatest (maximal) match
 43 | 	//! by partial probabilities
 44 | 	F1_PARTPROB,
 45 | 	//! Harmonic mean of the [weighted] average of the greatest (maximal) match by F1s
 46 | 	F1_HARMONIC,
 47 | 	//! Arithmetic mean (average) of the [weighted] average of the greatest (maximal)
 48 | 	//! match by F1s, i.e. F1-Score
 49 | 	F1_AVERAGE  // Suggested by Leskovec
 50 | } F1Kind;
 51 | 
 52 | //! \brief Collection matching kind
 53 | typedef enum {
 54 | 	MATCH_NONE = 0,  //!< Note initialized
 55 | 	MATCH_WEIGHTED,  //!< Weighted matching by the number of members in each cluster (macro weighting)
 56 | 	MATCH_UNWEIGHTED,  //!< Unweighted matching of each cluster (micro weighting)
 57 | 	MATCH_COMBINED  //!< Combined of macro and micro weightings using geometric mean
 58 | } MatchKind;
 59 | 
 60 | typedef float  Probability;
 61 | 
 62 | //! \brief Specified F1 evaluation of the Greatest (Max) Match for the
 63 | //! multi-resolution clustering with possibly unequal node base
 64 | //!
 65 | //! Supported F1 measures are F1p <= F1h <= F1s, where:
 66 | //! - F1p  - Harmonic mean of the [weighted] average of partial probabilities,
 67 | //! 	the most discriminative and satisfies the largest number of the Formal
 68 | //! 	Constraints (homogeneity, completeness, rag bag,  size/quantity, balance);
 69 | //! - F1h  - Harmonic mean of the [weighted] average of F1s;
 70 | //! - F1a  - Average F1-Score, i.e. arithmetic mean (average) of the [weighted]
 71 | //! 	average of F1s, the least discriminative and satisfies the lowest number
 72 | //! 	of the Formal Constraints.
 73 | //!
 74 | //! of the Greatest (Max) Match [Weighted] Average Harmonic Mean evaluation
 75 | //! \note Undirected (symmetric) evaluation
 76 | //!
 77 | //! \param cn1 const ClusterCollection  - first collection of clusters (node relations)
 78 | //! \param cn2 const ClusterCollection  - second collection
 79 | //! \param kind F1Kind  - kind of F1 to be evaluated
 80 | //! \param[out] rec Probability*  - recall of cn2 relative to the ground-truth cn1 or
 81 | //! 0 if the matching strategy does not have the precision/recall notations
 82 | //! \param[out] prc Probability*  - precision of cn2 relative to the ground-truth cn1 or
 83 | //! 0 if the matching strategy does not have the precision/recall notations
 84 | //! \param mkind=MATCH_WEIGHTED MatchKind  - matching kind
 85 | //! \param sync uint8_t  - synchronize node base of the input collections, by appending
 86 | //! the lacking single-node clusters
 87 | //! \param makeunique uint8_t  - ensure that clusters contain unique members by
 88 | //! removing the duplicates
 89 | //! \param verbose=0 uint8_t  - print intermediate results to the stdout
 90 | //! \return Probability  - resulting F1_gm
 91 | Probability f1x(const ClusterCollection cn1, const ClusterCollection cn2, F1Kind kind
 92 | 	, Probability* rec, Probability* prc, MatchKind mkind, uint8_t sync, uint8_t makeunique, uint8_t verbose);
 93 | Probability f1(const ClusterCollection cn1, const ClusterCollection cn2, F1Kind kind
 94 | 	, Probability* rec, Probability* prc);  // MATCH_WEIGHTED, false
 95 | Probability f1p(const ClusterCollection cn1, const ClusterCollection cn2);  // MATCH_WEIGHTED, false
 96 | Probability f1h(const ClusterCollection cn1, const ClusterCollection cn2);  // MATCH_WEIGHTED, false
 97 | 
 98 | //! \brief (Extended) Omega Index evaluation
 99 | //!
100 | //! \param cn1 const ClusterCollection  - first collection of clusters (node relations)
101 | //! \param cn2 const ClusterCollection  - second collection
102 | //! \return Probability  - omega index
103 | Probability omega(const ClusterCollection cn1, const ClusterCollection cn2);
104 | Probability omegaExt(const ClusterCollection cn1, const ClusterCollection cn2);
105 | 
106 | //! \brief (Extended) Omega Index evaluation
107 | //!
108 | //! \param cn1 const ClusterCollection  - first collection of clusters (node relations)
109 | //! \param cn2 const ClusterCollection  - second collection
110 | //! \param ext uint8_t  - evaluate extended omega index
111 | //! \param sync uint8_t  - synchronize node base of the input collections, by appending the lacking single-node clusters
112 | //! \param makeunique uint8_t  - ensure that clusters contain unique members by
113 | //! removing the duplicates
114 | //! \return Probability  - omega index
115 | Probability omegax(const ClusterCollection cn1, const ClusterCollection cn2, uint8_t ext, uint8_t sync, uint8_t makeunique);
116 | 
117 | #ifdef __cplusplus
118 | };
119 | #endif // __cplusplus
120 | 
121 | #endif // INTERFACE_C_H_INCLUDED
122 | 


--------------------------------------------------------------------------------
/shared/agghash.hpp:
--------------------------------------------------------------------------------
  1 | //! \brief AggHash simple (Aggregating Order Invariant Hashing) of the DAOC clustering library
  2 | //!
  3 | //! \license Apache License, Version 2.0: http://www.apache.org/licenses/LICENSE-2.0.html
  4 | //! > 	Simple explanation: https://tldrlegal.com/license/apache-license-2.0-(apache-2.0)
  5 | //!
  6 | //! Copyright (c)
  7 | //! \authr Artem Lutov
  8 | //! \email luart@ya.ru
  9 | //! \date 2017-02-21
 10 | 
 11 | #ifndef CODING_HPP
 12 | #define CODING_HPP
 13 | 
 14 | #include <cstdint>  // uintX_t
 15 | //#include <cstddef>  // size_t
 16 | #include <string>  // uintX_t
 17 | #include <functional>  // hash
 18 | //#include <cstring>  // memcmp
 19 | #include <type_traits>  // is_integral
 20 | #include <limits>  // numeric_limits
 21 | #include <stdexcept> // numeric_limits
 22 | 
 23 | 
 24 | namespace daoc {
 25 | 
 26 | using std::string;
 27 | using std::is_integral;
 28 | using std::numeric_limits;
 29 | using std::domain_error;
 30 | 
 31 | // Type Declarations ---------------------------------------------------
 32 | //! \brief Aggregation hash of ids
 33 | //! \pre Template types should be integral
 34 | //!
 35 | //! \tparam Id  - type of the member ids
 36 | //! \tparam AccId  - type of the accumulated Ids and accumulated squares of Ids
 37 | //! should have at least twice magnitude of the Id type (i.e. squared)
 38 | template <typename Id=uint32_t, typename AccId=uint64_t>
 39 | class AggHash {
 40 | 	static_assert(is_integral<Id>::value && is_integral<AccId>::value
 41 | 		&& sizeof(AccId) >= 2*sizeof(Id), "AggHash, types constraints are violated");
 42 | 
 43 | 	// ATTENTION: type of the m_size should not be less than of m_idsum to
 44 | 	// avoid gaps filled with trash on memory alignment
 45 | 	// Note: size should be first as the most discriminative attribute, which
 46 | 	// can be potentially used for the ordering
 47 | 	// Note: the size is redundant and does not have any impact except for the structured ordering
 48 | 	// if the sum does not increase AccId_MAX or if zero value of id is NOT allowed. The size is
 49 | 	// necessary if id=0 may be present in the clusters.
 50 | 	AccId  m_size;  //!< Size of the container
 51 | 	AccId  m_idsum;  //!< Sum of the member ids
 52 | 	AccId  m_id2sum;  //!< Sum of the squared member ids
 53 | protected:
 54 | 	//! Id correction to prevent collisions
 55 | 	constexpr static Id  idcor = sqrt(numeric_limits<Id>::max());
 56 | public:
 57 | 	// Export the template parameter types
 58 | 	using IdT = Id;  //!< Type of the member ids
 59 | 	using AccIdT = AccId;  //!< Type of the accumulated Ids and accumulated squares of Ids
 60 | 
 61 | 	//! \brief Default constructor
 62 | 	AggHash() noexcept
 63 | 	: m_size(0), m_idsum(0), m_id2sum(0) {}
 64 | 
 65 | 	//! \brief Add id to the aggregation
 66 | 	//! \note In case correction is used and id becomes out of range (initial id > IDMAX - IDCORR)
 67 | 	//! 	then an exception is thrown, which crashes the whole application, which is OK
 68 | 	//!
 69 | 	//! \param id Id  - id to be included into the hash
 70 | 	//! \return void
 71 | 	void add(Id id) noexcept;
 72 | 
 73 | 	//! \brief Clear/reset the aggregation
 74 | 	//!
 75 | 	//! \return void
 76 | 	void clear() noexcept;
 77 | 
 78 | 	//! \brief Number of the aggregated ids
 79 | 	//!
 80 | 	//! \return size_t  - number of the aggregated ids
 81 | 	size_t size() const noexcept  { return m_size; }
 82 | 
 83 | 	//! \brief Sum of the aggregated ids
 84 | 	//!
 85 | 	//! \return size_t  - sum of the aggregated ids
 86 | 	size_t idsum() const noexcept  { return m_idsum; }
 87 | 
 88 | 	//! \brief Sum of squares of the aggregated ids
 89 | 	//!
 90 | 	//! \return size_t  - sum of squares of the aggregated ids
 91 | 	size_t id2sum() const noexcept  { return m_id2sum; }
 92 | 
 93 | //    //! \brief The hash is empty
 94 | //    //!
 95 | //    //! \return bool  - the hash is empty
 96 | //	bool empty() const noexcept  { return !m_size; }
 97 | 
 98 | 	//! \brief Evaluate hash of the aggregation
 99 | 	//!
100 | 	//! \return size_t  - resulting hash
101 | 	size_t hash() const;
102 | 
103 | 	//! \brief Operator less
104 | 	//!
105 | 	//! \param ah const AggHash&  - comparing object
106 | 	//! \return bool operator  - result of the comparison
107 | 	inline bool operator <(const AggHash& ah) const noexcept;
108 | 
109 | 	//! \brief Operator less or equal
110 | 	//!
111 | 	//! \param ah const AggHash&  - comparing object
112 | 	//! \return bool operator  - result of the comparison
113 | 	inline bool operator <=(const AggHash& ah) const noexcept;
114 | 
115 | 	//! \brief Operator greater
116 | 	//!
117 | 	//! \param ah const AggHash&  - comparing object
118 | 	//! \return bool operator  - result of the comparison
119 | 	bool operator >(const AggHash& ah) const noexcept  { return !(*this <= ah); }
120 | 
121 | 	//! \brief Operator greater or equal
122 | 	//!
123 | 	//! \param ah const AggHash&  - comparing object
124 | 	//! \return bool operator  - result of the comparison
125 | 	bool operator >=(const AggHash& ah) const noexcept  { return !(*this < ah); }
126 | 
127 | 	//! \brief Operator equal
128 | 	//!
129 | 	//! \param ah const AggHash&  - comparing object
130 | 	//! \return bool operator  - result of the comparison
131 | 	inline bool operator ==(const AggHash& ah) const noexcept;
132 | 
133 | 	//! \brief Operator unequal (not equal)
134 | 	//!
135 | 	//! \param ah const AggHash&  - comparing object
136 | 	//! \return bool operator  - result of the comparison
137 | 	bool operator !=(const AggHash& ah) const noexcept  { return !(*this == ah); }
138 | };
139 | 
140 | // Type Definitions ----------------------------------------------------
141 | #pragma GCC diagnostic push
142 | #pragma GCC diagnostic ignored "-Wterminate"  // Disable the warning about the exception throwing function marked as noexcept
143 | template <typename Id, typename AccId>
144 | void AggHash<Id, AccId>::add(Id id) noexcept
145 | {
146 | 	id += idcor;  // Correct id to prevent collisions (see AgordiHash for details)
147 | 	// Check for the overflow after the correction
148 |     // Note: the exception will crash the whole app since noexcept is used but it is fine
149 | 	if(id < idcor)
150 | 		throw domain_error(string("The corrected value of ").append(std::to_string(id))
151 | 			.append(" is too large and causes the overflow\n"));
152 | 	++m_size;
153 | 	m_idsum += id;
154 | 	m_id2sum += id * id;
155 | }
156 | #pragma GCC diagnostic pop
157 | 
158 | template <typename Id, typename AccId>
159 | void AggHash<Id, AccId>::clear() noexcept
160 | {
161 | 	m_size = 0;
162 | 	m_idsum = 0;
163 | 	m_id2sum = 0;
164 | }
165 | 
166 | template <typename Id, typename AccId>
167 | size_t AggHash<Id, AccId>::hash() const
168 | {
169 | 	// ATTENTION: requires filling with zero memory alignment trash or avoid the padding
170 | 	return std::hash<string>()(string(reinterpret_cast<const char*>(this), sizeof *this));
171 | }
172 | 
173 | template <typename Id, typename AccId>
174 | bool AggHash<Id, AccId>::operator <(const AggHash& ah) const noexcept
175 | {
176 | 	return m_size < ah.m_size || (m_size == ah.m_size
177 | 		&& (m_idsum < ah.m_idsum || (m_idsum == ah.m_idsum && m_id2sum < ah.m_id2sum)));
178 | }
179 | 
180 | template <typename Id, typename AccId>
181 | bool AggHash<Id, AccId>::operator <=(const AggHash& ah) const noexcept
182 | {
183 | 	return m_size < ah.m_size || (m_size == ah.m_size
184 | 		&& (m_idsum < ah.m_idsum || (m_idsum == ah.m_idsum && m_id2sum <= ah.m_id2sum)));
185 | }
186 | 
187 | template <typename Id, typename AccId>
188 | bool AggHash<Id, AccId>::operator ==(const AggHash& ah) const noexcept
189 | {
190 | 	return m_size == ah.m_size && m_idsum == ah.m_idsum && m_id2sum == ah.m_id2sum;
191 | 	//return !memcmp(this, &ah, sizeof(AggHash));  // Note: memcmp returns 0 on full match
192 | }
193 | 
194 | }  // daoc
195 | 
196 | #endif // CODING_HPP
197 | 


--------------------------------------------------------------------------------
/args.ggo:
--------------------------------------------------------------------------------
  1 | # Configuration file for the automatic generation of the input options parsing
  2 | 
  3 | package "xmeasures"
  4 | version "4.0.4"
  5 | versiontext "Author:  (c) Artem Lutov <artem@exascale.info>
  6 | Sources:  https://github.com/eXascaleInfolab/xmeasures
  7 | Paper: \"Accuracy Evaluation of Overlapping and Multi-resolution Clustering Algorithms on Large Datasets\" by Artem Lutov, Mourad Khayati and Philippe Cudré-Mauroux, BigComp 2019
  8 | "
  9 | 
 10 | purpose "Extrinsic measures evaluation: Omega Index (a fuzzy version of the\
 11 |  Adjusted Rand Index, identical to the Fuzzy Rand Index) and [mean] F1-score\
 12 |  (prob, harm and avg) for the overlapping multi-resolution clusterings,\
 13 |  and standard NMI for the non-overlapping clustering on a single resolution.\
 14 |  Unequal node base is allowed in the evaluating clusterings and optionally can\
 15 |  be synchronized removing nodes from the clusters missed in one of the clusterings (collections)."
 16 | 
 17 | usage "xmeasures [OPTIONS] clustering1 clustering2
 18 | 
 19 |   clustering  - input file, collection of the clusters to be evaluated.
 20 |   
 21 | Examples:
 22 |   $ ./xmeasures -fp -kc networks/5K25.cnl tests/5K25_l0.825/5K25_l0.825_796.cnl
 23 |   $ ./xmeasures -fh -kc -i tests/5K25.cll -ph -l networks/5K25.cnl tests/5K25_l0.825/5K25_l0.825_796.cnl
 24 |   $ ./xmeasures -ox tests/clsevalsx/omega_c4.3-1.cnl tests/clsevalsx/omega_c4.3-2.cnl
 25 | "
 26 | 
 27 | description "Extrinsic measures are evaluated, i.e. two input clusterings\
 28 |  (collections of clusters) are compared to each other. Optionally, a labeling\
 29 |  of the evaluating clusters with the specified ground-truth clusters is performed.
 30 | NOTE:
 31 |   - Multiple evaluating measures can be specified.
 32 |   - Each cluster should contain unique members, which is ensured only if the\
 33 |  'unique' option is specified.
 34 |   - All clusters should be unique to not affect Omega Index evaluation, which\
 35 |   can be ensured by the [resmerge](https://github.com/eXascaleInfolab/resmerge) utility.
 36 |   - Non-corrected unequal node base in the clusterings is allowed, it penalizes the match.\
 37 | Use [OvpNMI](https://github.com/eXascaleInfolab/OvpNMI) or\
 38 |  [GenConvNMI](https://github.com/eXascaleInfolab/GenConvNMI) for NMI evaluation\
 39 |  in the arbitrary collections (still each cluster should contain unique members).
 40 | 
 41 | Evaluating measures are:
 42 |   - OI  - Omega Index (a fuzzy version of the Adjusted Rand Index, identical to\
 43 |   the Fuzzy Rand Index), which yields the same value as Adjusted Rand Index when\
 44 |   applied to the non-overlapping clusterings.
 45 |   - [M]F1  - various [mean] F1 measures of the Greatest (Max) Match including\
 46 |   the Average F1-Score (suggested by J. Leskovec) with the optional weighting.
 47 | NOTE: There are 3 matching policies available for each kind of F1. The most\
 48 |   representative evaluation is performed by the F1p with combined matching\
 49 |   policy (considers both micro and macro weighting).
 50 |   - NMI  - Normalized Mutual Information, normalized by either max or also sqrt,\
 51 |  avg and min information content denominators.
 52 | ATTENTION: This is a standard NMI, which should be used ONLY for the HARD\
 53 |  partitioning evaluation (non-overlapping clustering on a single resolution).\
 54 |   It penalizes overlapping and multi-resolution structures.
 55 | "
 56 | 
 57 | option  "ovp" O  "evaluate overlapping instead of the multi-resolution clusters,\
 58 |  where max matching for any shared member between R overlapping clusters is 1/R\
 59 |  (the member is shared) instead of 1 (the member fully belongs to each [hierarchical\
 60 |   sub]group) for the member belonging to R distinct clusters on R resolutions.
 61 | NOTE: It has no effect for the Omega Index evaluation."
 62 |   flag off
 63 | #NOTE: Multi-resolution mode can be used as approximation of the overlapping\
 64 | # clusters evaluation, but not vice verse"  flag off
 65 | # Note: ovp option requires shares evaluation/reading and processing of the directory
 66 | # of collections in case of both multi-resolution and overlapping  clustering evaluation
 67 | option  "unique" q  "ensure on loading that all cluster members are unique by\
 68 |  removing all duplicates."
 69 |   flag off
 70 | option  "sync" s  "synchronize with the specified node base omitting the non-matching nodes.
 71 | NOTE: The node base can be either a separate, or an evaluating CNL file, in the\
 72 |  latter case this option should precede the evaluating filename not repeating it"
 73 |   string  typestr="filename"
 74 | option  "membership" m  "average expected membership of the nodes in the clusters,\
 75 |  > 0, typically >= 1. Used only to facilitate estimation of the nodes number on\
 76 |  the containers preallocation if this number is not specified in the file header."
 77 |   float default="1"
 78 | option  "detailed" d  "detailed (verbose) results output"  flag off
 79 | 
 80 | section "Omega Index"
 81 | option  "omega" o  "evaluate Omega Index (a fuzzy version of the Adjusted Rand Index,\
 82 |  identical to the Fuzzy Rand Index and on the non-overlapping clusterings equals to ARI)."
 83 |   flag off
 84 | option  "extended" x  "evaluate extended (Soft) Omega Index, which does not excessively\
 85 |  penalize distinctly shared nodes."  flag off  dependon="omega"
 86 | 
 87 | section "Mean F1"
 88 | option  "f1" f  "evaluate mean F1 of the [weighted] average of the greatest (maximal)\
 89 |  match by F1 or partial probability.
 90 | NOTE: F1h <= F1a, where:
 91 |  - p (F1p or Ph)  - Harmonic mean (F1) of two [weighted] averages of the Partial Probabilities,\
 92 |  the most indicative as satisfies the largest number of the Formal Constraints\
 93 |  (homogeneity, completeness and size/quantity except the rag bag in some cases);
 94 |  - h (F1h)  - Harmonic mean (F1) of two [weighted] averages of all local F1\
 95 |  (harmonic means of the Precision and Recall of the best matches of the clusters);
 96 |  - a (F1a)  - Arithmetic mean (average) of two [weighted] averages of all local F1,\
 97 |  the least discriminative and satisfies the lowest number of the Formal Constraints.
 98 | Precision and recall are evaluated relative to the FIRST clustering dataset (ground-truth, gold standard).
 99 | "
100 |  values="partprob","harmonic","average"  enum  default="partprob"  argoptional
101 | option  "kind" k  "kind of the matching policy:
102 |  - w  - Weighted by the number of nodes in each cluster (known as micro weighting, MF1_micro)
103 |  - u  - Unweighed, where each cluster is treated equally (known as macro weighting, MF1_macro)
104 |  - c  - Combined(w, u) using geometric mean (drops the value not so much as harmonic mean)
105 | "
106 |   values ="weighted","unweighed","combined"  enum default="weighted" argoptional
107 |   dependon="f1"
108 | 
109 | section "Clusters Labeling & F1 evaluation with Precision and Recall"
110 | option  "label" l  "label evaluating clusters with the specified ground-truth (gt)\
111 |  cluster indices and evaluate F1 (including Precision and Recall) of the (best) MATCHED\
112 |  labeled clusters only (without the probable subclusters).
113 | NOTE: If 'sync' option is specified then the file name of the clusters labels\
114 |  should be the same as the node base (if specified) and should be in the .cnl format.\
115 |  The file name can be either a separate or an evaluating CNL file, in the\
116 |  latter case this option should precede the evaluating filename not repeating it.
117 | Precision and recall are evaluated relative to the FIRST clustering dataset (ground-truth, gold standard).
118 | "
119 |   string  typestr="gt_filename"
120 | option  "policy" p  "Labels matching policy:
121 |  - p  - Partial Probabilities (maximizes gain)
122 |  - h  - Harmonic Mean (minimizes loss, maximizes F1)
123 | "
124 |  values="partprob","harmonic"  enum  default="harmonic"  argoptional  dependon="label"
125 | option  "unweighted" u  "Labels weighting policy on F1 evaluation: weighted by the number\
126 |  of instances in each label by default (micro weighting, F1_micro) or unweighed,\
127 |  where each label is treated equally (i.e. macro weighting, F1_macro)"
128 |   flag off  dependon="label"
129 | option  "identifiers" i  "output labels (identifiers) of the evaluating clusters\
130 |  as lines of space-separated indices of the ground-truth clusters (.cll - clusters\
131 |  labels list)
132 | NOTE: If 'sync' option is specified then the reduced collection is outputted to the\
133 |  <labels_filename>.cnl besides the <labels_filename>
134 | "  string  typestr="labels_filename"  dependon="label"
135 | 
136 | section "NMI"
137 | option  "nmi" n  "evaluate NMI (Normalized Mutual Information), applicable only\
138 |  to the non-overlapping clusters"  flag off
139 | option  "all" a  "evaluate all NMIs using sqrt, avg and min denominators besides\
140 |  the max one"  flag off  dependon="nmi"
141 | option  "ln" e  "use ln (exp base) instead of log2 (Shannon entropy, bits)\
142 |  for the information measuring" flag off  dependon="nmi"
143 | # Note: log2 vs ln have no any influence on the resulting value
144 | 
145 | # Set optional options by default, allow input files to be unnamed parameters
146 | args "--default-optional --unamed-opts=clusterings"
147 | #args "--unamed-opts=clusterings"   # Allow input files to be unnamed parameters
148 | 
149 | 
150 | # = Changelog =
151 | # v4.0.4 - Precision and recall added to the MF1 output, mixed Prc, Rec in F1 fixed
152 | # v4.0.3 - Renamed F1s -> F1a to be synced with the paper, description refined
153 | # v4.0.2 - Description and output measures notations refined
154 | # v4.0.1 - Aggregated output for multiple measures added
155 | # v4.0.0 - Omega index added and bound to the "-o" argument
156 | #	- the former "-o" argument (overlaps) renamed to "-O"
157 | #	- F1s renamed to F1a (average), option "-s" renamed to "-a"
158 | # v3.2.2 - F1 weighting considered for the labels
159 | # v3.2.1 - Interpretation of F1p modified to have semantic of geometric mean, now F1h < F1p < F1s
160 | # v3.2.0 - Clusters labeling, labels F1 (with precision and recall) evaluation
161 | # v3.1.0 - Matching policy for the F measures parameterized (weighted, unweighted, combined)
162 | # v3.0.2 - NMI_sqrt added
163 | # v3.0 - Command line interface changed for F1
164 | #	- Standard F1-Score added
165 | #	- Option for the detailed results output added
166 | # v2.3 - Node base synchronization added
167 | # v2.2 - Input arguments redesigned
168 | # v2.1 - F1 of partial probabilities implemented besides F1 of f1s
169 | # v2.0 - Standard NMI implemented and evaluation base parameterized (exp, 2)
170 | # v1.1 - Weighted F1 implemented
171 | # v1.0 - Initial Release
172 | 
173 | #TODO:
174 | # NF1 measure
175 | 
176 | 
177 | # Basic structure, see https://www.gnu.org/software/gengetopt/gengetopt.html#Basic-Usage
178 | #     package "<packname>"
179 | #     version "<version>"
180 | #     purpose "<purpose>"
181 | #     usage "<usage>"
182 | #     description "<description>"
183 | #     versiontext "<versiontext>"
184 | #
185 | #     args "<command line options>"
186 | #
187 | #     option <long> <short> "<desc>"
188 | #         {details="<detailed description>"}
189 | #         {argtype} {typestr="<type descr>"}
190 | #         {values="<value1>","<value2>",...}
191 | #         {default="<default value>"}
192 | #         {dependon="<other option>"}
193 | #         {required} {argoptional} {multiple}
194 | #         {hidden}
195 | #
196 | #     option <long> <short> "<desc>" flag <on/off>
197 | #
198 | #     section "section name" {sectiondesc="optional section description"}
199 | #
200 | #     text "a textual sentence"
201 | #
202 | #
203 | # Mutually exclusive options should belong to a group:
204 | #
205 | #     defgroup "<group name>" {groupdesc="<group description>"} {required}
206 | #     groupoption <long> <short> "<desc>" <argtype> group="<group name>" \
207 | #          {argoptional} {multiple}
208 | #
209 | #     defgroup "my grp2"
210 | #     defgroup "grp1" groupdesc="an option of this group is required" required
211 | #     groupoption "opta" a "string a" group="grp1" multiple
212 | 


--------------------------------------------------------------------------------
/src/main.cpp:
--------------------------------------------------------------------------------
  1 | //! \brief Extrinsic measures evaluation for overlapping multi-resolution clusterings
  2 | //! with possible unequal node base.
  3 | //!
  4 | //! \license Apache License, Version 2.0: http://www.apache.org/licenses/LICENSE-2.0.html
  5 | //! > 	Simple explanation: https://tldrlegal.com/license/apache-license-2.0-(apache-2.0)
  6 | //!
  7 | //! Copyright (c)
  8 | //! \authr Artem Lutov
  9 | //! \email luart@ya.ru
 10 | //! \date 2017-02-13
 11 | 
 12 | #include <cstdio>
 13 | #include <sstream>
 14 | #include <stdexcept>
 15 | #include "cmdline.h"  // Arguments parsing
 16 | #include "macrodef.h"
 17 | #include "interface.hpp"
 18 | 
 19 | using std::stringstream;
 20 | 
 21 | 
 22 | //! \brief Arguments parser
 23 | struct ArgParser: gengetopt_args_info {
 24 | 	ArgParser(int argc, char **argv) {
 25 | 		auto  err = cmdline_parser(argc, argv, this);
 26 | 		if(err)
 27 | 			throw std::invalid_argument("Arguments parsing failed: " + to_string(err));
 28 | 	}
 29 | 
 30 | 	~ArgParser() {
 31 | 		cmdline_parser_free(this);
 32 | 	}
 33 | };
 34 | 
 35 | 
 36 | int main(int argc, char **argv)
 37 | {
 38 | 	ArgParser  args_info(argc, argv);
 39 | 
 40 | 	// Validate required xmeasure
 41 | 	if(!args_info.omega_flag && !args_info.nmi_flag && !args_info.f1_given && !args_info.label_given) {
 42 | 		fputs("WARNING, no any measures to evaluate are specified\n", stderr);
 43 | 		cmdline_parser_print_help();
 44 | 		return EINVAL;
 45 | 	}
 46 | 
 47 | 	if(args_info.membership_arg <= 0) {
 48 | 		fprintf(stderr, "ERROR, positive membership is expected: %G\n", args_info.membership_arg);
 49 | 		return EDOM;
 50 | 	}
 51 | 
 52 | 	{	// Validate the number of input files
 53 | 		// Note: sync_arg is specified if sync_given
 54 | 		const auto  inpfiles = args_info.inputs_num + (args_info.sync_given || args_info.label_given);  // The number of input files
 55 | 		if(inpfiles < 2 || inpfiles > 2 + args_info.sync_given + args_info.label_given) {
 56 | 			fputs("ERROR, 2 input clusterings are required with possibly additional"
 57 | 				" node base and clusters labels, i.e. 2-4 input files in total\n", stderr);
 58 | 			cmdline_parser_print_help();
 59 | 			return EINVAL;
 60 | 		}
 61 | 	}
 62 | 
 63 | 	// Verify that labeled clusters correspond to the node base if any of them is specified
 64 | 	if(args_info.sync_given && args_info.label_given && (strcmp(args_info.sync_arg, args_info.label_arg)
 65 | 	|| (args_info.inputs_num == 2 && strcmp(args_info.sync_arg, args_info.inputs[0]))))
 66 | 		throw invalid_argument("ERROR, node base file should correspond to the labeled clusters and"
 67 | 			" represent the first evaluating collection if both are specified\n");
 68 | 
 69 | 	// Load node base if required
 70 | 	NodeBase  ndbase;
 71 | 	::AggHash  nbhash;
 72 | 	// Note: if label_given then either inputs_num < 2 or inputs_num[0] = sync_arg = label_arg
 73 | 	if(args_info.sync_given && args_info.inputs_num == 2 && !args_info.label_given)
 74 | 		ndbase = NodeBase::load(args_info.sync_arg, args_info.membership_arg
 75 | 			, &nbhash, 0, 0, args_info.detailed_flag);
 76 | 
 77 | 	auto process = [&](auto evaluation) -> int {
 78 | 		using Count = decltype(evaluation);
 79 | 		using Collection = Collection<Count>;
 80 | 		// Load collections as relations
 81 | 		::AggHash  cn1hash, cn2hash;
 82 | 		// Note: cn1 is nodebase if specified and not in the separated file
 83 | 		const bool  cn1base = (args_info.sync_given || args_info.label_given) && args_info.inputs_num < 2;
 84 | 		//const char*  nbfile = args_info.sync_given
 85 | 		auto cn1 = Collection::load(cn1base ? args_info.sync_given ? args_info.sync_arg
 86 | 			: args_info.label_arg : args_info.inputs[0]
 87 | 			, args_info.unique_flag, args_info.membership_arg, &cn1hash
 88 | 			, ndbase ? &ndbase : nullptr, nullptr, args_info.detailed_flag);
 89 | 		if(ndbase) {
 90 | 			if(nbhash != cn1hash) {
 91 | 				fprintf(stderr, "ERROR, nodebase hash %lu (%lu nodes) != filtered"
 92 | 					" collection nodes hash %lu (%lu)\n", nbhash.hash(), nbhash.size()
 93 | 					, cn1hash.hash(), cn1hash.size());
 94 | 				return EINVAL;
 95 | 			}
 96 | 			ndbase.clear();
 97 | 		}
 98 | 		RawIds  lostcls;
 99 | 		auto cn2 = Collection::load(args_info.inputs[!cn1base]
100 | 			, args_info.unique_flag, args_info.membership_arg, &cn2hash
101 | 			, args_info.sync_given ? &cn1 : nullptr
102 | 			, args_info.sync_given && args_info.label_given ? &lostcls : nullptr
103 | 			, args_info.detailed_flag);
104 | 
105 | 		if(!cn1.ndsnum() || ! cn2.ndsnum()) {
106 | 			fprintf(stderr, "WARNING, at least one of the collections is empty, there is nothing"
107 | 				" to evaluate. Collection nodes sizes: %u, %u\n", cn1.ndsnum(), cn2.ndsnum());
108 | 			return EINVAL;
109 | 		}
110 | 
111 | 		// Check the collections' nodebase
112 | 		if(cn1hash != cn2hash) {
113 | 			fprintf(stderr, "WARNING, the nodes in the collections differ (the quality will be penalized)"
114 | 				": %u nodes with hash %lu, size: %lu, ids: %lu, id2s: %lu) !="
115 | 				" %u nodes with hash %lu, size: %lu, ids: %lu, id2s: %lu);  synchronize: %s, label: %s\n"
116 | 				, cn1.ndsnum(), cn1hash.hash(), cn1hash.size(), cn1hash.idsum(), cn1hash.id2sum()
117 | 				, cn2.ndsnum(), cn2hash.hash(), cn2hash.size(), cn2hash.idsum(), cn2hash.id2sum()
118 | 				, daoc::toYesNo(args_info.sync_given), daoc::toYesNo(args_info.label_given));
119 | 			//if(args_info.sync_given) {
120 | 			//	fputs("ERROR, the nodes base should be synchronized\n", stderr);
121 | 			//	return EINVAL;
122 | 			//}
123 | 		}
124 | 
125 | 		// The number of outputting measures (1 .. 4)
126 | 		uint8_t  outsnum = args_info.omega_flag + args_info.nmi_flag
127 | 			 + args_info.f1_given + args_info.label_given;
128 | 		stringstream  aggouts;  // Aggregated outputs
129 | 		// Evaluate and output measures
130 | 		// Note: evaluation of overlapping F1 after NMI allows to reuse some
131 | 		// calculations, for other cases the order of evaluations does not matter
132 | 		puts(string("= ").append(is_floating_point<Count>::value
133 | 			? "Overlaps" : "Multi-resolution").append(" Evaluation =").c_str());
134 | 		if(args_info.nmi_flag) {
135 | 			auto rnmi = Collection::nmi(cn1, cn2, args_info.ln_flag, args_info.detailed_flag);
136 | 			// Set NMI to NULL if collections have no any mutual information
137 | 			// ATTENTION: for some cases, for example when one of the collections is a single cluster,
138 | 			// NMI will always yield 0 for any clusters in the second collection, which is limitation
139 | 			// of the original NMI measure. Similar issues possible in more complex configurations.
140 | 			if(rnmi.mi <= precision_limit<decltype(rnmi.mi)>()) {  // Note: strict ! is fine here
141 | 				throw domain_error("NMI is not applicable to the specified collections: 0, which says nothing about the similarity\n");
142 | 				rnmi.h1 = rnmi.h2 = 1;
143 | 			}
144 | 			const auto  nmix = rnmi.mi / std::max(rnmi.h1, rnmi.h2);
145 | 			if(args_info.all_flag) {
146 | 				printf("NMI_max: %G, NMI_sqrt: %G, NMI_avg: %G, NMI_min: %G\n"
147 | 					, nmix, rnmi.mi / sqrt(rnmi.h1 * rnmi.h2)
148 | 					, 2 * rnmi.mi / (rnmi.h1 + rnmi.h2)
149 | 					, rnmi.mi / std::min(rnmi.h1, rnmi.h2));
150 | 				if(--outsnum || aggouts.tellp())
151 | 					aggouts << "NMI_max: " << nmix
152 | 						<< ", NMI_sqrt: " << rnmi.mi / sqrt(rnmi.h1 * rnmi.h2)
153 | 						<< ", NMI_avg: " << 2 * rnmi.mi / (rnmi.h1 + rnmi.h2)
154 | 						<< ", NMI_min: " << rnmi.mi / std::min(rnmi.h1, rnmi.h2);
155 | 			} else {
156 | 				printf("NMI_max:\n%G\n", nmix);
157 | 				if(--outsnum || aggouts.tellp())
158 | 					aggouts << "NMI_max: " << nmix;
159 | 			}
160 | 		}
161 | 		if(args_info.f1_given) {
162 | 			// Assign required F1 type
163 | 			F1  f1kind = F1::NONE;
164 | 			// Note: args_info.f1_orig is empty if default value is used
165 | 			char  f1suf = '-';  // Suffix char of the selected F1 measure
166 | 			switch(args_info.f1_arg) {
167 | 			case f1_arg_partprob:
168 | 				f1kind = F1::PARTPROB;
169 | 				f1suf = 'p';
170 | 				break;
171 | 			case f1_arg_harmonic:
172 | 				f1kind = F1::HARMONIC;
173 | 				f1suf = 'h';
174 | 				break;
175 | 			case f1_arg_average:
176 | 				f1kind = F1::AVERAGE;  // Suggested by Leskovec
177 | 				f1suf = 'a';
178 | 				break;
179 | 			default:
180 | 				throw invalid_argument("main(), UNKNOWN F1 policy specified\n");
181 | 			}
182 | 			// Assign matching kind
183 | 			Match  mkind = Match::NONE;
184 | 			// Note: args_info.kind_orig is empty if default value is used
185 | 			char  kindsuf = '-';  // Suffix char of the selected F1 measure
186 | 			switch(args_info.kind_arg) {
187 | 			case kind_arg_weighted:
188 | 				mkind = Match::WEIGHTED;
189 | 				kindsuf = 'w';
190 | 				break;
191 | 			case kind_arg_unweighed:
192 | 				mkind = Match::UNWEIGHTED;
193 | 				kindsuf = 'u';
194 | 				break;
195 | 			case kind_arg_combined:
196 | 				mkind = Match::COMBINED;
197 | 				kindsuf = 'c';
198 | 				break;
199 | 			default:
200 | 				throw invalid_argument("main(), UNKNOWN Matching policy specified\n");
201 | 			}
202 | 
203 | 			//if(args_info.nmi_flag)
204 | 			//	fputs("; ", stdout);
205 | 			Prob  prc, rec;  // Precision and recall of cn2 relative to ground-truth cn1
206 | 			const auto  f1val = Collection::f1(cn1, cn2, f1kind, rec, prc, mkind, args_info.detailed_flag);
207 | 			printf("MF1%c_%c (%s, %s):\n%G", f1suf, kindsuf, to_string(f1kind).c_str()
208 | 				, to_string(mkind).c_str(), f1val);
209 | 			if(prc || rec)
210 | 				printf(" (Prc: %G, Rec: %G)", prc, rec);
211 | 			fputc('\n', stdout);
212 | 			if(--outsnum || aggouts.tellp()) {
213 | 				if(aggouts.tellp())
214 | 					aggouts << "; ";
215 | 				aggouts << "MF1" << f1suf << '_' << kindsuf << ": " << f1val;
216 | 				// Note: prc and rec are zeroized if the matching strategy does not support them
217 | 				if(prc || rec)
218 | 					aggouts << " (Prc: " << prc << ", Rec: " << rec << ')';
219 | 			}
220 | 		}
221 | 		// Label clusters with the ground-truth clusters indices and output F1 for the labels if required
222 | 		if(args_info.label_given) {
223 | 			if(args_info.policy_arg == policy__NULL) {
224 | 				fputs("WARNING f1(), labels matching policy is not specified, the evaluation is skipped\n", stderr);
225 | 				return 0;
226 | 			}
227 | 			// Reset cluster counters if they were set (could be set only by F1)
228 | 			if(args_info.f1_given) {
229 | 				cn1.clearcounts();
230 | 				cn2.clearcounts();
231 | 			}
232 | 			const bool  prob = args_info.policy_arg == policy_arg_partprob;  // Partial Probabilities matching policy
233 | 			const bool  weighted = !args_info.unweighted_flag;
234 | 			PrcRec pr = Collection::label(cn1, cn2 //, lostcls
235 | 				, prob, weighted, args_info.identifiers_arg); //, args_info.detailed_flag);
236 | 			// Note: each measure name should form a single world to be properly parsed in a uniform way (see Clubmark),
237 | 			// that is why doubled underscore is used rather than a single space.
238 | 			printf("F1%c_%c__labels: %G (Prc: %G, Rec: %G)\n"
239 | 				, prob ? 'p' : 'h', weighted ? 'w' : 'u'
240 | 				, hmean(pr.prc, pr.rec), pr.prc, pr.rec);
241 | 			if(--outsnum || aggouts.tellp()) {
242 | 				if(aggouts.tellp())
243 | 					aggouts << "; ";
244 | 				aggouts << "F1" << (prob ? 'p' : 'h') << '_' << (weighted ? 'w' : 'u')
245 | 					<< "__labels: " << hmean(pr.prc, pr.rec)
246 | 					<< " (Prc: " << pr.prc << ", Rec: " << pr.rec << ')';
247 | 			}
248 | 		}
249 | 		if(args_info.omega_flag) {
250 | 			// Transform loaded and pre-processed collection to the representation
251 | 			// suitable for Omega Index evaluation
252 | 			RawClusters  cls1;
253 | 			RawClusters  cls2;
254 | 			NodeRClusters  ndrcs;
255 | 
256 | 			cn1.template transfer<true>(cls1, ndrcs);
257 | 			cn2.template transfer<false>(cls2, ndrcs);
258 | 			const auto oi = args_info.extended_flag
259 | 				? omega<true>(ndrcs, cls1, cls2)
260 | 				: omega<false>(ndrcs, cls1, cls2)
261 | 				;
262 | 			printf("OI%s:\n%G\n", args_info.extended_flag ? "x" : "", oi);
263 | 			if(--outsnum || aggouts.tellp()) {
264 | 				if(aggouts.tellp())
265 | 					aggouts << "; ";
266 | 				aggouts << "OI" << (args_info.extended_flag ? "x" : "") << ": " << oi;
267 | 			}
268 | 		}
269 | 		if(aggouts.tellp())
270 | 			puts(aggouts.str().c_str());
271 | 
272 | 		return 0;
273 | 	};
274 | 
275 | 
276 |     return args_info.ovp_flag ? process(AccProb()) : process(Id());
277 | }
278 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2016, Artem Lutov <luart@ya.ru>
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/shared/fileio.cpp:
--------------------------------------------------------------------------------
  1 | //! \brief File IO utils
  2 | //!
  3 | //! \license Apache License, Version 2.0: http://www.apache.org/licenses/LICENSE-2.0.html
  4 | //! > 	Simple explanation: https://tldrlegal.com/license/apache-license-2.0-(apache-2.0)
  5 | //!
  6 | //! Copyright (c)
  7 | //! \authr Artem Lutov
  8 | //! \email luart@ya.ru
  9 | //! \date 2017-02-13
 10 | 
 11 | #include <cassert>
 12 | #include <system_error>  // error_code
 13 | //#include <stdexcept>
 14 | 
 15 | #ifdef __unix__
 16 | #include <sys/stat.h>
 17 | #endif // __unix__
 18 | 
 19 | #define INCLUDE_STL_FS
 20 | #include "fileio.hpp"
 21 | 
 22 | 
 23 | using std::error_code;
 24 | using std::to_string;
 25 | using fs::path;
 26 | using fs::create_directories;
 27 | using fs::is_directory;
 28 | using fs::exists;
 29 | using fs::status;
 30 | using std::logic_error;
 31 | using namespace daoc;
 32 | 
 33 | // File IO Types definitions ---------------------------------------------------
 34 | size_t NamedFileWrapper::size() const noexcept
 35 | {
 36 | 	size_t  cmsbytes = -1;  // Return -1 on error
 37 | #ifdef __unix__  // sqrt(cmsbytes) lines => linebuf = max(4-8Kb, sqrt(cmsbytes) * 2) with dynamic realloc
 38 | 	struct stat  filest;
 39 | 	int fd = fileno(m_file);
 40 | 	if(fd != -1 && !fstat(fd, &filest))
 41 | 		return filest.st_size;
 42 | #endif // __unix
 43 | 	error_code  err;
 44 | 	cmsbytes = fs::file_size(m_name, err);
 45 | 	if(cmsbytes == size_t(-1))
 46 | 		fprintf(stderr, "WARNING size(), file size evaluation failed: %s\n"
 47 | 			, err.message().c_str());
 48 | 
 49 | //	// Get length of the file
 50 | //	fseek(m_file, 0, SEEK_END);
 51 | //	cmsbytes = ftell(m_file);  // The number of bytes in the input communities
 52 | //	if(cmsbytes == size_t(-1))
 53 | //		perror("WARNING size(), file size evaluation failed");
 54 | //	//fprintf(stderr, "  %s: %lu bytes\n", fname, cmsbytes);
 55 | //	rewind(m_file);  // Set position to the begin of the file
 56 | 
 57 | 	return cmsbytes;
 58 | }
 59 | 
 60 | NamedFileWrapper& NamedFileWrapper::reset(const char* filename, const char* mode)
 61 | {
 62 | 	if(filename) {
 63 | 		m_file.reset(fopen(filename, mode));
 64 | 		m_name = filename;
 65 | 	} else m_file.reset();
 66 | 	return *this;
 67 | }
 68 | 
 69 | // File Reading Types ----------------------------------------------------------
 70 | StringBuffer::StringBuffer(size_t size)
 71 | : StringBufferBase(size), m_cur(0), m_length(0)
 72 | {
 73 | 	if(size <= 2)
 74 | 		size = 2;
 75 | 	*data() = 0;  // Set first element to 0
 76 | 	data()[size-2] = 0;  // Set prelast reserved element to 0
 77 | 	// Note: data()[size-1] is set to 0 automatically on file read if
 78 | 	// the reading data size >= size - 1 bytes
 79 | }
 80 | 
 81 | void StringBuffer::reset(size_t size)
 82 | {
 83 | 	// Reset writing position
 84 | 	m_cur = 0;
 85 | 	m_length = 0;
 86 | 	// Reset the buffer
 87 | 	resize(size);  // Note: can throw bad_alloc
 88 | 	shrink_to_fit();  // Free reserved memory
 89 | 	*data() = 0;  // Set first element to 0
 90 | 	data()[size-2] = 0;  // Set prelast reserved element to 0
 91 | 	// Note: data()[size-1] is set to 0 automatically on file read if
 92 | 	// the reading data size >= size - 1 bytes
 93 | }
 94 | 
 95 | //size_t StringBuffer::length() const
 96 | //#if VALIDATE < 2
 97 | //	noexcept
 98 | //#endif // VALIDATE
 99 | //{
100 | //#if VALIDATE >= 2
101 | //	const auto slen = strlen(data());
102 | //	if(m_length != slen) {
103 | //#if TRACE >= 2
104 | //		fprintf(stderr, "length(), string: %s\n", data());
105 | //#endif // TRACE
106 | //		throw logic_error("ERROR length(), m_length (" + to_string(m_length)
107 | //			+ ") != actual string length (" + to_string(slen) + ")\n");
108 | //	}
109 | //#endif // VALIDATE
110 | //	return m_length;
111 | //}
112 | 
113 | bool StringBuffer::empty() const
114 | #if VALIDATE < 2
115 | 	noexcept
116 | #endif // VALIDATE
117 | {
118 | #if VALIDATE >= 2
119 | 	if((!front() || front() == '\n') && m_length >= 2)
120 | 		throw logic_error("ERROR empty(), m_length (" + to_string(m_length)
121 | 			+ ") != actual string length (" + to_string(int(front() != 0)) + ")\n");
122 | #endif // VALIDATE
123 | 	return !front() || front() == '\n';
124 | }
125 | 
126 | bool StringBuffer::readline(FILE* input)
127 | {
128 | #if VALIDATE >= 2
129 | 	assert(input && !m_cur
130 | 		&& "readline(), valid file stream should be specified and have initial m_cur = 0");
131 | #endif // VALIDATE
132 | 	*data() = 0;  // Set first element to 0 as an initialization to have the empty string on errors
133 | 	const auto ibeg = ftell(input);
134 | 	// Read data from file until the string is read or an error occurs
135 | 	while(fgets(data() + m_cur, size() - m_cur, input) && data()[size()-2]) {
136 | #if TRACE >= 3  // Verified
137 | 		fprintf(stderr, "readline(), resizing buffer of %lu bytes, %lu pos: %s\n"
138 | 			, size(), m_cur, data());
139 | #endif // TRACE
140 | 		m_cur = size() - 1;  // Start overwriting ending '0' of the string
141 | 		resize(size() + (size() / (spagesize * 2) + 1) * spagesize);
142 | 		data()[size() - 2] = 0;  // Set prelast element to 0
143 | 	}
144 | 	const auto iend = ftell(input);
145 | #if VALIDATE >= 2
146 | 	if(iend == -1 || ibeg == -1)
147 | 		perror("ERROR, file position reading error");
148 | 	const size_t  slen = strlen(data());
149 | 	if(!((!m_cur || slen >= m_cur) && size_t(iend - ibeg) == slen)) {
150 | 		fprintf(stderr, "readline(), m_cur: %lu, slen: %lu, dpos: %li,  str: %s\n"
151 | 			, m_cur, slen, iend - ibeg, data());
152 | 		assert(0 && "readline(), string size validation failed");
153 | 	}
154 | #endif // VALIDATE
155 | 	m_cur = 0;  // Reset the writing (appending) position
156 | 	// Note: prelast and last elements of the buffer will be always zero
157 | 
158 | 	// Set string length
159 | 	m_length = iend != -1 && ibeg != -1 ? iend - ibeg : strlen(data());
160 | 
161 | 	// Check for errors
162 | 	if((!m_length && feof(input)) || ferror(input)) {
163 | 		if(ferror(input))
164 | 			perror("ERROR readline(), file reading error");
165 | 		return false;  // No more lines can be read
166 | 	}
167 | 
168 | 	return true;  // More lines can be read
169 | }
170 | 
171 | // File I/O functions ----------------------------------------------------------
172 | namespace daoc {
173 | 
174 | void ensureDir(const string& dir)
175 | {
176 | #if TRACE >= 3
177 | 	fprintf(stderr, "ensureDir(), ensuring existence of: %s\n", dir.c_str());
178 | #endif // TRACE
179 | 	// Check whether the output directory exists and create it otherwise
180 | 	path  outdir = dir;
181 | 	if(!exists(outdir)) {
182 | 		error_code  err;
183 | 		if(!create_directories(outdir, err))
184 | //			fputs(string("ERROR ensureDir(), target directory '").append(dir)
185 | //				.append("' can't be created: ").append(err.message())
186 | //				.append("\n").c_str(), stderr);
187 | 			throw std::ios_base::failure(string("ERROR ensureDir(), target directory '")
188 | 				.append(dir).append("' can't be created: ") += err.message());
189 | 	} else if(!is_directory(outdir))
190 | //		fputs(string("ERROR ensureDir(), target entry '").append(dir)
191 | //			.append("' already exists as a non-directory path\n").c_str(), stderr);
192 | 		throw std::ios_base::failure(string("ERROR ensureDir(), target entry '").append(dir)
193 | 			+= "' already exists as a non-directory path\n");
194 | }
195 | 
196 | void parseCnlHeader(NamedFileWrapper& fcls, StringBuffer& line, size_t& clsnum
197 | 	, size_t& ndsnum, [[maybe_unused]] bool verbose)
198 | {
199 |     //! Parse count value
200 |     //! \return  - id value of 0 in case of parsing errors
201 | 	auto parseCount = []() noexcept -> size_t {
202 | 		char* tok = strtok(nullptr, " \t,");  // Note: the value can't be ended with ':'
203 | 		//errno = 0;
204 | 		const auto val = strtoul(tok, nullptr, 10);
205 | 		if(errno)
206 | 			perror(string("WARNING parseCount(), id value parsing error for the tok '")
207 | 				.append(tok).append("'").c_str());
208 | 		return val;
209 | 	};
210 | 
211 | 	errno = 0;  // Reset errno
212 | 	// Process the header, which is a special initial comment
213 | 	// The target header is:  # Clusters: <cls_num>[,] Nodes: <cls_num>
214 | 	constexpr char  clsmark[] = "clusters";
215 | 	constexpr char  ndsmark[] = "nodes";
216 | 	constexpr char  attrnameDelim[] = " \t:,";
217 | #if TRACE >= 2
218 | 	size_t  lnum = 0;  // The number of lines read
219 | #endif // TRACE
220 | 	while(line.readline(fcls)) {
221 | #if TRACE >= 2
222 | 		++lnum;
223 | #endif // TRACE
224 | 		// Skip empty lines
225 | 		if(line.empty())
226 | 			continue;
227 | 		// Consider only subsequent comments
228 | 		if(line[0] != '#')
229 | 			break;
230 | 
231 | 		// Tokenize the line
232 | 		char *tok = strtok(line + 1, attrnameDelim);  // Note: +1 to skip the leading '#'
233 | 		// Skip comment without the string continuation and continuous comment
234 | 		if(!tok || tok[0] == '#')
235 | 			continue;
236 | 		uint8_t  attrs = 0;  // The number of read attributes
237 | 		do {
238 | 			// Lowercase the token
239 | 			for(char* pos = tok; *pos; ++pos)
240 | 				*pos = tolower(*pos);
241 | 
242 | 			// Identify the attribute and read it's value
243 | 			if(!strcmp(tok, clsmark)) {
244 | 				clsnum = parseCount();
245 | 				++attrs;
246 | #if TRACE >= 2
247 | 				fprintf(stderr, "parseCnlHeader(), clusters: %lu\n", clsnum);
248 | #endif // TRACE
249 | 			} else if(!strcmp(tok, ndsmark)) {
250 | 				ndsnum = parseCount();
251 | 				++attrs;
252 | #if TRACE >= 2
253 | 				fprintf(stderr, "parseCnlHeader(), nodes: %lu\n", ndsnum);
254 | #endif // TRACE
255 | 			} else {
256 | #if TRACE >= 1
257 | #if TRACE < 2
258 | 			if(verbose)
259 | #endif // TRACE 2
260 | 				fprintf(
261 | #if TRACE >= 2
262 | 				stderr
263 | #else
264 | 				stdout
265 | #endif // TRACE 2
266 | 				, "WARNING parseCnlHeader(), the header parsing is omitted"
267 | 					" because of the unexpected attribute: %s\n", tok);
268 | #endif // TRACE 1
269 | 				break;
270 | 			}
271 | 		} while((tok = strtok(nullptr, attrnameDelim)) && attrs < 2);
272 | 
273 | 		// Validate and correct the number of clusters if required
274 | 		// Note: it's better to reallocate a container a few times than too much overconsume the memory
275 | 		if(ndsnum && clsnum > ndsnum) {
276 | 			fprintf(stderr, "WARNING parseCnlHeader(), clsnum (%lu) typically should be"
277 | 				" less than ndsnum (%lu)\n", clsnum, ndsnum);
278 | 			clsnum = ndsnum;
279 | 			//assert(0 && "parseCnlHeader(), clsnum typically should be less than ndsnum");
280 | 		}
281 | 		// Get following line for the unified subsequent processing
282 | 		line.readline(fcls);
283 | 		break;
284 | 	}
285 | #if TRACE >= 2
286 | 	fprintf(stderr, "parseCnlHeader(), processed %lu lines of '%s'\n"
287 | 		, lnum, fcls.name().c_str());
288 | #endif // TRACE
289 | }
290 | 
291 | size_t estimateCnlNodes(size_t filesize, float membership) noexcept
292 | {
293 | 	if(membership <= 0) {
294 | 		fprintf(stderr, "WARNING estimateCnlNodes(), invalid membership = %G specified"
295 | 			", reseted to 1\n", membership);
296 | 		membership = 1;
297 | 		//throw invalid_argument("estimateCnlNodes(), membership = "
298 | 		//	+ to_string(membership) + " should be positive\n");
299 | 	}
300 | 
301 | 	size_t  ndsnum = 0;  // The estimated number of nodes
302 | 	if(filesize) {
303 | 		size_t  magn = 10;  // Decimal ids magnitude
304 | 		unsigned  img = 1;  // Index of the magnitude (10^1)
305 | 		size_t  reminder = filesize % magn;  // Reminder in bytes
306 | 		ndsnum = reminder / ++img;  //  img digits + 1 delimiter for each element
307 | 		while(filesize >= magn) {
308 | 			magn *= 10;
309 | 			ndsnum += (filesize - reminder) % magn / ++img;
310 | 			reminder = filesize % magn;
311 | 		}
312 | 	}
313 | 	return ndsnum / membership;
314 | }
315 | 
316 | size_t estimateClusters(size_t ndsnum, float membership) noexcept
317 | {
318 | 	if(membership <= 0) {
319 | 		fprintf(stderr, "WARNING estimateClusters(), invalid membership = %G specified"
320 | 			", reseted to 1\n", membership);
321 | 		membership = 1;
322 | 		//throw invalid_argument("estimateClusters(), membership = "
323 | 		//	+ to_string(membership) + " should be positive\n");
324 | 	}
325 | 
326 | 	size_t  clsnum = 0;  // The estimated number of clusters
327 | 	// Usually the number of clusters does not increase square root of the number of nodes
328 | 	// Note: do not estimate in case the number of nodes is not specified
329 | 	if(ndsnum)
330 | 		clsnum = sqrt(ndsnum * membership) + 1;  // Note: +1 to consider rounding down
331 | 	return clsnum;
332 | }
333 | 
334 | }  // daoc
335 | 


--------------------------------------------------------------------------------
/src/interface_c.cpp:
--------------------------------------------------------------------------------
  1 | //! \brief Extrinsic measures evaluation interface implementation.
  2 | //!
  3 | //! \license Apache License, Version 2.0: http://www.apache.org/licenses/LICENSE-2.0.html
  4 | //! > 	Simple explanation: https://tldrlegal.com/license/apache-license-2.0-(apache-2.0)
  5 | //!
  6 | //! Copyright (c)
  7 | //! \authr Artem Lutov
  8 | //! \email luart@ya.ru
  9 | //! \date 2021-03-11
 10 | 
 11 | #include <utility>  // move
 12 | #include <string>
 13 | #include <vector>
 14 | #include <unordered_set>
 15 | // For the template definitions
 16 | #include <cmath>  // sqrt
 17 | #include <algorithm>  // sort
 18 | #include <cassert>
 19 | #include "agghash.hpp"
 20 | #include "interface.hpp"
 21 | #include "interface_c.h"
 22 | 
 23 | using std::move;
 24 | using std::string;
 25 | using std::vector;
 26 | using std::unordered_set;
 27 | 
 28 | // Accessory routines ----------------------------------------------------------
 29 | 
 30 | // Note: a dedicated declaration id required to define default parameters
 31 | //! \brief Load collection from the provided raw collection
 32 | //! \note This is an accessory routine for C API
 33 | //! \pre All clusters in the collection are expected to be unique and not validated for
 34 | //! the mutual match until makeunique is set;
 35 | //! (reduce == (nodebase->ndsnum() < rcn.num)) || nodebase->ndsnum() == rcn.num
 36 | //!
 37 | //! \param rcn const ClusterCollection  - raw collection of clusters
 38 | //! \param makeunique=false bool  - ensure that clusters contain unique members by
 39 | //! removing the duplicates
 40 | //! \param membership=1 float  - expected membership of the nodes, >0, typically >= 1.
 41 | //! Used only for the node container preallocation to estimate the number of nodes
 42 | //! if not specified in the file header
 43 | //! \param ahash=nullptr AggHash*  - resulting hash of the loaded
 44 | //! member ids base (unique ids only are hashed, not all ids) if not nullptr
 45 | //! \param const nodebase=nullptr NodeBaseI*  - node base to filter-out or complement nodes if required
 46 | //! \param reduce=false bool  - whether to reduce collections by removing the non-matching nodes
 47 | //! or extend collections by appending those nodes them to a single "noise" cluster
 48 | //! \param lostcls=nullptr RawIds*  - indices of the lost clusters during the node base
 49 | //! synchronization
 50 | //! \param verbose=false bool  - print the number of loaded nodes to the stdout
 51 | //! \return CollectionT  - the collection is loaded successfully
 52 | Collection<Id> loadCollection(const ClusterCollection rcn, bool makeunique=false
 53 | 	, float membership=1, ::AggHash* ahash=nullptr, const NodeBaseI* nodebase=nullptr
 54 | 	, bool reduce=false, RawIds* lostcls=nullptr, bool verbose=false);
 55 | 
 56 | Collection<Id> loadCollection(const ClusterCollection rcn, bool makeunique, float membership
 57 | 	, ::AggHash* ahash, const NodeBaseI* nodebase, bool reduce, RawIds* lostcls, bool verbose)
 58 | {
 59 | 	Collection<Id>  cn;  // Return using NRVO, named return value optimization
 60 | 
 61 | #ifdef DEBUG
 62 | 	// Note: asserts break libraries (=> may crash a webservice), and, hence, should be avoided in the release mode
 63 | 	assert(((reduce == (nodebase->ndsnum() < rcn.num)) || nodebase->ndsnum() == rcn.num)
 64 | 		&& "Nodebase is not synced with the reduce argument");
 65 | #else
 66 | 	if(!((reduce == (nodebase->ndsnum() < rcn.num)) || nodebase->ndsnum() == rcn.num)) {
 67 | 		fprintf(stderr, "ERROR: loadCollection(). Nodebase is not synced with the reduce argument (reduce: %u, nodebase: %u, rcn: %u)\n"
 68 | 			, reduce, nodebase->ndsnum(), rcn.num);
 69 | 		return cn;
 70 | 	}
 71 | #endif // DEBUG
 72 | 
 73 | 	if(!rcn.nodes) {
 74 | 		fputs("WARNING loadCollection(), the empty input collection is omitted\n", stderr);
 75 | 		return cn;
 76 | 	}
 77 | 
 78 | 	// Preallocate space for the clusters and nodes
 79 | 	size_t  nsnum = rcn.num * 2;  // The (estimated) number of nodes
 80 | 	if(cn.m_cls.capacity() < rcn.num)  //  * cn.m_cls.max_load_factor()
 81 | 		cn.m_cls.reserve(rcn.num);
 82 | 	if(cn.m_ndcs.bucket_count() * cn.m_ndcs.max_load_factor() < nsnum)
 83 | 		cn.m_ndcs.reserve(nsnum);
 84 | 
 85 | 	// Load clusters
 86 | #if TRACE >= 2
 87 | 	fprintf(stderr, "loadCollection(), expected %lu clusters, %lu nodes from %u raw node relations\n"
 88 | 		, rcn.num, nsnum, rcn.num);
 89 | 	if(nodebase)
 90 | 		fprintf(stderr, "loadCollection(), nodebase provided with %u nodes\n", nodebase->ndsnum());
 91 | #endif // TRACE
 92 | 
 93 | 	// Parse clusters
 94 | 	ClusterHolder<Id>  chd(new Cluster<Id>());
 95 | 	for(NodeId i = 0; i < rcn.num; ++i) {
 96 | 		Cluster<Id>* const  pcl = chd.get();
 97 | 		auto& members = pcl->members;
 98 | 		const auto& ndrels = rcn.nodes[i];
 99 | 		members.reserve(ndrels.num);
100 | 		for(NodeId j = 0; j < ndrels.num; ++j) {
101 | #ifdef DEBUG
102 | 			assert(ndrels.ids && "Invalid (non-allocated) node relations");
103 | #else
104 | 			if(!ndrels.ids) {
105 | 				fputs("ERROR: loadCollection(). Invalid (non-allocated) node relations\n", stderr);
106 | 				cn = Collection<Id>();
107 | 				return cn;
108 | 			}
109 | #endif // DEBUG
110 | 			const auto did = ndrels.ids[j];
111 | 			// Filter out nodes if required
112 | 			if(nodebase && reduce && !nodebase->nodeExists(did))
113 | 				continue;
114 | 			members.push_back(did);
115 | 			auto& ncs = cn.m_ndcs[did];
116 | 			ncs.push_back(pcl);
117 | 		}
118 | 		if(!members.empty()) {
119 | 			members.shrink_to_fit();  // Free over reserved space
120 | 			if(makeunique) {
121 | 				// Ensure or validate that members are unique
122 | 				std::sort(members.begin(), members.end());
123 | 				const auto im = unique(members.begin(), members.end());
124 | 				//const auto im = adjacent_find(members.begin(), members.end());
125 | 				if(im != members.end()) {
126 | 					fprintf(stderr, "WARNING loadCollection(), #%lu cluster contained %lu duplicated members, corrected.\n"
127 | 						, cn.m_cls.size(), distance(im, members.end()));
128 | 					// Remove associated clusters
129 | 					for(auto jm = im; jm != members.end(); ++jm)
130 | 						cn.m_ndcs[*jm].pop_back();
131 | 					// Remove the tail of duplicated node ids
132 | 					members.erase(im, members.end());
133 | 					//fprintf(stderr, "WARNING loadCollection(), #%lu cluster contains duplicated member #%lu: %u\n"
134 | 					//	, cn.m_cls.size(), distance(members.begin(), im), *im);
135 | 					//throw invalid_argument("loadCollection(), the cluster contains duplicated members\n");
136 | 				}
137 | 			}
138 | 			members.shrink_to_fit();  // Free over reserved space
139 | 			//for(auto v: members)
140 | 			//	printf(" %u", v);
141 | 			//puts("");
142 | 			cn.m_cls.push_back(chd.release());
143 | 			// Start filling a new cluster
144 | 			chd.reset(new Cluster<Id>());
145 | 		} else if(lostcls)
146 | 			lostcls->push_back(lostcls->size() + cn.m_cls.size());
147 | 	}
148 | 
149 | 	// Extend collection with a single "noise" cluster containing missed nodes if required
150 | 	if(nodebase && !reduce && cn.m_ndcs.size() < nodebase->ndsnum()) {
151 | 		// Fetch complementary nodes
152 | 		RawIds nids;
153 | 		nids.reserve(nodebase->ndsnum() - cn.m_ndcs.size());
154 | 		for(auto nid: nodebase->nodes())
155 | 			if(!cn.m_ndcs.count(nid))
156 | 				nids.push_back(nid);
157 | 		// Add complementary nodes to the
158 | 		Cluster<Id>* const  pcl = chd.get();
159 | 		pcl->members.insert(pcl->members.end(), nids.begin(), nids.end());
160 | 		for(auto nid: nids)
161 | 			cn.m_ndcs[nid].push_back(pcl);
162 | 		cn.m_cls.push_back(chd.release());
163 | 	}
164 | 
165 | 	// Save some space if it is essential
166 | 	if(cn.m_cls.size() < cn.m_cls.capacity() / 2)
167 | 		cn.m_cls.shrink_to_fit();
168 | 	// Rehash the clusters and nodes for faster traversing if required
169 | 	//if(cn.m_cls.size() < cn.m_cls.bucket_count() * cn.m_cls.max_load_factor() / 2)
170 | 	//	cn.m_cls.reserve(cn.m_cls.size());
171 | 	if(cn.m_ndcs.size() < cn.m_ndcs.bucket_count() * cn.m_ndcs.max_load_factor() / 2)
172 | 		cn.m_ndcs.reserve(cn.m_ndcs.size());
173 | 
174 | 	// Evaluate the node hash
175 | 	::AggHash  mbhash;  // Nodes hash (only unique nodes, not all the members)
176 | 	for(const auto& ndcl: cn.m_ndcs)
177 | 		mbhash.add(ndcl.first);
178 | 	// Assign hash to the results
179 | 	cn.m_ndshash = mbhash.hash();  // Note: required to identify the unequal node base in the processing collections
180 | 	if(ahash)
181 | 		*ahash = move(mbhash);
182 | #if TRACE >= 2
183 | 	printf("loadCollection(), loaded %lu clusters (capacity: %lu, overhead: %0.2f %%) and"
184 | 		" %lu nodes (reserved %lu buckets, overhead: %0.2f %%) with hash %lu from %u raw node relations\n"
185 | 		, cn.m_cls.size(), cn.m_cls.capacity()
186 | 		, cn.m_cls.size() ? float(cn.m_cls.capacity() - cn.m_cls.size()) / cn.m_cls.size() * 100
187 | 			: numeric_limits<float>::infinity()
188 | 		, cn.m_ndcs.size(), cn.m_ndcs.bucket_count()
189 | 		, cn.m_ndcs.size() ? float(cn.m_ndcs.bucket_count() - cn.m_ndcs.size()) / cn.m_ndcs.size() * 100
190 | 			: numeric_limits<float>::infinity()
191 | 		, cn.m_ndshash, rcn.num);
192 | #elif TRACE >= 1
193 | 	if(verbose)
194 | 		printf("loadCollection(), loaded %lu clusters %lu nodes from %u raw node relations\n", cn.m_cls.size()
195 | 			, cn.m_ndcs.size(), rcn.num);
196 | #endif
197 | 
198 | 	return cn;
199 | }
200 | 
201 | /// \brief Fetch nodes from the raw collection of clusters
202 | ///
203 | /// \param cn const ClusterCollection  - raw collection of clusters
204 | /// \return UniqIds  - cluster nodes
205 | UniqIds fetchNodes(const ClusterCollection cn)
206 | {
207 | 	UniqIds nodes;  // Uses NRVO return value optimization
208 | 	nodes.reserve(cn.num * 2);
209 | 
210 | 	if(cn.nodes) {
211 | 		for(NodeId i = 0; i < cn.num; ++i) {
212 | 			const auto& ndrs = cn.nodes[i];
213 | 			if(!ndrs.ids) {
214 | 				fprintf(stderr, "WARNING %s(), the empty node ids (nominally: %u ids) is omitted\n", __FUNCTION__, ndrs.num);
215 | 				continue;
216 | 			}
217 | 			for(NodeId j = 0; j < ndrs.num; ++j)
218 | 				nodes.insert(nodes.end(), ndrs.ids[j]);
219 | 		}
220 | 	} else fprintf(stderr, "WARNING %s(), the empty input collection (nominally: %u nodes) is omitted\n", __FUNCTION__, cn.num);
221 | 
222 | 	return nodes;
223 | }
224 | 
225 | /// \brief Fetch nodebase from collection of clusters, reduced (intersection) or extended (union) one
226 | ///
227 | /// \param cn1 ClusterCollection const  - first raw collection of clusters
228 | /// \param cn2 ClusterCollection const  - second raw collection of clusters
229 | /// \param reduced=false bool  - whether reduce or extend tham
230 | /// \return NodeBase  - resulting nodebase
231 | NodeBase fetchNodebase(const ClusterCollection cn1, const ClusterCollection cn2, bool reduced=false)
232 | {
233 | 	NodeBase nodes;  // Uses NRVO return value optimization
234 | 	if(reduced) {
235 | 		UniqIds  nds1 = fetchNodes(cn1);
236 | 		UniqIds  nds2 = fetchNodes(cn2);
237 | 		nodes.reserve(abs(static_cast<long>(nds1.size()) - static_cast<long>(nds2.size())));
238 | 		for(auto nid: nds1)
239 | 			if(!nds2.count(nid))
240 | 				nodes.insert(nodes.end(), nid);
241 | 		for(auto nid: nds2)
242 | 			if(!nds1.count(nid))
243 | 				nodes.insert(nodes.end(), nid);
244 | #if VALIDATE >= 2
245 | 		assert((nodes.ndsnum() <= min(nds1.size(), nds2.size())) && "Unexpected size of resulting nodes");
246 | #endif // VALIDATE
247 | 	} else for(const auto& cn: {cn1, cn2}) {
248 | 		const auto partnds = fetchNodes(cn);
249 | 		nodes.insert(partnds.begin(), partnds.end());
250 | 	}
251 | 	return nodes;
252 | }
253 | 
254 | // Interface implementation ----------------------------------------------------
255 | Probability f1p(const ClusterCollection cn1, const ClusterCollection cn2)
256 | {
257 | 	return f1(cn1, cn2, F1_PARTPROB, nullptr, nullptr);
258 | }
259 | 
260 | Probability f1h(const ClusterCollection cn1, const ClusterCollection cn2)
261 | {
262 | 	return f1(cn1, cn2, F1_HARMONIC, nullptr, nullptr);
263 | }
264 | 
265 | Probability f1(const ClusterCollection cn1, const ClusterCollection cn2, F1Kind kind
266 | 	, Probability* rec, Probability* prc)
267 | {
268 | 	Probability  tmp;  // Temporary buffer, a placeholder
269 | 	return f1x(cn1, cn2, kind, rec ? rec : &tmp, prc ? prc : &tmp, MATCH_WEIGHTED, 1, 1, 0);
270 | }
271 | 
272 | Probability f1x(const ClusterCollection cn1, const ClusterCollection cn2, F1Kind kind
273 | 	, Probability* rec, Probability* prc, MatchKind mkind, uint8_t sync, uint8_t makeunique, uint8_t verbose)
274 | {
275 | #if TRACE >= 2
276 | 	if(verbose)
277 | 		printf("%s(), loading clustering collections of size: %u, %u\n", __FUNCTION__
278 | 			, cn1.num, cn2.num);
279 | #endif // TRACE
280 | 	assert(rec && prc && "Invalid output arguments");
281 | 	// Load nodes
282 | 	const bool reduce = false;  // Whether to reduce or extend collections of clusters
283 | 	Probability res = 0;
284 | 	if(sync) {
285 | 		NodeBase ndbase = fetchNodebase(cn1, cn2, reduce);
286 | 		Collection<Id>  c1 = loadCollection(cn1, makeunique, 1, nullptr, &ndbase, reduce);
287 | 		Collection<Id>  c2 = loadCollection(cn2, makeunique, 1, nullptr, &ndbase, reduce);
288 | 		res = Collection<Id>::f1(c1, c2, static_cast<F1>(kind), *rec, *prc, static_cast<Match>(mkind), verbose);
289 | 	} else {
290 | 		Collection<Id>  c1 = loadCollection(cn1);
291 | 		Collection<Id>  c2 = loadCollection(cn2);
292 | 		res = Collection<Id>::f1(c1, c2, static_cast<F1>(kind), *rec, *prc, static_cast<Match>(mkind), verbose);
293 | 	}
294 | 	return res;
295 | }
296 | 
297 | Probability omega(const ClusterCollection cn1, const ClusterCollection cn2)
298 | {
299 | 	return omegax(cn1, cn2, 0, 1, 1);
300 | }
301 | 
302 | Probability omegaExt(const ClusterCollection cn1, const ClusterCollection cn2)
303 | {
304 | 	return omegax(cn1, cn2, 1, 1, 1);
305 | }
306 | 
307 | Probability omegax(const ClusterCollection cn1, const ClusterCollection cn2, uint8_t ext, uint8_t sync, uint8_t makeunique)
308 | {
309 | 	// Transform loaded and pre-processed collection to the representation
310 | 	// suitable for Omega Index evaluation
311 | 	RawClusters  cls1;
312 | 	RawClusters  cls2;
313 | 	NodeRClusters  ndrcs;
314 | 
315 | 	const bool reduce = false;  // Whether to reduce or expand collections of clusters
316 | 	if(sync) {
317 | 		NodeBase ndbase = fetchNodebase(cn1, cn2, reduce);
318 | 		Collection<Id>  c1 = loadCollection(cn1, makeunique, 1, nullptr, &ndbase, reduce);
319 | 		Collection<Id>  c2 = loadCollection(cn2, makeunique, 1, nullptr, &ndbase, reduce);
320 | 		c1.template transfer<true>(cls1, ndrcs);
321 | 		c2.template transfer<false>(cls2, ndrcs);
322 | 	} else {
323 | 		Collection<Id>  c1 = loadCollection(cn1);
324 | 		Collection<Id>  c2 = loadCollection(cn2);
325 | 		c1.template transfer<true>(cls1, ndrcs);
326 | 		c2.template transfer<false>(cls2, ndrcs);
327 | 	}
328 | 	return ext ? omega<true>(ndrcs, cls1, cls2)
329 | 		: omega<false>(ndrcs, cls1, cls2);
330 | }
331 | 


--------------------------------------------------------------------------------
/shared/fileio.hpp:
--------------------------------------------------------------------------------
  1 | //! \brief File IO utils
  2 | //!
  3 | //!	Interface macro definitions:
  4 | //! INCLUDE_STL_FS  - include STL filesystem library under fs namespace. This macros is
  5 | //! 	defined to avoid repetitive conditional inclusion of the STL FS.
  6 | //!
  7 | //! \license Apache License, Version 2.0: http://www.apache.org/licenses/LICENSE-2.0.html
  8 | //! > 	Simple explanation: https://tldrlegal.com/license/apache-license-2.0-(apache-2.0)
  9 | //!
 10 | //! Copyright (c)
 11 | //! \authr Artem Lutov
 12 | //! \email luart@ya.ru
 13 | //! \date 2017-02-13
 14 | 
 15 | #ifndef FILEIO_H
 16 | #define FILEIO_H
 17 | 
 18 | #include <cstdint>  // uintX_t
 19 | #include <cstdio>  // FILE
 20 | #include <utility>  // move
 21 | #include <string>
 22 | #include <vector>
 23 | #include <unordered_set>
 24 | // For the template definitions
 25 | #include <cstring>  // strtok
 26 | #include <cmath>  // sqrt
 27 | 
 28 | #ifdef INCLUDE_STL_FS
 29 | #if defined(__has_include) && __has_include(<filesystem>) && __cplusplus >= 201703L  // C++17+
 30 | 	#include <filesystem>
 31 | 	namespace fs = std::filesystem;
 32 | #elif defined(__has_include) && __has_include(<experimental/filesystem>)
 33 | 	#include <experimental/filesystem>
 34 | 	namespace fs = std::experimental::filesystem;
 35 | #else
 36 | 	#error "STL filesystem is not available. The native alternative is not implemented."
 37 | #endif // __has_include
 38 | #endif // INCLUDE_STL_FS
 39 | 
 40 | #include "agghash.hpp"
 41 | 
 42 | //#include "types.h"
 43 | 
 44 | 
 45 | namespace daoc {
 46 | 
 47 | using std::move;
 48 | using std::string;
 49 | using std::vector;
 50 | using std::unordered_set;
 51 | 
 52 | // File Wrapping Types ---------------------------------------------------------
 53 | //! \brief Wrapper around the FILE* to prevent hanging file descriptors
 54 | class FileWrapper {
 55 |     FILE*  m_dsc;
 56 |     bool  m_tidy;
 57 | public:
 58 |     //! \brief Constructor
 59 |     //!
 60 |     //! \param fd FILE*  - the file descriptor to be held
 61 |     //! \param cleanup=true bool  - close the file descriptor on destruction
 62 |     //! 	(typically false if stdin/out is supplied)
 63 |     FileWrapper(FILE* fd=nullptr, bool cleanup=true) noexcept
 64 |     : m_dsc(fd), m_tidy(cleanup)  {}
 65 | 
 66 |     //! \brief Copy constructor
 67 |     //! \note Any file descriptor should have a single owner
 68 |     FileWrapper(const FileWrapper&)=delete;
 69 | 
 70 | 	//! \brief Move constructor
 71 | 	// ATTENTION: fw.m_dsc is not set to nullptr by the default move operation
 72 | 	// ATTENTION: std::vector will move their elements if the elements' move constructor
 73 | 	// is noexcept, and copy otherwise (unless the copy constructor is not accessible)
 74 |     FileWrapper(FileWrapper&& fw) noexcept
 75 |     : FileWrapper(fw.m_dsc, fw.m_tidy)
 76 |     {
 77 |     	fw.m_dsc = nullptr;
 78 |     }
 79 | 
 80 |     //! \brief Copy assignment
 81 |     //! \note Any file descriptor should have the single owner
 82 |     FileWrapper& operator= (const FileWrapper&)=delete;
 83 | 
 84 | 	//! \brief Move assignment
 85 | 	// ATTENTION: fw.m_dsc is not set to nullptr by the default move operation
 86 |     FileWrapper& operator= (FileWrapper&& fw) noexcept
 87 |     {
 88 |     	reset(fw.m_dsc, fw.m_tidy);
 89 |     	fw.m_dsc = nullptr;
 90 |     	return *this;
 91 |     }
 92 | 
 93 |     //! \brief Destructor
 94 |     ~FileWrapper()  // noexcept by default
 95 |     {
 96 |         if(m_dsc && m_tidy) {
 97 |             fclose(m_dsc);
 98 |             m_dsc = nullptr;
 99 |         }
100 |     }
101 | 
102 |     //! \brief Implicit conversion to the file descriptor
103 |     //!
104 |     //! \return FILE*  - self as a file descriptor
105 |     operator FILE*() const noexcept  { return m_dsc; }
106 | 
107 |     //! \brief Reset the wrapper
108 |     //!
109 |     //! \param fd FILE*  - the file descriptor to be held
110 |     //! \param cleanup=true bool  - close the file descriptor on destruction
111 |     //! 	(typically false if stdin/out is supplied)
112 |     //! \return void
113 | 	void reset(FILE* fd=nullptr, bool cleanup=true) noexcept
114 | 	{
115 |         if(m_dsc && m_tidy && m_dsc != fd)
116 |             fclose(m_dsc);
117 |     	m_dsc = fd;
118 |     	m_tidy = cleanup;
119 | 	}
120 | 
121 |     //! \brief Release ownership of the holding file
122 |     //!
123 |     //! \return FILE*  - file descriptor
124 |     FILE* release() noexcept
125 |     {
126 |     	auto fd = m_dsc;
127 |     	m_dsc = nullptr;
128 | 		return fd;
129 |     }
130 | };
131 | 
132 | //! \brief Wrapper around the FILE* that holds also the filename giving ability
133 | //! to reopen it and perform meaningful
134 | // Note: we can't inherit from the FileWrapper because semantic of reset differs
135 | class NamedFileWrapper {
136 | 	FileWrapper  m_file;  //!< File descriptor
137 | 	string  m_name;  //!< File name
138 | public:
139 |     //! \brief Default Constructor
140 |     // Note: Required tor return empty objects using NRVO optimization
141 | 	NamedFileWrapper() noexcept: m_file(), m_name()  {}
142 | 
143 |     //! \brief Constructor
144 |     //! \pre Parent directory must exists
145 |     //!
146 |     //! \param filename const char*  - new file name to be opened
147 |     //! \param mode const char*  - opening mode, the same as fopen() has
148 | 	NamedFileWrapper(const char* filename, const char* mode)
149 | 	: m_file(filename && mode ? fopen(filename, mode) : nullptr)
150 | 	, m_name(filename ? filename : "")  {}
151 | 
152 |     //! \brief Copy constructor
153 |     //! \note Any file descriptor should have a single owner
154 |     NamedFileWrapper(const NamedFileWrapper&)=delete;
155 | 
156 | 	//! \brief Move constructor
157 | 	// ATTENTION: std::vector will move their elements if the elements' move constructor
158 | 	// is noexcept, and copy otherwise (unless the copy constructor is not accessible)
159 |     NamedFileWrapper(NamedFileWrapper&& fw) noexcept
160 |     : m_file(move(fw.m_file)), m_name(move(fw.m_name))  {}
161 | 
162 |     //! \brief Copy assignment
163 |     //! \note Any file descriptor should have the single owner
164 |     NamedFileWrapper& operator= (const NamedFileWrapper&)=delete;
165 | 
166 | 	//! \brief Move assignment
167 |     NamedFileWrapper& operator= (NamedFileWrapper&& fw) noexcept
168 |     {
169 | 		m_file = move(fw.m_file);
170 | 		m_name = move(fw.m_name);
171 | 		return *this;
172 |     }
173 | 
174 |     //! \brief File name
175 |     //!
176 |     //! \return const string&  - file name
177 |     const string& name() const noexcept  { return m_name; }
178 | 
179 |     //! \brief File size
180 |     //!
181 |     //! \return size_t  - file size or -1 on error
182 |     size_t size() const noexcept;
183 | 
184 |     //! \brief Implicit conversion to the file descriptor
185 |     //!
186 |     //! \return FILE*  - file descriptor
187 |     operator FILE*() const noexcept  { return m_file; }
188 | 
189 |     //! \brief Reopen the file under another mode
190 |     //!
191 |     //! \param mode const char*  - the mode of operations, the same as in fopen()
192 |     //! \return NamedFileWrapper&  - the reopened file or closed (if can't be opened)
193 |     NamedFileWrapper& reopen(const char* mode)
194 |     {
195 | 		m_file.reset(freopen(nullptr, mode, m_file));  // m_name.c_str()
196 | 		return *this;
197 |     }
198 | 
199 |     //! \brief Reset the file, closes current file and opens another one
200 |     //! \pre Parent directory must exists
201 |     //!
202 |     //! \param filename const char*  - new file name to be opened
203 |     //! \param mode const char*  - opening mode, the same as fopen() has
204 |     //! \return NamedFileWrapper&  - the newly opened file or just the old one closed
205 | 	NamedFileWrapper& reset(const char* filename, const char* mode);
206 | 
207 |     //! \brief Release ownership of the holding file
208 |     //!
209 |     //! \return FILE*  - file descriptor
210 |     FILE* release() noexcept  { return m_file.release(); }
211 | };
212 | 
213 | // File Reading Types ----------------------------------------------------------
214 | //! \brief Base of the StringBuffer
215 | using StringBufferBase = vector<char>;
216 | 
217 | //! \brief String buffer to real file by lines using c-strings
218 | //! \note The last symbol in the string is always set to 0 automatically
219 | class StringBuffer: protected StringBufferBase {
220 | 	constexpr static size_t  spagesize = 4096;  // Small page size on x64
221 | 
222 | 	size_t  m_cur;  //! Current position for the writing
223 | 	size_t  m_length;  //! Current length of the holding c-string
224 | //protected:
225 | //	StringBufferBase::size();
226 | public:
227 |     //! \brief
228 |     //! \post the allocated buffer will have size >= 2
229 |     //!
230 |     //! \param size=spagesize size_t  - size of the buffer
231 |     // Note: can throw bad_alloc
232 | 	StringBuffer(size_t size=spagesize);
233 | 
234 |     //! \brief Reset the string and it's shrink the allocated buffer
235 |     //!
236 |     //! \param size=spagesize size_t  - new initial size of the string buffer
237 |     //! \return void
238 | 	void reset(size_t size=spagesize);
239 | 
240 |     //! \brief Length of the string including the terminating '\n' if present,
241 |     //! 	but without the terminating '0'
242 |     //!
243 |     //! \return size_t  - length of the holding c-string without the null terminator
244 | 	size_t length() const noexcept  { return m_length; }
245 | 
246 |     //! \brief Whether the string is empty or starts with the newline symbol
247 |     //! \attention empty() is true for '\n' when length() == 1
248 |     //!
249 |     //! \return bool  - the string is empty or starts with the '\n'
250 | 	bool empty() const
251 | #if VALIDATE < 2
252 | 		noexcept
253 | #endif // VALIDATE
254 | 	;
255 | 
256 |     //! \brief C-string including '\n' if it was present in the file
257 | 	operator char*() noexcept  { return data(); }
258 | 
259 |     //! \brief Const C-string including '\n' if it was present in the file
260 | 	operator const char*() const noexcept  { return data(); }
261 | 
262 |     //! \brief Make public indexing operators
263 | 	using StringBufferBase::operator[];
264 | 	using StringBufferBase::at;
265 | 
266 |     //! \brief Read line from the file and store including the terminating '\n' symbol
267 |     //! \attention The read string contains the trailing '\n' if exist in the file
268 |     //! \note The buffer might contain [part of] the read line on reading error
269 |     //!
270 |     //! \param input FILE*  - processing file
271 |     //! \return bool  - whether the current line is read without any errors or
272 |     //! the all lines already read (and the current one is empty)
273 | 	bool readline(FILE* input);
274 | };
275 | 
276 | #ifndef NO_FILEIO  // Turn off file I/O
277 | 
278 | // File I/O functions declaration ----------------------------------------------
279 | //! \brief Ensure existence of the specified directory
280 | //!
281 | //! \param dir const string&  - directory to be created if has not existed
282 | //! \return void
283 | void ensureDir(const string& dir);
284 | 
285 | //! \brief  Parse the header of CNL file and validate the results
286 | //! \post clsnum <= ndsnum if ndsnum > 0. 0 means not specified
287 | //!
288 | //! \param fcls NamedFileWrapper&  - the reading file
289 | //! \param line StringBuffer&  - processing line (string, header) being read from the file
290 | //! \param[out] clsnum size_t&  - resulting number of clusters if specified, 0 in case of parsing errors
291 | //! \param[out] ndsnum size_t&  - resulting number of nodes if specified, 0 in case of parsing errors
292 | //! \param verbose=false bool  - print information about the header parsing issue to the stdout
293 | //! \return void
294 | void parseCnlHeader(NamedFileWrapper& fcls, StringBuffer& line, size_t& clsnum
295 | 	, size_t& ndsnum, bool verbose=false);
296 | 
297 | //! \brief Load all unique nodes from the CNL file with optional filtering by the cluster size
298 | //!
299 | //! \tparam Id  - Node id type
300 | //! \tparam AccId  - Accumulated node ids type
301 | //!
302 | //! \param file NamedFileWrapper&  - input collection of clusters in the CNL format
303 | //! \param membership=1 float  - expected membership of the nodes, >0, typically >= 1.
304 | //! Used only for the node container preallocation to estimate the number of nodes
305 | //! if not specified in the file header
306 | //! \param ahash=nullptr AggHash<Id, AccId>*  - resulting aggregated hash of the loaded
307 | //! node ids if not nullptr
308 | //! \param cmin=0 size_t  - min allowed cluster size
309 | //! \param cmax=0 size_t  - max allowed cluster size, 0 means any size
310 | //! \param verbose=true bool  - print the number of loaded nodes to the stdout
311 | //! \return unordered_set<Id>  - the loaded collection of nodes
312 | template <typename Id, typename AccId>
313 | unordered_set<Id> loadNodes(NamedFileWrapper& file, float membership=1
314 | 	, AggHash<Id, AccId>* ahash=nullptr, size_t cmin=0, size_t cmax=0, bool verbose=true);
315 | 
316 | //! \brief Estimate the number of nodes from the CNL file size
317 | //!
318 | //! \param filesize size_t  - the number of bytes in the CNL file
319 | //! \param membership=1.f float  - average membership of the node,
320 | //! 	> 0, typically ~= 1
321 | //! \return size_t  - estimated number of nodes
322 | size_t estimateCnlNodes(size_t filesize, float membership=1.f) noexcept;
323 | 
324 | //! \brief Estimate the number of clusters from the number of nodes
325 | //!
326 | //! \param ndsnum size_t - the number of nodes
327 | //! \param membership=1.f float  - average membership of the node,
328 | //! 	> 0, typically ~= 1
329 | //! \return size_t  - estimated number of clusters
330 | size_t estimateClusters(size_t ndsnum, float membership=1.f) noexcept;
331 | 
332 | //! \brief Convert value to yes/no c-string
333 | //!
334 | //! \param val bool  - value to be converted
335 | //! \return constexpr const char*  - resulting c-string
336 | constexpr const char* toYesNo(bool val) noexcept  { return val ? "yes" : "no"; }
337 | 
338 | // File I/O templates definition -----------------------------------------------
339 | template <typename Id, typename AccId>
340 | unordered_set<Id> loadNodes(NamedFileWrapper& file, float membership
341 | 	, AggHash<Id, AccId>* ahash, size_t cmin, size_t cmax, bool verbose)
342 | {
343 | 	unordered_set<Id>  nodebase;  // Node base;  Note: returned using NRVO optimization
344 | 
345 | 	if(!file)
346 | 		return nodebase;
347 | 
348 | 	// Note: CNL [CSN] format only is supported
349 | 	size_t  clsnum = 0;  // The number of clusters
350 | 	size_t  ndsnum = 0;  // The number of nodes
351 | 
352 | 	// Note: strings defined out of the cycle to avoid reallocations
353 | 	StringBuffer  line;  // Reading line
354 | 	// Parse header and read the number of clusters if specified
355 | 	// Note: line includes terminating '\n'
356 | 	parseCnlHeader(file, line, clsnum, ndsnum, verbose);
357 | 
358 | 	// Estimate the number of nodes in the file if not specified
359 | 	if(!ndsnum) {
360 | 		size_t  cmsbytes = file.size();
361 | 		if(cmsbytes != size_t(-1))  // File length fetching failed
362 | 			ndsnum = estimateCnlNodes(cmsbytes, membership);
363 | 		else if(clsnum)
364 | 			ndsnum = 2 * clsnum; // / membership;  // Note: use optimistic estimate instead of pessimistic (square / membership) to not overuse the memory
365 | #if TRACE >= 2
366 | 		fprintf(stderr, "loadNodes(), estimated %lu nodes\n", ndsnum);
367 | #endif // TRACE
368 | 	}
369 | #if TRACE >= 2
370 | 	else fprintf(stderr, "loadNodes(), specified %lu nodes\n", ndsnum);
371 | #endif // TRACE
372 | 
373 | 	// Preallocate space for nodes
374 | 	if(ndsnum)
375 | 		nodebase.reserve(ndsnum);
376 | 
377 | 	// Load clusters
378 | 	// ATTENTION: without '\n' delimiter the terminating '\n' is read as an item
379 | 	constexpr char  mbdelim[] = " \t\n";  // Delimiter for the members
380 | 	vector<Id>  cnds;  // Cluster nodes. Note: a dedicated container is required to filter clusters by size
381 | 	cnds.reserve(sqrt(ndsnum));  // Note: typically cluster size does not increase the square root of the number of nodes
382 | #if TRACE >= 2
383 | 	size_t  totmbs = 0;  // The number of read member nodes from the file including repetitions
384 | 	size_t  fclsnum = 0;  // The number of read clusters from the file
385 | #endif // TRACE
386 | 	do {
387 | #if TRACE >= 3
388 | 		// Note: line includes terminating '\n'
389 | 		fprintf(stderr, "%lu> %s", fclsnum, static_cast<const char*>(line));
390 | #endif // TRACE
391 | 		char *tok = strtok(line, mbdelim);  // const_cast<char*>(line.data())
392 | 
393 | 		// Skip comments
394 | 		if(!tok || tok[0] == '#')
395 | 			continue;
396 | 		// Skip the cluster id if present
397 | 		if(tok[strlen(tok) - 1] == '>') {
398 | 			const char* cidstr = tok;
399 | 			tok = strtok(nullptr, mbdelim);
400 | 			// Skip empty clusters, which actually should not exist
401 | 			if(!tok) {
402 | 				fprintf(stderr, "WARNING loadNodes(), empty cluster"
403 | 					" exists: '%s', skipped\n", cidstr);
404 | 				continue;
405 | 			}
406 | 		}
407 | 		do {
408 | 			// Note: only node id is parsed, share part is skipped if exists,
409 | 			// but potentially can be considered in NMI and F1 evaluation.
410 | 			// In the latter case abs diff of shares instead of co occurrence
411 | 			// counting should be performed.
412 | 			Id  nid = strtoul(tok, nullptr, 10);
413 | #if VALIDATE >= 2
414 | 			if(!nid && tok[0] != '0') {
415 | 				fprintf(stderr, "WARNING loadNodes(), conversion error of '%s' into 0: %s\n"
416 | 					, tok, strerror(errno));
417 | 				continue;
418 | 			}
419 | #endif // VALIDATE
420 | #if TRACE >= 2
421 | 			++totmbs;  // Update the total number of read members
422 | #endif // TRACE
423 | 			cnds.push_back(nid);
424 | 		} while((tok = strtok(nullptr, mbdelim)));
425 | #if TRACE >= 2
426 | 		++fclsnum;  // The number of valid read lines, i.e. clusters
427 | #endif // TRACE
428 | 
429 | 		// Filter read cluster by size
430 | 		if(cnds.size() >= cmin && (!cmax || cnds.size() <= cmax))
431 | 			nodebase.insert(cnds.begin(), cnds.end());
432 | 		// Prepare outer vars for the next iteration
433 | 		cnds.clear();
434 | 	} while(line.readline(file));
435 | //	// Rehash the nodes decreasing the allocated space if required
436 | //	if(nodebase.size() <= nodebase.bucket_count() * nodebase.max_load_factor() / 3)
437 | //		nodebase.reserve(nodebase.size());
438 | #if TRACE >= 2
439 | 	printf("loadNodes(), the loaded base has %lu nodes from the input %lu members of %lu clusters\n"
440 | 		, nodebase.size(), totmbs, fclsnum);
441 | #else
442 | 	if(verbose)
443 | 		printf("loadNodes(), nodebase nodes loaded: %lu\n", nodebase.size());
444 | #endif // TRACE 2
445 | 
446 | 	// Evaluate nodes hash if required
447 | 	if(ahash && nodebase.size()) {
448 | 		AggHash<Id, AccId>  ndsh;
449 | 		for(auto nid: nodebase)
450 | 			ndsh.add(nid);
451 | 		*ahash = move(ndsh);
452 | 	}
453 | 
454 | 	return nodebase;
455 | }
456 | 
457 | #endif  // NO_FILEIO
458 | 
459 | }  // daoc
460 | 
461 | #endif // FILEIO_H
462 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # xmeasures - Extrinsic Clustering Measures
  2 | Extremely fast evaluation of accuracy (extrinsic quality) measures for the [overlapping/fuzzy] clusterings (collections of groups of items):  
  3 | family of **[mean] F1 measures** (including Average F1-Score) and **Omega Index** *(fuzzy version of the Adjusted Rand Index)* for overlapping multi-resolution clusterings with unequal node base (and optional node base synchronization) using various matching policies (micro, macro and combined weighting),  
  4 | and standard **NMI** for non-overlapping clustering on a single resolution. `xmeasures` also provides clusters labeling with the indices of the ground-truth clusters considering 1:n match and evaluating F1, precision and recall of the labeled clusters.
  5 | 
  6 | `xmeasures` evaluates F1 and NMI for collections of hundreds thousands [overlapping] clusters (covers, communities) withing a dozen seconds on an ordinary laptop using a single CPU core. The computational time is O(N) <!-- O(N \* 2 \* s), where *s* is the average sharing ratio (membership) of the nodes, typically -> 1. -->
  7 | unlike O(N \* C) <!-- O(N \* (C + C')) for the average F1-score -->
  8 | of the existing state of the art implementations, where N is the number of nodes in the network and C is the number of clusters. Computational complexity for Omega Index is standard and equals O(N^2 \* s/2), where s is the average sharing ratio (membership) of the nodes, typically s -> 1.  
  9 | `xmeasures` is one of the utilities designed for the [PyCaBeM](https://github.com/eXascaleInfolab/PyCABeM) clustering benchmark to evaluate clusterings of large networks.
 10 | 
 11 | `The paper:` [Accuracy Evaluation of Overlapping and Multi-resolution Clustering Algorithms on Large Datasets](https://github.com/eXascaleInfolab/xmeasures/blob/master/docs/xmeasures.pdf)
 12 | ```bibtex
 13 | @inproceedings{Xms19,
 14 | 	author={Artem Lutov and Mourad Khayati and Philippe Cudr{\'e}-Mauroux},
 15 | 	title={Accuracy Evaluation of Overlapping and Multi-resolution Clustering Algorithms on Large Datasets},
 16 | 	booktitle={6th IEEE International Conference on Big Data and Smart Computing (BigComp 2019)},
 17 | 	year={2019},
 18 | 	keywords={accuracy metrics, overlapping community evaluation, multi-resolution clustering evaluation, Generalized NMI, Omega Index, MF1, similarity of collections of sets}
 19 | }
 20 | ```
 21 | 
 22 | Related papers about the implemented measures:
 23 |   - [Omega Index](http://dx.doi.org/10.1207/s15327906mbr2302_6) ([fuzzy version of the Adjusted Rand Index](http://iopscience.iop.org/article/10.1088/1742-5468/2011/02/P02017/meta)), which equal to ARI when applied for the non-overlapping clusterings;
 24 |   - Mean F1 measures: [F1a (Average F1-Score)](https://cs.stanford.edu/people/jure/pubs/bigclam-wsdm13.pdf), F1p is much more indicative and discriminative than the presented there F1a but the respective paper has not been published yet;
 25 |   - [NMI measure](http://www.jmlr.org/papers/volume11/vinh10a/vinh10a.pdf).
 26 |     > Standard NMI is implemented considering overlapping and multi-resolution clustering only to demonstrate non-applicability of the standard NMI for such cases, where it yields unfair results. See [GenConvNMI](https://github.com/eXascaleInfolab/GenConvNMI) for the fair generalized NMI evaluation.
 27 | 
 28 | The execution time and the total processing time (relative power consumption) of `xmeasures` on a single CPU core vs [ParallelComMetric](https://github.com/eXascaleInfolab/ParallelComMetric) on multiple SMP cores evaluated on the SNAP DBLP dataset and shown in the log scale demonstrates that `xmeasures` evaluates F1 family measures multiple orders of magnitude faster than other state-of-the-art solutions:
 29 | ![Clubmark_Poster-w1024](images/CPU-Timings-DBLP.png)
 30 | 
 31 | 
 32 | Author:  (c) Artem Lutov <artem@exascale.info>
 33 | 
 34 | ## Content
 35 | - [Deployment](#deployment)
 36 | 	- [Requirements](#requirements)
 37 | 	- [Compilation](#compilation)
 38 |   	- [Application Compilation](#application-compilation)
 39 |   	- [Library Compilation](#library-compilation)
 40 | - [Usage](#usage)
 41 | - [Related Projects](#related-projects)
 42 | 
 43 | # Deployment
 44 | 
 45 | The target platform is NIX/Posix, the binary is compiled for Linux Ubuntu x64 and also should work on Windows 10+ x64 (see details in [this article](https://www.howtogeek.com/249966/how-to-install-and-use-the-linux-bash-shell-on-windows-10/)).
 46 | 
 47 | ## Requirements
 48 | There are no any requirements for the execution or compilation except the *standard C++ library*.
 49 | 
 50 | To run the *prebuilt executable* on Linux Ubuntu 16.04 x64, the standard library can be installed by: `$ sudo apt-get install libstdc++6`.
 51 | 
 52 | ## Compilation
 53 | ### Application Compilation
 54 | ```
 55 | $ make release
 56 | ```
 57 | 
 58 | The following **build errors** might occur on some platforms and should be resolved as outlined.
 59 | * If your default compiler is *g++/gcc < 5.x*, then `g++-5` or higher should be installed and `Makefile` might need to be edited replacing `g++`, `gcc` with `g++-5`, `gcc-5`.
 60 | * `-fstack-clash-protection` compiler flag is added since `xmeasures v4.0.5`, which might not be supported by Clang/LLVM and *GCC < 8.2*. This flag just hardens the application against some stack overflow attacks and should be excluded from the `Makefile` if not supported on your platform.
 61 | 
 62 | To **update/extend the input parameters**, modify `args.ggo` and run `GenerateArgparser.sh` (calls `gengetopt`) before running `make`. To install [*gengetopt*](https://www.gnu.org/software/gengetopt), execute: `$ sudo apt-get install gengetopt`.
 63 | 
 64 | 
 65 | ### Library Compilation
 66 | Some core functionality of xmeasures is available as a library with C API, making it possible to link the library from Python and other scripting languages.  
 67 | The interface is defined in `include/interface_c.h`.  
 68 | To build the library, execute:
 69 | ```
 70 | $ make -f Makefile_lib release
 71 | ```
 72 | 
 73 | # Usage
 74 | Execution Options:
 75 | ```
 76 | $ ../xmeasures -h
 77 | xmeasures 4.0.4
 78 | 
 79 | Extrinsic measures evaluation: Omega Index (a fuzzy version of the Adjusted
 80 | Rand Index, identical to the Fuzzy Rand Index) and [mean] F1-score (prob, harm
 81 | and avg) for the overlapping multi-resolution clusterings, and standard NMI for
 82 | the non-overlapping clustering on a single resolution. Unequal node base is
 83 | allowed in the evaluating clusterings and optionally can be synchronized
 84 | removing nodes from the clusters missed in one of the clusterings
 85 | (collections).
 86 | 
 87 | Usage: xmeasures [OPTIONS] clustering1 clustering2
 88 | 
 89 |   clustering  - input file, collection of the clusters to be evaluated.
 90 |   
 91 | Examples:
 92 |   $ ./xmeasures -fp -kc networks/5K25.cnl tests/5K25_l0.825/5K25_l0.825_796.cnl
 93 |   $ ./xmeasures -fh -kc -i tests/5K25.cll -ph -l networks/5K25.cnl
 94 | tests/5K25_l0.825/5K25_l0.825_796.cnl
 95 |   $ ./xmeasures -ox tests/clsevalsx/omega_c4.3-1.cnl
 96 | tests/clsevalsx/omega_c4.3-2.cnl
 97 | 
 98 | 
 99 | Extrinsic measures are evaluated, i.e. two input clusterings (collections of
100 | clusters) are compared to each other. Optionally, a labeling of the evaluating
101 | clusters with the specified ground-truth clusters is performed.
102 | NOTE:
103 |   - Multiple evaluating measures can be specified.
104 |   - Each cluster should contain unique members, which is ensured only if the
105 | 'unique' option is specified.
106 |   - All clusters should be unique to not affect Omega Index evaluation, which
107 | can be ensured by the [resmerge](https://github.com/eXascaleInfolab/resmerge)
108 | utility.
109 |   - Non-corrected unequal node base in the clusterings is allowed, it penalizes
110 | the match.Use [OvpNMI](https://github.com/eXascaleInfolab/OvpNMI) or
111 | [GenConvNMI](https://github.com/eXascaleInfolab/GenConvNMI) for NMI evaluation
112 | in the arbitrary collections (still each cluster should contain unique
113 | members).
114 | 
115 | Evaluating measures are:
116 |   - OI  - Omega Index (a fuzzy version of the Adjusted Rand Index, identical to
117 | the Fuzzy Rand Index), which yields the same value as Adjusted Rand Index when
118 | applied to the non-overlapping clusterings.
119 |   - [M]F1  - various [mean] F1 measures of the Greatest (Max) Match including
120 | the Average F1-Score (suggested by J. Leskovec) with the optional weighting.
121 | NOTE: There are 3 matching policies available for each kind of F1. The most
122 | representative evaluation is performed by the F1p with combined matching
123 | policy (considers both micro and macro weighting).
124 |   - NMI  - Normalized Mutual Information, normalized by either max or also
125 | sqrt, avg and min information content denominators.
126 | ATTENTION: This is a standard NMI, which should be used ONLY for the HARD
127 | partitioning evaluation (non-overlapping clustering on a single resolution).
128 | It penalizes overlapping and multi-resolution structures.
129 | 
130 | 
131 |   -h, --help                    Print help and exit
132 |   -V, --version                 Print version and exit
133 |   -O, --ovp                     evaluate overlapping instead of the
134 |                                   multi-resolution clusters, where max matching
135 |                                   for any shared member between R overlapping
136 |                                   clusters is 1/R (the member is shared)
137 |                                   instead of 1 (the member fully belongs to
138 |                                   each [hierarchical  sub]group) for the member
139 |                                   belonging to R distinct clusters on R
140 |                                   resolutions.
141 |                                   NOTE: It has no effect for the Omega Index
142 |                                   evaluation.  (default=off)
143 |   -q, --unique                  ensure on loading that all cluster members are
144 |                                   unique by removing all duplicates.
145 |                                   (default=off)
146 |   -s, --sync=filename           synchronize with the specified node base
147 |                                   omitting the non-matching nodes.
148 |                                   NOTE: The node base can be either a separate,
149 |                                   or an evaluating CNL file, in the latter case
150 |                                   this option should precede the evaluating
151 |                                   filename not repeating it
152 |   -m, --membership=FLOAT        average expected membership of the nodes in the
153 |                                   clusters, > 0, typically >= 1. Used only to
154 |                                   facilitate estimation of the nodes number on
155 |                                   the containers preallocation if this number
156 |                                   is not specified in the file header.
157 |                                   (default=`1')
158 |   -d, --detailed                detailed (verbose) results output
159 |                                   (default=off)
160 | 
161 | Omega Index:
162 |   -o, --omega                   evaluate Omega Index (a fuzzy version of the
163 |                                   Adjusted Rand Index, identical to the Fuzzy
164 |                                   Rand Index and on the non-overlapping
165 |                                   clusterings equals to ARI).  (default=off)
166 |   -x, --extended                evaluate extended (Soft) Omega Index, which
167 |                                   does not excessively penalize distinctly
168 |                                   shared nodes.  (default=off)
169 | 
170 | Mean F1:
171 |   -f, --f1[=ENUM]               evaluate mean F1 of the [weighted] average of
172 |                                   the greatest (maximal) match by F1 or partial
173 |                                   probability.
174 |                                   NOTE: F1h <= F1a, where:
175 |                                    - p (F1p or Ph)  - Harmonic mean (F1) of two
176 |                                   [weighted] averages of the Partial
177 |                                   Probabilities, the most indicative as
178 |                                   satisfies the largest number of the Formal
179 |                                   Constraints (homogeneity, completeness and
180 |                                   size/quantity except the rag bag in some
181 |                                   cases);
182 |                                    - h (F1h)  - Harmonic mean (F1) of two
183 |                                   [weighted] averages of all local F1 (harmonic
184 |                                   means of the Precision and Recall of the best
185 |                                   matches of the clusters);
186 |                                    - a (F1a)  - Arithmetic mean (average) of
187 |                                   two [weighted] averages of all local F1, the
188 |                                   least discriminative and satisfies the lowest
189 |                                   number of the Formal Constraints.
190 |                                   Precision and recall are evaluated relative
191 |                                   to the FIRST clustering dataset
192 |                                   (ground-truth, gold standard).
193 |                                     (possible values="partprob",
194 |                                   "harmonic", "average" default=`partprob')
195 |   -k, --kind[=ENUM]             kind of the matching policy:
196 |                                    - w  - Weighted by the number of nodes in
197 |                                   each cluster (known as micro weighting,
198 |                                   MF1_micro)
199 |                                    - u  - Unweighed, where each cluster is
200 |                                   treated equally (known as macro weighting,
201 |                                   MF1_macro)
202 |                                    - c  - Combined(w, u) using geometric mean
203 |                                   (drops the value not so much as harmonic
204 |                                   mean)
205 |                                     (possible values="weighted",
206 |                                   "unweighed", "combined"
207 |                                   default=`weighted')
208 | 
209 | Clusters Labeling & F1 evaluation with Precision and Recall:
210 |   -l, --label=gt_filename       label evaluating clusters with the specified
211 |                                   ground-truth (gt) cluster indices and
212 |                                   evaluate F1 (including Precision and Recall)
213 |                                   of the (best) MATCHED labeled clusters only
214 |                                   (without the probable subclusters).
215 |                                   NOTE: If 'sync' option is specified then the
216 |                                   file name of the clusters labels should be
217 |                                   the same as the node base (if specified) and
218 |                                   should be in the .cnl format. The file name
219 |                                   can be either a separate or an evaluating CNL
220 |                                   file, in the latter case this option should
221 |                                   precede the evaluating filename not repeating
222 |                                   it.
223 |                                   Precision and recall are evaluated relative
224 |                                   to the FIRST clustering dataset
225 |                                   (ground-truth, gold standard).
226 | 
227 |   -p, --policy[=ENUM]           Labels matching policy:
228 |                                    - p  - Partial Probabilities (maximizes
229 |                                   gain)
230 |                                    - h  - Harmonic Mean (minimizes loss,
231 |                                   maximizes F1)
232 |                                     (possible values="partprob", "harmonic"
233 |                                   default=`harmonic')
234 |   -u, --unweighted              Labels weighting policy on F1 evaluation:
235 |                                   weighted by the number of instances in each
236 |                                   label by default (micro weighting, F1_micro)
237 |                                   or unweighed, where each label is treated
238 |                                   equally (i.e. macro weighting, F1_macro)
239 |                                   (default=off)
240 |   -i, --identifiers=labels_filename
241 |                                 output labels (identifiers) of the evaluating
242 |                                   clusters as lines of space-separated indices
243 |                                   of the ground-truth clusters (.cll - clusters
244 |                                   labels list)
245 |                                   NOTE: If 'sync' option is specified then the
246 |                                   reduced collection is outputted to the
247 |                                   <labels_filename>.cnl besides the
248 |                                   <labels_filename>
249 | 
250 | NMI:
251 |   -n, --nmi                     evaluate NMI (Normalized Mutual Information),
252 |                                   applicable only to the non-overlapping
253 |                                   clusters  (default=off)
254 |   -a, --all                     evaluate all NMIs using sqrt, avg and min
255 |                                   denominators besides the max one
256 |                                   (default=off)
257 |   -e, --ln                      use ln (exp base) instead of log2 (Shannon
258 |                                   entropy, bits) for the information measuring
259 |                                   (default=off)
260 | ```
261 | 
262 | > Empty lines and comments (lines starting with #) in the input file (cnl format) are omitted.
263 | 
264 | **Examples**  
265 | Evaluate harmonic mean of the weighted average of the greatest (maximal) match by partial probabilities (the most discriminative F1-measure) using macro weighting (default as the most frequently used, thought combined weighting is the most indicative one):
266 | ```
267 | $ ./xmeasures -f data/3cls5nds.cnl data/4cls6nds.cnl
268 | ```
269 | 
270 | Evaluate harmonic mean of the weighted average (by the cluster size) of the greatest (maximal) match by F1s and insure than all cluster members are unique (the duplicated members are removed):
271 | ```
272 | $ ./xmeasures -fh -q data/3cls5nds.cnl data/4cls6nds.cnl
273 | ```
274 | 
275 | Evaluate harmonic mean of the [unweighted] average of the greatest (maximal) match by partial probabilities and synchronize the node base with the first evaluating collection, and considering overlapping clusters instead of multi-resolutions (`-O` does not matter for the case of non-overlapping single resolution collections):
276 | ```
277 | $ ./xmeasures -sku -fp -O data/3cls5nds.cnl data/4cls6nds.cnl
278 | ```
279 | 
280 | Evaluate arithmetic mean of the weighted average (by the cluster size) of the greatest (maximal) match by F1s and  NMI with all denominators synchronizing node base of the evaluating collections with `1lev4nds2cls.cnl`:
281 | ```
282 | $ ./xmeasures -fa -na -s data/1lev4nds2cls.cnl data/3cls5nds.cnl data/4cls6nds.cnl
283 | ```
284 | 
285 | Evaluate combined weighed and unweighted F1h (harmonic mean of the average F1s), label the clusters with the indices of provided labels, evaluate standard F1, precision and recall of the labeled clusters and output the labels to the `clslbs.cll`:
286 | ```
287 | $ ./xmeasures -fh -kc -i clslbs.cll -l labels.cnl clusters.cnl
288 | ```
289 | 
290 | Evaluate extended Omega Index and mean F1h (harmonic mean of the weighted average of the greatest (maximal) match by F1):
291 | ```
292 | $ ./xmeasures -ox -fh omega_c4.3-1.cnl omega_c4.3-2.cnl
293 | ```
294 | 
295 | **Note:** Please, [star this project](https://github.com/eXascaleInfolab/xmeasures) if you use it.
296 | 
297 | # Related Projects
298 | - [GenConvNMI](https://github.com/eXascaleInfolab/GenConvNMI) - Overlapping NMI evaluation that is compatible with the original NMI and suitable for both overlapping and multi resolution (hierarchical) clustering evaluation.
299 | - [OvpNMI](https://github.com/eXascaleInfolab/OvpNMI)  - NMI evaluation for the overlapping clusters (communities) that is not compatible with the standard NMI value unlike GenConvNMI, but it is much faster than GenConvNMI.
300 | - [Clubmark](https://github.com/eXascaleInfolab/clubmark) - A parallel isolation framework for benchmarking and profiling clustering (community detection) algorithms considering overlaps (covers).
301 | - [ParallelComMetric](https://github.com/eXascaleInfolab/ParallelComMetric) - A parallel toolkit implemented with Pthreads (or MPI) to calculate various extrinsic and intrinsic quality metrics (with and without ground truth community structure) for non-overlapping (hard, single membership) clusterings.
302 | - [CluSim](https://github.com/Hoosier-Clusters/clusim) - A Python module that evaluates (slowly) various extrinsic quality metrics (accuracy) for non-overlapping (hard, single membership) clusterings.
303 | - [resmerge](https://github.com/eXascaleInfolab/resmerge)  - Resolution levels clustering merger with filtering. Flattens hierarchy/list of multiple resolutions levels (clusterings) into the single flat clustering with clusters on various resolution levels synchronizing the node base.
304 | - [ExecTime](https://bitbucket.org/lumais/exectime/)  - A lightweight resource consumption profiler.
305 | - [TInfES](https://github.com/eXascaleInfolab/TInfES)  - Type inference evaluation scripts and accessory apps used for the benchmarking.
306 | 


--------------------------------------------------------------------------------
/autogen/cmdline.h:
--------------------------------------------------------------------------------
  1 | /** @file cmdline.h
  2 |  *  @brief The header file for the command line option parser
  3 |  *  generated by GNU Gengetopt version 2.23
  4 |  *  http://www.gnu.org/software/gengetopt.
  5 |  *  DO NOT modify this file, since it can be overwritten
  6 |  *  @author GNU Gengetopt */
  7 | 
  8 | #ifndef CMDLINE_H
  9 | #define CMDLINE_H
 10 | 
 11 | /* If we use autoconf.  */
 12 | #ifdef HAVE_CONFIG_H
 13 | #include "config.h"
 14 | #endif
 15 | 
 16 | #include <stdio.h> /* for FILE */
 17 | 
 18 | #ifdef __cplusplus
 19 | extern "C" {
 20 | #endif /* __cplusplus */
 21 | 
 22 | #ifndef CMDLINE_PARSER_PACKAGE
 23 | /** @brief the program name (used for printing errors) */
 24 | #define CMDLINE_PARSER_PACKAGE "xmeasures"
 25 | #endif
 26 | 
 27 | #ifndef CMDLINE_PARSER_PACKAGE_NAME
 28 | /** @brief the complete program name (used for help and version) */
 29 | #define CMDLINE_PARSER_PACKAGE_NAME "xmeasures"
 30 | #endif
 31 | 
 32 | #ifndef CMDLINE_PARSER_VERSION
 33 | /** @brief the program version */
 34 | #define CMDLINE_PARSER_VERSION "4.0.4"
 35 | #endif
 36 | 
 37 | enum enum_f1 { f1__NULL = -1, f1_arg_partprob = 0, f1_arg_harmonic, f1_arg_average };
 38 | enum enum_kind { kind__NULL = -1, kind_arg_weighted = 0, kind_arg_unweighed, kind_arg_combined };
 39 | enum enum_policy { policy__NULL = -1, policy_arg_partprob = 0, policy_arg_harmonic };
 40 | 
 41 | /** @brief Where the command line options are stored */
 42 | struct gengetopt_args_info
 43 | {
 44 |   const char *help_help; /**< @brief Print help and exit help description.  */
 45 |   const char *version_help; /**< @brief Print version and exit help description.  */
 46 |   int ovp_flag;	/**< @brief evaluate overlapping instead of the multi-resolution clusters, where max matching for any shared member between R overlapping clusters is 1/R (the member is shared) instead of 1 (the member fully belongs to each [hierarchical  sub]group) for the member belonging to R distinct clusters on R resolutions.
 47 |   NOTE: It has no effect for the Omega Index evaluation. (default=off).  */
 48 |   const char *ovp_help; /**< @brief evaluate overlapping instead of the multi-resolution clusters, where max matching for any shared member between R overlapping clusters is 1/R (the member is shared) instead of 1 (the member fully belongs to each [hierarchical  sub]group) for the member belonging to R distinct clusters on R resolutions.
 49 |   NOTE: It has no effect for the Omega Index evaluation. help description.  */
 50 |   int unique_flag;	/**< @brief ensure on loading that all cluster members are unique by removing all duplicates. (default=off).  */
 51 |   const char *unique_help; /**< @brief ensure on loading that all cluster members are unique by removing all duplicates. help description.  */
 52 |   char * sync_arg;	/**< @brief synchronize with the specified node base omitting the non-matching nodes.
 53 |   NOTE: The node base can be either a separate, or an evaluating CNL file, in the latter case this option should precede the evaluating filename not repeating it.  */
 54 |   char * sync_orig;	/**< @brief synchronize with the specified node base omitting the non-matching nodes.
 55 |   NOTE: The node base can be either a separate, or an evaluating CNL file, in the latter case this option should precede the evaluating filename not repeating it original value given at command line.  */
 56 |   const char *sync_help; /**< @brief synchronize with the specified node base omitting the non-matching nodes.
 57 |   NOTE: The node base can be either a separate, or an evaluating CNL file, in the latter case this option should precede the evaluating filename not repeating it help description.  */
 58 |   float membership_arg;	/**< @brief average expected membership of the nodes in the clusters, > 0, typically >= 1. Used only to facilitate estimation of the nodes number on the containers preallocation if this number is not specified in the file header. (default='1').  */
 59 |   char * membership_orig;	/**< @brief average expected membership of the nodes in the clusters, > 0, typically >= 1. Used only to facilitate estimation of the nodes number on the containers preallocation if this number is not specified in the file header. original value given at command line.  */
 60 |   const char *membership_help; /**< @brief average expected membership of the nodes in the clusters, > 0, typically >= 1. Used only to facilitate estimation of the nodes number on the containers preallocation if this number is not specified in the file header. help description.  */
 61 |   int detailed_flag;	/**< @brief detailed (verbose) results output (default=off).  */
 62 |   const char *detailed_help; /**< @brief detailed (verbose) results output help description.  */
 63 |   int omega_flag;	/**< @brief evaluate Omega Index (a fuzzy version of the Adjusted Rand Index, identical to the Fuzzy Rand Index and on the non-overlapping clusterings equals to ARI). (default=off).  */
 64 |   const char *omega_help; /**< @brief evaluate Omega Index (a fuzzy version of the Adjusted Rand Index, identical to the Fuzzy Rand Index and on the non-overlapping clusterings equals to ARI). help description.  */
 65 |   int extended_flag;	/**< @brief evaluate extended (Soft) Omega Index, which does not excessively penalize distinctly shared nodes. (default=off).  */
 66 |   const char *extended_help; /**< @brief evaluate extended (Soft) Omega Index, which does not excessively penalize distinctly shared nodes. help description.  */
 67 |   enum enum_f1 f1_arg;	/**< @brief evaluate mean F1 of the [weighted] average of the greatest (maximal) match by F1 or partial probability.
 68 |   NOTE: F1h <= F1a, where:
 69 |    - p (F1p or Ph)  - Harmonic mean (F1) of two [weighted] averages of the Partial Probabilities, the most indicative as satisfies the largest number of the Formal Constraints (homogeneity, completeness and size/quantity except the rag bag in some cases);
 70 |    - h (F1h)  - Harmonic mean (F1) of two [weighted] averages of all local F1 (harmonic means of the Precision and Recall of the best matches of the clusters);
 71 |    - a (F1a)  - Arithmetic mean (average) of two [weighted] averages of all local F1, the least discriminative and satisfies the lowest number of the Formal Constraints.
 72 |   Precision and recall are evaluated relative to the FIRST clustering dataset (ground-truth, gold standard).
 73 |  (default='partprob').  */
 74 |   char * f1_orig;	/**< @brief evaluate mean F1 of the [weighted] average of the greatest (maximal) match by F1 or partial probability.
 75 |   NOTE: F1h <= F1a, where:
 76 |    - p (F1p or Ph)  - Harmonic mean (F1) of two [weighted] averages of the Partial Probabilities, the most indicative as satisfies the largest number of the Formal Constraints (homogeneity, completeness and size/quantity except the rag bag in some cases);
 77 |    - h (F1h)  - Harmonic mean (F1) of two [weighted] averages of all local F1 (harmonic means of the Precision and Recall of the best matches of the clusters);
 78 |    - a (F1a)  - Arithmetic mean (average) of two [weighted] averages of all local F1, the least discriminative and satisfies the lowest number of the Formal Constraints.
 79 |   Precision and recall are evaluated relative to the FIRST clustering dataset (ground-truth, gold standard).
 80 |  original value given at command line.  */
 81 |   const char *f1_help; /**< @brief evaluate mean F1 of the [weighted] average of the greatest (maximal) match by F1 or partial probability.
 82 |   NOTE: F1h <= F1a, where:
 83 |    - p (F1p or Ph)  - Harmonic mean (F1) of two [weighted] averages of the Partial Probabilities, the most indicative as satisfies the largest number of the Formal Constraints (homogeneity, completeness and size/quantity except the rag bag in some cases);
 84 |    - h (F1h)  - Harmonic mean (F1) of two [weighted] averages of all local F1 (harmonic means of the Precision and Recall of the best matches of the clusters);
 85 |    - a (F1a)  - Arithmetic mean (average) of two [weighted] averages of all local F1, the least discriminative and satisfies the lowest number of the Formal Constraints.
 86 |   Precision and recall are evaluated relative to the FIRST clustering dataset (ground-truth, gold standard).
 87 |  help description.  */
 88 |   enum enum_kind kind_arg;	/**< @brief kind of the matching policy:
 89 |    - w  - Weighted by the number of nodes in each cluster (known as micro weighting, MF1_micro)
 90 |    - u  - Unweighed, where each cluster is treated equally (known as macro weighting, MF1_macro)
 91 |    - c  - Combined(w, u) using geometric mean (drops the value not so much as harmonic mean)
 92 |  (default='weighted').  */
 93 |   char * kind_orig;	/**< @brief kind of the matching policy:
 94 |    - w  - Weighted by the number of nodes in each cluster (known as micro weighting, MF1_micro)
 95 |    - u  - Unweighed, where each cluster is treated equally (known as macro weighting, MF1_macro)
 96 |    - c  - Combined(w, u) using geometric mean (drops the value not so much as harmonic mean)
 97 |  original value given at command line.  */
 98 |   const char *kind_help; /**< @brief kind of the matching policy:
 99 |    - w  - Weighted by the number of nodes in each cluster (known as micro weighting, MF1_micro)
100 |    - u  - Unweighed, where each cluster is treated equally (known as macro weighting, MF1_macro)
101 |    - c  - Combined(w, u) using geometric mean (drops the value not so much as harmonic mean)
102 |  help description.  */
103 |   char * label_arg;	/**< @brief label evaluating clusters with the specified ground-truth (gt) cluster indices and evaluate F1 (including Precision and Recall) of the (best) MATCHED labeled clusters only (without the probable subclusters).
104 |   NOTE: If 'sync' option is specified then the file name of the clusters labels should be the same as the node base (if specified) and should be in the .cnl format. The file name can be either a separate or an evaluating CNL file, in the latter case this option should precede the evaluating filename not repeating it.
105 |   Precision and recall are evaluated relative to the FIRST clustering dataset (ground-truth, gold standard).
106 | .  */
107 |   char * label_orig;	/**< @brief label evaluating clusters with the specified ground-truth (gt) cluster indices and evaluate F1 (including Precision and Recall) of the (best) MATCHED labeled clusters only (without the probable subclusters).
108 |   NOTE: If 'sync' option is specified then the file name of the clusters labels should be the same as the node base (if specified) and should be in the .cnl format. The file name can be either a separate or an evaluating CNL file, in the latter case this option should precede the evaluating filename not repeating it.
109 |   Precision and recall are evaluated relative to the FIRST clustering dataset (ground-truth, gold standard).
110 |  original value given at command line.  */
111 |   const char *label_help; /**< @brief label evaluating clusters with the specified ground-truth (gt) cluster indices and evaluate F1 (including Precision and Recall) of the (best) MATCHED labeled clusters only (without the probable subclusters).
112 |   NOTE: If 'sync' option is specified then the file name of the clusters labels should be the same as the node base (if specified) and should be in the .cnl format. The file name can be either a separate or an evaluating CNL file, in the latter case this option should precede the evaluating filename not repeating it.
113 |   Precision and recall are evaluated relative to the FIRST clustering dataset (ground-truth, gold standard).
114 |  help description.  */
115 |   enum enum_policy policy_arg;	/**< @brief Labels matching policy:
116 |    - p  - Partial Probabilities (maximizes gain)
117 |    - h  - Harmonic Mean (minimizes loss, maximizes F1)
118 |  (default='harmonic').  */
119 |   char * policy_orig;	/**< @brief Labels matching policy:
120 |    - p  - Partial Probabilities (maximizes gain)
121 |    - h  - Harmonic Mean (minimizes loss, maximizes F1)
122 |  original value given at command line.  */
123 |   const char *policy_help; /**< @brief Labels matching policy:
124 |    - p  - Partial Probabilities (maximizes gain)
125 |    - h  - Harmonic Mean (minimizes loss, maximizes F1)
126 |  help description.  */
127 |   int unweighted_flag;	/**< @brief Labels weighting policy on F1 evaluation: weighted by the number of instances in each label by default (micro weighting, F1_micro) or unweighed, where each label is treated equally (i.e. macro weighting, F1_macro) (default=off).  */
128 |   const char *unweighted_help; /**< @brief Labels weighting policy on F1 evaluation: weighted by the number of instances in each label by default (micro weighting, F1_micro) or unweighed, where each label is treated equally (i.e. macro weighting, F1_macro) help description.  */
129 |   char * identifiers_arg;	/**< @brief output labels (identifiers) of the evaluating clusters as lines of space-separated indices of the ground-truth clusters (.cll - clusters labels list)
130 |   NOTE: If 'sync' option is specified then the reduced collection is outputted to the <labels_filename>.cnl besides the <labels_filename>
131 | .  */
132 |   char * identifiers_orig;	/**< @brief output labels (identifiers) of the evaluating clusters as lines of space-separated indices of the ground-truth clusters (.cll - clusters labels list)
133 |   NOTE: If 'sync' option is specified then the reduced collection is outputted to the <labels_filename>.cnl besides the <labels_filename>
134 |  original value given at command line.  */
135 |   const char *identifiers_help; /**< @brief output labels (identifiers) of the evaluating clusters as lines of space-separated indices of the ground-truth clusters (.cll - clusters labels list)
136 |   NOTE: If 'sync' option is specified then the reduced collection is outputted to the <labels_filename>.cnl besides the <labels_filename>
137 |  help description.  */
138 |   int nmi_flag;	/**< @brief evaluate NMI (Normalized Mutual Information), applicable only to the non-overlapping clusters (default=off).  */
139 |   const char *nmi_help; /**< @brief evaluate NMI (Normalized Mutual Information), applicable only to the non-overlapping clusters help description.  */
140 |   int all_flag;	/**< @brief evaluate all NMIs using sqrt, avg and min denominators besides the max one (default=off).  */
141 |   const char *all_help; /**< @brief evaluate all NMIs using sqrt, avg and min denominators besides the max one help description.  */
142 |   int ln_flag;	/**< @brief use ln (exp base) instead of log2 (Shannon entropy, bits) for the information measuring (default=off).  */
143 |   const char *ln_help; /**< @brief use ln (exp base) instead of log2 (Shannon entropy, bits) for the information measuring help description.  */
144 |   
145 |   unsigned int help_given ;	/**< @brief Whether help was given.  */
146 |   unsigned int version_given ;	/**< @brief Whether version was given.  */
147 |   unsigned int ovp_given ;	/**< @brief Whether ovp was given.  */
148 |   unsigned int unique_given ;	/**< @brief Whether unique was given.  */
149 |   unsigned int sync_given ;	/**< @brief Whether sync was given.  */
150 |   unsigned int membership_given ;	/**< @brief Whether membership was given.  */
151 |   unsigned int detailed_given ;	/**< @brief Whether detailed was given.  */
152 |   unsigned int omega_given ;	/**< @brief Whether omega was given.  */
153 |   unsigned int extended_given ;	/**< @brief Whether extended was given.  */
154 |   unsigned int f1_given ;	/**< @brief Whether f1 was given.  */
155 |   unsigned int kind_given ;	/**< @brief Whether kind was given.  */
156 |   unsigned int label_given ;	/**< @brief Whether label was given.  */
157 |   unsigned int policy_given ;	/**< @brief Whether policy was given.  */
158 |   unsigned int unweighted_given ;	/**< @brief Whether unweighted was given.  */
159 |   unsigned int identifiers_given ;	/**< @brief Whether identifiers was given.  */
160 |   unsigned int nmi_given ;	/**< @brief Whether nmi was given.  */
161 |   unsigned int all_given ;	/**< @brief Whether all was given.  */
162 |   unsigned int ln_given ;	/**< @brief Whether ln was given.  */
163 | 
164 |   char **inputs ; /**< @brief unnamed options (options without names) */
165 |   unsigned inputs_num ; /**< @brief unnamed options number */
166 | } ;
167 | 
168 | /** @brief The additional parameters to pass to parser functions */
169 | struct cmdline_parser_params
170 | {
171 |   int override; /**< @brief whether to override possibly already present options (default 0) */
172 |   int initialize; /**< @brief whether to initialize the option structure gengetopt_args_info (default 1) */
173 |   int check_required; /**< @brief whether to check that all required options were provided (default 1) */
174 |   int check_ambiguity; /**< @brief whether to check for options already specified in the option structure gengetopt_args_info (default 0) */
175 |   int print_errors; /**< @brief whether getopt_long should print an error message for a bad option (default 1) */
176 | } ;
177 | 
178 | /** @brief the purpose string of the program */
179 | extern const char *gengetopt_args_info_purpose;
180 | /** @brief the usage string of the program */
181 | extern const char *gengetopt_args_info_usage;
182 | /** @brief the description string of the program */
183 | extern const char *gengetopt_args_info_description;
184 | /** @brief all the lines making the help output */
185 | extern const char *gengetopt_args_info_help[];
186 | 
187 | /**
188 |  * The command line parser
189 |  * @param argc the number of command line options
190 |  * @param argv the command line options
191 |  * @param args_info the structure where option information will be stored
192 |  * @return 0 if everything went fine, NON 0 if an error took place
193 |  */
194 | int cmdline_parser (int argc, char **argv,
195 |   struct gengetopt_args_info *args_info);
196 | 
197 | /**
198 |  * The command line parser (version with additional parameters - deprecated)
199 |  * @param argc the number of command line options
200 |  * @param argv the command line options
201 |  * @param args_info the structure where option information will be stored
202 |  * @param override whether to override possibly already present options
203 |  * @param initialize whether to initialize the option structure my_args_info
204 |  * @param check_required whether to check that all required options were provided
205 |  * @return 0 if everything went fine, NON 0 if an error took place
206 |  * @deprecated use cmdline_parser_ext() instead
207 |  */
208 | int cmdline_parser2 (int argc, char **argv,
209 |   struct gengetopt_args_info *args_info,
210 |   int override, int initialize, int check_required);
211 | 
212 | /**
213 |  * The command line parser (version with additional parameters)
214 |  * @param argc the number of command line options
215 |  * @param argv the command line options
216 |  * @param args_info the structure where option information will be stored
217 |  * @param params additional parameters for the parser
218 |  * @return 0 if everything went fine, NON 0 if an error took place
219 |  */
220 | int cmdline_parser_ext (int argc, char **argv,
221 |   struct gengetopt_args_info *args_info,
222 |   struct cmdline_parser_params *params);
223 | 
224 | /**
225 |  * Save the contents of the option struct into an already open FILE stream.
226 |  * @param outfile the stream where to dump options
227 |  * @param args_info the option struct to dump
228 |  * @return 0 if everything went fine, NON 0 if an error took place
229 |  */
230 | int cmdline_parser_dump(FILE *outfile,
231 |   struct gengetopt_args_info *args_info);
232 | 
233 | /**
234 |  * Save the contents of the option struct into a (text) file.
235 |  * This file can be read by the config file parser (if generated by gengetopt)
236 |  * @param filename the file where to save
237 |  * @param args_info the option struct to save
238 |  * @return 0 if everything went fine, NON 0 if an error took place
239 |  */
240 | int cmdline_parser_file_save(const char *filename,
241 |   struct gengetopt_args_info *args_info);
242 | 
243 | /**
244 |  * Print the help
245 |  */
246 | void cmdline_parser_print_help(void);
247 | /**
248 |  * Print the version
249 |  */
250 | void cmdline_parser_print_version(void);
251 | 
252 | /**
253 |  * Initializes all the fields a cmdline_parser_params structure 
254 |  * to their default values
255 |  * @param params the structure to initialize
256 |  */
257 | void cmdline_parser_params_init(struct cmdline_parser_params *params);
258 | 
259 | /**
260 |  * Allocates dynamically a cmdline_parser_params structure and initializes
261 |  * all its fields to their default values
262 |  * @return the created and initialized cmdline_parser_params structure
263 |  */
264 | struct cmdline_parser_params *cmdline_parser_params_create(void);
265 | 
266 | /**
267 |  * Initializes the passed gengetopt_args_info structure's fields
268 |  * (also set default values for options that have a default)
269 |  * @param args_info the structure to initialize
270 |  */
271 | void cmdline_parser_init (struct gengetopt_args_info *args_info);
272 | /**
273 |  * Deallocates the string fields of the gengetopt_args_info structure
274 |  * (but does not deallocate the structure itself)
275 |  * @param args_info the structure to deallocate
276 |  */
277 | void cmdline_parser_free (struct gengetopt_args_info *args_info);
278 | 
279 | /**
280 |  * Checks that all the required options were specified
281 |  * @param args_info the structure to check
282 |  * @param prog_name the name of the program that will be used to print
283 |  *   possible errors
284 |  * @return
285 |  */
286 | int cmdline_parser_required (struct gengetopt_args_info *args_info,
287 |   const char *prog_name);
288 | 
289 | extern const char *cmdline_parser_f1_values[];  /**< @brief Possible values for f1. */
290 | extern const char *cmdline_parser_kind_values[];  /**< @brief Possible values for kind. */
291 | extern const char *cmdline_parser_policy_values[];  /**< @brief Possible values for policy. */
292 | 
293 | 
294 | #ifdef __cplusplus
295 | }
296 | #endif /* __cplusplus */
297 | #endif /* CMDLINE_H */
298 | 


--------------------------------------------------------------------------------
/include/interface.h:
--------------------------------------------------------------------------------
  1 | //! \brief Extrinsic measures evaluation interface.
  2 | //!
  3 | //! \license Apache License, Version 2.0: http://www.apache.org/licenses/LICENSE-2.0.html
  4 | //! > 	Simple explanation: https://tldrlegal.com/license/apache-license-2.0-(apache-2.0)
  5 | //!
  6 | //! Copyright (c)
  7 | //! \authr Artem Lutov
  8 | //! \email luart@ya.ru
  9 | //! \date 2017-02-13
 10 | 
 11 | #ifndef INTERFACE_H
 12 | #define INTERFACE_H
 13 | 
 14 | #include <unordered_map>
 15 | #include <memory>  // unique_ptr
 16 | #include <string>
 17 | #include <type_traits>
 18 | #include <limits>
 19 | #if VALIDATE >= 1
 20 | #include <stdexcept>
 21 | #endif // VALIDATE
 22 | 
 23 | #define INCLUDE_STL_FS
 24 | #include "fileio.hpp"
 25 | #if VALIDATE >= 2
 26 | #include "operations.hpp"
 27 | #endif // VALIDATE 2
 28 | 
 29 | #ifdef C_API
 30 | #include "interface_c.h"
 31 | #endif // C_API
 32 | 
 33 | 
 34 | using std::vector;
 35 | using std::unordered_set;
 36 | using std::unordered_map;
 37 | using std::unique_ptr;
 38 | using std::string;
 39 | using std::pair;
 40 | using std::is_integral;
 41 | using std::is_pointer;
 42 | using std::is_floating_point;
 43 | using std::is_arithmetic;
 44 | using std::is_same;
 45 | //using std::enable_if;
 46 | using std::enable_if_t;
 47 | using std::conditional_t;
 48 | using std::numeric_limits;
 49 | #if VALIDATE >= 2
 50 | using std::invalid_argument;
 51 | #endif // VALIDATE
 52 | 
 53 | // Data Types ------------------------------------------------------------------
 54 | using Id = uint32_t;  //!< Node id type
 55 | // Note: Size should a magnitude larger than Id to hold Id*Id
 56 | using AccId = uint64_t;  //!< Accumulated Id type
 57 | 
 58 | using Prob = float;  //!< Probability
 59 | using AccProb = double;  //!< Accumulated Probability
 60 | 
 61 | //! Aggregated Hash of the loading cluster member ids
 62 | using AggHash = daoc::AggHash<Id, AccId>;
 63 | 
 64 | using RawIds = vector<Id>;  //!< Node ids, unordered
 65 | 
 66 | // Omega Index related types and functions -------------------------------------
 67 | using RawCluster = RawIds;  //!< Raw cluster of member node ids
 68 | using RawClusters = vector<RawCluster>;  //!< Raw clustering, container of the raw clusters
 69 | using RawClusterPtrs = vector<RawCluster*>;
 70 | using NodeRClusters = unordered_map<Id, pair<RawClusterPtrs, RawClusterPtrs>>;  //!< Raw node membership in the clusters
 71 | 
 72 | //! \brief Omega Index evaluation
 73 | //!
 74 | //! \tparam EXT bool  - extended Omega Index, which does not excessively penalize
 75 | //! 	distinct node shares
 76 | //!
 77 | //! \param ndrcs const NodeRClusters&  - node raw clusters relations
 78 | //! \param cls1 const RawClusters&  - clusters of the first collection
 79 | //! \param cls2 const RawClusters&  - clusters of the second collection
 80 | //! \return Prob  - omega index
 81 | template <bool EXT=false>
 82 | Prob omega(const NodeRClusters& ndrcs, const RawClusters& cls1, const RawClusters& cls2);
 83 | 
 84 | //! \brief Evaluate the number of mutual raw cluster pointers in the containers
 85 | //!
 86 | //! \pre Input raw clusters pointer containers are ordered by the cmpBase<RawCluster*>
 87 | //!
 88 | //! \param a const RawClusterPtrs*  - first raw cluster pointers
 89 | //! \param b const RawClusterPtrs*  - second raw cluster pointers
 90 | //! \param nmax const Id  - max number of matches for the early termination,
 91 | //! 	0 is allowed but senseless.
 92 | //! \return Id  - the number of mutual members
 93 | Id mutualnum(const RawClusterPtrs* a, const RawClusterPtrs* b, const Id nmax) noexcept;
 94 | 
 95 | Id mutualnum(const RawClusterPtrs* a, const RawClusterPtrs* b) noexcept;
 96 | 
 97 | // F1 & NMI related data types -------------------------------------------------
 98 | template <typename Count>
 99 | struct Cluster;
100 | 
101 | //! Cluster matching counter
102 | //! \note Required only for F1 evaluation
103 | //! \tparam Count  - arithmetic counting type
104 | template <typename Count>
105 | class Counter {
106 | public:
107 | 	static_assert(is_arithmetic<Count>::value
108 | 		, "Counter(), Count should be an arithmetic type");
109 | 	using CountT = Count;  //!< Count type, arithmetic
110 | 	using ClusterT = Cluster<Count>;
111 | private:
112 | 	// Note: it's OK to copy this pointer on assignment since it is never
113 | 	// allocated in this class
114 | 	ClusterT*  m_orig;  //!<  Originator cluster
115 | 	CountT  m_count;  //!<  Occurrences counter, <= members size
116 | public:
117 |     //! Default constructor
118 | 	Counter(): m_orig(nullptr), m_count(0)  {}
119 | 
120 |     //! \brief Update the counter from the specified origin
121 |     //!
122 |     //! \param orig ClusterT*  - counter origin
123 |     //! \param cont Count  - contribution or share, actual only for the floating point counter
124 |     //! \return void
125 | 	void operator()(ClusterT* orig, Count cont)
126 | #if VALIDATE < 2
127 | 	noexcept
128 | #endif // VALIDATE
129 | 	{
130 | 		if(m_orig != orig) {
131 | 			m_orig = orig;
132 | 			m_count = 0;
133 | 		}
134 | 		if(is_integral<CountT>::value)
135 | 			++m_count;
136 | 		else {
137 | 			static_assert(!is_floating_point<CountT>::value || sizeof(m_count) >= sizeof(double)
138 | 				, "operator(), types validation failed");
139 | #if VALIDATE >= 2
140 | 			if(cont <= 0 || cont > 1)
141 | 				throw invalid_argument("operator(), cont should E (0, 1]\n");
142 | #endif // VALIDATE
143 | 			m_count += cont;
144 | 		}
145 | 	}
146 | 
147 |     //! \brief Get counted value
148 |     //!
149 |     //! \return CountT  - counted value
150 | 	CountT operator()() const noexcept  { return m_count; }
151 | 
152 |     //! \brief Get counter origin
153 |     //!
154 |     //! \return ClusterT*  - counter origin
155 | 	ClusterT* origin() const noexcept  { return m_orig; }
156 | 
157 |     //! \brief Clear (reset) the counter
158 | 	void clear() noexcept
159 | 	{
160 | 		m_orig = nullptr;
161 | 		m_count = 0;
162 | 	}
163 | };
164 | 
165 | //! Cluster
166 | //! \tparam Count  - nodes contribution counter type
167 | template <typename Count>
168 | struct Cluster {
169 | 	static_assert(is_arithmetic<Count>::value
170 | 		, "Counter(), Count should be an arithmetic type");
171 | 	using CountT = Count;  //!< Count type, arithmetic
172 | 
173 | 	RawIds  members;  //!< Node ids, unordered
174 | 	// Note: used by F1 only and always
175 | 	Counter<Count>  counter;  //!< Cluster matching counter
176 | 	////! Accumulated contribution
177 | 	//using AccCont = conditional_t<m_overlaps, Count, AccId>;
178 | 	//!< Contribution from members
179 | 	// Note: used only in case of a) overlaps (by all measures) or
180 | 	// b) multiple resolutions (by NMI only)
181 | 	Count  mbscont;
182 | 	static_assert(!is_floating_point<Count>::value || sizeof(mbscont) >= sizeof(double)
183 | 		, "operator(), types validation failed");
184 | 
185 |     //! Default constructor
186 | 	Cluster();
187 | 
188 |     //! \brief F1 measure
189 |     //! \pre Clusters should be valid, i.e. non-empty
190 |     //!
191 |     //! \param matches Count  - the number of matched members
192 |     //! \param capacity Count  - contributions capacity of the matching foreign cluster
193 |     //! \return AccProb  - resulting F1
194 | 	AccProb f1(Count matches, Count capacity) const
195 | #if VALIDATE < 2
196 | 	noexcept
197 | #endif // VALIDATE
198 | 	{
199 | 		// F1 = 2 * pr * rc / (pr + rc)
200 | 		// pr = m / c1
201 | 		// rc = m / c2
202 | 		// F1 = 2 * m/c1 * m/c2 / (m/c1 + m/c2) = 2 * m / (c2 + c1)
203 | 		// ATTENTION: F1 compares clusters per-pair, so it is much simpler and has another
204 | 		// semantics of contribution for the multi-resolution case
205 | 		const Count  contrib = is_floating_point<Count>::value ? cont() : members.size();
206 | #if VALIDATE >= 2
207 | 		if(matches < 0 || daoc::less<conditional_t<is_floating_point<Count>::value
208 | 		, Prob, Count>>(capacity, matches) || contrib <= 0)
209 | 			throw invalid_argument(string("f1(), both clusters should be non-empty, matches: ")
210 | 				.append(std::to_string(matches)).append(", capacity: ").append(std::to_string(capacity))
211 | 				.append(", contrib: ").append(std::to_string(contrib)) += '\n');
212 | #endif // VALIDATE
213 | 		return 2 * matches / AccProb(capacity + contrib);  // E [0, 1]
214 | 		// Note that partial probability (non-normalized to the remained matches,
215 | 		// it says only how far this match from the full match) of the match is:
216 | 		// P = AccProb(matches * matches) / AccProb(size * members.size()),
217 | 		// where nodes contribution instead of the size should be use for overlaps.
218 | 		// The probability is more discriminative than F1 for high values
219 | 	}
220 | 
221 |     //! \brief Partial probability of the match (non-normalized to the other matches)
222 |     //! \pre Clusters should be valid, i.e. non-empty
223 |     //!
224 |     //! \param matches Count  - the number of matched members
225 |     //! \param capacity Count  - contributions capacity of the matching foreign cluster
226 |     //! \return AccProb  - resulting probability
227 | 	AccProb pprob(Count matches, Count capacity) const
228 | #if VALIDATE < 2
229 | 	noexcept
230 | #endif // VALIDATE
231 | 	{
232 | 		// P = P1 * P2 = m/c1 * m/c2 = m*m / (c1*c2),
233 | 		// where nodes contribution instead of the size should be used for overlaps.
234 | 		// ATTENTION: F1 compares clusters per-pair, so it is much simpler and has another
235 | 		// semantics of contribution for the multi-resolution case comparing to NMI
236 | 		// that also uses cont()
237 | 		constexpr bool  floating = is_floating_point<Count>::value;
238 | 		const Count  contrib = floating ? cont() : members.size();
239 | #if VALIDATE >= 2
240 | 		if(matches < 0 || daoc::less<conditional_t<floating, Prob, Count>>
241 | 		(capacity, matches) || contrib <= 0)
242 | 			throw invalid_argument(string("pprob(), both clusters should be non-empty, matches: ")
243 | 				.append(std::to_string(matches)).append(", capacity: ").append(std::to_string(capacity))
244 | 				.append(", contrib: ").append(std::to_string(contrib)) += '\n');
245 | #endif // VALIDATE
246 | 		return floating ? static_cast<AccProb>(matches) * matches / (static_cast<AccProb>(capacity) * contrib)
247 | 			: static_cast<AccProb>(static_cast<AccId>(matches) * matches)
248 | 				/ (static_cast<AccId>(capacity) * contrib);  // E [0, 1]
249 | 	}
250 | 
251 |     //! \brief Cluster members contribution
252 |     //!
253 |     //! \return Count  - total contribution from the members
254 | 	Count cont() const noexcept
255 | 	{
256 | //		return is_same<decltype(mbscont), EmptyStub>::value ? members.size() : mbscont;
257 | 		return mbscont;
258 | 	}
259 | };
260 | 
261 | //! Automatic storage for the Cluster;
262 | //! \tparam Count  - arithmetic counting type
263 | template <typename Count>
264 | using ClusterHolder = unique_ptr<Cluster<Count>>;
265 | 
266 | //! Cluster pointers, unordered
267 | //! \tparam Count  - arithmetic counting type
268 | template <typename Count>
269 | using ClusterPtrs = vector<Cluster<Count>*>;
270 | 
271 | //! Node to clusters relations
272 | //! \tparam Count  - arithmetic counting type
273 | template <typename Count>
274 | using NodeClusters = unordered_map<Id, ClusterPtrs<Count>>;
275 | 
276 | //! Resulting greatest matches for 2 input collections of clusters in a single direction
277 | using Probs = vector<Prob>;
278 | 
279 | // Label-related types --------------------------------------------------------
280 | //! Clusters Labels, Labels are ORDERED by cmpBase
281 | template <typename Count>
282 | using ClustersLabels = unordered_map<Cluster<Count>*, ClusterPtrs<Count>>;
283 | 
284 | // F1-related types -----------------------------------------------------------
285 | using F1Base = uint8_t;
286 | 
287 | //! \brief F1 kind
288 | enum struct F1: F1Base {
289 | 	//! Not initialized
290 | 	NONE = 0,
291 | 	//! Harmonic mean of the [weighted] average of the greatest (maximal) match
292 | 	//! by partial probabilities
293 | 	PARTPROB,
294 | 	//! Harmonic mean of the [weighted] average of the greatest (maximal) match by F1s
295 | 	HARMONIC,
296 | 	//! Arithmetic mean (average) of the [weighted] average of the greatest (maximal)
297 | 	//! match by F1s, i.e. F1-Score
298 | 	AVERAGE  // Suggested by Leskovec
299 | };
300 | 
301 | //! \brief String representation of the F1
302 | //! \relates F1
303 | //!
304 | //! \param f1 F1  - the value to be converted
305 | //! \return string  - string value
306 | string to_string(F1 f1);
307 | 
308 | // NMI-related types -----------------------------------------------------------
309 | //! Internal element of the Sparse Matrix with Vector Rows
310 | //! \tparam Index  - index (of the column) in the row
311 | //! \tparam Value  - value type
312 | template <typename Index, typename Value>
313 | struct RowVecItem {
314 | 	static_assert(is_integral<Index>::value || is_pointer<Index>::value
315 | 		, "RowVecItem, Index should be an integral type");
316 | 
317 | 	using CallT = Index;  //!< Type of the functor call
318 | 
319 | 	Index  pos;  //!< Position (index) in the row
320 | 	Value  val;  //!< Target value (payload)
321 | 
322 | 	//! Constructor in case of the simple value
323 |     //!
324 |     //! \param i=Index() Index  - index of value in the row
325 |     //! \param v=Value() Value  - payload value
326 | 	template <typename T=Value, enable_if_t<sizeof(T) <= sizeof(void*)>* = nullptr>
327 | 	RowVecItem(Index i=Index(), Value v=Value()) noexcept(static_cast<bool>(Value()))
328 | 	: pos(i), val(v)  {}
329 | 
330 | 	//! Constructor in case of the compound value
331 |     //!
332 |     //! \param i=Index() Index  - index of value in the row
333 |     //! \param v=Value() Value  - payload value
334 | 	template <typename T=Value, enable_if_t<(sizeof(T) > sizeof(void*)), bool>* = nullptr>
335 | 	RowVecItem(Index i=Index(), Value&& v=Value()) noexcept(Value())
336 | 	: pos(i), val(move(v))  {}
337 | 
338 |     //! \brief Functor (call) operator
339 |     //!
340 |     //! \return CallT  - index of the value
341 | 	// Note: required to call obj()
342 | 	CallT operator()() const noexcept  { return pos; }
343 | 
344 | //	// Note: required for the comparison operations with index
345 | //	operator CallT() const noexcept  { return this }
346 | };
347 | 
348 | //! Row vector for the SparseMatrix
349 | template <typename Index, typename Value>
350 | using SparseMatrixRowVec = vector<RowVecItem<Index, Value>>;
351 | 
352 | //! Base type of the SparseMatrix (can be unordered_map, map, vector)
353 | template <typename Index, typename Value>
354 | using SparseMatrixBase = unordered_map<Index, SparseMatrixRowVec<Index, Value>>;
355 | 
356 | //! Sparse Matrix
357 | //! \tparam Index  - index type
358 | //! \tparam Value  - value type
359 | template <typename Index, typename Value>
360 | struct SparseMatrix: SparseMatrixBase<Index, Value> {
361 | 	static_assert((is_integral<Index>::value || is_pointer<Index>::value)
362 | 		&& is_arithmetic<Value>::value, "SparseMatrix(), invalid parameter types");
363 | 
364 | 	using IndexT = Index;  //!< Indexes type, integral
365 | 	using ValueT = Value;  //!< Value type, arithmetic
366 | 	using BaseT = SparseMatrixBase<IndexT, ValueT>;  //!< SparseMatrixBase type
367 | 	using RowT = typename BaseT::mapped_type;  //!< Matrix row type
368 | 	//! Matrix row element type, which contains the value and might have
369 | 	//! additional attributes
370 | 	using RowItemT = typename RowT::value_type;
371 | 
372 |     //! \brief Default constructor
373 |     //!
374 |     //! \param rows=0 Id  - initial number of rows
375 | 	SparseMatrix(Id rows=0);
376 | 
377 |     //! \brief Access specified element inserting it if not exists
378 |     //!
379 |     //! \param i Index  - row index
380 |     //! \param j Index  - column index
381 |     //! \return Value& operator  - value of the element to be set
382 | 	Value& operator ()(Index i, Index j);
383 | 
384 |     //! \brief Access specified element without bounds checking
385 |     //! \note fast, but unsafe
386 |     //!
387 |     //! \param i Index  - row index
388 |     //! \param j Index  - column index
389 |     //! \return Value& operator  - value of the element
390 | 	template <typename T=Value, enable_if_t<sizeof(T) <= sizeof(void*)>* = nullptr>
391 | 	Value operator ()(Index i, Index j) const noexcept; //  { return this->at(i) }
392 | 
393 |     //! \brief Access specified element without bounds checking
394 |     //! \note fast, but unsafe
395 |     //!
396 |     //! \param i Index  - row index
397 |     //! \param j Index  - column index
398 |     //! \return Value& operator  - value of the element
399 | 	template <typename T=Value, enable_if_t<(sizeof(T) > sizeof(void*)), bool>* = nullptr>
400 | 	const Value& operator ()(Index i, Index j) const noexcept; //  { return this->at(i) }
401 | 
402 |     //! \brief Access specified element checking the bounds
403 |     //!
404 |     //! \param i Index  - row index
405 |     //! \param j Index  - column index
406 |     //! \return Value& operator  - value of the element
407 | 	template <typename T=Value, enable_if_t<sizeof(T) <= sizeof(void*)>* = nullptr>
408 | 	Value at(Index i, Index j); //  { return this->at(i) }
409 | 
410 |     //! \brief Access specified element checking the bounds
411 |     //!
412 |     //! \param i Index  - row index
413 |     //! \param j Index  - column index
414 |     //! \return Value& operator  - value of the element
415 | 	template <typename T=Value, enable_if_t<(sizeof(T) > sizeof(void*)), bool>* = nullptr>
416 | 	const Value& at(Index i, Index j); //  { return this->at(i) }
417 | 
418 | 	using BaseT::at;  //!< Provide direct access to the matrix row
419 | };
420 | 
421 | //using EvalBase = uint8_t;  //!< Base type for the Evaluation
422 | //
423 | ////! \brief Evaluation type
424 | //enum struct Evaluation: EvalBase {
425 | //	NONE = 0,
426 | ////	HARD = 0
427 | //	MULTIRES = 1,  //!< Multi-resolution non-overlapping clusters, compatible with hard partitioning
428 | //	OVERLAPPING = 2,  //!< Overlapping clusters, compatible with hard partitioning
429 | //	MULRES_OVP = 3  //!< Multi-resolution clusters with possible overlaps on each resolution level
430 | //};
431 | //
432 | ////! \brief Convert Evaluation to string
433 | ////! \relates Evaluation
434 | ////!
435 | ////! \param flag Evaluation  - the flag to be converted
436 | ////! \param bitstr=false bool  - convert to bits string or to Evaluation captions
437 | ////! \return string  - resulting flag as a string
438 | //string to_string(Evaluation eval, bool bitstr=false);
439 | 
440 | struct RawNmi {
441 | 	Prob  mi;  //!< Mutual information of two collections
442 | 	Prob  h1;  //!< Information content of the 1-st collection
443 | 	Prob  h2;  //!< Information content of the 2-nd collection
444 | 	//Evaluation  eval;  //!< Evaluation type
445 | 
446 | 	static_assert(is_floating_point<Prob>::value, "RawNmi, Prob should be a floating point type");
447 | 	RawNmi() noexcept: mi(0), h1(numeric_limits<Prob>::quiet_NaN())
448 | 		, h2(numeric_limits<Prob>::quiet_NaN())  {}
449 | 
450 | 	void operator() (Prob mutinf, Prob cn1h, Prob cn2h) noexcept
451 | 	{
452 | 		mi = mutinf;
453 | 		h1 = cn1h;
454 | 		h2 = cn2h;
455 | 	};
456 | };
457 | 
458 | // Collection ------------------------------------------------------------------
459 | //! Unique ids (node ids)
460 | using UniqIds = unordered_set<Id>;
461 | 
462 | //! Node base interface
463 | struct NodeBaseI {
464 |     //! \brief Default virtual destructor
465 | 	virtual ~NodeBaseI()=default;
466 | 
467 |     //! \brief Whether the node base is actual (non-empty)
468 |     //!
469 |     //! \return bool  - the node base is non-empty
470 | 	operator bool() const noexcept  { return ndsnum(); };
471 | 
472 |     //! \brief The number of nodes
473 |     //!
474 |     //! \return Id  - the number of nodes in the collection
475 | 	virtual Id ndsnum() const noexcept = 0;
476 | 
477 |     //! \brief Whether exists the specified node
478 |     //!
479 |     //! \param nid  - node id
480 |     //! \return bool  - specified node id exists
481 | 	virtual bool nodeExists(Id nid) const noexcept = 0;
482 | 
483 | 	//! \brief Nodebase content
484 | 	//!
485 | 	//! \return virtual const UniqIds&  - nodebase content
486 | 	virtual const UniqIds& nodes() const noexcept = 0;
487 | };
488 | 
489 | //! Node base
490 | struct NodeBase: protected UniqIds, NodeBaseI {
491 | 	using UniqIds::clear;
492 | 	using UniqIds::reserve;
493 | 	using UniqIds::insert;
494 | 	using UniqIds::begin;
495 | 	using UniqIds::end;
496 | 
497 | 	//! \copydoc NodeBaseI::nodeExists(Id nid) const noexcept
498 | 	Id ndsnum() const noexcept override  { return size(); }
499 | 
500 | 	//! \copydoc NodeBaseI::nodeExists(Id nid) const noexcept
501 | 	bool nodeExists(Id nid) const noexcept override  { return count(nid); }
502 | 
503 | 	//! \copydoc NodeBaseI::nodes() const noexcept
504 | 	const UniqIds& nodes() const noexcept override  { return *this; }
505 | 
506 | #ifndef NO_FILEIO
507 | 	//! \brief Load all unique nodes from the CNL file with optional filtering by the cluster size
508 | 	//!
509 | 	//! \param filename const char*  - name of the input file
510 |     //! \param ahash=nullptr AggHash*  - resulting aggregated hash of the loaded
511 |     //! node ids if not nullptr
512 | 	//! \param membership=1 float  - expected membership of the nodes, >0, typically >= 1.
513 | 	//! Used only for the node container preallocation to estimate the number of nodes
514 | 	//! if not specified in the file header
515 | 	//! \param cmin=0 size_t  - min allowed cluster size
516 | 	//! \param cmax=0 size_t  - max allowed cluster size, 0 means any size
517 |     //! \param verbose=false bool  - print intermediate results to the stdout
518 |     //! \return bool  - the collection is loaded successfully
519 | 	static NodeBase load(const char* filename, float membership=1
520 | 		, AggHash* ahash=nullptr, size_t cmin=0, size_t cmax=0, bool verbose=false);
521 | #endif // NO_FILEIO
522 | };
523 | 
524 | //template <typename Iter>
525 | //Id iterValSimple(Iter it) noexcept  { return *it; }
526 | //
527 | //template <typename Iter>
528 | //Id iterValFirst(Iter it) noexcept  { return it->first; }
529 | //
530 | ////! \brief Identify external nodes that are complementary (do not belong) to the node base
531 | ////!
532 | ////! \tparam Iter  - iterator type for the external nodes
533 | ////! \tparam IterValF  - function, obtaining node value from the iterator
534 | ////!
535 | ////! \param begin  - begin of the external nodes
536 | ////! \param end  - end of the external nodes
537 | ////! \param size=0 Id  - size of the external nodes, 0 means not specified; used to pre-allocate data
538 | ////! \param itval=iterValSimple<Iter> IterValF  - iterator value extracting function
539 | ////! \return RawIds  - external nodes that are complementary (do not belong) to the node base
540 | //template <typename Iter, typename IterValF>
541 | //virtual RawIds complementary(Iter begin, Iter end, Id size=0, IterValF itval=iterValSimple<Iter>) const noexcept = 0;
542 | //
543 | //template <typename Iter, typename IterValF>
544 | //RawIds NodeBase::complementary(Iter begin, Iter end, Id size, IterValF itval) const override noexcept
545 | //{
546 | //	RawIds ndcpl;  // Return using NRVO, named return value optimization
547 | //	UniqIds ndext;  // External nodes, whose complementary values should be extracted
548 | //	if(size)
549 | //		ndext.reserve(size);
550 | //	for(Iter it = begin; it != end; ++it)
551 | //		ndext.insert(ndext.end(), itval(it));
552 | //	ndcpl.reserve(ndext.size() - this->size());
553 | //	for(auto nid: ndext)
554 | //		if(!count(nid))
555 | //			ndcpl.push_back(nid);
556 | //
557 | //	return ndcpl;
558 | //}
559 | 
560 | //! Collection matching kind base
561 | using MatchBase = uint8_t;
562 | 
563 | //! \brief Collection matching kind
564 | enum struct Match: MatchBase {
565 | 	NONE = 0,  //!< Note initialized
566 | 	WEIGHTED,  //!< Weighted matching by the number of members in each cluster (macro weighting)
567 | 	UNWEIGHTED,  //!< Unweighted matching of each cluster (micro weighting)
568 | 	COMBINED  //!< Combined of macro and micro weightings using geometric mean
569 | };
570 | 
571 | //! \brief String representation of the Match
572 | //! \relates Match
573 | //!
574 | //! \param mkind Match  - the value to be converted
575 | //! \return string  - string value
576 | string to_string(Match mkind);
577 | 
578 | //! \brief The matching includes weighted match
579 | //! \relates Match
580 | //!
581 | //! \param m Match  - matching kind
582 | //! \return bool  - weighted matching included
583 | bool xwmatch(Match m) noexcept;
584 | 
585 | //! \brief The matching includes unweighted match
586 | //! \relates Match
587 | //!
588 | //! \param m Match  - matching kind
589 | //! \return bool  - unweighted matching included
590 | bool xumatch(Match m) noexcept;
591 | 
592 | //! Precision and recall
593 | struct PrcRec {
594 | 	Prob prc;  //!< Precision
595 | 	Prob rec;  //!< Recall
596 | 
597 | 	// Explicit members initialization by value to avoid uninitialized members
598 | 	PrcRec(Prob prc=0, Prob rec=0): prc(prc), rec(rec)  {}
599 | };
600 | 
601 | #ifdef C_API
602 | template <typename Count>
603 | class Collection;
604 | 
605 | Collection<Id> loadCollection(const ClusterCollection rcn, bool makeunique
606 | 	, float membership, ::AggHash* ahash, const NodeBaseI* nodebase, RawIds* lostcls, bool verbose);
607 | #endif // C_API
608 | 
609 | //! Collection describing cluster-node relations
610 | //! \tparam Count  - arithmetic counting type
611 | template <typename Count>
612 | class Collection: public NodeBaseI {
613 | public:
614 | 	using CollectionT = Collection<Count>;
615 | 	//! Overlaps / multi-resolutions evaluation flag
616 | 	constexpr static bool  m_overlaps = is_floating_point<Count>::value;
617 | 	//! Accumulated contribution
618 | 	using AccCont = conditional_t<m_overlaps, Count, AccId>;
619 | 	//! Clusters matching matrix
620 | 	using ClustersMatching = SparseMatrix<Cluster<Count>*, AccCont>;  // Used only for NMI
621 | 	using ClsLabels = ClustersLabels<Count>;
622 | 
623 | #ifdef C_API
624 | 	friend Collection<Id> loadCollection(const ClusterCollection rcn, bool makeunique, float membership
625 | 		, ::AggHash* ahash, const NodeBaseI* nodebase, bool reduce, RawIds* lostcls, bool verbose);
626 | #endif // C_API
627 | private:
628 | 	// ATTENTNION: Collection manages the memory of the m_cls
629 | 	ClusterPtrs<Count>  m_cls;  //!< Clusters
630 | 	NodeClusters<Count>  m_ndcs;  //!< Node clusters relations
631 | 	size_t  m_ndshash;  //!< Nodes hash (of unique node ids only, not all members), 0 means was not evaluated
632 | 	//mutable bool  m_dirty;  //!< The cluster members contribution is not zero (should be reseted on reprocessing)
633 | 	//! Sum of contributions of all members in each cluster
634 | 	mutable AccCont  m_contsum;  // Used by NMI only, marked also by overlapping F1
635 | 
636 | 	//! \copydoc NodeBaseI::nodes() const noexcept
637 | 	const UniqIds& nodes() const noexcept override  // Make a stub and close it
638 | 	{
639 | 		assert(0 && "Nodes should not be accessed in collection via the NodeBaseI interface");
640 | 		static UniqIds  nds;
641 | 		return nds;  // Stub output
642 | 	}
643 | protected:
644 |     //! Default constructor
645 | 	Collection(): m_cls(), m_ndcs(), m_ndshash(0), m_contsum(0)  {}  //, m_dirty(false)  {}
646 | 
647 | 	// Note: Actual for NMI and overlapping F1
648 | 	//! \brief Initialized cluster members contributions
649 | 	//!
650 | 	//! \param cn const CollectionT&  - target collection to initialize cluster
651 | 	//! members contributions
652 | 	//! \return void
653 | 	static void initconts(const CollectionT& cn) noexcept;
654 | public:
655 | 	~Collection();
656 | 
657 |     //! \brief The number of clusters
658 |     //!
659 |     //! \return Id  - the number of clusters in the collection
660 | 	Id clsnum() const noexcept  { return m_cls.size(); }
661 | 
662 | 	//! \copydoc NodeBaseI::ndsnum() const noexcept
663 | 	Id ndsnum() const noexcept override  { return m_ndcs.size(); }
664 | 
665 | 	//! \copydoc NodeBaseI::nodeExists(Id nid) const noexcept
666 | 	bool nodeExists(Id nid) const noexcept override  { return m_ndcs.count(nid); }
667 | 
668 | #ifndef NO_FILEIO
669 | 	//! \brief Load collection from the CNL file
670 | 	//! \pre All clusters in the file are expected to be unique and not validated for
671 | 	//! the mutual match until makeunique is set
672 | 	//!
673 | 	//! \param filename const char*  - name of the input file
674 | 	//! \param makeunique=false bool  - ensure that clusters contain unique members by
675 | 	//! 	removing the duplicates
676 | 	//! \param membership=1 float  - expected membership of the nodes, >0, typically >= 1.
677 | 	//! Used only for the node container preallocation to estimate the number of nodes
678 | 	//! if not specified in the file header
679 |     //! \param ahash=nullptr AggHash*  - resulting hash of the loaded
680 |     //! member ids base (unique ids only are hashed, not all ids) if not nullptr
681 | 	//! \param const nodebase=nullptr NodeBaseI*  - node base to filter-out nodes if required
682 | 	//! \param lostcls=nullptr RawIds*  - indices of the lost clusters during the node base
683 | 	//! synchronization
684 | 	//! \param verbose=false bool  - print the number of loaded nodes to the stdout
685 |     //! \return CollectionT  - the collection is loaded successfully
686 | 	static CollectionT load(const char* filename, bool makeunique=false
687 | 		, float membership=1, AggHash* ahash=nullptr, const NodeBaseI* nodebase=nullptr
688 | 		, RawIds* lostcls=nullptr, bool verbose=false);
689 | #endif // NO_FILEIO
690 | 
691 |     //! \brief Transfer collection data
692 |     //! \post This collection becomes empty
693 |     //!
694 |     //! \tparam FIRST bool  - fill first of second node clusters relations container
695 |     //!
696 |     //! \param cls RawClusters&  - raw clusters to be extended
697 |     //! \param nds NodeRClusters&  - node raw clusters relations to be extended
698 |     //! \return void
699 |     template <bool FIRST>
700 | 	void transfer(RawClusters& cls, NodeRClusters& ndrcs);
701 | 
702 |     //! \brief Clear cluster counters
703 |     //!
704 |     //! \return void
705 | 	void clearcounts() const noexcept;
706 | 
707 | //	//! \brief Synchronize the node base of the cluster collections
708 | //	//!
709 | //	//! \tparam REDUCE bool  - whether to reduce collections by removing the non-matching nodes
710 | //	//! 	or extend collections by appending those nodes them to a single "noise" cluster
711 | //	///
712 | //	/// \param cn1 CollectionT&  - first collection
713 | //	/// \param cn2 CollectionT&  - second collection
714 | //	/// \return Prob  - harmonic mean of the nodebase correction (complement or reduction) for both collections
715 | //	template <bool REDUCE>
716 | //	static Prob syncCollections(CollectionT& cn1, CollectionT& cn2);
717 | 
718 |     //! \brief Label collection of clusters according to the ground-truth cluster indices
719 |     //!
720 |     //! \param gt const CollectionT&  - ground-truth cluster collection
721 |     //! \param cn const CollectionT&  - processing cluster collection
722 | //	//! \param lostcls const RawIds&  - indices of the lost clusters during the node base
723 | //	//! synchronization
724 |     //! \param prob bool  - Partial Probabilities or F1 (harmonic) matching policy
725 |     //! \param weighted=true bool  - weight labels by the number of instances or
726 |     //! treat each label equally
727 |     //! \param flname=nullptr const char*  - resulting label indices filename (.cll format)
728 | //    //! \param verbose=false bool  - print intermediate results to the stdout
729 |     //! \return PrcRec  - resulting precision and recall for the labeled items
730 | 	static PrcRec label(const CollectionT& gt, const CollectionT& cn //, const RawIds& lostcls
731 | 		, bool prob, bool weighted=true, const char* flname=nullptr); //, bool verbose=false);
732 | 
733 | 	//! \brief Specified F1 evaluation of the Greatest (Max) Match for the
734 | 	//! multi-resolution clustering with possibly unequal node base
735 | 	//!
736 | 	//! Supported F1 measures are F1p <= F1h <= F1s, where:
737 | 	//! - F1p  - Harmonic mean of the [weighted] average of partial probabilities,
738 | 	//! 	the most discriminative and satisfies the largest number of the Formal
739 | 	//! 	Constraints (homogeneity, completeness, rag bag,  size/quantity, balance);
740 | 	//! - F1h  - Harmonic mean of the [weighted] average of F1s;
741 | 	//! - F1a  - Average F1-Score, i.e. arithmetic mean (average) of the [weighted]
742 | 	//! 	average of F1s, the least discriminative and satisfies the lowest number
743 | 	//! 	of the Formal Constraints.
744 | 	//!
745 | 	//! of the Greatest (Max) Match [Weighted] Average Harmonic Mean evaluation
746 | 	//! \note Undirected (symmetric) evaluation
747 | 	//!
748 | 	//! \param cn1 const CollectionT&  - first collection
749 | 	//! \param cn2 const CollectionT&  - second collection
750 |     //! \param kind F1  - kind of F1 to be evaluated
751 |     //! \param rec Prob&  - recall of cn2 relative to the ground-truth cn1 or
752 |     //! 0 if the matching strategy does not have the precision/recall notations
753 |     //! \param prc Prob&  - precision of cn2 relative to the ground-truth cn1 or
754 |     //! 0 if the matching strategy does not have the precision/recall notations
755 |     //! \param mkind=Match::WEIGHTED Match  - matching kind
756 |     //! \param verbose=false bool  - print intermediate results to the stdout
757 | 	//! \return Prob  - resulting F1_gm
758 | 	static Prob f1(const CollectionT& cn1, const CollectionT& cn2, F1 kind
759 | 		, Prob& rec, Prob& prc, Match mkind=Match::WEIGHTED, bool verbose=false);
760 | 
761 | 	//! \brief NMI evaluation
762 | 	//! \note Undirected (symmetric) evaluation
763 | 	//!
764 | 	//! \param cn1 const CollectionT&  - first collection
765 | 	//! \param cn2 const CollectionT&  - second collection
766 |     //! \param expbase=false bool  - use ln (exp base) or log2 (Shannon entropy, bits)
767 |     //! for the information measuring
768 |     //! \param verbose=false bool  - perform additional verification and print details
769 | 	//! \return RawNmi  - resulting NMI
770 | 	static RawNmi nmi(const CollectionT& cn1, const CollectionT& cn2, bool expbase=false
771 | 		, bool verbose=false);
772 | protected:
773 | 	// Label related functions -------------------------------------------------
774 |     //! \brief Mark clusters of the argument collection with the labels
775 |     //! \note For EACH label the best matching cluster is identified. Mutual match
776 |     //! is not applied to guarantee coverage of the all ground-truth clusters to
777 |     //! have meaningful F1
778 |     //!
779 |     //! \param cn const CollectionT&  - the collection to be labeled
780 |     //! \param prob bool  - match labels by the Partial Probabilities or F1;
781 |     //! prob maximizes gain otherwise loss is minimized and F1 is maximized
782 |     //! \param weighted=true bool  - weight labels by the number of instances or
783 |     //! treat each label equally
784 |     //! \param csls=nullptr ClsLabels*  - resulting labels as clusters of the
785 |     //! ground-truth collection if not nullptr
786 |     //! \return PrcRec  - resulting average over all labels Precision and Recall
787 |     //! for all nodes of the marked clusters, where each label can be assigned
788 |     //! to multiple cn clusters and then all nodes of that clusters are matched
789 |     //! to the ground truth cluster (label) nodes
790 | 	PrcRec mark(const CollectionT& cn, bool prob, bool weighted=true, ClsLabels* csls=nullptr) const;
791 | 
792 | 	// F1-related functions ----------------------------------------------------
793 |     //! \brief Average of the maximal matches (by F1 or partial probabilities)
794 |     //! relative to the specified collection FROM this one
795 |     //! \note External cn collection can have unequal node base and overlapping
796 |     //! clusters on multiple resolutions. Small collection relative to the average
797 |     //! or average relative to huge might yield best matching F1 equal to 1, but
798 |     //! then the back match should be small.
799 |     //! \attention Directed (non-symmetric) evaluation
800 |     //!
801 |     //! \param gmats const Probs&  - greatest (max) matching with another collection
802 |     //! \param weighted bool  - weighted average by cluster size
803 |     //! \return AccProb  - resulting max average match value from this collection
804 |     //! to the specified one (DIRECTED)
805 | 	inline AccProb avggms(const Probs& gmats, bool weighted) const;  // const CollectionT& cn
806 | 
807 |     //! \brief Greatest (Max) matching value (F1 or partial probability) for each cluster
808 |     //! to the corresponding clusters of the specified collection
809 |     //! \note External cn collection can have unequal node base and overlapping
810 |     //! clusters on multiple resolutions
811 |     //! \attention Directed (non-symmetric) evaluation
812 |     //! \post Modifies internal state of the collection
813 |     //!
814 |     //! \param cn const CollectionT&  - collection to compare with
815 |     //! \param prob bool  - evaluate partial probability instead of F1
816 |     //! \return Probs - resulting max F1 or partial probability for cluster
817 |     //! (all member nodes are considered in the cluster)
818 | 	Probs gmatches(const CollectionT& cn, bool prob) const;
819 | 
820 | 	// NMI-related functions ---------------------------------------------------
821 | 	//! \brief NMI evaluation considering overlaps, multi-resolution and possibly
822 | 	//! unequal node base
823 | 	//! \note Undirected (symmetric) evaluation
824 |     //!
825 |     //! \param cn const CollectionT&  - collection to compare with
826 |     //! \param expbase bool  - use ln (exp base) or log2 (Shannon entropy, bits)
827 |     //! for the information measuring
828 |     //! \return RawNmi  - resulting NMI
829 | 	RawNmi nmi(const CollectionT& cn, bool expbase) const;
830 | 
831 |     //! \brief Clear contributions in each cluster and optionally
832 |     //! evaluate the clusters matching
833 |     //!
834 |     //! \param cn const CollectionT&  - foreign collection to be processed with this one
835 |     //! \param[out] clsmm=nullptr ClustersMatchingT*  - clusters matching matrix to be filled
836 |     //! \return AccCont  - sum of all values of the clsmm matrix if specified
837 | 	AccCont evalconts(const CollectionT& cn, ClustersMatching* clsmm=nullptr) const;
838 | 
839 |     //! \brief Clear contributions in each cluster
840 |     //!
841 |     //! \return void
842 | 	void clearconts() const noexcept;
843 | };
844 | 
845 | // Accessory functions ---------------------------------------------------------
846 | //! \brief Compile time pair selector
847 | //!
848 | //! \tparam FIRST bool  - select .first or .second
849 | //!
850 | //! \param pr P&  - pair
851 | //! \return selected field
852 | template <bool FIRST, typename P>
853 | enable_if_t<FIRST, typename P::first_type>& pairsel(P& pr) noexcept  { return pr.first; }
854 | 
855 | template <bool FIRST, typename P>
856 | enable_if_t<!FIRST, typename P::second_type>& pairsel(P& pr) noexcept  { return pr.second; }
857 | 
858 | //! \brief Parse decimal c-string as id
859 | //!
860 | //! \param str char*  - id string
861 | //! \return Id  - id value
862 | Id  parseId(char* str);
863 | 
864 | //! \brief Harmonic mean
865 | //! \note a + b = 0 are threated correctly resulting 0
866 | //!
867 | //! \param a AccProb  - first item
868 | //! \param b AccProb  - second item
869 | //! \return AccProb  - resulting mean
870 | AccProb hmean(AccProb a, AccProb b) noexcept;
871 | 
872 | //! \brief Geometric mean
873 | //!
874 | //! \param a AccProb  - first item
875 | //! \param b AccProb  - second item
876 | //! \return AccProb  - resulting mean
877 | AccProb gmean(AccProb a, AccProb b) noexcept;
878 | 
879 | //! \brief Arithmetic mean (average)
880 | //!
881 | //! \param a AccProb  - first item
882 | //! \param b AccProb  - second item
883 | //! \return AccProb  - resulting mean
884 | AccProb amean(AccProb a, AccProb b) noexcept;
885 | 
886 | #endif // INTERFACE_H
887 | 


--------------------------------------------------------------------------------