├── .coveragerc ├── .gitignore ├── .travis.yml ├── AUTHORS.rst ├── LICENSE.rst ├── MANIFEST.in ├── README.rst ├── beard ├── __init__.py ├── clustering │ ├── __init__.py │ ├── blocking.py │ ├── blocking_funcs.py │ └── wrappers.py ├── ext │ ├── __init__.py │ └── metaphone.py ├── metrics │ ├── __init__.py │ ├── clustering.py │ └── text.py ├── similarity │ ├── __init__.py │ └── pairs.py └── utils │ ├── __init__.py │ ├── misc.py │ ├── names.py │ ├── strings.py │ └── transformers.py ├── doc ├── Makefile ├── _build │ └── .keep ├── _static │ └── .keep ├── _templates │ └── .keep ├── conf.py ├── index.rst └── make.bat ├── examples ├── README.rst ├── applications │ └── author-disambiguation │ │ ├── README.rst │ │ ├── clustering.py │ │ ├── distance.py │ │ ├── ethnicity.py │ │ ├── sampling.py │ │ └── utils.py ├── author_disambiguation.py └── data │ ├── README.rst │ ├── author-disambiguation.npz │ ├── wang_clusters.json │ ├── wang_records.json │ └── wang_signatures.json ├── miniconda.sh ├── pytest.ini ├── run-tests.sh ├── setup.py ├── tests ├── clustering │ ├── test_block.py │ ├── test_blocking.py │ ├── test_blocking_funcs.py │ └── test_wrappers.py ├── metrics │ ├── test_clustering.py │ └── test_text.py ├── similarity │ └── test_pairs.py └── utils │ ├── test_names.py │ ├── test_strings.py │ └── test_transformers.py └── travis-install.sh /.coveragerc: -------------------------------------------------------------------------------- 1 | # This file is part of Beard. 2 | # Copyright (C) 2014 CERN. 3 | # 4 | # Beard is a free software; you can redistribute it and/or modify it 5 | # under the terms of the Revised BSD License; see LICENSE file for 6 | # more details. 7 | 8 | [run] 9 | source = beard 10 | omit = beard/ext/* 11 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | *.eggs 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .cache 41 | nosetests.xml 42 | coverage.xml 43 | 44 | # Translations 45 | *.mo 46 | *.pot 47 | 48 | # Django stuff: 49 | *.log 50 | 51 | # Sphinx documentation 52 | docs/_build/ 53 | 54 | # PyBuilder 55 | target/ 56 | 57 | .python-version 58 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # This file is part of Beard. 2 | # Copyright (C) 2014 CERN. 3 | # 4 | # Beard is a free software; you can redistribute it and/or modify it 5 | # under the terms of the Revised BSD License; see LICENSE file for 6 | # more details. 7 | 8 | language: python 9 | 10 | sudo: false 11 | 12 | matrix: 13 | include: 14 | - python: "2.7" 15 | env: PYTHON_VERSION="2.7" NUMPY_VERSION="1.10" SCIPY_VERSION="0.17" SKLEARN_VERSION="0.17" 16 | - python: "3.6" 17 | env: PYTHON_VERSION="3.6" 18 | 19 | install: 20 | - if [[ "$PYTHON_VERSION" == "2.7" ]]; then 21 | source travis-install.sh; 22 | fi 23 | - pip install check-manifest coveralls pydocstyle pytest-cov 24 | - python setup.py install 25 | - python setup.py clean --all 26 | 27 | script: 28 | - source run-tests.sh 29 | 30 | after_success: 31 | - coveralls 32 | 33 | notifications: 34 | email: false 35 | -------------------------------------------------------------------------------- /AUTHORS.rst: -------------------------------------------------------------------------------- 1 | Authors 2 | ======= 3 | 4 | Contributors: 5 | 6 | * Gilles Louppe 7 | * Mateusz Susik 8 | * Petros Ioannidis 9 | * Evangelos Tzemis 10 | * Hussein Al-Natsheh 11 | -------------------------------------------------------------------------------- /LICENSE.rst: -------------------------------------------------------------------------------- 1 | Beard is free software; you can redistribute it and/or modify it 2 | under the terms of the Revised BSD License quoted below. 3 | 4 | Copyright (c) 2014 CERN. 5 | All rights reserved. 6 | 7 | Redistribution and use in source and binary forms, with or without 8 | modification, are permitted provided that the following conditions are met: 9 | 10 | * Redistributions of source code must retain the above copyright notice, this 11 | list of conditions and the following disclaimer. 12 | 13 | * Redistributions in binary form must reproduce the above copyright notice, 14 | this list of conditions and the following disclaimer in the documentation 15 | and/or other materials provided with the distribution. 16 | 17 | * Neither the name of Beard nor the names of its 18 | contributors may be used to endorse or promote products derived from 19 | this software without specific prior written permission. 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 25 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 27 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 28 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 29 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | 32 | In applying this license, CERN does not waive the privileges and 33 | immunities granted to it by virtue of its status as an 34 | Intergovernmental Organization or submit itself to any jurisdiction. 35 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.rst 2 | include *.sh 3 | include .coveragerc 4 | include pytest.ini 5 | recursive-include doc *.bat 6 | recursive-include doc *.keep 7 | recursive-include doc *.py 8 | recursive-include doc *.rst 9 | recursive-include doc Makefile 10 | recursive-include examples *.json 11 | recursive-include examples *.npz 12 | recursive-include examples *.py 13 | recursive-include examples *.rst 14 | recursive-include tests *.py 15 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ===== 2 | Beard 3 | ===== 4 | 5 | Beard is a Python library of machine learning tools for Bibliographic Entity 6 | Automatic Recognition and Disambiguation. 7 | 8 | The project is currently in stable stage of development. 9 | 10 | .. image:: https://travis-ci.org/inspirehep/beard.svg?branch=master 11 | :target: https://travis-ci.org/inspirehep/beard 12 | .. image:: https://coveralls.io/repos/inspirehep/beard/badge.png 13 | :target: https://coveralls.io/r/inspirehep/beard 14 | 15 | Installation 16 | ============ 17 | 18 | ``python setup.py install`` 19 | 20 | Examples 21 | ======== 22 | 23 | In the ``examples/applications/author-disambiguation`` directory there are files 24 | that present how to use the library for the author disambiguation problem. 25 | Check the ``README.rst`` in this directory for details. 26 | -------------------------------------------------------------------------------- /beard/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of Beard. 4 | # Copyright (C) 2014 CERN. 5 | # 6 | # Beard is a free software; you can redistribute it and/or modify it 7 | # under the terms of the Revised BSD License; see LICENSE file for 8 | # more details. 9 | 10 | """Bibliographic Entity Automatic Recognition and Disambiguation.""" 11 | 12 | __version__ = "0.2.2" 13 | -------------------------------------------------------------------------------- /beard/clustering/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of Beard. 4 | # Copyright (C) 2014 CERN. 5 | # 6 | # Beard is a free software; you can redistribute it and/or modify it 7 | # under the terms of the Revised BSD License; see LICENSE file for 8 | # more details. 9 | 10 | """Clustering algorithms.""" 11 | 12 | from .blocking import BlockClustering 13 | from .blocking_funcs import block_phonetic 14 | from .blocking_funcs import block_last_name_first_initial 15 | from .blocking_funcs import block_single 16 | from .wrappers import ScipyHierarchicalClustering 17 | 18 | __all__ = ("BlockClustering", 19 | "block_phonetic", 20 | "block_last_name_first_initial", 21 | "block_single", 22 | "ScipyHierarchicalClustering") 23 | -------------------------------------------------------------------------------- /beard/clustering/blocking.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of Beard. 4 | # Copyright (C) 2015 CERN. 5 | # 6 | # Beard is a free software; you can redistribute it and/or modify it 7 | # under the terms of the Revised BSD License; see LICENSE file for 8 | # more details. 9 | 10 | """Blocking for clustering estimators. 11 | 12 | .. codeauthor:: Gilles Louppe 13 | .. codeauthor:: Mateusz Susik 14 | 15 | """ 16 | 17 | from __future__ import print_function 18 | 19 | import numpy as np 20 | import time 21 | import structlog 22 | 23 | from sklearn.base import BaseEstimator 24 | from sklearn.base import clone 25 | from sklearn.base import ClusterMixin 26 | from sklearn.utils import column_or_1d 27 | 28 | from .blocking_funcs import block_single 29 | 30 | LOGGER = structlog.getLogger() 31 | 32 | 33 | class _SingleClustering(BaseEstimator, ClusterMixin): 34 | def fit(self, X, y=None): 35 | self.labels_ = block_single(X) 36 | return self 37 | 38 | def partial_fit(self, X, y=None): 39 | self.labels_ = block_single(X) 40 | return self 41 | 42 | def predict(self, X): 43 | return block_single(X) 44 | 45 | 46 | def _parallel_fit(fit_, partial_fit_, estimator, verbose, data_queue, 47 | result_queue): 48 | """Run clusterer's fit function.""" 49 | # Status can be one of: 'middle', 'end' 50 | # 'middle' means that there is a block to compute and the process should 51 | # continue 52 | # 'end' means that the process should finish as all the data was sent 53 | # by the main process 54 | status, block, existing_clusterer = data_queue.get() 55 | 56 | while status != 'end': 57 | 58 | b, X, y = block 59 | 60 | if len(X) == 1: 61 | clusterer = _SingleClustering() 62 | elif existing_clusterer and partial_fit_ and not fit_: 63 | clusterer = existing_clusterer 64 | else: 65 | clusterer = clone(estimator) 66 | 67 | if verbose > 1: 68 | print("Clustering %d samples on block '%s'..." % (len(X), b)) 69 | LOGGER.info("Clustering %d samples on block '%s'..." % (len(X), b)) 70 | 71 | if fit_ or not hasattr(clusterer, "partial_fit"): 72 | try: 73 | clusterer.fit(X, y=y) 74 | except TypeError: 75 | clusterer.fit(X) 76 | elif partial_fit_: 77 | try: 78 | clusterer.partial_fit(X, y=y) 79 | except TypeError: 80 | clusterer.partial_fit(X) 81 | 82 | result_queue.put((b, clusterer)) 83 | status, block, existing_clusterer = data_queue.get() 84 | 85 | data_queue.put(('end', None, None)) 86 | return 87 | 88 | 89 | def _single_fit(fit_, partial_fit_, estimator, verbose, data): 90 | """Run clusterer's fit function.""" 91 | block, existing_clusterer = data 92 | b, X, y = block 93 | 94 | if len(X) == 1: 95 | clusterer = _SingleClustering() 96 | elif existing_clusterer and partial_fit_ and not fit_: 97 | clusterer = existing_clusterer 98 | else: 99 | clusterer = clone(estimator) 100 | 101 | if verbose > 1: 102 | print("Clustering %d samples on block '%s'..." % (len(X), b)) 103 | LOGGER.info("Clustering %d samples on block '%s'..." % (len(X), b)) 104 | 105 | if fit_ or not hasattr(clusterer, "partial_fit"): 106 | try: 107 | clusterer.fit(X, y=y) 108 | except TypeError: 109 | clusterer.fit(X) 110 | elif partial_fit_: 111 | try: 112 | clusterer.partial_fit(X, y=y) 113 | except TypeError: 114 | clusterer.partial_fit(X) 115 | 116 | return (b, clusterer) 117 | 118 | 119 | class BlockClustering(BaseEstimator, ClusterMixin): 120 | """Implements blocking for clustering estimators. 121 | 122 | Meta-estimator for grouping samples into blocks, within each of which 123 | a clustering base estimator is fit. This allows to reduce the cost of 124 | pairwise distance computation from O(N^2) to O(sum_b N_b^2), where 125 | N_b <= N is the number of samples in block b. 126 | 127 | Attributes 128 | ---------- 129 | labels_ : ndarray, shape (n_samples,) 130 | Array of labels assigned to the input data. 131 | if partial_fit is used instead of fit, they are assigned to the 132 | last batch of data. 133 | 134 | blocks_ : ndarray, shape (n_samples,) 135 | Array of keys mapping input data to blocks. 136 | """ 137 | 138 | def __init__(self, affinity=None, blocking="single", base_estimator=None, 139 | verbose=0, n_jobs=1): 140 | """Initialize. 141 | 142 | Parameters 143 | ---------- 144 | :param affinity: string or None 145 | If affinity == 'precomputed', then assume that X is a distance 146 | matrix. 147 | 148 | :param blocking: string or callable, default "single" 149 | The blocking strategy, for mapping samples X to blocks. 150 | - "single": group all samples X[i] into the same block; 151 | - "precomputed": use `blocks[i]` argument (in `fit`, `partial_fit` 152 | or `predict`) as a key for mapping sample X[i] to a block; 153 | - callable: use blocking(X)[i] as a key for mapping sample X[i] to 154 | a block. 155 | 156 | :param base_estimator: estimator 157 | Clustering estimator to fit within each block. 158 | 159 | :param verbose: int, default=0 160 | Verbosity of the fitting procedure. 161 | 162 | :param n_jobs: int 163 | Number of processes to use. 164 | """ 165 | self.affinity = affinity 166 | self.blocking = blocking 167 | self.base_estimator = base_estimator 168 | self.verbose = verbose 169 | self.n_jobs = n_jobs 170 | 171 | def _validate(self, X, blocks): 172 | """Validate hyper-parameters and input data.""" 173 | if self.blocking == "single": 174 | blocks = block_single(X) 175 | elif self.blocking == "precomputed": 176 | if blocks is not None and len(blocks) == len(X): 177 | blocks = column_or_1d(blocks).ravel() 178 | else: 179 | raise ValueError("Invalid value for blocks. When " 180 | "blocking='precomputed', blocks needs to be " 181 | "an array of size len(X).") 182 | elif callable(self.blocking): 183 | blocks = self.blocking(X) 184 | else: 185 | raise ValueError("Invalid value for blocking. Allowed values are " 186 | "'single', 'precomputed' or callable.") 187 | 188 | return X, blocks 189 | 190 | def _blocks(self, X, y, blocks): 191 | """Chop the training data into smaller chunks. 192 | 193 | A chunk is demarcated by the corresponding block. Each chunk contains 194 | only the training examples relevant to given block and a clusterer 195 | which will be used to fit the data. 196 | 197 | Returns 198 | ------- 199 | :returns: generator 200 | Quadruples in the form of ``(block, X, y, clusterer)`` where 201 | X and y are the training examples for given block and clusterer is 202 | an object with a ``fit`` method. 203 | """ 204 | unique_blocks = np.unique(blocks) 205 | 206 | for b in unique_blocks: 207 | mask = (blocks == b) 208 | X_mask = X[mask, :] 209 | if y is not None: 210 | y_mask = y[mask] 211 | else: 212 | y_mask = None 213 | if self.affinity == "precomputed": 214 | X_mask = X_mask[:, mask] 215 | 216 | yield (b, X_mask, y_mask) 217 | 218 | def _fit(self, X, y, blocks): 219 | """Fit base clustering estimators on X.""" 220 | self.blocks_ = blocks 221 | if self.n_jobs == 1: 222 | LOGGER.info("fitting data with 1 job") 223 | blocks_computed = 0 224 | blocks_all = len(np.unique(blocks)) 225 | LOGGER.info( 226 | "%s blocks computed out of %s" % ( 227 | blocks_computed, blocks_all 228 | ) 229 | ) 230 | for block in self._blocks(X, y, blocks): 231 | if self.partial_fit_ and block[0] in self.clusterers_: 232 | data = (block, self.clusterers_[block[0]]) 233 | else: 234 | data = (block, None) 235 | 236 | b, clusterer = _single_fit(self.fit_, self.partial_fit_, 237 | self.base_estimator, self.verbose, 238 | data) 239 | 240 | if clusterer: 241 | self.clusterers_[b] = clusterer 242 | 243 | if blocks_computed < blocks_all: 244 | print("%s blocks computed out of %s" % (blocks_computed, 245 | blocks_all)) 246 | LOGGER.info( 247 | "%s blocks computed out of %s" % ( 248 | blocks_computed, blocks_all 249 | ) 250 | ) 251 | blocks_computed += 1 252 | else: 253 | LOGGER.info( 254 | "fitting data with {0} parallel jobs".format( 255 | self.n_jobs 256 | ) 257 | ) 258 | try: 259 | from multiprocessing import SimpleQueue 260 | except ImportError: 261 | from multiprocessing.queues import SimpleQueue 262 | 263 | # Here the blocks will be passed to subprocesses 264 | data_queue = SimpleQueue() 265 | # Here the results will be passed back 266 | result_queue = SimpleQueue() 267 | 268 | for x in range(self.n_jobs): 269 | import multiprocessing as mp 270 | processes = [] 271 | 272 | processes.append(mp.Process(target=_parallel_fit, args=( 273 | self.fit_, self.partial_fit_, 274 | self.base_estimator, self.verbose, 275 | data_queue, result_queue))) 276 | processes[-1].start() 277 | 278 | # First n_jobs blocks are sent into the queue without waiting 279 | # for the results. This variable is a counter that takes care of 280 | # this. 281 | presend = 0 282 | blocks_computed = 0 283 | blocks_all = len(np.unique(blocks)) 284 | 285 | for block in self._blocks(X, y, blocks): 286 | if presend >= self.n_jobs: 287 | b, clusterer = result_queue.get() 288 | blocks_computed += 1 289 | if clusterer: 290 | self.clusterers_[b] = clusterer 291 | else: 292 | presend += 1 293 | if self.partial_fit_: 294 | if block[0] in self.clusterers_: 295 | data_queue.put(('middle', block, self.clusterers_[b])) 296 | continue 297 | 298 | data_queue.put(('middle', block, None)) 299 | 300 | # Get the last results and tell the subprocesses to finish 301 | for x in range(self.n_jobs): 302 | if blocks_computed < blocks_all: 303 | print("%s blocks computed out of %s" % (blocks_computed, 304 | blocks_all)) 305 | LOGGER.info( 306 | "%s blocks computed out of %s" % ( 307 | blocks_computed, blocks_all 308 | ) 309 | ) 310 | b, clusterer = result_queue.get() 311 | blocks_computed += 1 312 | if clusterer: 313 | self.clusterers_[b] = clusterer 314 | 315 | data_queue.put(('end', None, None)) 316 | 317 | time.sleep(1) 318 | 319 | return self 320 | 321 | def fit(self, X, y=None, blocks=None): 322 | """Fit individual base clustering estimators for each block. 323 | 324 | Parameters 325 | ---------- 326 | :param X: {array-like, sparse matrix}, shape (n_samples, n_features) 327 | or (n_samples, n_samples) 328 | Input data, as an array of samples or as a distance matrix if 329 | affinity == 'precomputed'. 330 | 331 | :param y: array-like, shape (n_samples, ) 332 | Input labels, in case of (semi-)supervised clustering. 333 | Labels equal to -1 stand for unknown labels. 334 | 335 | :param blocks: array-like, shape (n_samples, ) 336 | Block labels, if `blocking == 'precomputed'`. 337 | 338 | Returns 339 | ------- 340 | :returns: self 341 | """ 342 | # Validate parameters 343 | X, blocks = self._validate(X, blocks) 344 | 345 | # Reset attributes 346 | self.clusterers_ = {} 347 | self.fit_, self.partial_fit_ = True, False 348 | 349 | return self._fit(X, y, blocks) 350 | 351 | def partial_fit(self, X, y=None, blocks=None): 352 | """Resume fitting of base clustering estimators, for each block. 353 | 354 | This calls `partial_fit` whenever supported by the base estimator. 355 | Otherwise, this calls `fit`, on given blocks only. 356 | 357 | Parameters 358 | ---------- 359 | :param X: {array-like, sparse matrix}, shape (n_samples, n_features) 360 | or (n_samples, n_samples) 361 | Input data, as an array of samples or as a distance matrix if 362 | affinity == 'precomputed'. 363 | 364 | :param y: array-like, shape (n_samples, ) 365 | Input labels, in case of (semi-)supervised clustering. 366 | Labels equal to -1 stand for unknown labels. 367 | 368 | :param blocks: array-like, shape (n_samples, ) 369 | Block labels, if `blocking == 'precomputed'`. 370 | 371 | Returns 372 | ------- 373 | :returns: self 374 | """ 375 | # Validate parameters 376 | X, blocks = self._validate(X, blocks) 377 | 378 | # Set attributes if first call 379 | if not hasattr(self, "clusterers_"): 380 | self.clusterers_ = {} 381 | 382 | self.fit_, self.partial_fit_ = False, True 383 | 384 | return self._fit(X, y, blocks) 385 | 386 | def predict(self, X, blocks=None): 387 | """Predict data. 388 | 389 | Parameters 390 | ---------- 391 | :param X: {array-like, sparse matrix}, shape (n_samples, n_features) 392 | Input data. 393 | 394 | :param blocks: array-like, shape (n_samples, ) 395 | Block labels, if `blocking == 'precomputed'`. 396 | 397 | Returns 398 | ------- 399 | :returns: array-like, shape (n_samples) 400 | The labels. 401 | """ 402 | # Validate parameters 403 | X, blocks = self._validate(X, blocks) 404 | 405 | # Predict 406 | labels = -np.ones(len(X), dtype=np.int) 407 | offset = 0 408 | 409 | for b in np.unique(blocks): 410 | # Predict on the block, if known 411 | if b in self.clusterers_: 412 | mask = (blocks == b) 413 | clusterer = self.clusterers_[b] 414 | 415 | pred = np.array(clusterer.predict(X[mask])) 416 | pred[(pred != -1)] += offset 417 | labels[mask] = pred 418 | offset += np.max(clusterer.labels_) + 1 419 | 420 | return labels 421 | 422 | @property 423 | def labels_(self): 424 | """Compute the labels assigned to the input data. 425 | 426 | Note that labels are computed on-the-fly. 427 | """ 428 | labels = -np.ones(len(self.blocks_), dtype=np.int) 429 | offset = 0 430 | 431 | for b in self.clusterers_: 432 | mask = (self.blocks_ == b) 433 | clusterer = self.clusterers_[b] 434 | 435 | pred = np.array(clusterer.labels_) 436 | pred[(pred != -1)] += offset 437 | labels[mask] = pred 438 | offset += np.max(clusterer.labels_) + 1 439 | 440 | return labels 441 | -------------------------------------------------------------------------------- /beard/clustering/blocking_funcs.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of Beard. 4 | # Copyright (C) 2015 CERN. 5 | # 6 | # Beard is a free software; you can redistribute it and/or modify it 7 | # under the terms of the Revised BSD License; see LICENSE file for 8 | # more details. 9 | 10 | """The algorithms for blocking. 11 | 12 | .. codeauthor:: Mateusz Susik 13 | 14 | """ 15 | 16 | import numpy as np 17 | import six 18 | 19 | from beard.utils import normalize_name 20 | from beard.utils.names import phonetic_tokenize_name 21 | from beard.utils.names import given_name_initial 22 | 23 | 24 | class _Block: 25 | """Representation of a block. 26 | 27 | Block stores information about different variation of names and the 28 | quantities of their appearances on the papers. 29 | 30 | Example of a block _content: 31 | 32 | .. code:: python 33 | 34 | { 35 | ('JNS',): { 36 | ('P',): 2, ('P', 'PL'): 3, ('P', 'JH'): 2 37 | }, 38 | ('JNS', 'SM0'): { 39 | ('PAL', 'JH'): 5, ('JH',): 3, ('SMN',): 2 40 | }, 41 | ('RCN', 'JNS'): { 42 | ('A',): 34 43 | } 44 | } 45 | 46 | From the example above, one can see that the block stores information 47 | about 5 signatures of 'JNS' 'SM0', 'PAL' 'JH'. Those strings are results 48 | of the phonetic algorithm. Such signature might correspond, for 49 | example, to Jones-Smith, Paul John. 50 | """ 51 | 52 | def __init__(self, surnames, given_names): 53 | """Create a block. Add given names from the first signature. 54 | 55 | Parameters 56 | ---------- 57 | :param surnames: tuple 58 | Strings representing surnames on a signature. 59 | :param given_names: tuple 60 | Strings representing given names on a signature. 61 | """ 62 | self._content = {surnames: {given_names: 1}} 63 | 64 | self._name = surnames[-1] 65 | 66 | def add_signature(self, surnames, given_names): 67 | """Add a signature to the block. 68 | 69 | Parameters 70 | ---------- 71 | :param surnames: tuple 72 | Strings representing surnames on a signature. 73 | :param given_names: tuple 74 | Strings representing given_names on a signature. 75 | """ 76 | if surnames in self._content: 77 | if given_names in self._content[surnames]: 78 | self._content[surnames][given_names] += 1 79 | else: 80 | self._content[surnames][given_names] = 1 81 | else: 82 | self._content[surnames] = {given_names: 1} 83 | 84 | def compare_tokens_from_last(self, first_surnames, last_surname): 85 | """Check if a part of the surname matches with given names in block. 86 | 87 | For example, ``Sanchez-Gomez, Juan`` can appear on a signature as 88 | ``Gomez, Juan Sanchez``. This function checks if there is a match 89 | between surnames like ``Sanchez`` and the given names in the block. 90 | In this case, a signature like ``Gomez, J. Sanchez`` will create a 91 | match, while ``Gomez, Juan S.`` won't. 92 | 93 | Full names have to match. Only the signatures with single surname 94 | are used for matching. 95 | 96 | Parameters 97 | ---------- 98 | :param first_surnames: tuple 99 | Tokens which represent few first surnames. In form of a tuple of 100 | strings. 101 | :param last_surname: tuple 102 | Tokens, usually one, representing last surname(s) of the author. 103 | 104 | Raises 105 | ------ 106 | :raises: KeyError 107 | When the last name is not included in the cluster 108 | 109 | Returns 110 | ------- 111 | :returns: boolean 112 | Information whether cluster contains this author if some of the 113 | first last names are treated as the last given names. 114 | """ 115 | if last_surname in self._content: 116 | for given_names in six.iterkeys(self._content[last_surname]): 117 | given_names_left = len(given_names) 118 | for reversed_index, name in \ 119 | enumerate(reversed(first_surnames)): 120 | if given_names_left == 0: 121 | return True 122 | elif given_names[-(reversed_index + 1)] != name: 123 | break 124 | given_names_left -= 1 125 | if reversed_index == len(first_surnames) - 1: 126 | return True 127 | return False 128 | self._raise_keyerror(last_surname) 129 | 130 | def contains(self, surnames): 131 | """Check if there is at least one signature with given surnames. 132 | 133 | Parameters 134 | ---------- 135 | :param surnames: tuple 136 | Strings representing surnames on a signature. 137 | 138 | Returns 139 | ------- 140 | :returns: boolean 141 | True if there is at least one sinature with given surnames. 142 | """ 143 | return surnames in self._content 144 | 145 | def _raise_keyerror(self, key): 146 | raise KeyError("The cluster doesn't contain a key %s" % key) 147 | 148 | 149 | def _split_blocks(blocks, X, threshold): 150 | splitted_blocks = [] 151 | id_to_size = {} 152 | 153 | for block in blocks: 154 | if block._name in id_to_size: 155 | id_to_size[block._name] += 1 156 | else: 157 | id_to_size[block._name] = 1 158 | 159 | for index, block in enumerate(blocks): 160 | if id_to_size[block._name] > threshold: 161 | 162 | splitted_blocks.append(block._name + 163 | given_name_initial(X[index 164 | ][0]['author_name'])) 165 | else: 166 | splitted_blocks.append(block._name) 167 | 168 | return splitted_blocks 169 | 170 | 171 | def block_phonetic(X, threshold=1000, phonetic_algorithm="double_metaphone"): 172 | """Block the signatures. 173 | 174 | This blocking algorithm takes into consideration the cases, where 175 | author has more than one surname. Such a signature can be assigned 176 | to a block for the first author surname or the last one. 177 | 178 | The names are preprocessed by ``phonetic_tokenize_name`` function. As a 179 | result, here the algorithm operates on ``Double Metaphone`` tokens which 180 | are previously normalized. 181 | 182 | The algorithm has two phases. In the first phase, all the signatures with 183 | one surname are clustered together. Every different surname token creates 184 | a new block. In the second phase, the signatures 185 | with multiple surnames are compared with the blocks for the first and 186 | last surname. 187 | 188 | If the first surnames of author were already used as the last given names 189 | on some of the signatures, the new signature will be assigned to the block 190 | of the last surname. 191 | 192 | Otherwise, the signature will be assigned to the block of 193 | the first surname. 194 | 195 | To prevent creation of too big clusters, the ``threshold`` parameter can 196 | be set. The algorithm will split every block which size is bigger than 197 | ``threshold`` into smaller ones using given names initials as the 198 | condition. 199 | 200 | Parameters 201 | ---------- 202 | :param X: numpy array 203 | Array of one element arrays of dictionaries. Each dictionary 204 | represents a signature. The algorithm needs ``author_name`` field in 205 | the dictionaries in order to work. 206 | :param threshold: integer 207 | Size above which the blocks will be split into smaller ones. 208 | :param phonetic algorithm: string 209 | Which phonetic algorithm will be used. Options: 210 | - "double_metaphone" 211 | - "nysiis" (only for Python 2) 212 | - "soundex" (only for Python 2) 213 | 214 | Returns 215 | ------- 216 | :returns: numpy array 217 | Array with ids of the blocks. The ids are strings. The order of the 218 | array is the same as in the ``X`` input parameter. 219 | """ 220 | # Stores all clusters. It is the only way to access them. 221 | # Every cluster can be accessed by the token that was used to create it. 222 | # It is the last token from the surnames tokens passed to the constructor. 223 | id_to_block = {} 224 | 225 | # List of tuples. Used as the in-between state of the algorithm between 226 | # the first and the second states. The tuple contain the block name 227 | # if the signature has been already blocked or None otherwise, and the 228 | # tokens. 229 | ordered_tokens = [] 230 | 231 | # First phase. 232 | # Create blocks for signatures with single surname 233 | 234 | for signature_array in X[:, 0]: 235 | tokens = phonetic_tokenize_name(signature_array['author_name'], 236 | phonetic_algorithm=phonetic_algorithm) 237 | surname_tokens = tokens[0] 238 | if len(surname_tokens) == 1: 239 | # Single surname case 240 | surname = surname_tokens[0] 241 | if surname not in id_to_block: 242 | id_to_block[surname] = _Block(*tokens) 243 | else: 244 | id_to_block[surname].add_signature(*tokens) 245 | ordered_tokens.append((surname, tokens)) 246 | else: 247 | # Multiple surnames 248 | ordered_tokens.append((None, tokens)) 249 | 250 | # Second phase. 251 | # Assign every signature with multiple surnames to the block of the 252 | # first surname or the block of the last surname. 253 | 254 | blocks = [] 255 | 256 | for token_tuple in ordered_tokens: 257 | 258 | if token_tuple[0] is not None: 259 | 260 | # There is already a block 261 | blocks.append(id_to_block[token_tuple[0]]) 262 | 263 | else: 264 | 265 | # Case of multiple surnames 266 | tokens = token_tuple[1] 267 | surnames, given_names = tokens 268 | 269 | # Check if this combination of surnames was already included 270 | try: 271 | # First surname 272 | 273 | cluster = id_to_block[surnames[0]] 274 | if cluster.contains(surnames): 275 | cluster.add_signature(*tokens) 276 | blocks.append(cluster) 277 | continue 278 | except KeyError: 279 | # No such block 280 | pass 281 | 282 | try: 283 | # Last surname 284 | 285 | cluster = id_to_block[surnames[-1]] 286 | if cluster.contains(surnames): 287 | cluster.add_signature(*tokens) 288 | blocks.append(cluster) 289 | continue 290 | 291 | # # No match, compute heuristically the match over initials 292 | 293 | # Firstly, check if some of the surnames were used as the 294 | # last given names on some of the signatures. 295 | index = len(surnames) - 1 296 | match_found = False 297 | 298 | while index > 0: 299 | token_prefix = surnames[:index] 300 | if cluster.compare_tokens_from_last(token_prefix, 301 | (surnames[-1],)): 302 | cluster.add_signature(*tokens) 303 | match_found = True 304 | break 305 | index -= 1 306 | 307 | if match_found: 308 | # There was a full name match, so it must be the same 309 | # author. 310 | blocks.append(cluster) 311 | continue 312 | 313 | except KeyError: 314 | # No such block 315 | pass 316 | 317 | try: 318 | # No match with last surname. Match with the first one. 319 | cluster = id_to_block[surnames[0]] 320 | cluster.add_signature(*tokens) 321 | blocks.append(cluster) 322 | 323 | continue 324 | 325 | except KeyError: 326 | # No such block 327 | pass 328 | 329 | # No block for the first surname and no good match for the 330 | # last surname. 331 | if surnames[-1] not in id_to_block: 332 | # Create new block. 333 | id_to_block[surnames[-1]] = _Block(*tokens) 334 | blocks.append(id_to_block[surnames[-1]]) 335 | 336 | return np.array(_split_blocks(blocks, X, threshold)) 337 | 338 | 339 | def block_single(X): 340 | """Block the signatures into only one block. 341 | 342 | Parameters 343 | ---------- 344 | :param X: numpy array 345 | Array of singletons of dictionaries. 346 | 347 | Returns 348 | ------- 349 | :returns: numpy array 350 | Array with ids of the blocks. As there is only one block, every element 351 | equals zero. 352 | """ 353 | return np.zeros(len(X), dtype=np.int) 354 | 355 | 356 | def block_last_name_first_initial(X): 357 | """Blocking function using last name and first initial as key. 358 | 359 | The names are normalized before assigning to a block. 360 | 361 | Parameters 362 | ---------- 363 | :param X: numpy array 364 | Array of singletons of dictionaries. 365 | 366 | Returns 367 | ------- 368 | :returns: numpy array 369 | Array with ids of the blocks. The order of the 370 | array is the same as in the ``X`` input parameter. 371 | """ 372 | def last_name_first_initial(name): 373 | names = normalize_name(name).split(" ", 1) 374 | 375 | try: 376 | name = "%s %s" % (names[0], names[1].strip()[0]) 377 | except IndexError: 378 | name = names[0] 379 | 380 | return name 381 | 382 | blocks = [] 383 | 384 | for signature in X[:, 0]: 385 | blocks.append(last_name_first_initial(signature["author_name"])) 386 | 387 | return np.array(blocks) 388 | -------------------------------------------------------------------------------- /beard/clustering/wrappers.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of Beard. 4 | # Copyright (C) 2015 CERN. 5 | # 6 | # Beard is a free software; you can redistribute it and/or modify it 7 | # under the terms of the Revised BSD License; see LICENSE file for 8 | # more details. 9 | 10 | """Scikit-Learn compatible wrappers of clustering algorithms. 11 | 12 | .. codeauthor:: Gilles Louppe 13 | .. codeauthor:: Hussein Al-Natsheh 14 | 15 | """ 16 | import numpy as np 17 | 18 | import scipy.cluster.hierarchy as hac 19 | 20 | from sklearn.base import BaseEstimator 21 | from sklearn.base import ClusterMixin 22 | 23 | 24 | class ScipyHierarchicalClustering(BaseEstimator, ClusterMixin): 25 | """Wrapper for Scipy's hierarchical clustering implementation. 26 | 27 | Attributes 28 | ---------- 29 | labels_ : ndarray, shape (n_samples,) 30 | Array of labels assigned to the input data. 31 | 32 | linkage_ : ndarray 33 | The linkage matrix. 34 | """ 35 | 36 | def __init__(self, method="single", affinity="euclidean", 37 | threshold=None, n_clusters=None, criterion="distance", 38 | depth=2, R=None, monocrit=None, unsupervised_scoring=None, 39 | supervised_scoring=None, scoring_data=None, 40 | best_threshold_precedence=True): 41 | """Initialize. 42 | 43 | Parameters 44 | ---------- 45 | :param method: string 46 | The linkage algorithm to use. 47 | See scipy.cluster.hierarchy.linkage for further details. 48 | 49 | :param affinity: string or callable 50 | The distance metric to use. 51 | - "precomputed": assume that X is a distance matrix; 52 | - callable: a function returning a distance matrix. 53 | - Otherwise, any value supported by 54 | scipy.cluster.hierarchy.linkage. 55 | 56 | :param n_clusters: int 57 | The number of flat clusters to form. 58 | 59 | :param threshold: float or None 60 | The thresold to apply when forming flat clusters, if 61 | n_clusters=None. 62 | See scipy.cluster.hierarchy.fcluster for further details. 63 | 64 | :param criterion: string 65 | The criterion to use in forming flat clusters. 66 | See scipy.cluster.hierarchy.fcluster for further details. 67 | 68 | :param depth: int 69 | The maximum depth to perform the inconsistency calculation. 70 | See scipy.cluster.hierarchy.fcluster for further details. 71 | 72 | :param R: array-like or None 73 | The inconsistency matrix to use for the 'inconsistent' criterion. 74 | See scipy.cluster.hierarchy.fcluster for further details. 75 | 76 | :param monocrit: array-like or None 77 | The statistics upon which non-singleton i is thresholded. 78 | See scipy.cluster.hierarchy.fcluster for further details. 79 | 80 | :param scoring_data: string or None 81 | The type of input data to pass to the scoring function: 82 | - "raw": for passing the original X array; 83 | - "affinity": for passing an affinity matrix Xa; 84 | - None: for not passing anything but the labels. 85 | 86 | :param supervised_scoring: callable or None 87 | The scoring function to maximize in order to estimate the best 88 | threshold. Labels must be provided in y for this scoring function. 89 | There are 3 possible cases based on the value of `scoring_data`: 90 | - scoring_data == "raw": 91 | supervised_scoring(X_raw, label_true, label_pred); 92 | - scoring_data == "affinity": 93 | supervised_scoring(X_affinity, label_true, label_pred); 94 | - scoring_data is None: 95 | supervised_scoring(label_true, label_pred). 96 | 97 | :param unsupervised_scoring: callable or None 98 | The scoring function to maximize in order to estimate the best 99 | threshold. Labels must not be provided in y for this scoring 100 | function.There are 3 possible cases based on the value of 101 | `scoring_data`: 102 | - scoring_data == "raw": 103 | unsupervised_scoring(X_raw, label_pred); 104 | - scoring_data == "affinity": 105 | unsupervised_scoring(X_affinity, label_pred); 106 | - scoring_data is None: 107 | unsupervised_scoring(label_pred). 108 | 109 | """ 110 | self.method = method 111 | self.affinity = affinity 112 | self.threshold = threshold 113 | self.n_clusters = n_clusters 114 | self.criterion = criterion 115 | self.depth = depth 116 | self.R = R 117 | self.monocrit = monocrit 118 | self.unsupervised_scoring = unsupervised_scoring 119 | self.supervised_scoring = supervised_scoring 120 | self.scoring_data = scoring_data 121 | self.best_threshold_precedence = best_threshold_precedence 122 | 123 | def fit(self, X, y=None): 124 | """Perform hierarchical clustering on input data. 125 | 126 | Parameters 127 | ---------- 128 | :param X: array-like, shape (n_samples, n_features) or 129 | (n_samples, n_samples) 130 | Input data, as an array of samples or as a distance matrix if 131 | affinity == 'precomputed'. 132 | 133 | :param y: array-like, shape (n_samples, ) 134 | Input labels, in case of (semi-)supervised clustering. 135 | Labels equal to -1 stand for unknown labels. 136 | 137 | Returns 138 | ------- 139 | :returns: self 140 | """ 141 | X = np.array(X) 142 | X_raw = X 143 | n_samples = X.shape[0] 144 | 145 | # Build linkage matrix 146 | if self.affinity == "precomputed" or callable(self.affinity): 147 | if callable(self.affinity): 148 | X = self.affinity(X) 149 | X_affinity = X 150 | if X.ndim == 2: 151 | i, j = np.triu_indices(X.shape[0], k=1) 152 | X = X[i, j] 153 | self.linkage_ = hac.linkage(X, method=self.method) 154 | else: 155 | X_affinity = None 156 | self.linkage_ = hac.linkage(X, 157 | method=self.method, 158 | metric=self.affinity) 159 | 160 | if self.scoring_data == "affinity" and X_affinity is None: 161 | raise ValueError("The scoring function expects an affinity matrix," 162 | " which cannot be computed from the combination" 163 | " of parameters you provided.") 164 | 165 | # Estimate threshold in case of semi-supervised or unsupervised 166 | # As default value we use the highest so we obtain only 1 cluster. 167 | best_threshold = (self.linkage_[-1, 2] if self.threshold is None 168 | else self.threshold) 169 | 170 | n_clusters = self.n_clusters 171 | supervised_scoring = self.supervised_scoring 172 | unsupervised_scoring = self.unsupervised_scoring 173 | ground_truth = (y is not None) and np.any(np.array(y) != -1) 174 | scoring = supervised_scoring is not None or \ 175 | unsupervised_scoring is not None 176 | 177 | if n_clusters is None and scoring: 178 | best_score = -np.inf 179 | thresholds = np.concatenate(([0], 180 | self.linkage_[:, 2], 181 | [self.linkage_[-1, 2]])) 182 | 183 | for i in range(len(thresholds) - 1): 184 | t1, t2 = thresholds[i:i + 2] 185 | threshold = (t1 + t2) / 2.0 186 | labels = hac.fcluster(self.linkage_, threshold, 187 | criterion=self.criterion, 188 | depth=self.depth, R=self.R, 189 | monocrit=self.monocrit) 190 | 191 | if ground_truth and supervised_scoring is not None: 192 | train = (y != -1) 193 | 194 | if self.scoring_data == "raw": 195 | score = supervised_scoring(X_raw, y[train], 196 | labels[train]) 197 | 198 | elif self.scoring_data == "affinity": 199 | score = supervised_scoring(X_affinity, y[train], 200 | labels[train]) 201 | 202 | else: 203 | score = supervised_scoring(y[train], 204 | labels[train]) 205 | 206 | elif unsupervised_scoring is not None: 207 | if self.scoring_data == "raw": 208 | score = unsupervised_scoring(X_raw, labels) 209 | 210 | elif self.scoring_data == "affinity": 211 | score = unsupervised_scoring(X_affinity, labels) 212 | 213 | else: 214 | score = unsupervised_scoring(labels) 215 | 216 | else: 217 | break 218 | 219 | if score >= best_score: 220 | best_score = score 221 | best_threshold = threshold 222 | 223 | self.best_threshold_ = best_threshold 224 | self.n_samples_ = n_samples 225 | 226 | return self 227 | 228 | @property 229 | def labels_(self): 230 | """Compute the labels assigned to the input data. 231 | 232 | Note that labels are computed on-the-fly from the linkage matrix, 233 | based on the value of self.threshold or self.n_clusters. 234 | """ 235 | n_clusters = self.n_clusters 236 | 237 | if n_clusters is not None: 238 | if n_clusters < 1 or n_clusters > self.n_samples_: 239 | raise ValueError("n_clusters must be within [1; n_samples].") 240 | 241 | else: 242 | thresholds = np.concatenate(([0], 243 | self.linkage_[:, 2], 244 | [self.linkage_[-1, 2]])) 245 | 246 | for i in range(len(thresholds) - 1): 247 | t1, t2 = thresholds[i:i + 2] 248 | threshold = (t1 + t2) / 2.0 249 | labels = hac.fcluster(self.linkage_, threshold, 250 | criterion=self.criterion, 251 | depth=self.depth, R=self.R, 252 | monocrit=self.monocrit) 253 | 254 | if len(np.unique(labels)) <= n_clusters: 255 | _, labels = np.unique(labels, return_inverse=True) 256 | return labels 257 | 258 | else: 259 | threshold = self.threshold 260 | 261 | if self.best_threshold_precedence: 262 | threshold = self.best_threshold_ 263 | 264 | labels = hac.fcluster(self.linkage_, threshold, 265 | criterion=self.criterion, depth=self.depth, 266 | R=self.R, monocrit=self.monocrit) 267 | 268 | _, labels = np.unique(labels, return_inverse=True) 269 | 270 | return labels 271 | -------------------------------------------------------------------------------- /beard/ext/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of Beard. 4 | # Copyright (C) 2015 CERN. 5 | # 6 | # Beard is a free software; you can redistribute it and/or modify it 7 | # under the terms of the Revised BSD License; see LICENSE file for 8 | # more details. 9 | 10 | """External libraries and source files used by Beard.""" 11 | -------------------------------------------------------------------------------- /beard/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of Beard. 4 | # Copyright (C) 2014, 2015 CERN. 5 | # 6 | # Beard is a free software; you can redistribute it and/or modify it 7 | # under the terms of the Revised BSD License; see LICENSE file for 8 | # more details. 9 | 10 | """Scoring metrics.""" 11 | 12 | from .clustering import b3_precision_recall_fscore 13 | from .clustering import b3_precision_score 14 | from .clustering import b3_recall_score 15 | from .clustering import b3_f_score 16 | from .clustering import paired_precision_recall_fscore 17 | from .clustering import paired_precision_score 18 | from .clustering import paired_recall_score 19 | from .clustering import paired_f_score 20 | from .clustering import silhouette_score 21 | from .text import jaro 22 | from .text import jaro_winkler 23 | from .text import levenshtein 24 | 25 | __all__ = ("b3_precision_recall_fscore", 26 | "b3_precision_score", 27 | "b3_recall_score", 28 | "b3_f_score", 29 | "paired_precision_recall_fscore", 30 | "paired_precision_score", 31 | "paired_recall_score", 32 | "paired_f_score", 33 | "silhouette_score", 34 | "jaro", 35 | "jaro_winkler", 36 | "levenshtein") 37 | -------------------------------------------------------------------------------- /beard/metrics/clustering.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of Beard. 4 | # Copyright (C) 2014, 2015 CERN. 5 | # 6 | # Beard is a free software; you can redistribute it and/or modify it 7 | # under the terms of the Revised BSD License; see LICENSE file for 8 | # more details. 9 | 10 | """Clustering evaluation metrics. 11 | 12 | .. codeauthor:: Evangelos Tzemis 13 | .. codeauthor:: Gilles Louppe 14 | .. codeauthor:: Hussein Al-Natsheh 15 | 16 | """ 17 | from __future__ import division 18 | 19 | import numpy as np 20 | from operator import mul 21 | from itertools import groupby 22 | 23 | from sklearn.metrics import silhouette_score as sklearn_silhouette_score 24 | from sklearn.metrics.cluster.supervised import check_clusterings 25 | 26 | 27 | def silhouette_score(X, labels, metric="precomputed"): 28 | """Compute the silhouette score. 29 | 30 | The silhouette coefficent is only defined if number of clusters if 31 | 1 < n_clusters < n_samples. 32 | 33 | Parameters: 34 | ----------- 35 | :param X : array [n_samples_a, n_samples_a] if metric == "precomputed", 36 | or [n_samples_a, n_features] otherwise 37 | Array of pairwise distances between samples, or a feature array. 38 | :param labels : array, shape = [n_samples] 39 | Predicted labels for each sample. 40 | :param metric : string, or callable 41 | The metric to use when calculating distance between instances in a 42 | feature array. If metric is a string, it must be one of the options 43 | allowed by `sklearn.metrics.pairwise.pairwise_distances`. If X is the 44 | distance array itself, use metric="precomputed". 45 | 46 | Returns: 47 | -------- 48 | :return floate: mean silhouette coefficient for all samples or 49 | -1.0 if n_clusters <= 1 or n_clusters >= n_samples. 50 | """ 51 | n_samples = X.shape[0] 52 | n_clusters = len(np.unique(labels)) 53 | 54 | if 1 < n_clusters < n_samples: 55 | return sklearn_silhouette_score(X, labels, metric) 56 | else: 57 | return -1.0 58 | 59 | 60 | def b3_precision_recall_fscore(labels_true, labels_pred): 61 | """Compute the B^3 variant of precision, recall and F-score. 62 | 63 | Parameters 64 | ---------- 65 | :param labels_true: 1d array containing the ground truth cluster labels. 66 | :param labels_pred: 1d array containing the predicted cluster labels. 67 | 68 | Returns 69 | ------- 70 | :return float precision: calculated precision 71 | :return float recall: calculated recall 72 | :return float f_score: calculated f_score 73 | 74 | Reference 75 | --------- 76 | Amigo, Enrique, et al. "A comparison of extrinsic clustering evaluation 77 | metrics based on formal constraints." Information retrieval 12.4 78 | (2009): 461-486. 79 | """ 80 | # Check that labels_* are 1d arrays and have the same size 81 | labels_true, labels_pred = check_clusterings(labels_true, labels_pred) 82 | 83 | # Check that input given is not the empty set 84 | if labels_true.shape == (0, ): 85 | raise ValueError( 86 | "input labels must not be empty.") 87 | 88 | # Compute P/R/F scores 89 | n_samples = len(labels_true) 90 | true_clusters = {} # true cluster_id => set of sample indices 91 | pred_clusters = {} # pred cluster_id => set of sample indices 92 | 93 | for i in range(n_samples): 94 | true_cluster_id = labels_true[i] 95 | pred_cluster_id = labels_pred[i] 96 | 97 | if true_cluster_id not in true_clusters: 98 | true_clusters[true_cluster_id] = set() 99 | if pred_cluster_id not in pred_clusters: 100 | pred_clusters[pred_cluster_id] = set() 101 | 102 | true_clusters[true_cluster_id].add(i) 103 | pred_clusters[pred_cluster_id].add(i) 104 | 105 | for cluster_id, cluster in true_clusters.items(): 106 | true_clusters[cluster_id] = frozenset(cluster) 107 | for cluster_id, cluster in pred_clusters.items(): 108 | pred_clusters[cluster_id] = frozenset(cluster) 109 | 110 | precision = 0.0 111 | recall = 0.0 112 | 113 | intersections = {} 114 | 115 | for i in range(n_samples): 116 | pred_cluster_i = pred_clusters[labels_pred[i]] 117 | true_cluster_i = true_clusters[labels_true[i]] 118 | 119 | if (pred_cluster_i, true_cluster_i) in intersections: 120 | intersection = intersections[(pred_cluster_i, true_cluster_i)] 121 | else: 122 | intersection = pred_cluster_i.intersection(true_cluster_i) 123 | intersections[(pred_cluster_i, true_cluster_i)] = intersection 124 | 125 | precision += len(intersection) / len(pred_cluster_i) 126 | recall += len(intersection) / len(true_cluster_i) 127 | 128 | precision /= n_samples 129 | recall /= n_samples 130 | 131 | f_score = 2 * precision * recall / (precision + recall) 132 | 133 | return precision, recall, f_score 134 | 135 | 136 | def b3_precision_score(labels_true, labels_pred): 137 | """Compute the B^3 variant of precision. 138 | 139 | Parameters 140 | ---------- 141 | :param labels_true: 1d array containing the ground truth cluster labels. 142 | :param labels_pred: 1d array containing the predicted cluster labels. 143 | 144 | Returns 145 | ------- 146 | :return float precision: calculated precision 147 | """ 148 | p, _, _ = b3_precision_recall_fscore(labels_true, labels_pred) 149 | return p 150 | 151 | 152 | def b3_recall_score(labels_true, labels_pred): 153 | """Compute the B^3 variant of recall. 154 | 155 | Parameters 156 | ---------- 157 | :param labels_true: 1d array containing the ground truth cluster labels. 158 | :param labels_pred: 1d array containing the predicted cluster labels. 159 | 160 | Returns 161 | ------- 162 | :return float recall: calculated recall 163 | """ 164 | _, r, _ = b3_precision_recall_fscore(labels_true, labels_pred) 165 | return r 166 | 167 | 168 | def b3_f_score(labels_true, labels_pred): 169 | """Compute the B^3 variant of F-score. 170 | 171 | Parameters 172 | ---------- 173 | :param labels_true: 1d array containing the ground truth cluster labels. 174 | :param labels_pred: 1d array containing the predicted cluster labels. 175 | 176 | Returns 177 | ------- 178 | :return float f_score: calculated F-score 179 | """ 180 | _, _, f = b3_precision_recall_fscore(labels_true, labels_pred) 181 | return f 182 | 183 | 184 | def paired_precision_recall_fscore(labels_true, labels_pred): 185 | """Compute the pairwise variant of precision, recall and F-score. 186 | 187 | Precision is the ability not to label as positive a sample 188 | that is negative. The best value is 1 and the worst is 0. 189 | 190 | Recall is the ability to successfully find all the positive samples. 191 | The best value is 1 and the worst is 0. 192 | 193 | F-score (Harmonic mean) can be thought as a weighted harmonic mean of 194 | the precision and recall, where an F-score reaches its best value at 1 195 | and worst at 0. 196 | 197 | Parameters 198 | ---------- 199 | :param labels_true: 1d array containing the ground truth cluster labels. 200 | :param labels_pred: 1d array containing the predicted cluster labels. 201 | 202 | Returns 203 | ------- 204 | :return float precision: calculated precision 205 | :return float recall: calculated recall 206 | :return float f_score: calculated f_score 207 | 208 | Reference 209 | --------- 210 | Levin, Michael et al., "Citation-based bootstrapping for large-scale 211 | author disambiguation", Journal of the American Society for Information 212 | Science and Technology 63.5 (2012): 1030-1047. 213 | """ 214 | # Check that labels_* are 1d arrays and have the same size 215 | labels_true, labels_pred = check_clusterings(labels_true, labels_pred) 216 | 217 | # Check that input given is not the empty set 218 | if labels_true.shape == (0, ): 219 | raise ValueError( 220 | "input labels must not be empty.") 221 | 222 | # Assigns each label to its own cluster 223 | default_clustering = range(len(labels_pred)) 224 | 225 | # Calculate precision 226 | numerator = _general_merge_distance(labels_true, labels_pred, 227 | fm=_zero, fs=mul) 228 | denominator = _general_merge_distance(default_clustering, 229 | labels_pred, 230 | fm=_zero, fs=mul) 231 | try: 232 | precision = 1.0 - numerator / denominator 233 | except ZeroDivisionError: 234 | precision = 1.0 235 | 236 | # Calculate recall 237 | numerator = _general_merge_distance(labels_true, labels_pred, 238 | fm=mul, fs=_zero) 239 | denominator = _general_merge_distance(labels_true, 240 | default_clustering, 241 | fm=mul, fs=_zero) 242 | try: 243 | recall = 1.0 - numerator / denominator 244 | except ZeroDivisionError: 245 | recall = 1.0 246 | 247 | # Calculate f_score 248 | 249 | # If both are zero (minimum score) then f_score is also zero 250 | if precision + recall == 0.0: 251 | f_score = 0.0 252 | else: 253 | f_score = 2.0 * precision * recall / (precision + recall) 254 | 255 | return precision, recall, f_score 256 | 257 | 258 | def paired_precision_score(labels_true, labels_pred): 259 | """Compute the pairwise variant of precision. 260 | 261 | Precision is the ability not to label as positive a sample 262 | that is negative. The best value is 1 and the worst is 0. 263 | 264 | Parameters 265 | ---------- 266 | :param labels_true: 1d array containing the ground truth cluster labels. 267 | :param labels_pred: 1d array containing the predicted cluster labels. 268 | 269 | Returns 270 | ------- 271 | :return float precision: calculated precision 272 | """ 273 | p, _, _ = paired_precision_recall_fscore(labels_true, labels_pred) 274 | return p 275 | 276 | 277 | def paired_recall_score(labels_true, labels_pred): 278 | """Compute the pairwise variant of recall. 279 | 280 | Recall is the ability to succesfully find all the positive samples. 281 | The best value is 1 and the worst is 0. 282 | 283 | Parameters 284 | ---------- 285 | :param labels_true: 1d array containing the ground truth labels. 286 | :param labels_pred: 1d array containing the predicted labels. 287 | 288 | Returns 289 | ------- 290 | :return float recall: calculated recall 291 | """ 292 | _, r, _ = paired_precision_recall_fscore(labels_true, labels_pred) 293 | return r 294 | 295 | 296 | def paired_f_score(labels_true, labels_pred): 297 | """Compute the pairwise variant of F-score. 298 | 299 | F score can be thought as a weighted harmonic mean of the precision 300 | and recall, where an F score reaches its best value at 1 301 | and worst at 0. 302 | 303 | Parameters 304 | ---------- 305 | :param labels_true: 1d array containing the ground truth cluster labels. 306 | :param labels_pred: 1d array containing the predicted cluster labels. 307 | 308 | Returns 309 | ------- 310 | :return float f_score: calculated harmonic mean (f_score) 311 | 312 | """ 313 | _, _, f = paired_precision_recall_fscore(labels_true, labels_pred) 314 | return f 315 | 316 | 317 | def _zero(x, y): 318 | return 0.0 319 | 320 | 321 | def _cluster_samples(labels): 322 | """Group input to sets that belong to the same cluster. 323 | 324 | Parameters 325 | ---------- 326 | :param labels: array with the cluster labels 327 | 328 | Returns 329 | ------- 330 | :return: dictionary with keys the cluster ids and values a tuple containing 331 | the ids of elements tha belong to this cluster. 332 | """ 333 | groupped_samples = groupby(np.argsort(labels), lambda i: labels[i]) 334 | 335 | return {k: tuple(values) for k, values in groupped_samples} 336 | 337 | 338 | def _general_merge_distance(y_true, y_pred, 339 | fs=lambda x, y: 1.0, fm=lambda x, y: 1.0): 340 | """Slice algorithm for computing generalized merge distance. 341 | 342 | Slice is a linear time algorithm. 343 | 344 | Merge Distance is the minimum number of splits and merges 345 | to get from R-flat to y_true. 346 | 347 | Parameters 348 | ---------- 349 | :param y_true: array with the ground truth cluster labels. 350 | :param y_pred: array with the predicted cluster labels. 351 | :param fs: Optional. Function defining the cost of split. 352 | :param fm: Optional. Function defining the cost of merge. 353 | 354 | Returns 355 | ------- 356 | :return float: Cost of getting from y_pred to y_true. 357 | 358 | Reference 359 | --------- 360 | Menestrina, David Michael., "Matching and unifying records in a 361 | distributed system", Department of Computer Science Thesis, Ph.D. 362 | dissertation, Stanford University (2010). 363 | """ 364 | r = _cluster_samples(y_pred) 365 | s = _cluster_samples(y_true) 366 | r_sizes = {k: len(v) for k, v in r.items()} 367 | 368 | cost = 0.0 369 | for si in s.values(): 370 | # determine which clusters in r contain the records of si 371 | p_map = {} 372 | for element in si: 373 | cl = y_pred[element] 374 | if cl not in p_map: 375 | p_map[cl] = 0 376 | p_map[cl] += 1 377 | 378 | # Compute cost to generate si 379 | si_cost = 0.0 380 | total_recs = 0 381 | for i, count in p_map.items(): 382 | # add the cost to split ri 383 | if r_sizes[i] > count: 384 | si_cost += fs(count, r_sizes[i] - count) 385 | r_sizes[i] -= count 386 | if total_recs != 0: 387 | # Cost to merge into si 388 | si_cost += fm(count, total_recs) 389 | total_recs += count 390 | cost += si_cost 391 | 392 | return cost 393 | -------------------------------------------------------------------------------- /beard/metrics/text.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of Beard. 4 | # Copyright (C) 2015 CERN. 5 | # 6 | # Beard is a free software; you can redistribute it and/or modify it 7 | # under the terms of the Revised BSD License; see LICENSE file for 8 | # more details. 9 | 10 | """Text metrics. 11 | 12 | .. codeauthor:: Petros Ioannidis 13 | .. codeauthor:: Evangelos Tzemis 14 | 15 | """ 16 | 17 | from __future__ import division 18 | import numpy as np 19 | import re 20 | 21 | 22 | def _find_all(s, pattern): 23 | """Find all occurences of the given pattern. 24 | 25 | Parameters 26 | ---------- 27 | :param s: string 28 | String to be searched 29 | 30 | :param letter: string 31 | Substring we are searching for 32 | 33 | Returns 34 | ------- 35 | :returns: generator 36 | A generator that holds the indexes of the patterns 37 | """ 38 | for match in re.finditer(pattern, s): 39 | yield match.start() 40 | 41 | 42 | def _jaro_matching(s1, s2): 43 | """Return the number of matching letters and transpositions. 44 | 45 | Parameters 46 | ---------- 47 | :param s1: string 48 | First string 49 | 50 | :param s2: string 51 | Second string 52 | 53 | Returns 54 | ------- 55 | :returns: (int, int) 56 | The number of matching letters and transpositions 57 | """ 58 | H = min(len(s1), len(s2)) // 2 59 | 60 | letters_cache = {} 61 | matches = 0 62 | transpositions = 0 63 | s1_matching_letters = [] 64 | s2_matching_letters = [] 65 | s1_matched_positions = [] 66 | s2_matched_positions = [] 67 | 68 | for letter in s1: 69 | if letter not in letters_cache: 70 | letters_cache[letter] = (tuple(_find_all(s1, letter)), 71 | tuple(_find_all(s2, letter))) 72 | 73 | for letter, (s1_positions, s2_positions) in letters_cache.items(): 74 | for i in s1_positions: 75 | for j in s2_positions: 76 | if i - H <= j <= i + H: 77 | if j not in s2_matched_positions: 78 | matches += 1 79 | s2_matched_positions.append(j) 80 | s1_matching_letters.append((i, letter)) 81 | break 82 | 83 | for letter, (s1_positions, s2_positions) in letters_cache.items(): 84 | for j in s2_positions: 85 | for i in s1_positions: 86 | if j - H <= i <= j + H: 87 | if i not in s1_matched_positions: 88 | s1_matched_positions.append(i) 89 | s2_matching_letters.append((j, letter)) 90 | break 91 | 92 | s1_matching_letters.sort() 93 | s2_matching_letters.sort() 94 | transpositions = len(tuple(filter(lambda x: x[0][1] != x[1][1], 95 | zip(s1_matching_letters, 96 | s2_matching_letters)))) 97 | 98 | return matches, transpositions 99 | 100 | 101 | def jaro(s1, s2): 102 | """Return the Jaro similarity of the strings s1 and s2. 103 | 104 | Parameters 105 | ---------- 106 | :param s1: string 107 | First string 108 | 109 | :param s2: string 110 | Second string 111 | 112 | Returns 113 | ------- 114 | :returns: float 115 | Similarity of s1 and s2 116 | 117 | Reference 118 | --------- 119 | Jaro, M. A., "Advances in record-linkage methodology as applied to 120 | matching the 1985 census of Tampa, Florida", Journal of the American 121 | Statistical Association, 84:414-420, 1989. 122 | """ 123 | if len(s1) == 0 or len(s2) == 0: 124 | return 0 125 | 126 | n_matches, n_transpositions = _jaro_matching(s1, s2) 127 | 128 | if n_matches == 0: 129 | return 0 130 | 131 | return 1 / 3 * (n_matches / len(s1) + 132 | n_matches / len(s2) + 133 | (n_matches - n_transpositions / 2) / n_matches) 134 | 135 | 136 | def jaro_winkler(s1, s2, p=0.1): 137 | """Return the Jaro-Winkler similarity of the strings s1 and s2. 138 | 139 | Parameters 140 | ---------- 141 | :param s1: string 142 | First string 143 | 144 | :param s2: string 145 | Second string 146 | 147 | Returns 148 | ------- 149 | :returns: float 150 | Similarity of s1 and s2 151 | 152 | Reference 153 | --------- 154 | Winkler, W. E., "The state of record linkage and current research 155 | problems", Statistical Research Division, US Census Bureau. 1999. 156 | """ 157 | jaro_distance = jaro(s1, s2) 158 | 159 | common_prefix = 0 160 | for s1_letter, s2_letter in zip(s1, s2): 161 | if s1_letter == s2_letter and common_prefix < 4: 162 | common_prefix += 1 163 | else: 164 | break 165 | 166 | return jaro_distance + p * common_prefix * (1 - jaro_distance) 167 | 168 | 169 | def levenshtein(a, b): 170 | """Calculate the levenshtein distance between strings a and b. 171 | 172 | Case sensitiveness is activated, meaning that uppercase letters 173 | are treated differently than their corresponding lowercase ones. 174 | 175 | Parameters 176 | ---------- 177 | :param a: string 178 | String to be compared 179 | 180 | :param b: string 181 | String to be compared 182 | 183 | Returns 184 | ------- 185 | :returns int: 186 | The calculated levenshtein distance. 187 | """ 188 | len_a, len_b = len(a), len(b) 189 | 190 | if len_a < len_b: 191 | return levenshtein(b, a) 192 | if len_b == 0: 193 | return len_a 194 | 195 | # We use tuple() to force strings to be used as sequences. 196 | a = np.array(tuple(a)) 197 | b = np.array(tuple(b)) 198 | 199 | # Instead of calculating the whole matrix, we only keep the last 2 rows. 200 | previous_row = np.arange(len_b + 1) 201 | for character in a: 202 | # Insertion 203 | current_row = previous_row + 1 204 | # Substitution or matching 205 | current_row[1:] = np.minimum( 206 | current_row[1:], 207 | np.add(previous_row[:-1], b != character)) 208 | # Deletion 209 | current_row[1:] = np.minimum( 210 | current_row[1:], 211 | current_row[:-1] + 1) 212 | previous_row = current_row 213 | 214 | return current_row[-1] 215 | -------------------------------------------------------------------------------- /beard/similarity/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of Beard. 4 | # Copyright (C) 2014 CERN. 5 | # 6 | # Beard is a free software; you can redistribute it and/or modify it 7 | # under the terms of the Revised BSD License; see LICENSE file for 8 | # more details. 9 | 10 | """Similarity learning algorithms.""" 11 | 12 | from .pairs import AbsoluteDifference 13 | from .pairs import CosineSimilarity 14 | from .pairs import EstimatorTransformer 15 | from .pairs import ElementMultiplication 16 | from .pairs import JaccardSimilarity 17 | from .pairs import PairTransformer 18 | from .pairs import StringDistance 19 | from .pairs import Thresholder 20 | 21 | __all__ = ("AbsoluteDifference", 22 | "CosineSimilarity", 23 | "EstimatorTransformer", 24 | "ElementMultiplication", 25 | "JaccardSimilarity", 26 | "PairTransformer", 27 | "StringDistance", 28 | "Thresholder") 29 | -------------------------------------------------------------------------------- /beard/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of Beard. 4 | # Copyright (C) 2014 CERN. 5 | # 6 | # Beard is a free software; you can redistribute it and/or modify it 7 | # under the terms of the Revised BSD License; see LICENSE file for 8 | # more details. 9 | 10 | """Helper functions.""" 11 | 12 | from .misc import memoize 13 | from .names import phonetic_tokenize_name 14 | from .names import given_name_initial 15 | from .names import given_name 16 | from .names import name_initials 17 | from .names import normalize_name 18 | from .strings import asciify 19 | from .transformers import FuncTransformer 20 | from .transformers import Shaper 21 | 22 | __all__ = ("memoize", 23 | "phonetic_tokenize_name", 24 | "given_name_initial", 25 | "given_name", 26 | "normalize_name", 27 | "name_initials", 28 | "asciify", 29 | "FuncTransformer", 30 | "Shaper") 31 | -------------------------------------------------------------------------------- /beard/utils/misc.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of Beard. 4 | # Copyright (C) 2015 CERN. 5 | # 6 | # Beard is a free software; you can redistribute it and/or modify it 7 | # under the terms of the Revised BSD License; see LICENSE file for 8 | # more details. 9 | 10 | """Miscellaneous helpers. 11 | 12 | .. codeauthor:: Gilles Louppe 13 | 14 | """ 15 | 16 | from functools import wraps 17 | 18 | 19 | def memoize(func): 20 | """Memoization function.""" 21 | cache = {} 22 | 23 | @wraps(func) 24 | def wrap(*args, **kwargs): 25 | 26 | frozen = frozenset(kwargs.items()) 27 | if (args, frozen) not in cache: 28 | cache[(args, frozen)] = func(*args, **kwargs) 29 | return cache[(args, frozen)] 30 | 31 | return wrap 32 | -------------------------------------------------------------------------------- /beard/utils/names.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of Beard. 4 | # Copyright (C) 2015 CERN. 5 | # 6 | # Beard is a free software; you can redistribute it and/or modify it 7 | # under the terms of the Revised BSD License; see LICENSE file for 8 | # more details. 9 | 10 | """Helper functions for handling personal names. 11 | 12 | .. codeauthor:: Gilles Louppe 13 | .. codeauthor:: Mateusz Susik 14 | 15 | """ 16 | 17 | import functools 18 | import re 19 | import sys 20 | 21 | import fuzzy 22 | 23 | from .misc import memoize 24 | from .strings import asciify 25 | 26 | RE_NORMALIZE_WHOLE_NAME = re.compile("[^a-zA-Z,\s]+") 27 | RE_NORMALIZE_OTHER_NAMES = re.compile("(,\s(i{1,3}|iv|v|vi|jr))|[\.'\-,\s]+") 28 | RE_APOSTROPHES = re.compile('\'+') 29 | RE_REMOVE_NON_CHARACTERS = re.compile('[^a-zA-Z\',\s]+') 30 | DROPPED_AFFIXES = {'a', 'ab', 'am', 'ap', 'abu', 'al', 'auf', 'aus', 'bar', 31 | 'bath', 'bat', 'ben', 'bet', 'bin', 'bint', 'd', 'da', 32 | 'dall', 'dalla', 'das', 'de', 'degli', 'del', 'dell', 33 | 'della', 'dem', 'den', 'der', 'di', 'do', 'dos', 'ds', 'du', 34 | 'e', 'el', 'i', 'ibn', 'im', 'jr', 'l', 'la', 'las', 'le', 35 | 'los', 'm', 'mac', 'mc', 'mhic', 'mic', 'o', 'ter', 'und', 36 | 'v', 'van', 'vom', 'von', 'zu', 'zum', 'zur'} 37 | 38 | 39 | @memoize 40 | def normalize_name(name, drop_common_affixes=True): 41 | """Normalize a personal name. 42 | 43 | Parameters 44 | ---------- 45 | :param name: string 46 | Name, formatted as "Last Name, Other Names". 47 | 48 | :param drop_common_affixes: boolean 49 | If the affixes like ``della`` should be dropeed. 50 | 51 | Returns 52 | ------- 53 | :return: string 54 | Normalized name, formatted as "lastnames first names" where last names 55 | are joined. 56 | """ 57 | name = asciify(name).lower() 58 | name = RE_NORMALIZE_WHOLE_NAME.sub(' ', name) 59 | names = name.split(",", 1) 60 | if not names: 61 | return "" 62 | if len(names) == 1: 63 | # There was no comma in the name 64 | all_names = names[0].split(" ") 65 | if len(all_names) > 1: 66 | # The last string should be the surname 67 | names = [all_names[-1], " ".join(all_names[:-1])] 68 | else: 69 | names = [all_names[0], ""] 70 | 71 | if drop_common_affixes: 72 | last_names = names[0].split(" ") 73 | without_affixes = list(filter(lambda x: x not in DROPPED_AFFIXES, 74 | last_names)) 75 | if len(without_affixes) > 0: 76 | names[0] = "".join(without_affixes) 77 | else: 78 | names[0] = re.sub('\s', '', names[0]) 79 | 80 | name = "%s, %s" % (names[0], names[1]) 81 | name = RE_NORMALIZE_OTHER_NAMES.sub(" ", name) 82 | name = name.strip() 83 | 84 | return name 85 | 86 | 87 | @memoize 88 | def name_initials(name): 89 | """Compute the set of initials of a given name.""" 90 | return set([w[0] for w in name.split()]) 91 | 92 | 93 | @memoize 94 | def phonetic_tokenize_name(name, phonetic_algorithm="double_metaphone"): 95 | """Create Double Metaphone tokens from the string. 96 | 97 | Parameters 98 | ---------- 99 | :param name: string 100 | Name of the author. Usually it should be in the format: 101 | surnames, first names. 102 | 103 | :param phonetic algorithm: string 104 | Which phonetic algorithm will be used. Options: 105 | - "double_metaphone" 106 | - "nysiis" 107 | - "soundex" 108 | 109 | Returns 110 | ------- 111 | :return: tuple 112 | The first element is a tuple with the tokens for surnames, the second 113 | is a tuple with the tokens for first names. The tuple always contains 114 | exactly two elements. Only the first results of the double metaphone 115 | algorithm are included in tuples. 116 | """ 117 | if phonetic_algorithm == "soundex": 118 | error = ( 119 | "The version of the 'fuzzy' package in use has a buggy soundex" 120 | " implementation (see https://github.com/yougov/fuzzy/issues/14 )," 121 | " downgrade the package to 1.1 (compatible with Python 2 only) if" 122 | " you want to use the soundex phonetic encoding." 123 | ) 124 | try: 125 | if fuzzy.Soundex(4)("fuzzy") != "F200": 126 | raise ValueError(error) 127 | except UnicodeDecodeError: 128 | raise ValueError(error) 129 | 130 | dm = fuzzy.DMetaphone() 131 | soundex = fuzzy.Soundex(5) 132 | phonetic_algorithms = { 133 | "double_metaphone": lambda y: (dm(y)[0] or b'').decode(), 134 | "nysiis": lambda y: fuzzy.nysiis(y), 135 | "soundex": lambda y: soundex(y) 136 | } 137 | 138 | tokens = tokenize_name(name) 139 | # Use double metaphone 140 | tokens = tuple(map(lambda x: tuple(map(lambda y: phonetic_algorithms[ 141 | phonetic_algorithm](y), x)), 142 | tokens)) 143 | 144 | return tokens 145 | 146 | 147 | @memoize 148 | def tokenize_name(name, handle_soft_sign=True, drop_common_affixes=True): 149 | """Normalize the name and create tokens from it. 150 | 151 | Parameters 152 | ---------- 153 | :param name: string 154 | Name of the author. Usually it should be in the format: 155 | surnames, first names. 156 | :param handle_soft_sign: boolean 157 | Should the case of cyrillic soft sign be handled. 158 | :param drop_common_affixes: boolean 159 | Should the common affixes like ``von`` be dropped. 160 | 161 | Returns 162 | ------- 163 | :return: tuple 164 | The first element is a tuple with surnames, the second 165 | is a tuple first names. The tuple always contains 166 | exactly two elements. 167 | """ 168 | name = asciify(name) 169 | 170 | # Get rid of non character. Leave apostrophes as they are handled in a 171 | # different way. 172 | name = RE_REMOVE_NON_CHARACTERS.sub(' ', name) 173 | 174 | if handle_soft_sign: 175 | # Handle the "miagkii znak" in russian names. 176 | matches = re.findall(r"^([^',]*)'([a-z].*)", name) 177 | if matches: 178 | name = matches[0][0] + matches[0][1] 179 | 180 | # Remove apostrophes 181 | name = RE_APOSTROPHES.sub(' ', name) 182 | 183 | # Extract surname and name 184 | tokens = name.split(',') 185 | # If there are no first names, the default value is an empty string. 186 | tokens = [tokens[0], functools.reduce(lambda x, y: x+y, tokens[1:], '')] 187 | 188 | # Remove whitespaces and split both surnames and first-names 189 | tokens = list(map(lambda x: ' '.join(x.split()).lower().split(' '), 190 | tokens)) 191 | 192 | # Special case where there is no first name, i.e. there was no comma in 193 | # the signature. 194 | if tokens[1] == [''] and len(tokens[0]) > 1: 195 | # Probably the first string is the first name 196 | tokens = [tokens[0][1:], [tokens[0][0]]] 197 | elif tokens[1] == ['']: 198 | tokens = [[tokens[0][0]], [u'']] 199 | 200 | if drop_common_affixes: 201 | # Remove common prefixes 202 | without_affixes = list(filter(lambda x: x not in DROPPED_AFFIXES, 203 | tokens[0])) 204 | if len(without_affixes) > 0: 205 | tokens[0] = without_affixes 206 | 207 | return tokens 208 | 209 | RE_CHARACTERS = re.compile('\w') 210 | 211 | 212 | @memoize 213 | def given_name_initial(name, index=0): 214 | """Get the initial from the first given name if available. 215 | 216 | Parameters 217 | ---------- 218 | :param name: string 219 | Name of the author. Usually it should be in the format: 220 | surnames, first names. 221 | :param index: integer 222 | Which given name's initial should be returned. 0 for first, 1 for 223 | second, etc. 224 | 225 | Returns 226 | ------- 227 | :return: string 228 | The given name initial. Asciified one character, lowercase if 229 | available, empty string otherwise. 230 | """ 231 | try: 232 | asciified = asciify(name.split(",")[1]).lower().strip() 233 | names = asciified.split(" ") 234 | return RE_CHARACTERS.findall(names[index])[0] 235 | except IndexError: 236 | if index > 0: 237 | return "" 238 | split_name = name.split(" ") 239 | if len(split_name) > 1: 240 | # For example "John Smith", without comma. The first string should 241 | # indicate the first given name. 242 | asciified = asciify(split_name[0]).lower().strip() 243 | try: 244 | return RE_CHARACTERS.findall(asciified)[0] 245 | except IndexError: 246 | pass 247 | return "" 248 | 249 | 250 | @memoize 251 | def given_name(full_name, index): 252 | """Get a specific given name from full name. 253 | 254 | Parameters 255 | ---------- 256 | :param full_name: string 257 | Name of the author. Usually it should be in the format: 258 | surnames, first names. 259 | :param index: integer 260 | Which given name should be returned. 0 for the first, 1 for the second, 261 | etc. 262 | 263 | Returns 264 | ------- 265 | :return: string 266 | Given name or empty string if it is not available. 267 | """ 268 | try: 269 | given_names = full_name.split(',')[1].strip() 270 | try: 271 | return given_names.split(' ')[index] 272 | except IndexError: 273 | return "" 274 | except IndexError: 275 | names = full_name.split(' ') 276 | try: 277 | return names[index] 278 | except IndexError: 279 | return "" 280 | -------------------------------------------------------------------------------- /beard/utils/strings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of Beard. 4 | # Copyright (C) 2015 CERN. 5 | # 6 | # Beard is a free software; you can redistribute it and/or modify it 7 | # under the terms of the Revised BSD License; see LICENSE file for 8 | # more details. 9 | 10 | """Helper functions for strings. 11 | 12 | .. codeauthor:: Gilles Louppe 13 | .. codeauthor:: Mateusz Susik 14 | 15 | """ 16 | 17 | import sys 18 | import unicodedata 19 | 20 | from unidecode import unidecode 21 | 22 | from .misc import memoize 23 | 24 | IS_PYTHON_3 = sys.version_info[0] == 3 25 | 26 | 27 | @memoize 28 | def asciify(string): 29 | """Transliterate a string to ASCII.""" 30 | if not IS_PYTHON_3 and not isinstance(string, unicode): 31 | string = unicode(string, "utf8", errors="ignore") 32 | 33 | string = unidecode(unicodedata.normalize("NFKD", string)) 34 | string = string.encode("ascii", "ignore") 35 | string = string.decode("utf8") 36 | 37 | return string 38 | -------------------------------------------------------------------------------- /beard/utils/transformers.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of Beard. 4 | # Copyright (C) 2015 CERN. 5 | # 6 | # Beard is a free software; you can redistribute it and/or modify it 7 | # under the terms of the Revised BSD License; see LICENSE file for 8 | # more details. 9 | 10 | """Generic transformers for data manipulation. 11 | 12 | .. codeauthor:: Gilles Louppe 13 | 14 | """ 15 | import numpy as np 16 | 17 | from sklearn.base import BaseEstimator 18 | from sklearn.base import TransformerMixin 19 | 20 | 21 | class FuncTransformer(BaseEstimator, TransformerMixin): 22 | """Apply a given function element-wise.""" 23 | 24 | def __init__(self, func, dtype=None): 25 | """Initialize. 26 | 27 | Parameters 28 | ---------- 29 | :param func: callable 30 | The function to apply on each element. 31 | 32 | :param dtype: numpy dtype 33 | The type of the values returned by `func`. 34 | If None, then use X.dtype as dtype. 35 | """ 36 | self.func = func 37 | self.dtype = dtype 38 | 39 | def fit(self, X, y=None): 40 | """(Do nothing). 41 | 42 | Parameters 43 | ---------- 44 | :param X: array-like, shape (n_samples, n_features) 45 | Input data. 46 | 47 | Returns 48 | ------- 49 | :returns: self 50 | """ 51 | return self 52 | 53 | def transform(self, X): 54 | """Apply `func` on all elements of X. 55 | 56 | Parameters 57 | ---------- 58 | :param X: array-like, shape (n_samples, n_features) 59 | Input data. 60 | 61 | Returns 62 | ------- 63 | :returns Xt: array-like, shape (n_samples, n_features) 64 | The transformed data. 65 | """ 66 | dtype = self.dtype 67 | if dtype is None: 68 | dtype = X.dtype 69 | 70 | vfunc = np.vectorize(self.func, otypes=[dtype]) 71 | return vfunc(X) 72 | 73 | 74 | class Shaper(BaseEstimator, TransformerMixin): 75 | """Reshape arrays.""" 76 | 77 | def __init__(self, newshape, order="C"): 78 | """Initialize. 79 | 80 | Parameters 81 | ---------- 82 | :param newshape: int or tuple 83 | The new shape of the array. 84 | See numpy.reshape for further details. 85 | 86 | :param order: {'C', 'F', 'A'} 87 | The index order. 88 | See numpy.reshape for further details. 89 | """ 90 | self.newshape = newshape 91 | self.order = order 92 | 93 | def fit(self, X, y=None): 94 | """(Do nothing). 95 | 96 | Parameters 97 | ---------- 98 | :param X: array-like, shape (n_samples, n_features) 99 | Input data. 100 | 101 | Returns 102 | ------- 103 | :returns: self 104 | """ 105 | return self 106 | 107 | def transform(self, X): 108 | """Reshape X. 109 | 110 | Parameters 111 | ---------- 112 | :param X: array-like, shape (n_samples, n_features) 113 | Input data. 114 | 115 | Returns 116 | ------- 117 | :returns Xt: array-like, shape (self.newshape) 118 | The transformed data. 119 | """ 120 | return X.reshape(self.newshape, order=self.order) 121 | -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 21 | 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 23 | 24 | help: 25 | @echo "Please use \`make ' where is one of" 26 | @echo " html to make standalone HTML files" 27 | @echo " dirhtml to make HTML files named index.html in directories" 28 | @echo " singlehtml to make a single large HTML file" 29 | @echo " pickle to make pickle files" 30 | @echo " json to make JSON files" 31 | @echo " htmlhelp to make HTML files and a HTML help project" 32 | @echo " qthelp to make HTML files and a qthelp project" 33 | @echo " devhelp to make HTML files and a Devhelp project" 34 | @echo " epub to make an epub" 35 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 36 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 37 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 38 | @echo " text to make text files" 39 | @echo " man to make manual pages" 40 | @echo " texinfo to make Texinfo files" 41 | @echo " info to make Texinfo files and run them through makeinfo" 42 | @echo " gettext to make PO message catalogs" 43 | @echo " changes to make an overview of all changed/added/deprecated items" 44 | @echo " xml to make Docutils-native XML files" 45 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 46 | @echo " linkcheck to check all external links for integrity" 47 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 48 | 49 | clean: 50 | rm -rf $(BUILDDIR)/* 51 | 52 | html: 53 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 54 | @echo 55 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 56 | 57 | dirhtml: 58 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 59 | @echo 60 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 61 | 62 | singlehtml: 63 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 64 | @echo 65 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 66 | 67 | pickle: 68 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 69 | @echo 70 | @echo "Build finished; now you can process the pickle files." 71 | 72 | json: 73 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 74 | @echo 75 | @echo "Build finished; now you can process the JSON files." 76 | 77 | htmlhelp: 78 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 79 | @echo 80 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 81 | ".hhp project file in $(BUILDDIR)/htmlhelp." 82 | 83 | qthelp: 84 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 85 | @echo 86 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 87 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 88 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/beard.qhcp" 89 | @echo "To view the help file:" 90 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/beard.qhc" 91 | 92 | devhelp: 93 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 94 | @echo 95 | @echo "Build finished." 96 | @echo "To view the help file:" 97 | @echo "# mkdir -p $$HOME/.local/share/devhelp/beard" 98 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/beard" 99 | @echo "# devhelp" 100 | 101 | epub: 102 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 103 | @echo 104 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 105 | 106 | latex: 107 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 108 | @echo 109 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 110 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 111 | "(use \`make latexpdf' here to do that automatically)." 112 | 113 | latexpdf: 114 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 115 | @echo "Running LaTeX files through pdflatex..." 116 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 117 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 118 | 119 | latexpdfja: 120 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 121 | @echo "Running LaTeX files through platex and dvipdfmx..." 122 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 123 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 124 | 125 | text: 126 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 127 | @echo 128 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 129 | 130 | man: 131 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 132 | @echo 133 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 134 | 135 | texinfo: 136 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 137 | @echo 138 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 139 | @echo "Run \`make' in that directory to run these through makeinfo" \ 140 | "(use \`make info' here to do that automatically)." 141 | 142 | info: 143 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 144 | @echo "Running Texinfo files through makeinfo..." 145 | make -C $(BUILDDIR)/texinfo info 146 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 147 | 148 | gettext: 149 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 150 | @echo 151 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 152 | 153 | changes: 154 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 155 | @echo 156 | @echo "The overview file is in $(BUILDDIR)/changes." 157 | 158 | linkcheck: 159 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 160 | @echo 161 | @echo "Link check complete; look for any errors in the above output " \ 162 | "or in $(BUILDDIR)/linkcheck/output.txt." 163 | 164 | doctest: 165 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 166 | @echo "Testing of doctests in the sources finished, look at the " \ 167 | "results in $(BUILDDIR)/doctest/output.txt." 168 | 169 | xml: 170 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 171 | @echo 172 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 173 | 174 | pseudoxml: 175 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 176 | @echo 177 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 178 | -------------------------------------------------------------------------------- /doc/_build/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inspirehep/beard/0ad199d6c89ce331be29c3c9112b27b7d87011f8/doc/_build/.keep -------------------------------------------------------------------------------- /doc/_static/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inspirehep/beard/0ad199d6c89ce331be29c3c9112b27b7d87011f8/doc/_static/.keep -------------------------------------------------------------------------------- /doc/_templates/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inspirehep/beard/0ad199d6c89ce331be29c3c9112b27b7d87011f8/doc/_templates/.keep -------------------------------------------------------------------------------- /doc/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # beard documentation build configuration file, created by 4 | # sphinx-quickstart on Wed Oct 29 10:00:05 2014. 5 | # 6 | # This file is execfile()d with the current directory set to its 7 | # containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | import sys 16 | import os 17 | 18 | # If extensions (or modules to document with autodoc) are in another directory, 19 | # add these directories to sys.path here. If the directory is relative to the 20 | # documentation root, use os.path.abspath to make it absolute, like shown here. 21 | #sys.path.insert(0, os.path.abspath('.')) 22 | 23 | # -- General configuration ------------------------------------------------ 24 | 25 | # If your documentation needs a minimal Sphinx version, state it here. 26 | #needs_sphinx = '1.0' 27 | 28 | # Add any Sphinx extension module names here, as strings. They can be 29 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 30 | # ones. 31 | extensions = [ 32 | 'sphinx.ext.autodoc', 33 | 'sphinx.ext.doctest', 34 | 'sphinx.ext.intersphinx', 35 | 'sphinx.ext.todo', 36 | 'sphinx.ext.coverage', 37 | 'sphinx.ext.mathjax', 38 | 'sphinx.ext.ifconfig', 39 | 'sphinx.ext.viewcode', 40 | ] 41 | 42 | # Add any paths that contain templates here, relative to this directory. 43 | templates_path = ['_templates'] 44 | 45 | # The suffix of source filenames. 46 | source_suffix = '.rst' 47 | 48 | # The encoding of source files. 49 | #source_encoding = 'utf-8-sig' 50 | 51 | # The master toctree document. 52 | master_doc = 'index' 53 | 54 | # General information about the project. 55 | project = u'beard' 56 | copyright = u'2014, Invenio collaboration' 57 | 58 | # The version info for the project you're documenting, acts as replacement for 59 | # |version| and |release|, also used in various other places throughout the 60 | # built documents. 61 | # 62 | # The short X.Y version. 63 | version = '0.0' 64 | # The full version, including alpha/beta/rc tags. 65 | release = '0.0' 66 | 67 | # The language for content autogenerated by Sphinx. Refer to documentation 68 | # for a list of supported languages. 69 | #language = None 70 | 71 | # There are two options for replacing |today|: either, you set today to some 72 | # non-false value, then it is used: 73 | #today = '' 74 | # Else, today_fmt is used as the format for a strftime call. 75 | #today_fmt = '%B %d, %Y' 76 | 77 | # List of patterns, relative to source directory, that match files and 78 | # directories to ignore when looking for source files. 79 | exclude_patterns = ['_build'] 80 | 81 | # The reST default role (used for this markup: `text`) to use for all 82 | # documents. 83 | #default_role = None 84 | 85 | # If true, '()' will be appended to :func: etc. cross-reference text. 86 | #add_function_parentheses = True 87 | 88 | # If true, the current module name will be prepended to all description 89 | # unit titles (such as .. function::). 90 | #add_module_names = True 91 | 92 | # If true, sectionauthor and moduleauthor directives will be shown in the 93 | # output. They are ignored by default. 94 | #show_authors = False 95 | 96 | # The name of the Pygments (syntax highlighting) style to use. 97 | pygments_style = 'sphinx' 98 | 99 | # A list of ignored prefixes for module index sorting. 100 | #modindex_common_prefix = [] 101 | 102 | # If true, keep warnings as "system message" paragraphs in the built documents. 103 | #keep_warnings = False 104 | 105 | 106 | # -- Options for HTML output ---------------------------------------------- 107 | 108 | # The theme to use for HTML and HTML Help pages. See the documentation for 109 | # a list of builtin themes. 110 | html_theme = 'alabaster' 111 | 112 | # Theme options are theme-specific and customize the look and feel of a theme 113 | # further. For a list of options available for each theme, see the 114 | # documentation. 115 | #html_theme_options = {} 116 | 117 | # Add any paths that contain custom themes here, relative to this directory. 118 | #html_theme_path = [] 119 | 120 | # The name for this set of Sphinx documents. If None, it defaults to 121 | # " v documentation". 122 | #html_title = None 123 | 124 | # A shorter title for the navigation bar. Default is the same as html_title. 125 | #html_short_title = None 126 | 127 | # The name of an image file (relative to this directory) to place at the top 128 | # of the sidebar. 129 | #html_logo = None 130 | 131 | # The name of an image file (within the static path) to use as favicon of the 132 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 133 | # pixels large. 134 | #html_favicon = None 135 | 136 | # Add any paths that contain custom static files (such as style sheets) here, 137 | # relative to this directory. They are copied after the builtin static files, 138 | # so a file named "default.css" will overwrite the builtin "default.css". 139 | html_static_path = ['_static'] 140 | 141 | # Add any extra paths that contain custom files (such as robots.txt or 142 | # .htaccess) here, relative to this directory. These files are copied 143 | # directly to the root of the documentation. 144 | #html_extra_path = [] 145 | 146 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 147 | # using the given strftime format. 148 | #html_last_updated_fmt = '%b %d, %Y' 149 | 150 | # If true, SmartyPants will be used to convert quotes and dashes to 151 | # typographically correct entities. 152 | #html_use_smartypants = True 153 | 154 | # Custom sidebar templates, maps document names to template names. 155 | #html_sidebars = {} 156 | 157 | # Additional templates that should be rendered to pages, maps page names to 158 | # template names. 159 | #html_additional_pages = {} 160 | 161 | # If false, no module index is generated. 162 | #html_domain_indices = True 163 | 164 | # If false, no index is generated. 165 | #html_use_index = True 166 | 167 | # If true, the index is split into individual pages for each letter. 168 | #html_split_index = False 169 | 170 | # If true, links to the reST sources are added to the pages. 171 | #html_show_sourcelink = True 172 | 173 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 174 | #html_show_sphinx = True 175 | 176 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 177 | #html_show_copyright = True 178 | 179 | # If true, an OpenSearch description file will be output, and all pages will 180 | # contain a tag referring to it. The value of this option must be the 181 | # base URL from which the finished HTML is served. 182 | #html_use_opensearch = '' 183 | 184 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 185 | #html_file_suffix = None 186 | 187 | # Output file base name for HTML help builder. 188 | htmlhelp_basename = 'bearddoc' 189 | 190 | 191 | # -- Options for LaTeX output --------------------------------------------- 192 | 193 | latex_elements = { 194 | # The paper size ('letterpaper' or 'a4paper'). 195 | #'papersize': 'letterpaper', 196 | 197 | # The font size ('10pt', '11pt' or '12pt'). 198 | #'pointsize': '10pt', 199 | 200 | # Additional stuff for the LaTeX preamble. 201 | #'preamble': '', 202 | } 203 | 204 | # Grouping the document tree into LaTeX files. List of tuples 205 | # (source start file, target name, title, 206 | # author, documentclass [howto, manual, or own class]). 207 | latex_documents = [ 208 | ('index', 'beard.tex', u'beard Documentation', 209 | u'Invenio collaboration', 'manual'), 210 | ] 211 | 212 | # The name of an image file (relative to this directory) to place at the top of 213 | # the title page. 214 | #latex_logo = None 215 | 216 | # For "manual" documents, if this is true, then toplevel headings are parts, 217 | # not chapters. 218 | #latex_use_parts = False 219 | 220 | # If true, show page references after internal links. 221 | #latex_show_pagerefs = False 222 | 223 | # If true, show URL addresses after external links. 224 | #latex_show_urls = False 225 | 226 | # Documents to append as an appendix to all manuals. 227 | #latex_appendices = [] 228 | 229 | # If false, no module index is generated. 230 | #latex_domain_indices = True 231 | 232 | 233 | # -- Options for manual page output --------------------------------------- 234 | 235 | # One entry per manual page. List of tuples 236 | # (source start file, name, description, authors, manual section). 237 | man_pages = [ 238 | ('index', 'beard', u'beard Documentation', 239 | [u'Invenio collaboration'], 1) 240 | ] 241 | 242 | # If true, show URL addresses after external links. 243 | #man_show_urls = False 244 | 245 | 246 | # -- Options for Texinfo output ------------------------------------------- 247 | 248 | # Grouping the document tree into Texinfo files. List of tuples 249 | # (source start file, target name, title, author, 250 | # dir menu entry, description, category) 251 | texinfo_documents = [ 252 | ('index', 'beard', u'beard Documentation', 253 | u'Invenio collaboration', 'beard', 'One line description of project.', 254 | 'Miscellaneous'), 255 | ] 256 | 257 | # Documents to append as an appendix to all manuals. 258 | #texinfo_appendices = [] 259 | 260 | # If false, no module index is generated. 261 | #texinfo_domain_indices = True 262 | 263 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 264 | #texinfo_show_urls = 'footnote' 265 | 266 | # If true, do not generate a @detailmenu in the "Top" node's menu. 267 | #texinfo_no_detailmenu = False 268 | 269 | 270 | # Example configuration for intersphinx: refer to the Python standard library. 271 | intersphinx_mapping = {'http://docs.python.org/': None} 272 | -------------------------------------------------------------------------------- /doc/index.rst: -------------------------------------------------------------------------------- 1 | .. beard documentation master file, created by 2 | sphinx-quickstart on Wed Oct 29 10:00:05 2014. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to beard's documentation! 7 | ================================= 8 | 9 | Contents: 10 | 11 | .. toctree:: 12 | :maxdepth: 2 13 | 14 | 15 | 16 | Indices and tables 17 | ================== 18 | 19 | * :ref:`genindex` 20 | * :ref:`modindex` 21 | * :ref:`search` 22 | 23 | -------------------------------------------------------------------------------- /doc/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=_build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . 10 | set I18NSPHINXOPTS=%SPHINXOPTS% . 11 | if NOT "%PAPER%" == "" ( 12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% 14 | ) 15 | 16 | if "%1" == "" goto help 17 | 18 | if "%1" == "help" ( 19 | :help 20 | echo.Please use `make ^` where ^ is one of 21 | echo. html to make standalone HTML files 22 | echo. dirhtml to make HTML files named index.html in directories 23 | echo. singlehtml to make a single large HTML file 24 | echo. pickle to make pickle files 25 | echo. json to make JSON files 26 | echo. htmlhelp to make HTML files and a HTML help project 27 | echo. qthelp to make HTML files and a qthelp project 28 | echo. devhelp to make HTML files and a Devhelp project 29 | echo. epub to make an epub 30 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 31 | echo. text to make text files 32 | echo. man to make manual pages 33 | echo. texinfo to make Texinfo files 34 | echo. gettext to make PO message catalogs 35 | echo. changes to make an overview over all changed/added/deprecated items 36 | echo. xml to make Docutils-native XML files 37 | echo. pseudoxml to make pseudoxml-XML files for display purposes 38 | echo. linkcheck to check all external links for integrity 39 | echo. doctest to run all doctests embedded in the documentation if enabled 40 | goto end 41 | ) 42 | 43 | if "%1" == "clean" ( 44 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 45 | del /q /s %BUILDDIR%\* 46 | goto end 47 | ) 48 | 49 | 50 | %SPHINXBUILD% 2> nul 51 | if errorlevel 9009 ( 52 | echo. 53 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 54 | echo.installed, then set the SPHINXBUILD environment variable to point 55 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 56 | echo.may add the Sphinx directory to PATH. 57 | echo. 58 | echo.If you don't have Sphinx installed, grab it from 59 | echo.http://sphinx-doc.org/ 60 | exit /b 1 61 | ) 62 | 63 | if "%1" == "html" ( 64 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 65 | if errorlevel 1 exit /b 1 66 | echo. 67 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 68 | goto end 69 | ) 70 | 71 | if "%1" == "dirhtml" ( 72 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 73 | if errorlevel 1 exit /b 1 74 | echo. 75 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 76 | goto end 77 | ) 78 | 79 | if "%1" == "singlehtml" ( 80 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 81 | if errorlevel 1 exit /b 1 82 | echo. 83 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 84 | goto end 85 | ) 86 | 87 | if "%1" == "pickle" ( 88 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 89 | if errorlevel 1 exit /b 1 90 | echo. 91 | echo.Build finished; now you can process the pickle files. 92 | goto end 93 | ) 94 | 95 | if "%1" == "json" ( 96 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 97 | if errorlevel 1 exit /b 1 98 | echo. 99 | echo.Build finished; now you can process the JSON files. 100 | goto end 101 | ) 102 | 103 | if "%1" == "htmlhelp" ( 104 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 105 | if errorlevel 1 exit /b 1 106 | echo. 107 | echo.Build finished; now you can run HTML Help Workshop with the ^ 108 | .hhp project file in %BUILDDIR%/htmlhelp. 109 | goto end 110 | ) 111 | 112 | if "%1" == "qthelp" ( 113 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 114 | if errorlevel 1 exit /b 1 115 | echo. 116 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 117 | .qhcp project file in %BUILDDIR%/qthelp, like this: 118 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\beard.qhcp 119 | echo.To view the help file: 120 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\beard.ghc 121 | goto end 122 | ) 123 | 124 | if "%1" == "devhelp" ( 125 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 126 | if errorlevel 1 exit /b 1 127 | echo. 128 | echo.Build finished. 129 | goto end 130 | ) 131 | 132 | if "%1" == "epub" ( 133 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 134 | if errorlevel 1 exit /b 1 135 | echo. 136 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 137 | goto end 138 | ) 139 | 140 | if "%1" == "latex" ( 141 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 142 | if errorlevel 1 exit /b 1 143 | echo. 144 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 145 | goto end 146 | ) 147 | 148 | if "%1" == "latexpdf" ( 149 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 150 | cd %BUILDDIR%/latex 151 | make all-pdf 152 | cd %BUILDDIR%/.. 153 | echo. 154 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 155 | goto end 156 | ) 157 | 158 | if "%1" == "latexpdfja" ( 159 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 160 | cd %BUILDDIR%/latex 161 | make all-pdf-ja 162 | cd %BUILDDIR%/.. 163 | echo. 164 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 165 | goto end 166 | ) 167 | 168 | if "%1" == "text" ( 169 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 170 | if errorlevel 1 exit /b 1 171 | echo. 172 | echo.Build finished. The text files are in %BUILDDIR%/text. 173 | goto end 174 | ) 175 | 176 | if "%1" == "man" ( 177 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 178 | if errorlevel 1 exit /b 1 179 | echo. 180 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 181 | goto end 182 | ) 183 | 184 | if "%1" == "texinfo" ( 185 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo 186 | if errorlevel 1 exit /b 1 187 | echo. 188 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. 189 | goto end 190 | ) 191 | 192 | if "%1" == "gettext" ( 193 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale 194 | if errorlevel 1 exit /b 1 195 | echo. 196 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale. 197 | goto end 198 | ) 199 | 200 | if "%1" == "changes" ( 201 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 202 | if errorlevel 1 exit /b 1 203 | echo. 204 | echo.The overview file is in %BUILDDIR%/changes. 205 | goto end 206 | ) 207 | 208 | if "%1" == "linkcheck" ( 209 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 210 | if errorlevel 1 exit /b 1 211 | echo. 212 | echo.Link check complete; look for any errors in the above output ^ 213 | or in %BUILDDIR%/linkcheck/output.txt. 214 | goto end 215 | ) 216 | 217 | if "%1" == "doctest" ( 218 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 219 | if errorlevel 1 exit /b 1 220 | echo. 221 | echo.Testing of doctests in the sources finished, look at the ^ 222 | results in %BUILDDIR%/doctest/output.txt. 223 | goto end 224 | ) 225 | 226 | if "%1" == "xml" ( 227 | %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml 228 | if errorlevel 1 exit /b 1 229 | echo. 230 | echo.Build finished. The XML files are in %BUILDDIR%/xml. 231 | goto end 232 | ) 233 | 234 | if "%1" == "pseudoxml" ( 235 | %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml 236 | if errorlevel 1 exit /b 1 237 | echo. 238 | echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. 239 | goto end 240 | ) 241 | 242 | :end 243 | -------------------------------------------------------------------------------- /examples/README.rst: -------------------------------------------------------------------------------- 1 | ======== 2 | Examples 3 | ======== 4 | 5 | General purpose and introductory examples of Beard. 6 | -------------------------------------------------------------------------------- /examples/applications/author-disambiguation/README.rst: -------------------------------------------------------------------------------- 1 | This example shows how to build a full author disambiguation pipeline. 2 | The pipeline is made of several scripts: 3 | 4 | - ``sampling.py``: Build a training set of labeled pairs from a set of 5 | signatures, to be further used as input for ``distance.py``.:: 6 | 7 | python sampling.py \ 8 | --input_signatures input/signatures.json \ 9 | --input_clusters input/clusters.json \ 10 | --balanced 1 \ 11 | --sample_size 1000000 \ 12 | --output_pairs pairs/1M_nysiis_balanced.json \ 13 | --use_blocking 1 \ 14 | --blocking_function block_phonetic \ 15 | --blocking_threshold 1 \ 16 | --blocking_phonetic_alg nysiis \ 17 | --verbose 1 18 | 19 | - ``distance.py``: for inferring with supervised learning a distance or 20 | linkage function between signatures. An estimator is learned from 21 | labeled paired data and models whether two signatures belong to the same 22 | person.:: 23 | 24 | python distance.py \ 25 | --distance_pairs 1M_nysiis_balanced.json \ 26 | --distance_model linkage.dat \ 27 | --input_signatures input/signatures.json \ 28 | --input_records input/records.json \ 29 | --input_ethnicity_estimator ethnicity_estimator.pickle \ 30 | --verbose 3 31 | 32 | - ``clustering.py``: Semi-supervised block clustering, for grouping together 33 | signatures from the same author. Signatures are blocked and then clustered 34 | using hierarchical clustering together with the linkage function learned at 35 | the previous step. For each block, the best cut-off threshold is chosen so 36 | as to maximize some scoring metric on the provided labeled data.:: 37 | 38 | python clustering.py \ 39 | --distance_model linkage.dat \ 40 | --input_signatures input/signatures.json \ 41 | --input_records input/records.json \ 42 | --output_clusters predicted_clusters.json \ 43 | --blocking_function block_phonetic \ 44 | --blocking_threshold 0 \ 45 | --blocking_phonetic_alg nysiis \ 46 | --clustering_threshold 0.709 \ 47 | --verbose 3 \ 48 | --n_jobs 16 49 | 50 | If partial clusters are known, these should be specified using the 51 | ``input_clusters`` option. 52 | -------------------------------------------------------------------------------- /examples/applications/author-disambiguation/clustering.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of Beard. 4 | # Copyright (C) 2015 CERN. 5 | # 6 | # Beard is a free software; you can redistribute it and/or modify it 7 | # under the terms of the Revised BSD License; see LICENSE file for 8 | # more details. 9 | 10 | """Author disambiguation -- Clustering. 11 | 12 | See README.rst for further details. 13 | 14 | .. codeauthor:: Gilles Louppe 15 | .. codeauthor:: Mateusz Susik 16 | 17 | """ 18 | 19 | import argparse 20 | import pickle 21 | import json 22 | import numpy as np 23 | 24 | from functools import partial 25 | 26 | try: 27 | from sklearn.cross_validation import train_test_split 28 | except ImportError: 29 | from sklearn.model_selection import train_test_split 30 | 31 | # These imports are used during unpickling. 32 | from utils import get_author_full_name 33 | from utils import get_author_other_names 34 | from utils import get_author_initials 35 | from utils import get_surname 36 | from utils import get_first_initial 37 | from utils import get_second_initial 38 | from utils import get_author_affiliation 39 | from utils import get_title 40 | from utils import get_journal 41 | from utils import get_abstract 42 | from utils import get_coauthors_from_range 43 | from utils import get_keywords 44 | from utils import get_collaborations 45 | from utils import get_references 46 | from utils import get_topics 47 | from utils import get_year 48 | from utils import group_by_signature 49 | from utils import load_signatures 50 | 51 | from beard.clustering import BlockClustering 52 | from beard.clustering import block_last_name_first_initial 53 | from beard.clustering import block_phonetic 54 | from beard.clustering import ScipyHierarchicalClustering 55 | from beard.metrics import b3_f_score 56 | from beard.metrics import b3_precision_recall_fscore 57 | from beard.metrics import paired_precision_recall_fscore 58 | 59 | 60 | def _affinity(X, step=10000): 61 | """Custom affinity function, using a pre-learned distance estimator.""" 62 | # Assumes that 'distance_estimator' lives in global, making things fast 63 | global distance_estimator 64 | 65 | all_i, all_j = np.triu_indices(len(X), k=1) 66 | n_pairs = len(all_i) 67 | distances = np.zeros(n_pairs, dtype=np.float64) 68 | 69 | for start in range(0, n_pairs, step): 70 | end = min(n_pairs, start+step) 71 | Xt = np.empty((end-start, 2), dtype=np.object) 72 | 73 | for k, (i, j) in enumerate(zip(all_i[start:end], 74 | all_j[start:end])): 75 | Xt[k, 0], Xt[k, 1] = X[i, 0], X[j, 0] 76 | 77 | Xt = distance_estimator.predict_proba(Xt)[:, 1] 78 | distances[start:end] = Xt[:] 79 | 80 | return distances 81 | 82 | 83 | def clustering(input_signatures, input_records, distance_model, 84 | input_clusters=None, output_clusters=None, 85 | verbose=1, n_jobs=-1, clustering_method="average", 86 | train_signatures_file=None, clustering_threshold=None, 87 | results_file=None, blocking_function="block_phonetic", 88 | blocking_threshold=1, blocking_phonetic_alg="nysiis"): 89 | """Cluster signatures using a pretrained distance model. 90 | 91 | Parameters 92 | ---------- 93 | :param input_signatures: string 94 | Path to the file with signatures. The content should be a JSON array 95 | of dictionaries holding metadata about signatures. 96 | 97 | [{"signature_id": 0, 98 | "author_name": "Doe, John", 99 | "publication_id": 10, ...}, { ... }, ...] 100 | 101 | :param input_records: string 102 | Path to the file with records. The content should be a JSON array of 103 | dictionaries holding metadata about records 104 | 105 | [{"publication_id": 0, 106 | "title": "Author disambiguation using Beard", ... }, { ... }, ...] 107 | 108 | :param distance_model: string 109 | Path to the file with the distance model. The file should be a pickle 110 | created using the ``distance.py`` script. 111 | 112 | :param input_clusters: string 113 | Path to the file with knownn clusters. The file should be a dictionary, 114 | where keys are cluster labels and values are the `signature_id` of the 115 | signatures grouped in the clusters. Signatures assigned to the cluster 116 | with label "-1" are not clustered. 117 | 118 | {"0": [0, 1, 3], "1": [2, 5], ...} 119 | 120 | :param output_clusters: string 121 | Path to the file with output cluster. The file will be filled with 122 | clusters, using the same format as ``input_clusters``. 123 | 124 | :param verbose: int 125 | If not zero, function will output scores on stdout. 126 | 127 | :param n_jobs: int 128 | Parameter passed to joblib. Number of threads to be used. 129 | 130 | :param clustering_method: string 131 | Parameter passed to ``ScipyHierarchicalClustering``. Used only if 132 | ``clustering_test_size`` is specified. 133 | 134 | :param train_signatures_file: str 135 | Path to the file with train set signatures. Format the same as in 136 | ``input_signatures``. 137 | 138 | :param clustering_threshold: float 139 | Threshold passed to ``ScipyHierarchicalClustering``. 140 | 141 | :param results_file: str 142 | Path to the file where the results will be output. It will give 143 | additional information about pairwise variant of scores. 144 | 145 | :param blocking_function: string 146 | must be a defined blocking function. Defined functions are: 147 | - "block_last_name_first_initial" 148 | - "block_phonetic" 149 | 150 | :param blocking_threshold: int or None 151 | It determines the maximum allowed size of blocking on the last name 152 | It can only be: 153 | - None; if the blocking function is block_last_name_first_initial 154 | - int; if the blocking function is block_phonetic 155 | please check the documentation of phonetic blocking in 156 | beard.clustering.blocking_funcs.py 157 | 158 | :param blocking_phonetic_alg: string or None 159 | If not None, determines which phonetic algorithm is used. Options: 160 | - "double_metaphone" 161 | - "nysiis" (only for Python 2) 162 | - "soundex" (only for Python 2) 163 | """ 164 | # Assumes that 'distance_estimator' lives in global, making things fast 165 | global distance_estimator 166 | distance_estimator = pickle.load(open(distance_model, "rb")) 167 | 168 | try: 169 | distance_estimator.steps[-1][1].set_params(n_jobs=1) 170 | except: 171 | pass 172 | 173 | signatures, records = load_signatures(input_signatures, 174 | input_records) 175 | 176 | indices = {} 177 | X = np.empty((len(signatures), 1), dtype=np.object) 178 | for i, signature in enumerate(sorted(signatures.values(), 179 | key=lambda s: s["signature_id"])): 180 | X[i, 0] = signature 181 | indices[signature["signature_id"]] = i 182 | 183 | if blocking_function == "block_last_name_first_initial": 184 | block_function = block_last_name_first_initial 185 | else: 186 | block_function = partial(block_phonetic, 187 | threshold=blocking_threshold, 188 | phonetic_algorithm=blocking_phonetic_alg) 189 | 190 | # Semi-supervised block clustering 191 | if input_clusters: 192 | true_clusters = json.load(open(input_clusters, "r")) 193 | y_true = -np.ones(len(X), dtype=np.int) 194 | 195 | for label, signature_ids in true_clusters.items(): 196 | for signature_id in signature_ids: 197 | y_true[indices[signature_id]] = label 198 | 199 | y = -np.ones(len(X), dtype=np.int) 200 | 201 | if train_signatures_file: 202 | train_signatures = json.load(open(train_signatures_file, "r")) 203 | train_ids = [x['signature_id'] for x in train_signatures] 204 | del train_signatures 205 | y[train_ids] = y_true[train_ids] 206 | test_ids = list(set([x['signature_id'] for _, x in 207 | signatures.iteritems()]) - set(train_ids)) 208 | else: 209 | y = y_true 210 | 211 | else: 212 | y = None 213 | 214 | clusterer = BlockClustering( 215 | blocking=block_function, 216 | base_estimator=ScipyHierarchicalClustering( 217 | affinity=_affinity, 218 | threshold=clustering_threshold, 219 | method=clustering_method, 220 | supervised_scoring=b3_f_score), 221 | verbose=verbose, 222 | n_jobs=n_jobs).fit(X, y) 223 | 224 | labels = clusterer.labels_ 225 | 226 | # Save predicted clusters 227 | if output_clusters: 228 | clusters = {} 229 | 230 | for label in np.unique(labels): 231 | mask = (labels == label) 232 | clusters[str(label)] = [r[0]["signature_id"] for r in X[mask]] 233 | 234 | json.dump(clusters, open(output_clusters, "w")) 235 | 236 | # Statistics 237 | if verbose and input_clusters: 238 | print("Number of blocks =", len(clusterer.clusterers_)) 239 | print("True number of clusters", len(np.unique(y_true))) 240 | print("Number of computed clusters", len(np.unique(labels))) 241 | 242 | b3_overall = b3_precision_recall_fscore(y_true, labels) 243 | print("B^3 F-score (overall) =", b3_overall[2]) 244 | 245 | if train_signatures_file: 246 | b3_train = b3_precision_recall_fscore( 247 | y_true[train_ids], 248 | labels[train_ids] 249 | ) 250 | b3_test = b3_precision_recall_fscore( 251 | y_true[test_ids], 252 | labels[test_ids] 253 | ) 254 | print("B^3 F-score (train) =", b3_train[2]) 255 | print("B^3 F-score (test) =", b3_test[2]) 256 | if results_file: 257 | paired_overall = paired_precision_recall_fscore(y_true, labels) 258 | paired_train = paired_precision_recall_fscore( 259 | y_true[train_ids], 260 | labels[train_ids] 261 | ) 262 | paired_test = paired_precision_recall_fscore( 263 | y_true[test_ids], 264 | labels[test_ids] 265 | ) 266 | 267 | json.dump({ 268 | "description": ["precision", "recall", "f_score"], 269 | "b3": {"overall": list(b3_overall), 270 | "train": list(b3_train), 271 | "test": list(b3_test) 272 | }, 273 | "paired": {"overall": list(paired_overall), 274 | "train": list(paired_train), 275 | "test": list(paired_test) 276 | } 277 | }, open(results_file, 'w')) 278 | 279 | if __name__ == "__main__": 280 | parser = argparse.ArgumentParser() 281 | parser.add_argument("--distance_model", required=True, type=str) 282 | parser.add_argument("--input_signatures", required=True, type=str) 283 | parser.add_argument("--input_records", required=True, type=str) 284 | parser.add_argument("--input_clusters", default=None, type=str) 285 | parser.add_argument("--output_clusters", required=True, type=str) 286 | parser.add_argument("--clustering_method", default="average", type=str) 287 | parser.add_argument("--clustering_threshold", default=None, type=float) 288 | parser.add_argument("--train_signatures", default=None, type=str) 289 | parser.add_argument("--results_file", default=None, type=str) 290 | parser.add_argument("--blocking_function", default="block_phonetic", 291 | type=str) 292 | parser.add_argument("--blocking_threshold", default=1, type=int) 293 | parser.add_argument("--blocking_phonetic_alg", default="nysiis", type=str) 294 | parser.add_argument("--verbose", default=1, type=int) 295 | parser.add_argument("--n_jobs", default=1, type=int) 296 | args = parser.parse_args() 297 | 298 | clustering(args.input_signatures, args.input_records, args.distance_model, 299 | args.input_clusters, args.output_clusters, 300 | args.verbose, args.n_jobs, args.clustering_method, 301 | args.train_signatures, args.clustering_threshold, 302 | args.results_file, args.blocking_function, 303 | args.blocking_threshold, args.blocking_phonetic_alg) 304 | -------------------------------------------------------------------------------- /examples/applications/author-disambiguation/ethnicity.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of Beard. 4 | # Copyright (C) 2015 CERN. 5 | # 6 | # Beard is a free software; you can redistribute it and/or modify it 7 | # under the terms of the Revised BSD License; see LICENSE file for 8 | # more details. 9 | 10 | """Author disambiguation -- Build an estimator for guessing an author ethnic 11 | group from his name. 12 | 13 | .. codeauthor:: Gilles Louppe 14 | .. codeauthor:: Hussein Al-Natsheh 15 | 16 | """ 17 | 18 | import argparse 19 | import numpy as np 20 | import pandas as pd 21 | import pickle 22 | 23 | from sklearn.feature_extraction.text import TfidfVectorizer 24 | from sklearn.pipeline import Pipeline 25 | from sklearn.svm import LinearSVC 26 | 27 | from beard.utils import normalize_name 28 | 29 | 30 | if __name__ == "__main__": 31 | parser = argparse.ArgumentParser() 32 | parser.add_argument("--input_datafile", required=True, type=str) 33 | parser.add_argument("--output_ethnicity_estimator", 34 | default="ethnicity_estimator.pickle", type=str) 35 | parser.add_argument("--C", default=4.0, type=float) 36 | args = parser.parse_args() 37 | 38 | # Load data 39 | data = pd.read_csv(args.input_datafile) 40 | y = data.RACE.values 41 | X = ["%s, %s" % (last, first) for last, first in zip(data.NAMELAST.values, 42 | data.NAMEFRST.values)] 43 | X = [normalize_name(name) for name in X] 44 | 45 | # Train an estimator 46 | estimator = Pipeline([ 47 | ("transformer", TfidfVectorizer(analyzer="char_wb", 48 | ngram_range=(1, 5), 49 | min_df=0.00005, 50 | dtype=np.float32, 51 | decode_error="replace")), 52 | ("classifier", LinearSVC(C=args.C))]) 53 | estimator.fit(X, y) 54 | 55 | pickle.dump(estimator, 56 | open(args.output_ethnicity_estimator, "w"), 57 | protocol=pickle.HIGHEST_PROTOCOL) 58 | -------------------------------------------------------------------------------- /examples/applications/author-disambiguation/sampling.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of Beard. 4 | # Copyright (C) 2015 CERN. 5 | # 6 | # Beard is a free software; you can redistribute it and/or modify it 7 | # under the terms of the Revised BSD License; see LICENSE file for 8 | # more details. 9 | 10 | r"""Script for generating the training set. 11 | 12 | It samples pairs of signatures labeled with 1 if they are of different authors 13 | or 0 if they are of the same author. 14 | 15 | Examples of command line use: 16 | 17 | Sampling without blocking 18 | 19 | python sampling.py --input_clusters big/clusters.json \ 20 | --train_signatures train.json --output_pairs pairs.json --use_blocking 0 21 | 22 | Sampling with blocking, without balancing 23 | 24 | python sampling.py --input_clusters big/clusters.json \ 25 | --train_signatures train.json --output_pairs pairs.json --input_balanced 0 26 | 27 | Sampling with blocking, with balancing and smaller sample size. 28 | 29 | python sampling.py --input_clusters big/clusters.json --sample_size 500000 \ 30 | --train_signatures train.json --output_pairs pairs.json --input_balanced 1 31 | 32 | 33 | .. codeauthor:: Hussein Al-Natsheh 34 | .. codeauthor:: Mateusz Susik 35 | """ 36 | 37 | from __future__ import print_function 38 | 39 | import argparse 40 | import json 41 | import math 42 | import numpy as np 43 | import random 44 | import six 45 | 46 | from beard.clustering import block_phonetic 47 | from beard.clustering import block_last_name_first_initial 48 | 49 | import sys 50 | 51 | # for Python 3 52 | if sys.version_info[0]==3: 53 | from functools import reduce 54 | 55 | def _noblocking_sampling(sample_size, train_signatures, clusters_reversed): 56 | pairs = [] 57 | # Pairs dict will prevent duplicates 58 | pairs_dict = {} 59 | category_size = sample_size // 2 60 | negative = 0 61 | while negative < category_size: 62 | s1 = random.choice(train_signatures)['signature_id'] 63 | s2 = random.choice(train_signatures)['signature_id'] 64 | if s1 == s2: 65 | continue 66 | elif s1 > s2: 67 | s1, s2 = s2, s1 68 | s1_cluster = clusters_reversed[s1] 69 | s2_cluster = clusters_reversed[s2] 70 | if s1_cluster != s2_cluster: 71 | if negative < category_size: 72 | if s1 in pairs_dict: 73 | if s2 in pairs_dict[s1]: 74 | continue 75 | pairs_dict[s1].append(s2) 76 | else: 77 | pairs_dict[s1] = [s2] 78 | pairs.append((s1, s2, 1)) 79 | negative += 1 80 | 81 | print("successfully sampled pairs from different authors") 82 | 83 | positive_pairs = [] 84 | for i in range(100): 85 | print("sampling positive examples: %s out of 100 folds" % (i+1)) 86 | some_signatures = random.sample(train_signatures, 87 | len(train_signatures)//20) 88 | for i, s1 in enumerate(some_signatures): 89 | for s2 in some_signatures[i+1:]: 90 | s1_id = s1['signature_id'] 91 | s2_id = s2['signature_id'] 92 | s1_cluster = clusters_reversed[s1_id] 93 | s2_cluster = clusters_reversed[s2_id] 94 | if s1_cluster == s2_cluster: 95 | positive_pairs.append((s1_id, s2_id, 0)) 96 | 97 | sampled = random.sample(positive_pairs, category_size//100) 98 | pairs += sampled 99 | for s1, s2, _ in sampled: 100 | if s1 > s2: 101 | s2, s1 = s1, s2 102 | if s1 in pairs_dict: 103 | if s2 in pairs_dict[s1]: 104 | continue 105 | pairs_dict[s1].append(s2) 106 | else: 107 | pairs_dict[s1] = [s2] 108 | 109 | print("successfully sampled pairs belonging to the same author") 110 | return pairs 111 | 112 | 113 | def pair_sampling(blocking_function, 114 | blocking_threshold, 115 | blocking_phonetic_alg, 116 | clusters_filename, 117 | train_filename, 118 | balanced=1, verbose=1, 119 | sample_size=1000000, 120 | use_blocking=1): 121 | """Sampling pairs from the ground-truth data. 122 | 123 | This function builds a pair dataset from claimed signatures. 124 | It gives the ability to specify the 125 | blocking function and whether the sampling would be balanced or not. 126 | 127 | Parameters 128 | ---------- 129 | :param blocking_function: string 130 | must be a defined blocking function. Defined functions are: 131 | - "block_last_name_first_initial" 132 | - "block_phonetic" 133 | 134 | :param blocking_threshold: int or None 135 | It determines the maximum allowed size of blocking on the last name 136 | It can only be: 137 | - None; if the blocking function is block_last_name_first_initial 138 | - int; if the blocking function is block_phonetic 139 | please check the documentation of phonetic blocking in 140 | beard.clustering.blocking_funcs.py 141 | 142 | :param blocking_phonetic_alg: string or None 143 | If not None, determines which phonetic algorithm is used. Options: 144 | - "double_metaphone" 145 | - "nysiis" (only for Python 2) 146 | - "soundex" (only for Python 2) 147 | 148 | :param clusters_filename: string 149 | Path to the input clusters (ground-truth) file 150 | 151 | :param train_filename: string 152 | Path to train set file 153 | 154 | :param balanced: boolean 155 | determines if the sampling would be balanced. 156 | The balance is defined as the same number of pairs with the same name 157 | on signature and pairs with different names. The balance is preserved 158 | both in the pairs belonging to one authors and in the pairs belonging 159 | to different authors. Note that if there are not enough pairs to 160 | satisfy the balance condition, some of the pairs will be replicated. 161 | 162 | :param verbose: boolean 163 | determines if some processing statistics would be shown 164 | 165 | :param sample_size: integer 166 | The desired sample size 167 | 168 | :param use_blocking: boolean 169 | determines if the signatures should be blocked before sampling 170 | 171 | Returns 172 | ------- 173 | :returns: list 174 | list of signature pairs 175 | """ 176 | # Load ground-truth 177 | true_clusters = json.load(open(clusters_filename, "r")) 178 | clusters_reversed = {v: k for k, va in six.iteritems(true_clusters) 179 | for v in va} 180 | 181 | train_signatures = json.load(open(train_filename, "r")) 182 | 183 | if not use_blocking: 184 | return _noblocking_sampling(sample_size, train_signatures, 185 | clusters_reversed) 186 | 187 | train_signatures_ids = [] 188 | for item in train_signatures: 189 | train_signatures_ids.append([item]) 190 | 191 | train_signatures_ids = np.array(train_signatures_ids) 192 | 193 | if blocking_function == "block_last_name_first_initial": 194 | blocking = block_last_name_first_initial(train_signatures_ids) 195 | elif blocking_function == "block_phonetic" and blocking_threshold: 196 | blocking = block_phonetic(train_signatures_ids, 197 | blocking_threshold, 198 | blocking_phonetic_alg) 199 | else: 200 | raise ValueError("No such blocking strategy.") 201 | 202 | category_size = sample_size // 4 203 | 204 | blocking_dict = {} 205 | 206 | for index, b in enumerate(blocking): 207 | if b in blocking_dict: 208 | blocking_dict[b].append(index) 209 | else: 210 | blocking_dict[b] = [index] 211 | 212 | # 'd' stands for different, 's' stands for same, 'a' stands for author 213 | # 'n' stands for name 214 | dasn = [] 215 | sasn = [] 216 | sadn = [] 217 | dadn = [] 218 | 219 | for _, sig_s in six.iteritems(blocking_dict): 220 | 221 | for i, s1 in enumerate(sig_s): 222 | for s2 in sig_s[i+1:]: 223 | s1_id = train_signatures[s1]['signature_id'] 224 | s2_id = train_signatures[s2]['signature_id'] 225 | s1_name = train_signatures[s1]['author_name'] 226 | s2_name = train_signatures[s2]['author_name'] 227 | s1_cluster = clusters_reversed[s1_id] 228 | s2_cluster = clusters_reversed[s2_id] 229 | 230 | if s1_cluster == s2_cluster: 231 | # Same author 232 | if s1_name == s2_name: 233 | sasn.append((s1_id, s2_id, 0)) 234 | else: 235 | sadn.append((s1_id, s2_id, 0)) 236 | else: 237 | # Different authors 238 | if s1_name == s2_name: 239 | dasn.append((s1_id, s2_id, 1)) 240 | else: 241 | dadn.append((s1_id, s2_id, 1)) 242 | 243 | if balanced: 244 | if verbose: 245 | print("len of dasn:", len(dasn)) 246 | print("len of sadn:", len(sadn)) 247 | print("len of sasn:", len(sasn)) 248 | print("len of dadn:", len(dadn)) 249 | 250 | all_pairs = map(lambda x: int(math.ceil( 251 | category_size/float(len(x)))) * x, 252 | [dasn, sasn, sadn, dadn]) 253 | 254 | if sys.version_info[0]==3: 255 | all_pairs = list(all_pairs) 256 | 257 | pairs = reduce(lambda x, y: x + random.sample(y, category_size), 258 | all_pairs, []) 259 | 260 | else: 261 | positive = sasn + sadn 262 | negative = dasn + dadn 263 | pairs = random.sample(positive, 264 | sample_size/2) + random.sample(negative, 265 | sample_size/2) 266 | 267 | return pairs 268 | 269 | if __name__ == "__main__": 270 | # Parse command line arugments 271 | parser = argparse.ArgumentParser() 272 | parser.add_argument("--input_signatures", required=True, type=str) 273 | parser.add_argument("--input_clusters", default="clusters.json", type=str) 274 | parser.add_argument("--balanced", default=1, type=int) 275 | parser.add_argument("--sample_size", default=1000000, type=int) 276 | parser.add_argument("--output_pairs", default="pairs.json", type=str) 277 | parser.add_argument("--use_blocking", default=1, type=int) 278 | parser.add_argument("--blocking_function", default="block_phonetic", 279 | type=str) 280 | parser.add_argument("--blocking_threshold", default=1, type=int) 281 | parser.add_argument("--blocking_phonetic_alg", default="nysiis", type=str) 282 | parser.add_argument("--verbose", default=1, type=int) 283 | 284 | args = parser.parse_args() 285 | 286 | pairs = pair_sampling( 287 | train_filename=args.input_signatures, 288 | clusters_filename=args.input_clusters, 289 | balanced=args.balanced, 290 | sample_size=args.sample_size, 291 | use_blocking=args.use_blocking, 292 | blocking_function=args.blocking_function, 293 | blocking_threshold=args.blocking_threshold, 294 | blocking_phonetic_alg=args.blocking_phonetic_alg, 295 | verbose=args.verbose 296 | ) 297 | 298 | if args.verbose: 299 | print("number of pairs", len(pairs)) 300 | 301 | json.dump(pairs, open(args.output_pairs, "w")) 302 | 303 | print("The sampled pairs file was successfully created") 304 | -------------------------------------------------------------------------------- /examples/applications/author-disambiguation/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of Beard. 4 | # Copyright (C) 2015 CERN. 5 | # 6 | # Beard is a free software; you can redistribute it and/or modify it 7 | # under the terms of the Revised BSD License; see LICENSE file for 8 | # more details. 9 | 10 | """Helpers for author disambiguation. 11 | 12 | .. codeauthor:: Gilles Louppe 13 | .. codeauthor:: Mateusz Susik 14 | 15 | """ 16 | 17 | import json 18 | 19 | from beard.utils import given_name 20 | from beard.utils import name_initials 21 | from beard.utils import normalize_name 22 | from beard.utils import given_name_initial 23 | 24 | 25 | def load_signatures(signatures_filename, records_filename): 26 | """Load signatures from JSON files. 27 | 28 | Parameters 29 | ---------- 30 | :param signatures_filename: string 31 | Path to the signatures file. The file should be in json format. 32 | 33 | :param records_filename: string 34 | Path to the records file. The file should be in json formaat. 35 | 36 | Returns 37 | ------- 38 | :returns: tuple 39 | Signatures and records. Both are lists of dictionaries. 40 | """ 41 | signatures = json.load(open(signatures_filename, "r")) 42 | records = json.load(open(records_filename, "r")) 43 | 44 | if isinstance(signatures, list): 45 | signatures = {s["signature_id"]: s for s in signatures} 46 | 47 | if isinstance(records, list): 48 | records = {r["publication_id"]: r for r in records} 49 | 50 | for signature_id, signature in signatures.items(): 51 | signature["publication"] = records[signature["publication_id"]] 52 | 53 | return signatures, records 54 | 55 | 56 | def get_author_full_name(s): 57 | """Get author full name from the signature. 58 | 59 | Parameters 60 | ---------- 61 | :param s: dict 62 | Signature 63 | 64 | Returns 65 | ------- 66 | :returns: string 67 | Normalized author name 68 | """ 69 | v = s["author_name"] 70 | v = normalize_name(v) if v else "" 71 | return v 72 | 73 | 74 | def get_first_given_name(s): 75 | """Get author first given name from the signature. 76 | 77 | Parameters 78 | ---------- 79 | :param s: dict 80 | Signature 81 | 82 | Returns 83 | ------- 84 | :returns: string 85 | Author's first given name 86 | """ 87 | v = given_name(s["author_name"], 0) 88 | return v 89 | 90 | 91 | def get_second_given_name(s): 92 | """Get author second given name from the signature. 93 | 94 | Parameters 95 | ---------- 96 | :param s: dict 97 | Signature 98 | 99 | Returns 100 | ------- 101 | :returns: string 102 | Author's second given name 103 | """ 104 | v = given_name(s["author_name"], 1) 105 | return v 106 | 107 | 108 | def get_surname(s): 109 | return s['author_name'].split(" ")[0].split(",")[0] 110 | 111 | 112 | def get_first_initial(s): 113 | v = given_name_initial(s["author_name"], 0) 114 | try: 115 | return v 116 | except IndexError: 117 | return "" 118 | 119 | 120 | def get_second_initial(s): 121 | """Get author second given name's initial from the signature. 122 | 123 | Parameters 124 | ---------- 125 | :param s: dict 126 | Signature 127 | 128 | Returns 129 | ------- 130 | :returns: string 131 | Second given name's initial. Empty string in case it's not available. 132 | """ 133 | v = given_name_initial(s["author_name"], 1) 134 | try: 135 | return v 136 | except IndexError: 137 | return "" 138 | 139 | 140 | def get_author_other_names(s): 141 | """Get author other names from the signature. 142 | 143 | Parameters 144 | ---------- 145 | :param s: dict 146 | Signature 147 | 148 | Returns 149 | ------- 150 | :returns: string 151 | Normalized other author names 152 | """ 153 | v = s["author_name"] 154 | v = v.split(",", 1) 155 | v = normalize_name(v[1]) if len(v) == 2 else "" 156 | return v 157 | 158 | 159 | def get_author_initials(s): 160 | """Get author initials from the signature. 161 | 162 | Parameters 163 | ---------- 164 | :param s: dict 165 | Signature 166 | 167 | Returns 168 | ------- 169 | :returns: string 170 | Initials, not separated 171 | """ 172 | v = s["author_name"] 173 | v = v if v else "" 174 | v = "".join(name_initials(v)) 175 | return v 176 | 177 | 178 | def get_author_affiliation(s): 179 | """Get author affiliation from the signature. 180 | 181 | Parameters 182 | ---------- 183 | :param s: dict 184 | Signature 185 | 186 | Returns 187 | ------- 188 | :returns: string 189 | Normalized affiliation name 190 | """ 191 | v = s["author_affiliation"] 192 | v = normalize_name(v) if v else "" 193 | return v 194 | 195 | 196 | def get_title(s): 197 | """Get publication's title from the signature. 198 | 199 | Parameters 200 | ---------- 201 | :param s: dict 202 | Signature 203 | 204 | Returns 205 | ------- 206 | :returns: string 207 | Title of the publication 208 | """ 209 | v = s["publication"]["title"] 210 | v = v if v else "" 211 | return v 212 | 213 | 214 | def get_journal(s): 215 | """Get journal's name from the signature. 216 | 217 | Parameters 218 | ---------- 219 | :param s: dict 220 | Signature 221 | 222 | Returns 223 | ------- 224 | :returns: string 225 | Journal's name 226 | """ 227 | v = s["publication"]["journal"] 228 | v = v if v else "" 229 | return v 230 | 231 | 232 | def get_abstract(s): 233 | """Get author full name from the signature. 234 | 235 | Parameters 236 | ---------- 237 | :param s: dict 238 | Signature 239 | 240 | Returns 241 | ------- 242 | :returns: string 243 | Normalized author name 244 | """ 245 | v = s["publication"]["abstract"] 246 | v = v if v else "" 247 | return v 248 | 249 | 250 | def get_coauthors(s): 251 | """Get coauthors from the signature. 252 | 253 | Parameters 254 | ---------- 255 | :param s: dict 256 | Signature 257 | 258 | Returns 259 | ------- 260 | :returns: string 261 | Coauthors ids separated by a space 262 | """ 263 | v = s["publication"]["authors"] 264 | v = " ".join(v) 265 | return v 266 | 267 | 268 | def get_coauthors_from_range(s, range=10): 269 | """Get coauthors from the signature. 270 | 271 | Only the signatures from the range-neighbourhood of the given signature 272 | will be selected. Signatures on the paper are ordered (although they don't 273 | have to be sorted!), and the distance between signatures is defined 274 | as absolute difference of the indices. 275 | 276 | The function was introduced due to the high memory usage of 277 | a simple version. 278 | 279 | Parameters 280 | ---------- 281 | :param s: dict 282 | Signature 283 | :param range: integer 284 | The maximum distance for the signatures between the author and his 285 | coauthor. 286 | 287 | Returns 288 | ------- 289 | :returns: string 290 | Coauthors ids separated by a space 291 | """ 292 | v = s["publication"]["authors"] 293 | try: 294 | index = v.index(s["author_name"]) 295 | v = " ".join(v[max(0, index-range):min(len(v), index+range)]) 296 | return v 297 | except ValueError: 298 | v = " ".join(v) 299 | return v 300 | 301 | 302 | def get_keywords(s): 303 | """Get keywords from the signature. 304 | 305 | Parameters 306 | ---------- 307 | :param s: dict 308 | Signature 309 | 310 | Returns 311 | ------- 312 | :returns: string 313 | Keywords separated by a space 314 | """ 315 | v = s["publication"]["keywords"] 316 | v = " ".join(v) 317 | return v 318 | 319 | 320 | def get_topics(s): 321 | """Get topics from the signature. 322 | 323 | Parameters 324 | ---------- 325 | :param s: dict 326 | Signature 327 | 328 | Returns 329 | ------- 330 | :returns: string 331 | Topics separated by a space 332 | """ 333 | v = s["publication"]["topics"] 334 | v = " ".join(v) 335 | return v 336 | 337 | 338 | def get_collaborations(s): 339 | """Get collaborations from the signature. 340 | 341 | Parameters 342 | ---------- 343 | :param s: dict 344 | Signature 345 | 346 | Returns 347 | ------- 348 | :returns: string 349 | Collaboations separated by a space 350 | """ 351 | v = s["publication"]["collaborations"] 352 | v = " ".join(v) 353 | return v 354 | 355 | 356 | def get_references(s): 357 | """Get references from the signature. 358 | Parameters 359 | ---------- 360 | :param s: dict 361 | Signature 362 | Returns 363 | ------- 364 | :returns: string 365 | Ids of references separated by a space 366 | """ 367 | v = s["publication"]["references"] 368 | v = " ".join(str(r) for r in v) 369 | v = v if v else "" 370 | return v 371 | 372 | 373 | def get_year(s): 374 | """Get year from the signature. 375 | 376 | Parameters 377 | ---------- 378 | :param s: dict 379 | Signature 380 | 381 | Returns 382 | ------- 383 | :returns: int 384 | Year of publication if present on the signature, -1 otherwise 385 | """ 386 | v = s["publication"]["year"] 387 | v = int(v) if v else -1 388 | return v 389 | 390 | 391 | def group_by_signature(r): 392 | """Grouping function for ``PairTransformer``. 393 | 394 | Parameters 395 | ---------- 396 | :param r: iterable 397 | signature in a singleton. 398 | 399 | Returns 400 | ------- 401 | :returns: string 402 | Signature id 403 | """ 404 | return r[0]["signature_id"] 405 | -------------------------------------------------------------------------------- /examples/author_disambiguation.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of Beard. 4 | # Copyright (C) 2015 CERN. 5 | # 6 | # Beard is a free software; you can redistribute it and/or modify it 7 | # under the terms of the Revised BSD License; see LICENSE file for 8 | # more details. 9 | 10 | """Simplified author disambiguation example. 11 | 12 | This example shows how to use block clustering for the author 13 | disambiguation problem. To goal is to cluster together all (author name, 14 | affiliation) tuples that correspond to the same actual person. 15 | 16 | .. codeauthor:: Gilles Louppe 17 | 18 | """ 19 | 20 | from __future__ import print_function 21 | 22 | import numpy as np 23 | 24 | from beard.clustering import BlockClustering 25 | from beard.clustering import block_last_name_first_initial 26 | from beard.clustering import ScipyHierarchicalClustering 27 | from beard.metrics import paired_f_score 28 | from beard.utils import normalize_name 29 | from beard.utils import name_initials 30 | 31 | 32 | def affinity(X): 33 | """Compute pairwise distances between (author, affiliation) tuples. 34 | 35 | Note that this function is a heuristic. It should ideally be replaced 36 | by a more robust distance function, e.g. using a model learned over 37 | pairs of tuples. 38 | """ 39 | distances = np.zeros((len(X), len(X)), dtype=np.float) 40 | 41 | for i, j in zip(*np.triu_indices(len(X), k=1)): 42 | name_i = normalize_name(X[i, 0]) 43 | aff_i = X[i, 1] 44 | initials_i = name_initials(name_i) 45 | name_j = normalize_name(X[j, 0]) 46 | aff_j = X[j, 1] 47 | initials_j = name_initials(name_j) 48 | 49 | # Names and affiliations match 50 | if (name_i == name_j and aff_i == aff_j): 51 | distances[i, j] = 0.0 52 | 53 | # Compatible initials and affiliations match 54 | elif (len(initials_i | initials_j) == max(len(initials_i), 55 | len(initials_j)) and 56 | aff_i == aff_j and aff_i != ""): 57 | distances[i, j] = 0.0 58 | 59 | # Initials are not compatible 60 | elif (len(initials_i | initials_j) != max(len(initials_i), 61 | len(initials_j))): 62 | distances[i, j] = 1.0 63 | 64 | # We dont know 65 | else: 66 | distances[i, j] = 0.5 67 | 68 | distances += distances.T 69 | return distances 70 | 71 | if __name__ == "__main__": 72 | # Load data 73 | data = np.load("data/author-disambiguation.npz") 74 | X = data["X"] 75 | truth = data["y"] 76 | 77 | # Block clustering with fixed threshold 78 | block_clusterer = BlockClustering( 79 | blocking=block_last_name_first_initial, 80 | base_estimator=ScipyHierarchicalClustering( 81 | threshold=0.5, 82 | affinity=affinity, 83 | method="complete"), 84 | verbose=3, 85 | n_jobs=-1) 86 | block_clusterer.fit(X) 87 | labels = block_clusterer.labels_ 88 | 89 | # Print clusters 90 | for cluster in np.unique(labels): 91 | entries = set() 92 | 93 | for name, affiliation in X[labels == cluster]: 94 | entries.add((name, affiliation)) 95 | 96 | print("Cluster #%d = %s" % (cluster, entries)) 97 | print() 98 | 99 | # Statistics 100 | print("Number of blocks =", len(block_clusterer.clusterers_)) 101 | print("True number of clusters", len(np.unique(truth))) 102 | print("Number of computed clusters", len(np.unique(labels))) 103 | print("Paired F-score =", paired_f_score(truth, labels)) 104 | -------------------------------------------------------------------------------- /examples/data/README.rst: -------------------------------------------------------------------------------- 1 | This directory contains disambiguation input data from INSPIRE. 2 | All signatures of people whose names contain *wang* are extracted. 3 | Please note that ids of records/signatures in ``wang_records.json`` file 4 | under authors/references/citations keys do NOT represent positions 5 | of corresponding entities in the files in this directory. 6 | Still, the disambiguation runs finely on these files. 7 | -------------------------------------------------------------------------------- /examples/data/author-disambiguation.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inspirehep/beard/0ad199d6c89ce331be29c3c9112b27b7d87011f8/examples/data/author-disambiguation.npz -------------------------------------------------------------------------------- /miniconda.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inspirehep/beard/0ad199d6c89ce331be29c3c9112b27b7d87011f8/miniconda.sh -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | # This file is part of Beard. 2 | # Copyright (C) 2014 CERN. 3 | # 4 | # Beard is a free software; you can redistribute it and/or modify it 5 | # under the terms of the Revised BSD License; see LICENSE file for 6 | # more details. 7 | 8 | [pytest] 9 | addopts = --pep8 --ignore=doc --ignore=setup.py --ignore=examples --ignore=beard/ext --doctest-modules --cov=beard --cov-report=term-missing --cov-config=.coveragerc 10 | -------------------------------------------------------------------------------- /run-tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This file is part of Beard. 3 | # Copyright (C) 2016 CERN. 4 | # 5 | # Beard is a free software; you can redistribute it and/or modify it 6 | # under the terms of the Revised BSD License; see LICENSE file for 7 | # more details. 8 | 9 | set -e 10 | 11 | check-manifest --ignore miniconda.sh 12 | python setup.py test 13 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of Beard. 4 | # Copyright (C) 2014, 2015 CERN. 5 | # 6 | # Beard is a free software; you can redistribute it and/or modify it 7 | # under the terms of the Revised BSD License; see LICENSE file for 8 | # more details. 9 | 10 | """Setup file for Beard. 11 | 12 | .. codeauthor:: Mateusz Susik 13 | .. codeauthor:: Jan Aage Lavik 14 | 15 | """ 16 | 17 | from setuptools import setup, find_packages 18 | from setuptools.command.test import test as TestCommand 19 | import os 20 | import re 21 | import sys 22 | 23 | 24 | class PyTest(TestCommand): 25 | 26 | """Handle ``python setup.py test``.""" 27 | 28 | user_options = [("pytest-args=", "a", "Arguments to pass to py.test")] 29 | 30 | def initialize_options(self): 31 | """Read options from ``pytest.ini`` config file.""" 32 | TestCommand.initialize_options(self) 33 | try: 34 | from ConfigParser import ConfigParser 35 | except ImportError: 36 | from configparser import ConfigParser 37 | config = ConfigParser() 38 | config.read("pytest.ini") 39 | self.pytest_args = config.get("pytest", "addopts").split(" ") 40 | 41 | def finalize_options(self): 42 | """Finalize options.""" 43 | TestCommand.finalize_options(self) 44 | self.test_args = [] 45 | self.test_suite = True 46 | 47 | def run_tests(self): 48 | """Run tests using pytest library.""" 49 | # import here, cause outside the eggs aren't loaded 50 | import pytest 51 | errno = pytest.main(self.pytest_args) 52 | sys.exit(errno) 53 | 54 | 55 | packages = find_packages(exclude=['doc', 'examples']) 56 | # Get the version string. Cannot be done with import! 57 | with open(os.path.join("beard", "__init__.py"), "rt") as f: 58 | _version = re.search( 59 | '__version__\s*=\s*"(?P.*)"\n', 60 | f.read() 61 | ).group("version") 62 | 63 | _classifiers = [ 64 | # classifiers for PyPI 65 | "Development Status :: 4 - Beta", 66 | "Environment :: Console", 67 | "Intended Audience :: Developers", 68 | "License :: OSI Approved :: BSD License", 69 | "Operating System :: OS Independent", 70 | "Programming Language :: Python", 71 | "Programming Language :: Python :: 2", 72 | "Programming Language :: Python :: 2.7", 73 | "Programming Language :: Python :: 3", 74 | "Programming Language :: Python :: 3.6", 75 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 76 | "Topic :: Scientific/Engineering :: Information Analysis" 77 | ] 78 | 79 | _keywords = [ 80 | "author disambiguation", 81 | "machine learning", 82 | "data mining" 83 | ] 84 | 85 | _install_requires = [ 86 | "setuptools-scm<4.0.0", 87 | # jellyfish 0.7 is Python 3 only 88 | "jellyfish<=0.7", 89 | "numpy>=1.9", 90 | "scipy>=0.14", 91 | "scikit-learn>=0.15.2", 92 | "six", 93 | "structlog", 94 | "unidecode", 95 | ] 96 | 97 | if sys.version[0] == '2': 98 | # use version 1.1 due to Soundex bug in 1.2 99 | _install_requires.append("fuzzy==1.1") 100 | else: 101 | # need to use version 1.2 with buggy Soundex for Python 3 compatibility 102 | _install_requires.append("fuzzy~=1.0,>=1.2") 103 | 104 | _tests_require = [ 105 | "coverage", 106 | "pytest>=2.6.1", 107 | "pytest-cache>=1.0", 108 | "pytest-cov>=1.8.0", 109 | "pytest-pep8>=1.0.6", 110 | ] 111 | 112 | _parameters = { 113 | "author": "CERN", 114 | "author_email": "admin@inspirehep.net", 115 | "classifiers": _classifiers, 116 | "cmdclass": {"test": PyTest}, 117 | "description": "Bibliographic Entity Automatic \ 118 | Recognition and Disambiguation", 119 | "install_requires": _install_requires, 120 | "keywords": _keywords, 121 | "license": "BSD", 122 | "long_description": open("README.rst").read(), 123 | "name": "beard", 124 | "packages": packages, 125 | "platforms": "any", 126 | "tests_require": _tests_require, 127 | "url": "https://github.com/inspirehep/beard", 128 | "version": _version, 129 | } 130 | 131 | setup(**_parameters) 132 | -------------------------------------------------------------------------------- /tests/clustering/test_block.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of Beard. 4 | # Copyright (C) 2015 CERN. 5 | # 6 | # Beard is a free software; you can redistribute it and/or modify it 7 | # under the terms of the Revised BSD License; see LICENSE file for 8 | # more details. 9 | 10 | """Tests of _Block class. 11 | 12 | .. codeauthor:: Mateusz Susik 13 | 14 | """ 15 | 16 | import pytest 17 | 18 | from beard.clustering.blocking_funcs import _Block 19 | 20 | 21 | @pytest.fixture 22 | def block(): 23 | """Create a block for mr Abc, D. Vasquez.""" 24 | return _Block(*(("ABC",), ("D", "VSQ"))) 25 | 26 | 27 | def test_add_signature(block): 28 | """Test adding signatures to the cluster.""" 29 | assert block._content[("ABC",)][("D", "VSQ")] == 1 30 | block.add_signature(*(("ABC",), ("D", "VSQ"))) 31 | assert block._content[("ABC",)][("D", "VSQ")] == 2 32 | block.add_signature(*(("ABC",), ("E",))) 33 | assert block._content[("ABC",)][("E",)] == 1 34 | block.add_signature(*(("ABD",), ("D", "VSQ",))) 35 | assert block._content[("ABD",)][("D", "VSQ")] == 1 36 | block.add_signature(*(("ABC", ""), ("D", "VSQ"))) 37 | # Check handling of multiple surnames 38 | block.add_signature(*(("ABD", "EFG"), ("D", "VSQ",))) 39 | assert block._content[("ABD", "EFG")][("D", "VSQ")] == 1 40 | assert block._content[("ABC",)][("D", "VSQ")] == 2 41 | 42 | 43 | def test_compare_tokens_from_last(block): 44 | """Test comparing tokens from the back.""" 45 | assert block.compare_tokens_from_last(("VSQ",), ("ABC",)) 46 | assert block.compare_tokens_from_last(("C", "D", "VSQ",), ("ABC",)) 47 | with pytest.raises(KeyError) as excinfo: 48 | block.compare_tokens_from_last(("VSQ",), ("DEF")) 49 | assert "cluster doesn't contain a key" in str(excinfo.value) 50 | assert not block.compare_tokens_from_last(("VSD",), ("ABC",)) 51 | assert not block.compare_tokens_from_last(("DGM", "VSQ"), ("ABC",)) 52 | 53 | 54 | def test_contains(block): 55 | """Test contains method.""" 56 | assert block.contains(("ABC",)) 57 | assert not block.contains(("DEF",)) 58 | -------------------------------------------------------------------------------- /tests/clustering/test_blocking.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of Beard. 4 | # Copyright (C) 2015 CERN. 5 | # 6 | # Beard is a free software; you can redistribute it and/or modify it 7 | # under the terms of the Revised BSD License; see LICENSE file for 8 | # more details. 9 | 10 | """Tests of blocking for clustering. 11 | 12 | .. codeauthor:: Gilles Louppe 13 | 14 | """ 15 | from __future__ import division 16 | 17 | import numpy as np 18 | from numpy.testing import assert_equal 19 | from numpy.testing import assert_array_equal 20 | 21 | from pytest import mark 22 | import pytest 23 | 24 | from sklearn.cluster import AgglomerativeClustering 25 | from sklearn.cluster import MiniBatchKMeans 26 | from sklearn.datasets import make_blobs 27 | from sklearn.metrics.pairwise import euclidean_distances 28 | from sklearn.utils import check_random_state 29 | 30 | from beard.clustering import BlockClustering 31 | from beard.clustering import ScipyHierarchicalClustering 32 | from beard.metrics import paired_f_score 33 | 34 | random_state = check_random_state(42) 35 | X, y = make_blobs(centers=4, shuffle=False, random_state=random_state) 36 | 37 | 38 | def _distance(X_ids): 39 | return euclidean_distances(X[X_ids.ravel()]) 40 | 41 | 42 | @mark.parametrize('n_jobs', (1, 2)) 43 | def test_fit(n_jobs): 44 | """Test fit.""" 45 | # Single block 46 | clusterer = BlockClustering( 47 | blocking="single", 48 | base_estimator=AgglomerativeClustering(n_clusters=4, 49 | linkage="complete"), 50 | n_jobs=n_jobs) 51 | clusterer.fit(X) 52 | 53 | assert_equal(len(clusterer.clusterers_), 1) 54 | assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_)) 55 | 56 | # Precomputed blocks 57 | clusterer = BlockClustering( 58 | blocking="precomputed", 59 | base_estimator=AgglomerativeClustering(n_clusters=2, 60 | linkage="complete"), 61 | n_jobs=n_jobs) 62 | clusterer.fit(X, blocks=(y <= 1)) 63 | 64 | assert_equal(len(clusterer.clusterers_), 2) 65 | assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_)) 66 | 67 | # Precomputed affinity 68 | clusterer = BlockClustering( 69 | affinity="precomputed", 70 | blocking="precomputed", 71 | base_estimator=ScipyHierarchicalClustering(affinity="precomputed", 72 | n_clusters=2, 73 | method="complete"), 74 | n_jobs=n_jobs) 75 | X_affinity = euclidean_distances(X) 76 | clusterer.fit(X_affinity, blocks=(y <= 1)) 77 | 78 | assert_equal(len(clusterer.clusterers_), 2) 79 | assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_)) 80 | 81 | # Custom blocking function 82 | X_ids = np.arange(len(X)).reshape((-1, 1)) 83 | 84 | def _blocking(X_ids): 85 | return y[X_ids.ravel()] <= 1 # block labels into {0,1} and {2,3} 86 | 87 | clusterer = BlockClustering( 88 | blocking=_blocking, 89 | base_estimator=AgglomerativeClustering(n_clusters=2, 90 | linkage="complete", 91 | affinity=_distance)) 92 | clusterer.fit(X_ids) 93 | 94 | assert_equal(len(clusterer.clusterers_), 2) 95 | assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_)) 96 | 97 | 98 | def test_partial_fit(): 99 | """Test partial_fit.""" 100 | blocks = (y <= 1) 101 | 102 | clusterer1 = BlockClustering(blocking="precomputed", 103 | base_estimator=MiniBatchKMeans(n_clusters=2)) 104 | clusterer1.partial_fit(X[y <= 1], blocks=blocks[y <= 1]) 105 | assert_equal(len(clusterer1.clusterers_), 1) 106 | clusterer1.partial_fit(X[y > 1], blocks=blocks[y > 1]) 107 | assert_equal(len(clusterer1.clusterers_), 2) 108 | 109 | clusterer2 = BlockClustering(blocking="precomputed", 110 | base_estimator=MiniBatchKMeans(n_clusters=2)) 111 | clusterer2.fit(X, blocks=blocks) 112 | 113 | c1 = clusterer1.predict(X, blocks=blocks) 114 | c2 = clusterer2.labels_ 115 | 116 | assert_equal(paired_f_score(c1, c2), 1.0) 117 | 118 | 119 | def test_onthefly_labels(): 120 | """Test assigning labels on the fly.""" 121 | clusterer = BlockClustering( 122 | base_estimator=ScipyHierarchicalClustering(n_clusters=1, 123 | method="complete")) 124 | clusterer.fit(X) 125 | assert_array_equal([100], np.bincount(clusterer.labels_)) 126 | clusterer.clusterers_[0].set_params(n_clusters=4) 127 | assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_)) 128 | 129 | 130 | def test_predict(): 131 | """Test predict.""" 132 | clusterer = BlockClustering(blocking="precomputed", 133 | base_estimator=MiniBatchKMeans(n_clusters=2)) 134 | clusterer.fit(X, blocks=(y <= 1)) 135 | pred = clusterer.predict(X, blocks=(y <= 1)) 136 | assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_)) 137 | 138 | pred = clusterer.predict(X, blocks=10 * np.ones(len(X))) 139 | assert_array_equal(-np.ones(len(X)), pred) 140 | 141 | 142 | @mark.parametrize('n_jobs', (1, 2)) 143 | def test_single_signature(n_jobs): 144 | """Test clustering of a single signature.""" 145 | import numbers 146 | clusterer = BlockClustering(base_estimator=MiniBatchKMeans(n_clusters=2)) 147 | clusterer.fit(np.array([X[0]])) 148 | assert isinstance(clusterer.predict(X[0])[0], numbers.Integral) 149 | 150 | 151 | def test_validation(): 152 | """Test the validation of hyper-parameters and input data.""" 153 | with pytest.raises(ValueError): 154 | clusterer = BlockClustering( 155 | blocking="foobar", 156 | base_estimator=MiniBatchKMeans(n_clusters=2)) 157 | clusterer.fit(X) 158 | 159 | with pytest.raises(ValueError): 160 | clusterer = BlockClustering( 161 | blocking="precomputed", 162 | base_estimator=MiniBatchKMeans(n_clusters=2)) 163 | clusterer.fit(X) 164 | 165 | with pytest.raises(ValueError): 166 | clusterer = BlockClustering( 167 | blocking="precomputed", 168 | base_estimator=MiniBatchKMeans(n_clusters=2)) 169 | clusterer.fit(X, blocks=(y <= 1)) 170 | clusterer.predict(X) 171 | -------------------------------------------------------------------------------- /tests/clustering/test_blocking_funcs.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of Beard. 4 | # Copyright (C) 2015 CERN. 5 | # 6 | # Beard is a free software; you can redistribute it and/or modify it 7 | # under the terms of the Revised BSD License; see LICENSE file for 8 | # more details. 9 | 10 | """Tests of the blocking algorithm. 11 | 12 | .. codeauthor:: Mateusz Susik 13 | 14 | """ 15 | 16 | import numpy as np 17 | 18 | from beard.clustering.blocking_funcs import block_phonetic 19 | from beard.clustering.blocking_funcs import block_last_name_first_initial 20 | 21 | 22 | def run_blocking(names, expected_results, threshold=100): 23 | """Run dm_blocking and assert that the results are correct.""" 24 | sigs = np.array([[{'author_name': sig}] for sig in names]) 25 | for index, value in enumerate(block_phonetic(sigs, threshold)): 26 | assert value == expected_results[index] 27 | 28 | 29 | def test_single_signature(): 30 | """Cluster one signature.""" 31 | run_blocking(['Smith, Joe'], ['SM0']) 32 | 33 | 34 | def test_first_surname_included(): 35 | """Check first surname full match.""" 36 | run_blocking(['Smith-Jones, Joe', 'Smith, Joe', 37 | 'Jones, Paul', 'Smith-Jones, Paul'], 38 | ['SM0', 'SM0', 'JNS', 'SM0']) 39 | 40 | 41 | def test_last_surname_included(): 42 | """Check last surname full match.""" 43 | run_blocking(['Jones-Smith, Joe', 'Smith, Joe', 'Jones-Smith, Paul'], 44 | ['SM0', 'SM0', 'SM0']) 45 | 46 | 47 | def test_no_suitable_block_for_multiple_surnames(): 48 | """Check if a block is created for surnames that don't match.""" 49 | run_blocking(['Jones-Smith, Joe'], ['SM0']) 50 | 51 | 52 | def test_precluster_split(): 53 | """Check if huge blocks are split.""" 54 | run_blocking(['Smith, Joe', 'Smith, Paul'], ['SM0j', 'SM0p'], 55 | threshold=1) 56 | 57 | 58 | def test_compare_tokens_from_last_usage(): 59 | """Check if the surnames are compared to the first_names.""" 60 | run_blocking(['Jones, Joe', 'Smith, Joe Jones', 'Jones, Joe', 61 | 'Jones-Smith, Joe'], ['JNS', 'SM0', 'JNS', 'SM0']) 62 | 63 | 64 | def test_block_last_name_first_initial(): 65 | """Block using LNFI strategy.""" 66 | names = ['Smith, Jonh', 'Smith, James', 'Smith, Peter', 'Smit, John'] 67 | sigs = np.array([[{'author_name': sig}] for sig in names]) 68 | lnfi_blocking = block_last_name_first_initial(sigs) 69 | assert lnfi_blocking.tolist() == ['smith j', 'smith j', 70 | 'smith p', 'smit j'] 71 | -------------------------------------------------------------------------------- /tests/clustering/test_wrappers.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of Beard. 4 | # Copyright (C) 2015 CERN. 5 | # 6 | # Beard is a free software; you can redistribute it and/or modify it 7 | # under the terms of the Revised BSD License; see LICENSE file for 8 | # more details. 9 | 10 | """Test of clustering wrappers. 11 | 12 | .. codeauthor:: Gilles Louppe 13 | .. codeauthor:: Hussein Al-Natsheh 14 | 15 | """ 16 | from __future__ import division 17 | 18 | from functools import partial 19 | import numpy as np 20 | from numpy.testing import assert_equal 21 | from numpy.testing import assert_array_equal 22 | import pytest 23 | 24 | from sklearn.datasets import make_blobs 25 | from sklearn.metrics.pairwise import euclidean_distances 26 | from sklearn.utils import check_random_state 27 | 28 | from beard.metrics import b3_f_score 29 | from beard.metrics import silhouette_score 30 | from beard.clustering import ScipyHierarchicalClustering 31 | 32 | 33 | def generate_data(supervised=False, affinity=False): 34 | rng = check_random_state(42) 35 | X, y = make_blobs(centers=4, cluster_std=0.01, 36 | shuffle=False, random_state=rng) 37 | 38 | if affinity: 39 | d = euclidean_distances(X) 40 | d = (d + d.T) / 2.0 41 | d /= d.max() 42 | X = d 43 | 44 | if supervised: 45 | mask = rng.randint(2, size=len(y)).astype(np.bool) 46 | y[mask] = -1 47 | 48 | else: 49 | y[:] = -1 50 | 51 | return X, y 52 | 53 | 54 | def test_shc_semi_supervised_scoring_data_raw(): 55 | """Test semi-supervised learning for SHC when scoring_data='raw'.""" 56 | X, y = generate_data(supervised=True, affinity=False) 57 | 58 | def _scoring(X_raw, labels_true, labels_pred): 59 | assert X_raw.shape == X.shape 60 | score = b3_f_score(labels_true, labels_pred) 61 | return score 62 | 63 | clusterer = ScipyHierarchicalClustering(supervised_scoring=_scoring, 64 | scoring_data="raw") 65 | clusterer.fit(X, y) 66 | labels = clusterer.labels_ 67 | assert_array_equal([25, 25, 25, 25], np.bincount(labels)) 68 | 69 | 70 | def test_shc_semi_supervised_scoring_data_affinity(): 71 | """Test semi-supervised learning for SHC when scoring_data='affinity'.""" 72 | # Passing feature matrix 73 | X1, y1 = generate_data(supervised=True, affinity=False) 74 | 75 | def _scoring1(X_affinity, labels_true, labels_pred): 76 | assert X_affinity.shape[0] == X_affinity.shape[1] 77 | assert X_affinity.shape != X1.shape 78 | score = b3_f_score(labels_true, labels_pred) 79 | return score 80 | 81 | clusterer = ScipyHierarchicalClustering(supervised_scoring=_scoring1, 82 | scoring_data="affinity", 83 | affinity=euclidean_distances) 84 | clusterer.fit(X1, y1) 85 | labels = clusterer.labels_ 86 | assert_array_equal([25, 25, 25, 25], np.bincount(labels)) 87 | 88 | # Passing affinity matrix 89 | X2, y2 = generate_data(supervised=True, affinity=True) 90 | 91 | def _scoring2(X_affinity, labels_true, labels_pred): 92 | assert X_affinity.shape[0] == X_affinity.shape[1] 93 | assert X_affinity.shape == X2.shape 94 | score = b3_f_score(labels_true, labels_pred) 95 | return score 96 | 97 | clusterer = ScipyHierarchicalClustering(supervised_scoring=_scoring2, 98 | scoring_data="affinity", 99 | affinity="precomputed") 100 | clusterer.fit(X2, y2) 101 | labels = clusterer.labels_ 102 | assert_array_equal([25, 25, 25, 25], np.bincount(labels)) 103 | 104 | 105 | def test_shc_semi_supervised_scoring_data_none(): 106 | """Test semi-supervised learning for SHC when scoring_data is None.""" 107 | X, y = generate_data(supervised=True, affinity=False) 108 | 109 | def _scoring(labels_true, labels_pred): 110 | score = b3_f_score(labels_true, labels_pred) 111 | return score 112 | 113 | # We should find all 4 clusters 114 | clusterer = ScipyHierarchicalClustering(supervised_scoring=_scoring) 115 | clusterer.fit(X, y) 116 | labels = clusterer.labels_ 117 | assert_array_equal([25, 25, 25, 25], np.bincount(labels)) 118 | 119 | 120 | def test_shc_unsupervised_scoring_data_raw(): 121 | """Test unsupervised clustering for SHC when scoring_data='raw'.""" 122 | X, _ = generate_data(supervised=False, affinity=False) 123 | _scoring = partial(silhouette_score, metric="euclidean") 124 | clusterer = ScipyHierarchicalClustering(affinity=euclidean_distances, 125 | unsupervised_scoring=_scoring, 126 | scoring_data="raw") 127 | labels = clusterer.fit_predict(X) 128 | assert_array_equal([25, 25, 25, 25], np.bincount(labels)) 129 | 130 | 131 | def test_shc_unsupervised_scoring_data_affinity(): 132 | """Test unsupervised clustering for SHC when scoring_data='affinity'.""" 133 | # Passing feature matrix 134 | X, _ = generate_data(supervised=False, affinity=False) 135 | _scoring = partial(silhouette_score, metric="precomputed") 136 | clusterer = ScipyHierarchicalClustering(affinity=euclidean_distances, 137 | unsupervised_scoring=_scoring, 138 | scoring_data="affinity") 139 | labels = clusterer.fit_predict(X) 140 | assert_array_equal([25, 25, 25, 25], np.bincount(labels)) 141 | 142 | # Passing affinity matrix 143 | X, _ = generate_data(supervised=False, affinity=True) 144 | _scoring = partial(silhouette_score, metric="precomputed") 145 | clusterer = ScipyHierarchicalClustering(affinity="precomputed", 146 | unsupervised_scoring=_scoring, 147 | scoring_data="affinity") 148 | labels = clusterer.fit_predict(X) 149 | assert_array_equal([25, 25, 25, 25], np.bincount(labels)) 150 | 151 | 152 | def test_shc_unsupervised_scoring_data_None(): 153 | """Test unsupervised clustering for SHC when scoring_data is None.""" 154 | X, _ = generate_data(supervised=False, affinity=False) 155 | 156 | def _scoring(labels_pred): 157 | return -np.inf 158 | 159 | clusterer = ScipyHierarchicalClustering(affinity=euclidean_distances, 160 | unsupervised_scoring=_scoring) 161 | labels = clusterer.fit_predict(X) 162 | assert_array_equal([100], np.bincount(labels)) 163 | 164 | 165 | def test_shc_default_euclidean(): 166 | """Test default parameters of SHC, using euclidean distance.""" 167 | X, _ = generate_data(supervised=False, affinity=False) 168 | clusterer = ScipyHierarchicalClustering(n_clusters=4) 169 | labels = clusterer.fit_predict(X) 170 | assert_array_equal([25, 25, 25, 25], np.bincount(labels)) 171 | 172 | 173 | def test_shc_custom_affinity(): 174 | """Test custom affinity function in SHC.""" 175 | X, _ = generate_data(supervised=False, affinity=False) 176 | clusterer = ScipyHierarchicalClustering(affinity=euclidean_distances, 177 | n_clusters=4) 178 | labels = clusterer.fit_predict(X) 179 | assert_array_equal([25, 25, 25, 25], np.bincount(labels)) 180 | 181 | 182 | def test_shc_precomputed_distance(): 183 | """Test using precomputed distances in SHC.""" 184 | X, _ = generate_data(supervised=False, affinity=True) 185 | clusterer = ScipyHierarchicalClustering(affinity="precomputed", 186 | n_clusters=4) 187 | labels = clusterer.fit_predict(X) 188 | assert_array_equal([25, 25, 25, 25], np.bincount(labels)) 189 | 190 | 191 | def test_shc_n_clusters(): 192 | """Test changing number of clusters in SHC.""" 193 | X, _ = generate_data(supervised=False, affinity=True) 194 | 195 | clusterer = ScipyHierarchicalClustering(affinity="precomputed", 196 | n_clusters=4) 197 | 198 | labels = clusterer.fit_predict(X) 199 | assert_equal(len(np.unique(labels)), 4) 200 | clusterer.set_params(n_clusters=10) 201 | labels = clusterer.labels_ 202 | assert_equal(len(np.unique(labels)), 10) 203 | 204 | 205 | def test_shc_threshold(): 206 | """Test changing threshold in SHC.""" 207 | X, _ = generate_data(supervised=False, affinity=True) 208 | 209 | # n_clusters has precedence over threshold 210 | clusterer = ScipyHierarchicalClustering(affinity="precomputed", 211 | n_clusters=2) 212 | labels1 = clusterer.fit_predict(X) 213 | clusterer.set_params(threshold=clusterer.linkage_[-4, 2]) 214 | labels2 = clusterer.labels_ 215 | assert_array_equal(labels1, labels2) 216 | assert_equal(len(np.unique(labels1)), 2) 217 | 218 | # change threshold 219 | clusterer.set_params(best_threshold_precedence=False) 220 | clusterer.set_params(n_clusters=None, 221 | threshold=clusterer.linkage_[-5, 2]) 222 | labels = clusterer.labels_ 223 | assert_equal(len(np.unique(labels)), 5) 224 | clusterer.set_params(threshold=clusterer.linkage_[-4, 2]) 225 | labels = clusterer.labels_ 226 | assert_equal(len(np.unique(labels)), 4) 227 | 228 | 229 | def test_shc_validation(): 230 | """Test the validation of hyper-parameters and input data in SHC""" 231 | X, _ = generate_data(supervised=False, affinity=False) 232 | 233 | with pytest.raises(ValueError): 234 | clusterer = ScipyHierarchicalClustering(n_clusters=len(X) + 1) 235 | labels = clusterer.fit_predict(X) 236 | 237 | with pytest.raises(ValueError): 238 | clusterer = ScipyHierarchicalClustering(n_clusters=-1) 239 | labels = clusterer.fit_predict(X) 240 | 241 | with pytest.raises(ValueError): 242 | clusterer = ScipyHierarchicalClustering(scoring_data="affinity") 243 | labels = clusterer.fit_predict(X) 244 | -------------------------------------------------------------------------------- /tests/metrics/test_clustering.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of Beard. 4 | # Copyright (C) 2014, 2015 CERN. 5 | # 6 | # Beard is a free software; you can redistribute it and/or modify it 7 | # under the terms of the Revised BSD License; see LICENSE file for 8 | # more details. 9 | 10 | """Test clustering evaluation metrics. 11 | 12 | .. codeauthor:: Evangelos Tzemis 13 | .. codeauthor:: Gilles Louppe 14 | 15 | """ 16 | from __future__ import division 17 | 18 | import numpy as np 19 | from numpy.testing import assert_equal 20 | from numpy.testing import assert_almost_equal 21 | import pytest 22 | 23 | from beard.metrics.clustering import b3_precision_recall_fscore 24 | from beard.metrics.clustering import b3_precision_score 25 | from beard.metrics.clustering import b3_recall_score 26 | from beard.metrics.clustering import b3_f_score 27 | from beard.metrics.clustering import paired_precision_recall_fscore 28 | from beard.metrics.clustering import paired_precision_score 29 | from beard.metrics.clustering import paired_recall_score 30 | from beard.metrics.clustering import paired_f_score 31 | from beard.metrics.clustering import _cluster_samples 32 | from beard.metrics.clustering import _general_merge_distance 33 | 34 | 35 | def test_b3_precision_recall_fscore(): 36 | """Test the results of b3_precision_recall_fscore.""" 37 | # test for the border case where score maximum 38 | y = [1, 2, 1, 3, 2, 4, 5, 4] 39 | assert_equal(b3_precision_recall_fscore(y, y), (1, 1, 1)) 40 | 41 | # test for border case when predicting singletons 42 | y_true = [1, 1, 2, 2] 43 | y_pred = [1, 2, 3, 4] 44 | assert_equal(b3_precision_recall_fscore(y_true, y_pred), (1, 0.5, 2 / 3)) 45 | 46 | 47 | def test_b3_precision_score(): 48 | """Test the returned results of b3_precision_score.""" 49 | y_true = [1, 1, 2, 2, 3, 4, 5] 50 | y_pred = [1, 2, 2, 2, 3, 4, 5] 51 | assert_almost_equal(b3_precision_score(y_true, y_pred), 17 / 21) 52 | 53 | y_true = [1, 1, 1, 4, 5, 5, 0, 4] 54 | y_pred = [1, 1, 1, 1, 5, 5, 6, 7] 55 | assert_equal(b3_precision_score(y_true, y_pred), 13 / 16) 56 | 57 | # test for the trivial maximum case 58 | assert_equal(b3_precision_score(y_true, y_true), 1) 59 | 60 | 61 | def test_b3_recall_score(): 62 | """Test the returned results of b3_recall_score.""" 63 | y_true = [1, 1, 2, 2, 3, 4, 5] 64 | y_pred = [1, 2, 2, 2, 3, 4, 5] 65 | assert_almost_equal(b3_recall_score(y_true, y_pred), 6 / 7) 66 | 67 | y_true = [1, 1, 1, 4, 5, 5, 0, 4] 68 | y_pred = [1, 1, 1, 1, 5, 5, 6, 7] 69 | assert_equal(b3_recall_score(y_true, y_pred), 7 / 8) 70 | 71 | # test for the trivial maximum case 72 | assert_equal(b3_recall_score(y_true, y_true), 1) 73 | 74 | 75 | def test_b3_f_score(): 76 | """Test the returned results of b3_f_score.""" 77 | y_true = [1, 1, 2, 2, 3, 4, 5] 78 | y_pred = [1, 2, 2, 2, 3, 4, 5] 79 | desired_output = 2 * (17 / 21) * (6 / 7) / (17 / 21 + 6 / 7) 80 | assert_almost_equal(b3_f_score(y_true, y_pred), desired_output) 81 | 82 | y_true = [1, 1, 1, 4, 5, 5, 0, 4] 83 | y_pred = [1, 1, 1, 1, 5, 5, 6, 7] 84 | desired_output = 2 * (13 / 16) * (7 / 8) / (13 / 16 + 7 / 8) 85 | assert_almost_equal(b3_f_score(y_true, y_pred), desired_output) 86 | 87 | # test for the trivial maximum case 88 | assert_equal(b3_f_score(y_true, y_true), 1) 89 | 90 | 91 | def test_b3_label_invariability(): 92 | """Test that paired P/R/F values are label invariant.""" 93 | y = [1, 2, 1, 3, 2, 4, 5, 4] 94 | y_prime_invariant = [3, 6, 6, 5, 6, 2, 4, 2] 95 | y_prime = [2, 3, 3, 4, 3, 5, 1, 5] 96 | assert_equal(b3_precision_recall_fscore(y, y_prime), 97 | b3_precision_recall_fscore(y, y_prime_invariant)) 98 | 99 | 100 | def test_b3_raise_error(): 101 | """Test the raise of the ValueError exception for paired P/R/F.""" 102 | y = np.array([1, 2, 1, 3, 2, 4, 5, 4]) 103 | 104 | # test raise when not 1d shape 105 | y = y.reshape(2, 4) 106 | with pytest.raises(ValueError): 107 | b3_precision_recall_fscore(y, y) 108 | 109 | # test raise when different size of elements 110 | y = y.reshape(8, 1) 111 | with pytest.raises(ValueError): 112 | b3_precision_recall_fscore(y[1:], y[2:]) 113 | 114 | # test error raise when labels_true is empty 115 | with pytest.raises(ValueError): 116 | b3_precision_recall_fscore(y, []) 117 | 118 | # test error raise when labels_pred is empty 119 | with pytest.raises(ValueError): 120 | b3_precision_recall_fscore([], y) 121 | 122 | # test error raise when both inputs are empty 123 | with pytest.raises(ValueError): 124 | b3_precision_recall_fscore([], []) 125 | 126 | 127 | def test_paired_precision_recall_fscore(): 128 | """Test the results of paired_precision_recall_fscore.""" 129 | # test for border case where score is maximum 130 | y = [1, 2, 1, 3, 2, 4, 5, 4] 131 | assert_equal(paired_precision_recall_fscore(y, y), (1, 1, 1)) 132 | 133 | # test for border case where score is minimum 134 | y_true = [1, 2, 1, 3, 2, 4, 5, 4] 135 | y_pred = [1, 1, 2, 2, 3, 3, 4, 4] 136 | assert_equal(paired_precision_recall_fscore(y_true, y_pred), (0, 0, 0)) 137 | 138 | 139 | def test_paired_precision_score(): 140 | """Test the returned results of paired_precision_score.""" 141 | y_true = [1, 1, 2, 2, 3, 4, 5] 142 | y_pred = [1, 2, 2, 2, 3, 4, 5] 143 | assert_almost_equal(paired_precision_score(y_true, y_pred), 1 / 3) 144 | 145 | y_true = [1, 1, 1, 4, 5, 5, 0, 4] 146 | y_pred = [1, 1, 1, 1, 5, 5, 6, 7] 147 | assert_equal(paired_precision_score(y_true, y_pred), 4 / 7) 148 | 149 | # test for the trivial maximum case 150 | assert_equal(paired_precision_score(y_true, y_true), 1) 151 | 152 | 153 | def test_paired_recall_score(): 154 | """Test the returned results of paired_recall_score.""" 155 | y_true = [1, 1, 2, 2, 3, 4, 5] 156 | y_pred = [1, 2, 2, 2, 3, 4, 5] 157 | assert_almost_equal(paired_recall_score(y_true, y_pred), 0.5) 158 | 159 | y_true = [1, 1, 1, 4, 5, 5, 0, 4] 160 | y_pred = [1, 1, 1, 1, 5, 5, 6, 7] 161 | assert_equal(paired_recall_score(y_true, y_pred), 4 / 5) 162 | 163 | # test for the trivial maximum case 164 | assert_equal(paired_recall_score(y_true, y_true), 1) 165 | 166 | 167 | def test_paired_f_score(): 168 | """Test the returned results of paired_f_score.""" 169 | y_true = [1, 1, 2, 2, 3, 4, 5] 170 | y_pred = [1, 2, 2, 2, 3, 4, 5] 171 | desired_output = 2 * (1 / 3) * 0.5 / (1 / 3 + 0.5) 172 | assert_almost_equal(paired_f_score(y_true, y_pred), desired_output) 173 | 174 | y_true = [1, 1, 1, 4, 5, 5, 0, 4] 175 | y_pred = [1, 1, 1, 1, 5, 5, 6, 7] 176 | desired_output = 2 * (4 / 7) * (4 / 5) / (4 / 7 + 4 / 5) 177 | assert_almost_equal(paired_f_score(y_true, y_pred), desired_output) 178 | 179 | # test for the trivial maximum case 180 | assert_equal(paired_f_score(y_true, y_true), 1) 181 | 182 | 183 | def test_paired_label_invariability(): 184 | """Test that paired P/R/F values are label invariant.""" 185 | y = [1, 2, 1, 3, 2, 4, 5, 4] 186 | y_prime_invariant = [3, 6, 6, 5, 6, 2, 4, 2] 187 | y_prime = [2, 3, 3, 4, 3, 5, 1, 5] 188 | assert_equal(paired_precision_recall_fscore(y, y_prime), 189 | paired_precision_recall_fscore(y, y_prime_invariant)) 190 | 191 | 192 | def test_paired_raise_error(): 193 | """Test the raise of the ValueError exception for paired P/R/F.""" 194 | y = np.array([1, 2, 1, 3, 2, 4, 5, 4]) 195 | 196 | # test raise when not 1d shape 197 | y = y.reshape(2, 4) 198 | with pytest.raises(ValueError): 199 | paired_precision_recall_fscore(y, y) 200 | 201 | # test raise when different size of elements 202 | y = y.reshape(8, 1) 203 | with pytest.raises(ValueError): 204 | paired_precision_recall_fscore(y[1:], y[2:]) 205 | 206 | # test error raise when labels_true is empty 207 | with pytest.raises(ValueError): 208 | paired_precision_recall_fscore(y, []) 209 | 210 | # test error raise when labels_pred is empty 211 | with pytest.raises(ValueError): 212 | paired_precision_recall_fscore([], y) 213 | 214 | # test error raise when both inputs are empty 215 | with pytest.raises(ValueError): 216 | paired_precision_recall_fscore([], []) 217 | 218 | 219 | def test_cluster_samples(): 220 | """Test that samples are correctly seperated into appropriate groups.""" 221 | y = [1, 2, 1, 3, 2, 4, 5, 4] 222 | cls_true = {1: [0, 2], 2: [1, 4], 3: [3], 4: [5, 7], 5: [6]} 223 | 224 | assert_equal(cls_true, _cluster_samples(y)) 225 | 226 | 227 | def test_general_merge_distance(): 228 | """Test general merge distance function.""" 229 | y_true = np.array([1, 2, 1, 2, 1, 2]) 230 | y_pred = [1, 1, 1, 2, 2, 2] 231 | 232 | # test for trivial case 233 | assert_equal(_general_merge_distance(y_true, y_true), 0) 234 | 235 | # test that fs and fm has effect on result 236 | zero_res = _general_merge_distance(y_true, y_pred, 237 | fm=lambda x, y: 0, 238 | fs=lambda x, y: 0) 239 | assert_equal(zero_res, 0) 240 | 241 | # test for default functions 242 | assert_equal(_general_merge_distance(y_true, y_pred), 4) 243 | -------------------------------------------------------------------------------- /tests/metrics/test_text.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of Beard. 4 | # Copyright (C) 2015 CERN. 5 | # 6 | # Beard is a free software; you can redistribute it and/or modify it 7 | # under the terms of the Revised BSD License; see LICENSE file for 8 | # more details. 9 | 10 | """Test text metrics. 11 | 12 | .. codeauthor:: Petros Ioannidis 13 | .. codeauthor:: Evangelos Tzemis 14 | 15 | """ 16 | from __future__ import generators 17 | 18 | from numpy.testing import assert_almost_equal 19 | import pytest 20 | from pytest import mark 21 | 22 | from beard.metrics.text import _find_all 23 | from beard.metrics.text import _jaro_matching 24 | from beard.metrics.text import jaro 25 | from beard.metrics.text import jaro_winkler 26 | from beard.metrics.text import levenshtein 27 | 28 | 29 | @mark.parametrize('s, letter, occur', 30 | (('MARTHA', 'A', (1, 5)), 31 | ('DWAYNE', 'D', (0, )), 32 | ('A', 'A', (0, )), 33 | ('AABAA', 'AA', (0, 3)), 34 | ('ABCD', 'D', (3, )))) 35 | def test_find_all_normal_string(s, letter, occur): 36 | """Test find_all behaviour for average cases.""" 37 | assert tuple(_find_all(s, letter)) == occur 38 | 39 | 40 | @mark.parametrize('s, letter', 41 | (('MARTHA', 'Z'), 42 | ('', 'A'))) 43 | def test_find_all_none_string(s, letter): 44 | """Test find_all behaviour for empty cases.""" 45 | with pytest.raises(StopIteration): 46 | assert next(_find_all(s, letter)) 47 | 48 | 49 | @mark.parametrize('s, letter', 50 | ((set(), 'A'), 51 | (dict(), 'A'), 52 | (int(), 'A'), 53 | (float(), 'A'), 54 | (list(), 'A'))) 55 | def test_find_all_abnormal_string(s, letter): 56 | """Test find_all behaviour called with wrong objects.""" 57 | with pytest.raises(TypeError): 58 | next(_find_all(s, letter)) 59 | 60 | 61 | @mark.parametrize('s1, s2, match', 62 | (('MARTHA', 'MARHTA', (6, 2)), 63 | ('DWAYNE', 'DUANE', (4, 0)), 64 | ('DUANE', 'DWAYNE', (4, 0)), 65 | ('MARHTA', 'MARTHA', (6, 2)))) 66 | def test_jaro_matching(s1, s2, match): 67 | """Test jaro_matching behaviour.""" 68 | assert _jaro_matching(s1, s2) == match 69 | 70 | 71 | @mark.parametrize('s1, s2, match', 72 | (('MARTHA', 'MARHTA', 0.944), 73 | ('DWAYNE', 'DUANE', 0.822), 74 | ('ABCDEFG', 'ABCDEFG', 1.0), 75 | ('', 'ABCDEFG', 0.0), 76 | ('ABCDEFG', 'HIGKLMN', 0.0), 77 | ('apple', 'apple', 1.0))) 78 | def test_jaro(s1, s2, match): 79 | """Test jaro_similarity_metric behaviour.""" 80 | assert_almost_equal(jaro(s1, s2), match, 3) 81 | 82 | 83 | @mark.parametrize('s1, s2, match', 84 | (('MARTHA', 'MARHTA', 0.961), 85 | ('DWAYNE', 'DUANE', 0.84), 86 | ('ABCDEFG', 'ABCDEFG', 1.0), 87 | ('', 'ABCDEFG', 0.0), 88 | ('ABCDEFG', 'HIGKLMN', 0.0))) 89 | def test_jaro_winkler(s1, s2, match): 90 | """Test jaro_similarity_metric behaviour.""" 91 | assert_almost_equal(jaro_winkler(s1, s2), match, 3) 92 | 93 | 94 | @mark.parametrize('string_a, string_b, distance', 95 | (('back', 'book', 2), 96 | ('weight', 'height', 1), 97 | ('Adam', 'Adams', 1), 98 | ('YES', 'yes', 3), 99 | ('weight', 'muchweigh', 5), 100 | ('grand father', '', len('grand father')), 101 | ('', 'grand father', len('grand father')), 102 | (' ', ' ', 0), 103 | ('', '', 0))) 104 | def test_levenshtein(string_a, string_b, distance): 105 | """Test levenshtein_metric behaviour.""" 106 | assert levenshtein(string_a, string_b) == distance 107 | -------------------------------------------------------------------------------- /tests/similarity/test_pairs.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of Beard. 4 | # Copyright (C) 2015 CERN. 5 | # 6 | # Beard is a free software; you can redistribute it and/or modify it 7 | # under the terms of the Revised BSD License; see LICENSE file for 8 | # more details. 9 | 10 | """Tests of transformers for paired data. 11 | 12 | .. codeauthor:: Gilles Louppe 13 | .. codeauthor:: Hussein AL-NATSHEH 14 | 15 | """ 16 | 17 | import jellyfish 18 | import numpy as np 19 | from numpy.testing import assert_array_almost_equal 20 | import scipy.sparse as sp 21 | 22 | from sklearn.preprocessing import MinMaxScaler 23 | from sklearn.preprocessing import OneHotEncoder 24 | from sklearn.preprocessing import StandardScaler 25 | try: 26 | from sklearn.cross_validation import train_test_split 27 | except ImportError: 28 | from sklearn.model_selection import train_test_split 29 | from sklearn.datasets import load_iris 30 | from sklearn.svm import LinearSVC 31 | 32 | from beard.similarity import AbsoluteDifference 33 | from beard.similarity import CosineSimilarity 34 | from beard.similarity import ElementMultiplication 35 | from beard.similarity import EstimatorTransformer 36 | from beard.similarity import JaccardSimilarity 37 | from beard.similarity import PairTransformer 38 | from beard.similarity import StringDistance 39 | from beard.utils import FuncTransformer 40 | 41 | 42 | def test_pair_transformer(): 43 | """Test for PairTransformer.""" 44 | X = np.array([[0, 1], [2, 0], [2, 5]], dtype=np.float) 45 | tf = PairTransformer(element_transformer=FuncTransformer(lambda v: v + 1)) 46 | Xt = tf.fit_transform(X) 47 | assert_array_almost_equal(Xt, X + 1) 48 | 49 | X = np.array([[0, 1], [2, 0], [2, 5], 50 | [0, 1], [2, 0], [2, 5]], dtype=np.float) 51 | tf = PairTransformer(element_transformer=FuncTransformer(lambda v: v + 1), 52 | groupby=lambda r: r[0]) 53 | Xt = tf.fit_transform(X) 54 | assert_array_almost_equal(Xt, X + 1) 55 | 56 | X = np.array([[0, 1], [2, 3], [4, 5]], dtype=np.float) 57 | Xt = PairTransformer(element_transformer=MinMaxScaler()).fit_transform(X) 58 | assert_array_almost_equal(Xt, [[0, 0.2], [0.4, 0.6], [0.8, 1.0]]) 59 | 60 | X = np.array([[0, 1], [2, 3]], dtype=np.float) 61 | tf = PairTransformer(element_transformer=OneHotEncoder(sparse=True)) 62 | Xt = tf.fit_transform(X) 63 | assert sp.issparse(Xt) 64 | assert_array_almost_equal(Xt.todense(), [[1, 0, 0, 0, 0, 1, 0, 0], 65 | [0, 0, 1, 0, 0, 0, 0, 1]]) 66 | 67 | X = sp.csr_matrix(np.array([[0, 1], [2, 3]], dtype=np.float)) 68 | tf = PairTransformer(element_transformer=StandardScaler(with_mean=False)) 69 | Xt = tf.fit_transform(X) 70 | assert sp.issparse(Xt) 71 | assert_array_almost_equal(Xt.todense(), [[0, 0.89442719], 72 | [1.78885438, 2.68328157]]) 73 | 74 | 75 | def test_cosine_similarity(): 76 | """Test for CosineSimilarity.""" 77 | X = np.array([[1, 0, 0, 0, 0, 0], 78 | [1, 0, 1, 1, 0, 0], 79 | [1, 0, 0, 1, 0, 0], 80 | [0, 0, 0, 0, 0, 0], 81 | [1, 1, 1, 1, 1, 1]]) 82 | 83 | Xt = CosineSimilarity().fit_transform(X) 84 | assert_array_almost_equal(Xt, [[0.], [2 ** -0.5], [1.], [0.], [1.]]) 85 | 86 | Xt = CosineSimilarity().fit_transform(sp.csr_matrix(X)) 87 | assert_array_almost_equal(Xt, [[0.], [2 ** -0.5], [1.], [0.], [1.]]) 88 | 89 | 90 | def test_absolute_difference(): 91 | """Test for AbsoluteDifference.""" 92 | X = np.array([[0, 0, 0, 0], 93 | [0, 1, 1, 0], 94 | [1, 1, 1, 1], 95 | [1, 0, 0, 1]]) 96 | 97 | Xt = AbsoluteDifference().fit_transform(X) 98 | assert_array_almost_equal(Xt, [[0, 0], [1, 1], [0, 0], [1, 1]]) 99 | 100 | Xt = AbsoluteDifference().fit_transform(sp.csr_matrix(X)) 101 | assert_array_almost_equal(Xt, [[0, 0], [1, 1], [0, 0], [1, 1]]) 102 | 103 | 104 | def test_CharacterEquality(): 105 | """Test for CharacterEquality.""" 106 | X = np.array([['q', 'q'], 107 | ['q', 'a'], 108 | ['q', ''], 109 | ['', ''], 110 | ['', 'q']]) 111 | Xt = StringDistance(similarity_function='character_equality').transform(X) 112 | assert_array_almost_equal(Xt, [[1.], [0.], [0.], [0.5], [0.]]) 113 | 114 | 115 | def test_StringDistance(): 116 | """Test for StringDistance.""" 117 | X = np.array([[u'this', u'that'], 118 | [u'that', u't'], 119 | [u't', u't'], 120 | [u't', u'this']]) 121 | Xt = StringDistance().transform(X) 122 | assert_array_almost_equal(Xt, [[jellyfish.jaro_winkler(u'this', u'that')], 123 | [-1.], [-1.], [-1.]]) 124 | 125 | 126 | def test_JaccardSimilarity(): 127 | """Test for JaccardSimilarity.""" 128 | X = np.array([[0, 0, 0, 0, 0, 0, 0, 0], 129 | [0, 0, 1, 1, 0, 1, 0, 1], 130 | [0, 1, 0, 1, 0, 0, 1, 0], 131 | [1, 0, 1, 1, 1, 1, 0, 7], 132 | [0, 3, 0, 1, 0, 9, 0, 1]]) 133 | 134 | Xt = JaccardSimilarity().fit_transform(X) 135 | assert_array_almost_equal(Xt, [[0.], [0.33333333], [0.], [0.5], [1.]]) 136 | 137 | Xt = JaccardSimilarity().fit_transform(sp.csr_matrix(X)) 138 | assert_array_almost_equal(Xt, [[0.], [0.33333333], [0.], [0.5], [1.]]) 139 | 140 | X = np.array([[0, 0, 0, 0, 0, 0, 0, 0], 141 | [0, 0, 0, 0, 0, 0, 0, 0], 142 | [0, 0, 0, 0, 0, 0, 0, 0], 143 | [0, 0, 0, 0, 0, 0, 0, 0]]) 144 | 145 | Xt = JaccardSimilarity().fit_transform(X) 146 | assert_array_almost_equal(Xt, [[0.], [0.], [0.], [0.]]) 147 | 148 | Xt = JaccardSimilarity().fit_transform(sp.csr_matrix(X)) 149 | assert_array_almost_equal(Xt, [[0.], [0.], [0.], [0.]]) 150 | 151 | 152 | def test_EstimatorTransformer(): 153 | """Test for EstimatorTransformer.""" 154 | data = load_iris() 155 | train, test = train_test_split(np.arange(len(data.data)), 156 | test_size=0.08, random_state=42) 157 | X_train = data.data[train] 158 | y_train = data.target[train] 159 | X_test = data.data[test] 160 | 161 | clf = LinearSVC().fit(X_train, y_train) 162 | 163 | y_predict = clf.decision_function(X_test) 164 | 165 | Xt = EstimatorTransformer(clf).fit_transform(X_test) 166 | assert_array_almost_equal(Xt, y_predict) 167 | 168 | 169 | def test_ElementMultiplication(): 170 | """Test for ElementMultiplication.""" 171 | X = np.array([[1.0, 1.0, 1.0, 2.0], 172 | [0.5, 1.0, 1.0, 0.5], 173 | [2.5, 0.2, 10.0, 2.0]]) 174 | 175 | y = np.array([[1.0, 2.0], 176 | [0.5, 0.5], 177 | [25.0, 0.4]]) 178 | 179 | Xt = ElementMultiplication().fit_transform(X) 180 | assert_array_almost_equal(Xt, y) 181 | -------------------------------------------------------------------------------- /tests/utils/test_names.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of Beard. 4 | # Copyright (C) 2015 CERN. 5 | # 6 | # Beard is a free software; you can redistribute it and/or modify it 7 | # under the terms of the Revised BSD License; see LICENSE file for 8 | # more details. 9 | 10 | """Tests of personal names helpers. 11 | 12 | .. codeauthor:: Gilles Louppe 13 | .. codeauthor:: Mateusz Susik 14 | 15 | """ 16 | 17 | import pytest 18 | import sys 19 | 20 | import fuzzy 21 | 22 | from beard.ext.metaphone import dm 23 | 24 | from beard.utils.names import phonetic_tokenize_name 25 | from beard.utils.names import given_name_initial 26 | from beard.utils.names import given_name 27 | from beard.utils.names import name_initials 28 | from beard.utils.names import normalize_name 29 | 30 | 31 | def test_name_initals(): 32 | """Test extracting name initials.""" 33 | assert name_initials("Dupont, Jean-René") == set(['D', 'J']) 34 | 35 | 36 | def test_normalize_name(): 37 | """Test of normalize_name.""" 38 | assert normalize_name("Doe, John") == "doe john" 39 | assert normalize_name("Doe, J.") == "doe j" 40 | assert normalize_name("Doe, J") == "doe j" 41 | assert normalize_name("Doe-Foe, Willem") == "doefoe willem" 42 | assert normalize_name("Doe-Foe Willem") == "willem doe foe" 43 | assert normalize_name("Dupont, René") == "dupont rene" 44 | assert normalize_name("Dupont., René") == "dupont rene" 45 | assert normalize_name("Dupont, Jean-René") == "dupont jean rene" 46 | assert normalize_name("Dupont, René, III") == "dupont rene" 47 | assert normalize_name("Dupont, René, Jr.") == "dupont rene" 48 | assert normalize_name("Dupont, J.R.") == "dupont j r" 49 | assert normalize_name("Dupont, J.-R.") == "dupont j r" 50 | assert normalize_name("Dupont") == "dupont" 51 | assert normalize_name("Dupont J.R.") == "dupont j r" 52 | assert normalize_name("von und zu Hohenstein, F.") == "hohenstein f" 53 | assert normalize_name("von und zu Hohenstein, F.", 54 | drop_common_affixes=False) == "vonundzuhohenstein f" 55 | assert normalize_name("Jakub, Ibrahim ibn") == "jakub ibrahim ibn" 56 | assert normalize_name("o'Neill, Jack") == "neill jack" 57 | assert normalize_name("o'Neill, Jack", 58 | drop_common_affixes=False) == "oneill jack" 59 | assert normalize_name("Ben, Robert") == "ben robert" 60 | assert normalize_name("Robert, L. W") == "robert l w" 61 | assert normalize_name("Mueller aus Auer, Peter") == \ 62 | "muellerauer peter" 63 | assert normalize_name("Mueller aus Auer, Peter", 64 | drop_common_affixes=False) == \ 65 | "muellerausauer peter" 66 | 67 | 68 | def test_phonetic_tokenize_name_simple(): 69 | """Test of tokenize_name.""" 70 | assert phonetic_tokenize_name("Doe, John") == ((dm(u"Doe")[0],), 71 | (dm(u"John")[0],)) 72 | assert phonetic_tokenize_name("Doe, J.") == \ 73 | phonetic_tokenize_name(u"Doe, J") 74 | assert phonetic_tokenize_name("Doe-Foe, Willem") == ((dm(u"Doe")[0], 75 | dm(u"Foe")[0]), 76 | (dm(u"Willem")[0],)) 77 | assert phonetic_tokenize_name("Dupont, René") == \ 78 | phonetic_tokenize_name("Dupont., René") 79 | assert phonetic_tokenize_name("Dupont, Jean-René") == \ 80 | ((dm(u"Dupont")[0],), (dm(u"Jean")[0], dm(u"René")[0])) 81 | assert phonetic_tokenize_name("Dupont, René, III") == \ 82 | ((dm(u"Dupont")[0],), (dm(u"Rene")[0], dm(u"III")[0])) 83 | assert phonetic_tokenize_name("Dupont, René, Jr.") == \ 84 | ((dm(u"Dupont")[0],), (dm(u"Rene")[0], dm(u"Jr")[0])) 85 | assert phonetic_tokenize_name("Dupont, J.R.") == \ 86 | phonetic_tokenize_name("Dupont, J.-R.") 87 | assert phonetic_tokenize_name("Dupont") == ((dm(u"Dupont")[0],), ('',)) 88 | assert phonetic_tokenize_name("Jean Dupont") == \ 89 | phonetic_tokenize_name("Dupont, Jean") 90 | 91 | 92 | def test_phonetic_tokenize_name_nysiis(): 93 | assert phonetic_tokenize_name("Dupont, René", "nysiis") == ( 94 | ((fuzzy.nysiis(u"Dupont"),), (fuzzy.nysiis(u"René"),))) 95 | 96 | 97 | @pytest.mark.xfail(reason="soundex is broken in fuzzy 1.2.*") 98 | def test_phonetic_tokenize_name_soundex(): 99 | """Test checking if custom phonetic algorithms from fuzzy packages work.""" 100 | soundex = fuzzy.Soundex(5) 101 | assert phonetic_tokenize_name("Dupont, René", "soundex") == ( 102 | # no direct support for unicode in soundex, thus "Rene" 103 | ((soundex(u"Dupont"),), (soundex(u"Rene"),))) 104 | 105 | 106 | def test_phonetic_normalize_name_tokenize_sign(): 107 | """Test correct handling of the cyrillic soft sign.""" 108 | assert phonetic_tokenize_name("Aref'ev, M.") == ((dm(u"Arefev")[0],), 109 | (dm(u"M")[0],)) 110 | # If the following letter is uppercase, split 111 | assert phonetic_tokenize_name("An'Sun, J.") == ((dm(u"An")[0], 112 | dm(u"Sun")[0]), 113 | (dm(u"J")[0],)) 114 | 115 | 116 | def test_phonetic_normalize_name_remove_tokenizefixes(): 117 | """Test correct removal of the common affixes.""" 118 | assert phonetic_tokenize_name("von und zu Hohenstein, F.") == \ 119 | phonetic_tokenize_name("Hohenstein, F.") 120 | # If the name consists of only the common prefixes, don't drop it, as 121 | # it might actually be the correct surname. 122 | assert phonetic_tokenize_name("Ben, Robert") == ((dm(u"Ben")[0],), 123 | (dm(u"Robert")[0],)) 124 | # Don't drop affixes among the first names. 125 | assert phonetic_tokenize_name("Robert, L. W.") == ((dm(u"Robert")[0],), 126 | (dm(u"L")[0], 127 | dm(u"W")[0])) 128 | 129 | 130 | def test_given_name_initial(): 131 | """Test the extraction of the first initial.""" 132 | assert given_name_initial("Doe, John") == 'j' 133 | assert given_name_initial("Doe-Foe, Willem") == 'w' 134 | assert given_name_initial("Doe=Foe, Willem John", 1) == 'j' 135 | assert given_name_initial("Dupont, Jean-René") == 'j' 136 | assert given_name_initial("Dupont, René, III") == 'r' 137 | assert given_name_initial("Dupont, René Pierre", 1) == 'p' 138 | assert given_name_initial("Dupont, René, III Pierre", 1) == '' 139 | assert given_name_initial("Mieszko") == '' 140 | assert given_name_initial("John Doe") == 'j' 141 | assert given_name_initial("Dupont, .J") == 'j' 142 | 143 | 144 | def test_given_name(): 145 | """Test given name extraction.""" 146 | assert given_name("Doe, John", 0) == 'John' 147 | assert given_name("Doe, John", 1) == '' 148 | assert given_name("Doe, John William", 0) == 'John' 149 | assert given_name("Doe, John William", 1) == 'William' 150 | assert given_name("Dupont, .J", 0) == ".J" 151 | assert given_name("John Doe", 0) == 'John' 152 | assert given_name("Mieszko", 0) == 'Mieszko' 153 | assert given_name("Dupont, René, III Pierre", 0) == 'René' 154 | assert given_name("Dupont, René, III Pierre", 1) == '' 155 | assert given_name("Dupont, René, III Pierre", 2) == '' 156 | -------------------------------------------------------------------------------- /tests/utils/test_strings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of Beard. 4 | # Copyright (C) 2015 CERN. 5 | # 6 | # Beard is a free software; you can redistribute it and/or modify it 7 | # under the terms of the Revised BSD License; see LICENSE file for 8 | # more details. 9 | 10 | """Tests of string helpers. 11 | 12 | .. codeauthor:: Gilles Louppe 13 | 14 | """ 15 | 16 | from beard.utils.strings import asciify 17 | 18 | 19 | def test_asciify(): 20 | """Test of asciify.""" 21 | assert asciify("") == "" 22 | assert asciify("foo") == "foo" 23 | assert asciify("bèård") == "beard" 24 | assert asciify("schröder") == "schroder" 25 | -------------------------------------------------------------------------------- /tests/utils/test_transformers.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of Beard. 4 | # Copyright (C) 2015 CERN. 5 | # 6 | # Beard is a free software; you can redistribute it and/or modify it 7 | # under the terms of the Revised BSD License; see LICENSE file for 8 | # more details. 9 | 10 | """Tests of generic transformers. 11 | 12 | .. codeauthor:: Gilles Louppe 13 | 14 | """ 15 | 16 | import numpy as np 17 | from numpy.testing import assert_array_equal 18 | from numpy.testing import assert_equal 19 | 20 | from beard.utils.transformers import FuncTransformer 21 | from beard.utils.transformers import Shaper 22 | 23 | 24 | def test_func_transformer(): 25 | """Test for FuncTransformer.""" 26 | X = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.int) 27 | 28 | def myfunc(v): 29 | return v + 1 30 | 31 | Xt = FuncTransformer(myfunc).fit_transform(X) 32 | assert_array_equal(Xt, X + 1) 33 | assert_equal(X.dtype, Xt.dtype) 34 | 35 | Xt = FuncTransformer(myfunc, dtype=np.float).fit_transform(X) 36 | assert_array_equal(Xt, X + 1) 37 | assert_equal(Xt.dtype, np.float) 38 | 39 | 40 | def test_shaper(): 41 | """Test for Shaper""" 42 | X = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.int) 43 | 44 | Xt = Shaper((-1, 1)).fit_transform(X) 45 | assert_array_equal(Xt, [[0], [1], [2], [3], [4], [5]]) 46 | assert_array_equal(Xt.shape, (6, 1)) 47 | 48 | Xt = Shaper((-1,)).fit_transform(X) 49 | assert_array_equal(Xt, [0, 1, 2, 3, 4, 5]) 50 | assert_array_equal(Xt.shape, (6,)) 51 | 52 | Xt = Shaper((-1, 1), order="F").fit_transform(X) 53 | assert_array_equal(Xt, [[0], [3], [1], [4], [2], [5]]) 54 | assert_array_equal(Xt.shape, (6, 1)) 55 | # assert np.isfortran(Xt) 56 | -------------------------------------------------------------------------------- /travis-install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This file is part of Beard. 3 | # Copyright (C) 2014 CERN. 4 | # 5 | # Beard is a free software; you can redistribute it and/or modify it 6 | # under the terms of the Revised BSD License; see LICENSE file for 7 | # more details. 8 | 9 | # This script is freely inspired from the Scikit-Learn integration scripts. 10 | # https://github.com/scikit-learn/scikit-learn/blob/master/continuous_integration/install.sh 11 | # License: 3-clause BSD 12 | 13 | set -e 14 | 15 | # Fix the compilers to workaround avoid having the Python 3.4 build 16 | # lookup for g++44 unexpectedly. 17 | export CC=gcc 18 | export CXX=g++ 19 | 20 | # Deactivate the travis-provided virtual environment and setup a 21 | # conda-based environment instead 22 | # deactivate 23 | 24 | # Use the miniconda installer for faster download / install of conda 25 | # itself 26 | 27 | wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh \ 28 | -O miniconda.sh 29 | 30 | chmod +x miniconda.sh && ./miniconda.sh -b 31 | export PATH=/home/travis/miniconda2/bin:$PATH 32 | conda update --yes conda 33 | 34 | # Configure the conda environment and put it in the path using the 35 | # provided versions 36 | conda create -n testenv --yes python=$PYTHON_VERSION pip \ 37 | numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION scikit-learn=$SKLEARN_VERSION \ 38 | pytest pytest-pep8 pytest-cache sphinx 39 | source activate testenv 40 | 41 | python --version 42 | python -c "import numpy; print('numpy %s' % numpy.__version__)" 43 | python -c "import scipy; print('scipy %s' % scipy.__version__)" 44 | --------------------------------------------------------------------------------