├── .coveragerc
├── .gitignore
├── .travis.yml
├── AUTHORS.rst
├── LICENSE.rst
├── MANIFEST.in
├── README.rst
├── beard
    ├── __init__.py
    ├── clustering
    │   ├── __init__.py
    │   ├── blocking.py
    │   ├── blocking_funcs.py
    │   └── wrappers.py
    ├── ext
    │   ├── __init__.py
    │   └── metaphone.py
    ├── metrics
    │   ├── __init__.py
    │   ├── clustering.py
    │   └── text.py
    ├── similarity
    │   ├── __init__.py
    │   └── pairs.py
    └── utils
    │   ├── __init__.py
    │   ├── misc.py
    │   ├── names.py
    │   ├── strings.py
    │   └── transformers.py
├── doc
    ├── Makefile
    ├── _build
    │   └── .keep
    ├── _static
    │   └── .keep
    ├── _templates
    │   └── .keep
    ├── conf.py
    ├── index.rst
    └── make.bat
├── examples
    ├── README.rst
    ├── applications
    │   └── author-disambiguation
    │   │   ├── README.rst
    │   │   ├── clustering.py
    │   │   ├── distance.py
    │   │   ├── ethnicity.py
    │   │   ├── sampling.py
    │   │   └── utils.py
    ├── author_disambiguation.py
    └── data
    │   ├── README.rst
    │   ├── author-disambiguation.npz
    │   ├── wang_clusters.json
    │   ├── wang_records.json
    │   └── wang_signatures.json
├── miniconda.sh
├── pytest.ini
├── run-tests.sh
├── setup.py
├── tests
    ├── clustering
    │   ├── test_block.py
    │   ├── test_blocking.py
    │   ├── test_blocking_funcs.py
    │   └── test_wrappers.py
    ├── metrics
    │   ├── test_clustering.py
    │   └── test_text.py
    ├── similarity
    │   └── test_pairs.py
    └── utils
    │   ├── test_names.py
    │   ├── test_strings.py
    │   └── test_transformers.py
└── travis-install.sh


/.coveragerc:
--------------------------------------------------------------------------------
 1 | # This file is part of Beard.
 2 | # Copyright (C) 2014 CERN.
 3 | #
 4 | # Beard is a free software; you can redistribute it and/or modify it
 5 | # under the terms of the Revised BSD License; see LICENSE file for
 6 | # more details.
 7 | 
 8 | [run]
 9 | source = beard
10 | omit = beard/ext/*
11 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | lib/
17 | lib64/
18 | parts/
19 | sdist/
20 | var/
21 | *.egg-info/
22 | .installed.cfg
23 | *.egg
24 | *.eggs
25 | 
26 | # PyInstaller
27 | #  Usually these files are written by a python script from a template
28 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 | 
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 | 
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .cache
41 | nosetests.xml
42 | coverage.xml
43 | 
44 | # Translations
45 | *.mo
46 | *.pot
47 | 
48 | # Django stuff:
49 | *.log
50 | 
51 | # Sphinx documentation
52 | docs/_build/
53 | 
54 | # PyBuilder
55 | target/
56 | 
57 | .python-version
58 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # This file is part of Beard.
 2 | # Copyright (C) 2014 CERN.
 3 | #
 4 | # Beard is a free software; you can redistribute it and/or modify it
 5 | # under the terms of the Revised BSD License; see LICENSE file for
 6 | # more details.
 7 | 
 8 | language: python
 9 | 
10 | sudo: false
11 | 
12 | matrix:
13 |   include:
14 |     - python: "2.7"
15 |       env: PYTHON_VERSION="2.7" NUMPY_VERSION="1.10" SCIPY_VERSION="0.17" SKLEARN_VERSION="0.17"
16 |     - python: "3.6"
17 |       env: PYTHON_VERSION="3.6"
18 | 
19 | install:
20 |     - if [[ "$PYTHON_VERSION" == "2.7" ]]; then
21 |         source travis-install.sh;
22 |       fi
23 |     - pip install check-manifest coveralls pydocstyle pytest-cov
24 |     - python setup.py install
25 |     - python setup.py clean --all
26 | 
27 | script:
28 |     - source run-tests.sh
29 | 
30 | after_success:
31 |     - coveralls
32 | 
33 | notifications:
34 |     email: false
35 | 


--------------------------------------------------------------------------------
/AUTHORS.rst:
--------------------------------------------------------------------------------
 1 | Authors
 2 | =======
 3 | 
 4 | Contributors:
 5 | 
 6 | * Gilles Louppe <g.louppe@cern.ch>
 7 | * Mateusz Susik <mateusz.susik@cern.ch>
 8 | * Petros Ioannidis <petros.ioannidis91@gmail.com>
 9 | * Evangelos Tzemis <evangelos.tzemis@cern.ch>
10 | * Hussein Al-Natsheh <h.natsheh@ciapple.com>
11 | 


--------------------------------------------------------------------------------
/LICENSE.rst:
--------------------------------------------------------------------------------
 1 | Beard is free software; you can redistribute it and/or modify it
 2 | under the terms of the Revised BSD License quoted below.
 3 | 
 4 | Copyright (c) 2014 CERN.
 5 | All rights reserved.
 6 | 
 7 | Redistribution and use in source and binary forms, with or without
 8 | modification, are permitted provided that the following conditions are met:
 9 | 
10 | * Redistributions of source code must retain the above copyright notice, this
11 |   list of conditions and the following disclaimer.
12 | 
13 | * Redistributions in binary form must reproduce the above copyright notice,
14 |   this list of conditions and the following disclaimer in the documentation
15 |   and/or other materials provided with the distribution.
16 | 
17 | * Neither the name of Beard nor the names of its
18 |   contributors may be used to endorse or promote products derived from
19 |   this software without specific prior written permission.
20 | 
21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
25 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 | 
32 | In applying this license, CERN does not waive the privileges and
33 | immunities granted to it by virtue of its status as an
34 | Intergovernmental Organization or submit itself to any jurisdiction.
35 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include *.rst
 2 | include *.sh
 3 | include .coveragerc
 4 | include pytest.ini
 5 | recursive-include doc *.bat
 6 | recursive-include doc *.keep
 7 | recursive-include doc *.py
 8 | recursive-include doc *.rst
 9 | recursive-include doc Makefile
10 | recursive-include examples *.json
11 | recursive-include examples *.npz
12 | recursive-include examples *.py
13 | recursive-include examples *.rst
14 | recursive-include tests *.py
15 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | =====
 2 | Beard
 3 | =====
 4 | 
 5 | Beard is a Python library of machine learning tools for Bibliographic Entity
 6 | Automatic Recognition and Disambiguation.
 7 | 
 8 | The project is currently in stable stage of development.
 9 | 
10 | .. image:: https://travis-ci.org/inspirehep/beard.svg?branch=master
11 | 	:target: https://travis-ci.org/inspirehep/beard
12 | .. image:: https://coveralls.io/repos/inspirehep/beard/badge.png
13 | 	:target: https://coveralls.io/r/inspirehep/beard
14 | 
15 | Installation
16 | ============
17 | 
18 | ``python setup.py install``
19 | 
20 | Examples
21 | ========
22 | 
23 | In the ``examples/applications/author-disambiguation`` directory there are files
24 | that present how to use the library for the author disambiguation problem.
25 | Check the ``README.rst`` in this directory for details.
26 | 


--------------------------------------------------------------------------------
/beard/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #
 3 | # This file is part of Beard.
 4 | # Copyright (C) 2014 CERN.
 5 | #
 6 | # Beard is a free software; you can redistribute it and/or modify it
 7 | # under the terms of the Revised BSD License; see LICENSE file for
 8 | # more details.
 9 | 
10 | """Bibliographic Entity Automatic Recognition and Disambiguation."""
11 | 
12 | __version__ = "0.2.2"
13 | 


--------------------------------------------------------------------------------
/beard/clustering/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #
 3 | # This file is part of Beard.
 4 | # Copyright (C) 2014 CERN.
 5 | #
 6 | # Beard is a free software; you can redistribute it and/or modify it
 7 | # under the terms of the Revised BSD License; see LICENSE file for
 8 | # more details.
 9 | 
10 | """Clustering algorithms."""
11 | 
12 | from .blocking import BlockClustering
13 | from .blocking_funcs import block_phonetic
14 | from .blocking_funcs import block_last_name_first_initial
15 | from .blocking_funcs import block_single
16 | from .wrappers import ScipyHierarchicalClustering
17 | 
18 | __all__ = ("BlockClustering",
19 |            "block_phonetic",
20 |            "block_last_name_first_initial",
21 |            "block_single",
22 |            "ScipyHierarchicalClustering")
23 | 


--------------------------------------------------------------------------------
/beard/clustering/blocking.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is part of Beard.
  4 | # Copyright (C) 2015 CERN.
  5 | #
  6 | # Beard is a free software; you can redistribute it and/or modify it
  7 | # under the terms of the Revised BSD License; see LICENSE file for
  8 | # more details.
  9 | 
 10 | """Blocking for clustering estimators.
 11 | 
 12 | .. codeauthor:: Gilles Louppe <g.louppe@cern.ch>
 13 | .. codeauthor:: Mateusz Susik <mateusz.susik@cern.ch>
 14 | 
 15 | """
 16 | 
 17 | from __future__ import print_function
 18 | 
 19 | import numpy as np
 20 | import time
 21 | import structlog
 22 | 
 23 | from sklearn.base import BaseEstimator
 24 | from sklearn.base import clone
 25 | from sklearn.base import ClusterMixin
 26 | from sklearn.utils import column_or_1d
 27 | 
 28 | from .blocking_funcs import block_single
 29 | 
 30 | LOGGER = structlog.getLogger()
 31 | 
 32 | 
 33 | class _SingleClustering(BaseEstimator, ClusterMixin):
 34 |     def fit(self, X, y=None):
 35 |         self.labels_ = block_single(X)
 36 |         return self
 37 | 
 38 |     def partial_fit(self, X, y=None):
 39 |         self.labels_ = block_single(X)
 40 |         return self
 41 | 
 42 |     def predict(self, X):
 43 |         return block_single(X)
 44 | 
 45 | 
 46 | def _parallel_fit(fit_, partial_fit_, estimator, verbose, data_queue,
 47 |                   result_queue):
 48 |     """Run clusterer's fit function."""
 49 |     # Status can be one of: 'middle', 'end'
 50 |     # 'middle' means that there is a block to compute and the process should
 51 |     # continue
 52 |     # 'end' means that the process should finish as all the data was sent
 53 |     # by the main process
 54 |     status, block, existing_clusterer = data_queue.get()
 55 | 
 56 |     while status != 'end':
 57 | 
 58 |         b, X, y = block
 59 | 
 60 |         if len(X) == 1:
 61 |             clusterer = _SingleClustering()
 62 |         elif existing_clusterer and partial_fit_ and not fit_:
 63 |             clusterer = existing_clusterer
 64 |         else:
 65 |             clusterer = clone(estimator)
 66 | 
 67 |         if verbose > 1:
 68 |             print("Clustering %d samples on block '%s'..." % (len(X), b))
 69 |             LOGGER.info("Clustering %d samples on block '%s'..." % (len(X), b))
 70 | 
 71 |         if fit_ or not hasattr(clusterer, "partial_fit"):
 72 |             try:
 73 |                 clusterer.fit(X, y=y)
 74 |             except TypeError:
 75 |                 clusterer.fit(X)
 76 |         elif partial_fit_:
 77 |             try:
 78 |                 clusterer.partial_fit(X, y=y)
 79 |             except TypeError:
 80 |                 clusterer.partial_fit(X)
 81 | 
 82 |         result_queue.put((b, clusterer))
 83 |         status, block, existing_clusterer = data_queue.get()
 84 | 
 85 |     data_queue.put(('end', None, None))
 86 |     return
 87 | 
 88 | 
 89 | def _single_fit(fit_, partial_fit_, estimator, verbose, data):
 90 |     """Run clusterer's fit function."""
 91 |     block, existing_clusterer = data
 92 |     b, X, y = block
 93 | 
 94 |     if len(X) == 1:
 95 |         clusterer = _SingleClustering()
 96 |     elif existing_clusterer and partial_fit_ and not fit_:
 97 |         clusterer = existing_clusterer
 98 |     else:
 99 |         clusterer = clone(estimator)
100 | 
101 |     if verbose > 1:
102 |         print("Clustering %d samples on block '%s'..." % (len(X), b))
103 |         LOGGER.info("Clustering %d samples on block '%s'..." % (len(X), b))
104 | 
105 |     if fit_ or not hasattr(clusterer, "partial_fit"):
106 |         try:
107 |             clusterer.fit(X, y=y)
108 |         except TypeError:
109 |             clusterer.fit(X)
110 |     elif partial_fit_:
111 |         try:
112 |             clusterer.partial_fit(X, y=y)
113 |         except TypeError:
114 |             clusterer.partial_fit(X)
115 | 
116 |     return (b, clusterer)
117 | 
118 | 
119 | class BlockClustering(BaseEstimator, ClusterMixin):
120 |     """Implements blocking for clustering estimators.
121 | 
122 |     Meta-estimator for grouping samples into blocks, within each of which
123 |     a clustering base estimator is fit. This allows to reduce the cost of
124 |     pairwise distance computation from O(N^2) to O(sum_b N_b^2), where
125 |     N_b <= N is the number of samples in block b.
126 | 
127 |     Attributes
128 |     ----------
129 |     labels_ : ndarray, shape (n_samples,)
130 |         Array of labels assigned to the input data.
131 |         if partial_fit is used instead of fit, they are assigned to the
132 |         last batch of data.
133 | 
134 |     blocks_ : ndarray, shape (n_samples,)
135 |         Array of keys mapping input data to blocks.
136 |     """
137 | 
138 |     def __init__(self, affinity=None, blocking="single", base_estimator=None,
139 |                  verbose=0, n_jobs=1):
140 |         """Initialize.
141 | 
142 |         Parameters
143 |         ----------
144 |         :param affinity: string or None
145 |             If affinity == 'precomputed', then assume that X is a distance
146 |             matrix.
147 | 
148 |         :param blocking: string or callable, default "single"
149 |             The blocking strategy, for mapping samples X to blocks.
150 |             - "single": group all samples X[i] into the same block;
151 |             - "precomputed": use `blocks[i]` argument (in `fit`, `partial_fit`
152 |               or `predict`) as a key for mapping sample X[i] to a block;
153 |             - callable: use blocking(X)[i] as a key for mapping sample X[i] to
154 |               a block.
155 | 
156 |         :param base_estimator: estimator
157 |             Clustering estimator to fit within each block.
158 | 
159 |         :param verbose: int, default=0
160 |             Verbosity of the fitting procedure.
161 | 
162 |         :param n_jobs: int
163 |             Number of processes to use.
164 |         """
165 |         self.affinity = affinity
166 |         self.blocking = blocking
167 |         self.base_estimator = base_estimator
168 |         self.verbose = verbose
169 |         self.n_jobs = n_jobs
170 | 
171 |     def _validate(self, X, blocks):
172 |         """Validate hyper-parameters and input data."""
173 |         if self.blocking == "single":
174 |             blocks = block_single(X)
175 |         elif self.blocking == "precomputed":
176 |             if blocks is not None and len(blocks) == len(X):
177 |                 blocks = column_or_1d(blocks).ravel()
178 |             else:
179 |                 raise ValueError("Invalid value for blocks. When "
180 |                                  "blocking='precomputed', blocks needs to be "
181 |                                  "an array of size len(X).")
182 |         elif callable(self.blocking):
183 |             blocks = self.blocking(X)
184 |         else:
185 |             raise ValueError("Invalid value for blocking. Allowed values are "
186 |                              "'single', 'precomputed' or callable.")
187 | 
188 |         return X, blocks
189 | 
190 |     def _blocks(self, X, y, blocks):
191 |         """Chop the training data into smaller chunks.
192 | 
193 |         A chunk is demarcated by the corresponding block. Each chunk contains
194 |         only the training examples relevant to given block and a clusterer
195 |         which will be used to fit the data.
196 | 
197 |         Returns
198 |         -------
199 |         :returns: generator
200 |             Quadruples in the form of ``(block, X, y, clusterer)`` where
201 |             X and y are the training examples for given block and clusterer is
202 |             an object with a ``fit`` method.
203 |         """
204 |         unique_blocks = np.unique(blocks)
205 | 
206 |         for b in unique_blocks:
207 |             mask = (blocks == b)
208 |             X_mask = X[mask, :]
209 |             if y is not None:
210 |                 y_mask = y[mask]
211 |             else:
212 |                 y_mask = None
213 |             if self.affinity == "precomputed":
214 |                 X_mask = X_mask[:, mask]
215 | 
216 |             yield (b, X_mask, y_mask)
217 | 
218 |     def _fit(self, X, y, blocks):
219 |         """Fit base clustering estimators on X."""
220 |         self.blocks_ = blocks
221 |         if self.n_jobs == 1:
222 |             LOGGER.info("fitting data with 1 job")
223 |             blocks_computed = 0
224 |             blocks_all = len(np.unique(blocks))
225 |             LOGGER.info(
226 |                 "%s blocks computed out of %s" % (
227 |                     blocks_computed, blocks_all
228 |                 )
229 |             )
230 |             for block in self._blocks(X, y, blocks):
231 |                 if self.partial_fit_ and block[0] in self.clusterers_:
232 |                     data = (block, self.clusterers_[block[0]])
233 |                 else:
234 |                     data = (block, None)
235 | 
236 |                 b, clusterer = _single_fit(self.fit_, self.partial_fit_,
237 |                                            self.base_estimator, self.verbose,
238 |                                            data)
239 | 
240 |                 if clusterer:
241 |                     self.clusterers_[b] = clusterer
242 | 
243 |                 if blocks_computed < blocks_all:
244 |                     print("%s blocks computed out of %s" % (blocks_computed,
245 |                                                             blocks_all))
246 |                     LOGGER.info(
247 |                         "%s blocks computed out of %s" % (
248 |                             blocks_computed, blocks_all
249 |                         )
250 |                     )
251 |                 blocks_computed += 1
252 |         else:
253 |             LOGGER.info(
254 |                 "fitting data with {0} parallel jobs".format(
255 |                     self.n_jobs
256 |                 )
257 |             )
258 |             try:
259 |                 from multiprocessing import SimpleQueue
260 |             except ImportError:
261 |                 from multiprocessing.queues import SimpleQueue
262 | 
263 |             # Here the blocks will be passed to subprocesses
264 |             data_queue = SimpleQueue()
265 |             # Here the results will be passed back
266 |             result_queue = SimpleQueue()
267 | 
268 |             for x in range(self.n_jobs):
269 |                 import multiprocessing as mp
270 |                 processes = []
271 | 
272 |                 processes.append(mp.Process(target=_parallel_fit, args=(
273 |                                  self.fit_, self.partial_fit_,
274 |                                  self.base_estimator, self.verbose,
275 |                                  data_queue, result_queue)))
276 |                 processes[-1].start()
277 | 
278 |             # First n_jobs blocks are sent into the queue without waiting
279 |             # for the results. This variable is a counter that takes care of
280 |             # this.
281 |             presend = 0
282 |             blocks_computed = 0
283 |             blocks_all = len(np.unique(blocks))
284 | 
285 |             for block in self._blocks(X, y, blocks):
286 |                 if presend >= self.n_jobs:
287 |                     b, clusterer = result_queue.get()
288 |                     blocks_computed += 1
289 |                     if clusterer:
290 |                         self.clusterers_[b] = clusterer
291 |                 else:
292 |                     presend += 1
293 |                 if self.partial_fit_:
294 |                     if block[0] in self.clusterers_:
295 |                         data_queue.put(('middle', block, self.clusterers_[b]))
296 |                         continue
297 | 
298 |                 data_queue.put(('middle', block, None))
299 | 
300 |             # Get the last results and tell the subprocesses to finish
301 |             for x in range(self.n_jobs):
302 |                 if blocks_computed < blocks_all:
303 |                     print("%s blocks computed out of %s" % (blocks_computed,
304 |                                                             blocks_all))
305 |                     LOGGER.info(
306 |                         "%s blocks computed out of %s" % (
307 |                             blocks_computed, blocks_all
308 |                         )
309 |                     )
310 |                     b, clusterer = result_queue.get()
311 |                     blocks_computed += 1
312 |                     if clusterer:
313 |                         self.clusterers_[b] = clusterer
314 | 
315 |             data_queue.put(('end', None, None))
316 | 
317 |             time.sleep(1)
318 | 
319 |         return self
320 | 
321 |     def fit(self, X, y=None, blocks=None):
322 |         """Fit individual base clustering estimators for each block.
323 | 
324 |         Parameters
325 |         ----------
326 |         :param X: {array-like, sparse matrix}, shape (n_samples, n_features)
327 |                   or (n_samples, n_samples)
328 |             Input data, as an array of samples or as a distance matrix if
329 |             affinity == 'precomputed'.
330 | 
331 |         :param y: array-like, shape (n_samples, )
332 |             Input labels, in case of (semi-)supervised clustering.
333 |             Labels equal to -1 stand for unknown labels.
334 | 
335 |         :param blocks: array-like, shape (n_samples, )
336 |             Block labels, if `blocking == 'precomputed'`.
337 | 
338 |         Returns
339 |         -------
340 |         :returns: self
341 |         """
342 |         # Validate parameters
343 |         X, blocks = self._validate(X, blocks)
344 | 
345 |         # Reset attributes
346 |         self.clusterers_ = {}
347 |         self.fit_, self.partial_fit_ = True, False
348 | 
349 |         return self._fit(X, y, blocks)
350 | 
351 |     def partial_fit(self, X, y=None, blocks=None):
352 |         """Resume fitting of base clustering estimators, for each block.
353 | 
354 |         This calls `partial_fit` whenever supported by the base estimator.
355 |         Otherwise, this calls `fit`, on given blocks only.
356 | 
357 |         Parameters
358 |         ----------
359 |         :param X: {array-like, sparse matrix}, shape (n_samples, n_features)
360 |                   or (n_samples, n_samples)
361 |             Input data, as an array of samples or as a distance matrix if
362 |             affinity == 'precomputed'.
363 | 
364 |         :param y: array-like, shape (n_samples, )
365 |             Input labels, in case of (semi-)supervised clustering.
366 |             Labels equal to -1 stand for unknown labels.
367 | 
368 |         :param blocks: array-like, shape (n_samples, )
369 |             Block labels, if `blocking == 'precomputed'`.
370 | 
371 |         Returns
372 |         -------
373 |         :returns: self
374 |         """
375 |         # Validate parameters
376 |         X, blocks = self._validate(X, blocks)
377 | 
378 |         # Set attributes if first call
379 |         if not hasattr(self, "clusterers_"):
380 |             self.clusterers_ = {}
381 | 
382 |         self.fit_, self.partial_fit_ = False, True
383 | 
384 |         return self._fit(X, y, blocks)
385 | 
386 |     def predict(self, X, blocks=None):
387 |         """Predict data.
388 | 
389 |         Parameters
390 |         ----------
391 |         :param X: {array-like, sparse matrix}, shape (n_samples, n_features)
392 |             Input data.
393 | 
394 |         :param blocks: array-like, shape (n_samples, )
395 |             Block labels, if `blocking == 'precomputed'`.
396 | 
397 |         Returns
398 |         -------
399 |         :returns: array-like, shape (n_samples)
400 |             The labels.
401 |         """
402 |         # Validate parameters
403 |         X, blocks = self._validate(X, blocks)
404 | 
405 |         # Predict
406 |         labels = -np.ones(len(X), dtype=np.int)
407 |         offset = 0
408 | 
409 |         for b in np.unique(blocks):
410 |             # Predict on the block, if known
411 |             if b in self.clusterers_:
412 |                 mask = (blocks == b)
413 |                 clusterer = self.clusterers_[b]
414 | 
415 |                 pred = np.array(clusterer.predict(X[mask]))
416 |                 pred[(pred != -1)] += offset
417 |                 labels[mask] = pred
418 |                 offset += np.max(clusterer.labels_) + 1
419 | 
420 |         return labels
421 | 
422 |     @property
423 |     def labels_(self):
424 |         """Compute the labels assigned to the input data.
425 | 
426 |         Note that labels are computed on-the-fly.
427 |         """
428 |         labels = -np.ones(len(self.blocks_), dtype=np.int)
429 |         offset = 0
430 | 
431 |         for b in self.clusterers_:
432 |             mask = (self.blocks_ == b)
433 |             clusterer = self.clusterers_[b]
434 | 
435 |             pred = np.array(clusterer.labels_)
436 |             pred[(pred != -1)] += offset
437 |             labels[mask] = pred
438 |             offset += np.max(clusterer.labels_) + 1
439 | 
440 |         return labels
441 | 


--------------------------------------------------------------------------------
/beard/clustering/blocking_funcs.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is part of Beard.
  4 | # Copyright (C) 2015 CERN.
  5 | #
  6 | # Beard is a free software; you can redistribute it and/or modify it
  7 | # under the terms of the Revised BSD License; see LICENSE file for
  8 | # more details.
  9 | 
 10 | """The algorithms for blocking.
 11 | 
 12 | .. codeauthor:: Mateusz Susik <mateusz.susik@cern.ch>
 13 | 
 14 | """
 15 | 
 16 | import numpy as np
 17 | import six
 18 | 
 19 | from beard.utils import normalize_name
 20 | from beard.utils.names import phonetic_tokenize_name
 21 | from beard.utils.names import given_name_initial
 22 | 
 23 | 
 24 | class _Block:
 25 |     """Representation of a block.
 26 | 
 27 |     Block stores information about different variation of names and the
 28 |     quantities of their appearances on the papers.
 29 | 
 30 |     Example of a block _content:
 31 | 
 32 |     .. code:: python
 33 | 
 34 |         {
 35 |             ('JNS',): {
 36 |                 ('P',): 2, ('P', 'PL'): 3, ('P', 'JH'): 2
 37 |             },
 38 |             ('JNS', 'SM0'): {
 39 |                 ('PAL', 'JH'): 5, ('JH',): 3, ('SMN',): 2
 40 |             },
 41 |             ('RCN', 'JNS'): {
 42 |                 ('A',): 34
 43 |             }
 44 |         }
 45 | 
 46 |     From the example above, one can see that the block stores information
 47 |     about 5 signatures of 'JNS' 'SM0', 'PAL' 'JH'. Those strings are results
 48 |     of the phonetic algorithm. Such signature might correspond, for
 49 |     example, to Jones-Smith, Paul John.
 50 |     """
 51 | 
 52 |     def __init__(self, surnames, given_names):
 53 |         """Create a block. Add given names from the first signature.
 54 | 
 55 |         Parameters
 56 |         ----------
 57 |         :param surnames: tuple
 58 |             Strings representing surnames on a signature.
 59 |         :param given_names: tuple
 60 |             Strings representing given names on a signature.
 61 |         """
 62 |         self._content = {surnames: {given_names: 1}}
 63 | 
 64 |         self._name = surnames[-1]
 65 | 
 66 |     def add_signature(self, surnames, given_names):
 67 |         """Add a signature to the block.
 68 | 
 69 |         Parameters
 70 |         ----------
 71 |         :param surnames: tuple
 72 |             Strings representing surnames on a signature.
 73 |         :param given_names: tuple
 74 |             Strings representing given_names on a signature.
 75 |         """
 76 |         if surnames in self._content:
 77 |             if given_names in self._content[surnames]:
 78 |                 self._content[surnames][given_names] += 1
 79 |             else:
 80 |                 self._content[surnames][given_names] = 1
 81 |         else:
 82 |             self._content[surnames] = {given_names: 1}
 83 | 
 84 |     def compare_tokens_from_last(self, first_surnames, last_surname):
 85 |         """Check if a part of the surname matches with given names in block.
 86 | 
 87 |         For example, ``Sanchez-Gomez, Juan`` can appear on a signature as
 88 |         ``Gomez, Juan Sanchez``. This function checks if there is a match
 89 |         between surnames like ``Sanchez`` and the given names in the block.
 90 |         In this case, a signature like ``Gomez, J. Sanchez`` will create a
 91 |         match, while ``Gomez, Juan S.`` won't.
 92 | 
 93 |         Full names have to match. Only the signatures with single surname
 94 |         are used for matching.
 95 | 
 96 |         Parameters
 97 |         ----------
 98 |         :param first_surnames: tuple
 99 |             Tokens which represent  few first surnames. In form of a tuple of
100 |             strings.
101 |         :param last_surname: tuple
102 |             Tokens, usually one, representing last surname(s) of the author.
103 | 
104 |         Raises
105 |         ------
106 |         :raises: KeyError
107 |             When the last name is not included in the cluster
108 | 
109 |         Returns
110 |         -------
111 |         :returns: boolean
112 |             Information whether cluster contains this author if some of the
113 |             first last names are treated as the last given names.
114 |         """
115 |         if last_surname in self._content:
116 |             for given_names in six.iterkeys(self._content[last_surname]):
117 |                 given_names_left = len(given_names)
118 |                 for reversed_index, name in \
119 |                         enumerate(reversed(first_surnames)):
120 |                     if given_names_left == 0:
121 |                         return True
122 |                     elif given_names[-(reversed_index + 1)] != name:
123 |                         break
124 |                     given_names_left -= 1
125 |                     if reversed_index == len(first_surnames) - 1:
126 |                         return True
127 |             return False
128 |         self._raise_keyerror(last_surname)
129 | 
130 |     def contains(self, surnames):
131 |         """Check if there is at least one signature with given surnames.
132 | 
133 |         Parameters
134 |         ----------
135 |         :param surnames: tuple
136 |             Strings representing surnames on a signature.
137 | 
138 |         Returns
139 |         -------
140 |         :returns: boolean
141 |             True if there is at least one sinature with given surnames.
142 |         """
143 |         return surnames in self._content
144 | 
145 |     def _raise_keyerror(self, key):
146 |         raise KeyError("The cluster doesn't contain a key %s" % key)
147 | 
148 | 
149 | def _split_blocks(blocks, X, threshold):
150 |     splitted_blocks = []
151 |     id_to_size = {}
152 | 
153 |     for block in blocks:
154 |         if block._name in id_to_size:
155 |             id_to_size[block._name] += 1
156 |         else:
157 |             id_to_size[block._name] = 1
158 | 
159 |     for index, block in enumerate(blocks):
160 |         if id_to_size[block._name] > threshold:
161 | 
162 |             splitted_blocks.append(block._name +
163 |                                    given_name_initial(X[index
164 |                                                         ][0]['author_name']))
165 |         else:
166 |             splitted_blocks.append(block._name)
167 | 
168 |     return splitted_blocks
169 | 
170 | 
171 | def block_phonetic(X, threshold=1000, phonetic_algorithm="double_metaphone"):
172 |     """Block the signatures.
173 | 
174 |     This blocking algorithm takes into consideration the cases, where
175 |     author has more than one surname. Such a signature can be assigned
176 |     to a block for the first author surname or the last one.
177 | 
178 |     The names are preprocessed by ``phonetic_tokenize_name`` function. As a
179 |     result, here the algorithm operates on ``Double Metaphone`` tokens which
180 |     are previously normalized.
181 | 
182 |     The algorithm has two phases. In the first phase, all the signatures with
183 |     one surname are clustered together. Every different surname token creates
184 |     a new block. In the second phase, the signatures
185 |     with multiple surnames are compared with the blocks for the first and
186 |     last surname.
187 | 
188 |     If the first surnames of author were already used as the last given names
189 |     on some of the signatures, the new signature will be assigned to the block
190 |     of the last surname.
191 | 
192 |     Otherwise, the signature will be assigned to the block of
193 |     the first surname.
194 | 
195 |     To prevent creation of too big clusters, the ``threshold`` parameter can
196 |     be set. The algorithm will split every block which size is bigger than
197 |     ``threshold`` into smaller ones using given names initials as the
198 |     condition.
199 | 
200 |     Parameters
201 |     ----------
202 |     :param X: numpy array
203 |         Array of one element arrays of dictionaries. Each dictionary
204 |         represents a signature. The algorithm needs ``author_name`` field in
205 |         the dictionaries in order to work.
206 |     :param threshold: integer
207 |         Size above which the blocks will be split into smaller ones.
208 |     :param phonetic algorithm: string
209 |         Which phonetic algorithm will be used. Options:
210 |         -  "double_metaphone"
211 |         -  "nysiis" (only for Python 2)
212 |         -  "soundex" (only for Python 2)
213 | 
214 |     Returns
215 |     -------
216 |     :returns: numpy array
217 |         Array with ids of the blocks. The ids are strings. The order of the
218 |         array is the same as in the ``X`` input parameter.
219 |     """
220 |     # Stores all clusters. It is the only way to access them.
221 |     # Every cluster can be accessed by the token that was used to create it.
222 |     # It is the last token from the surnames tokens passed to the constructor.
223 |     id_to_block = {}
224 | 
225 |     # List of tuples. Used as the in-between state of the algorithm between
226 |     # the first and the second states. The tuple contain the block name
227 |     # if the signature has been already blocked or None otherwise, and the
228 |     # tokens.
229 |     ordered_tokens = []
230 | 
231 |     # First phase.
232 |     # Create blocks for signatures with single surname
233 | 
234 |     for signature_array in X[:, 0]:
235 |         tokens = phonetic_tokenize_name(signature_array['author_name'],
236 |                                         phonetic_algorithm=phonetic_algorithm)
237 |         surname_tokens = tokens[0]
238 |         if len(surname_tokens) == 1:
239 |             # Single surname case
240 |             surname = surname_tokens[0]
241 |             if surname not in id_to_block:
242 |                 id_to_block[surname] = _Block(*tokens)
243 |             else:
244 |                 id_to_block[surname].add_signature(*tokens)
245 |             ordered_tokens.append((surname, tokens))
246 |         else:
247 |             # Multiple surnames
248 |             ordered_tokens.append((None, tokens))
249 | 
250 |     # Second phase.
251 |     # Assign every signature with multiple surnames to the block of the
252 |     # first surname or the block of the last surname.
253 | 
254 |     blocks = []
255 | 
256 |     for token_tuple in ordered_tokens:
257 | 
258 |         if token_tuple[0] is not None:
259 | 
260 |             # There is already a block
261 |             blocks.append(id_to_block[token_tuple[0]])
262 | 
263 |         else:
264 | 
265 |             # Case of multiple surnames
266 |             tokens = token_tuple[1]
267 |             surnames, given_names = tokens
268 | 
269 |             # Check if this combination of surnames was already included
270 |             try:
271 |                 # First surname
272 | 
273 |                 cluster = id_to_block[surnames[0]]
274 |                 if cluster.contains(surnames):
275 |                     cluster.add_signature(*tokens)
276 |                     blocks.append(cluster)
277 |                     continue
278 |             except KeyError:
279 |                 # No such block
280 |                 pass
281 | 
282 |             try:
283 |                 # Last surname
284 | 
285 |                 cluster = id_to_block[surnames[-1]]
286 |                 if cluster.contains(surnames):
287 |                     cluster.add_signature(*tokens)
288 |                     blocks.append(cluster)
289 |                     continue
290 | 
291 |                 # # No match, compute heuristically the match over initials
292 | 
293 |                 # Firstly, check if some of the surnames were used as the
294 |                 # last given names on some of the signatures.
295 |                 index = len(surnames) - 1
296 |                 match_found = False
297 | 
298 |                 while index > 0:
299 |                     token_prefix = surnames[:index]
300 |                     if cluster.compare_tokens_from_last(token_prefix,
301 |                                                         (surnames[-1],)):
302 |                         cluster.add_signature(*tokens)
303 |                         match_found = True
304 |                         break
305 |                     index -= 1
306 | 
307 |                 if match_found:
308 |                     # There was a full name match, so it must be the same
309 |                     # author.
310 |                     blocks.append(cluster)
311 |                     continue
312 | 
313 |             except KeyError:
314 |                 # No such block
315 |                 pass
316 | 
317 |             try:
318 |                 # No match with last surname. Match with the first one.
319 |                 cluster = id_to_block[surnames[0]]
320 |                 cluster.add_signature(*tokens)
321 |                 blocks.append(cluster)
322 | 
323 |                 continue
324 | 
325 |             except KeyError:
326 |                 # No such block
327 |                 pass
328 | 
329 |             # No block for the first surname and no good match for the
330 |             # last surname.
331 |             if surnames[-1] not in id_to_block:
332 |                 # Create new block.
333 |                 id_to_block[surnames[-1]] = _Block(*tokens)
334 |             blocks.append(id_to_block[surnames[-1]])
335 | 
336 |     return np.array(_split_blocks(blocks, X, threshold))
337 | 
338 | 
339 | def block_single(X):
340 |     """Block the signatures into only one block.
341 | 
342 |     Parameters
343 |     ----------
344 |     :param X: numpy array
345 |         Array of singletons of dictionaries.
346 | 
347 |     Returns
348 |     -------
349 |     :returns: numpy array
350 |         Array with ids of the blocks. As there is only one block, every element
351 |         equals zero.
352 |     """
353 |     return np.zeros(len(X), dtype=np.int)
354 | 
355 | 
356 | def block_last_name_first_initial(X):
357 |     """Blocking function using last name and first initial as key.
358 | 
359 |     The names are normalized before assigning to a block.
360 | 
361 |     Parameters
362 |     ----------
363 |     :param X: numpy array
364 |         Array of singletons of dictionaries.
365 | 
366 |     Returns
367 |     -------
368 |     :returns: numpy array
369 |         Array with ids of the blocks. The order of the
370 |         array is the same as in the ``X`` input parameter.
371 |     """
372 |     def last_name_first_initial(name):
373 |         names = normalize_name(name).split(" ", 1)
374 | 
375 |         try:
376 |             name = "%s %s" % (names[0], names[1].strip()[0])
377 |         except IndexError:
378 |             name = names[0]
379 | 
380 |         return name
381 | 
382 |     blocks = []
383 | 
384 |     for signature in X[:, 0]:
385 |         blocks.append(last_name_first_initial(signature["author_name"]))
386 | 
387 |     return np.array(blocks)
388 | 


--------------------------------------------------------------------------------
/beard/clustering/wrappers.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is part of Beard.
  4 | # Copyright (C) 2015 CERN.
  5 | #
  6 | # Beard is a free software; you can redistribute it and/or modify it
  7 | # under the terms of the Revised BSD License; see LICENSE file for
  8 | # more details.
  9 | 
 10 | """Scikit-Learn compatible wrappers of clustering algorithms.
 11 | 
 12 | .. codeauthor:: Gilles Louppe <g.louppe@cern.ch>
 13 | .. codeauthor:: Hussein Al-Natsheh <h.natsheh@ciapple.com>
 14 | 
 15 | """
 16 | import numpy as np
 17 | 
 18 | import scipy.cluster.hierarchy as hac
 19 | 
 20 | from sklearn.base import BaseEstimator
 21 | from sklearn.base import ClusterMixin
 22 | 
 23 | 
 24 | class ScipyHierarchicalClustering(BaseEstimator, ClusterMixin):
 25 |     """Wrapper for Scipy's hierarchical clustering implementation.
 26 | 
 27 |     Attributes
 28 |     ----------
 29 |     labels_ : ndarray, shape (n_samples,)
 30 |         Array of labels assigned to the input data.
 31 | 
 32 |     linkage_ : ndarray
 33 |         The linkage matrix.
 34 |     """
 35 | 
 36 |     def __init__(self, method="single", affinity="euclidean",
 37 |                  threshold=None, n_clusters=None, criterion="distance",
 38 |                  depth=2, R=None, monocrit=None, unsupervised_scoring=None,
 39 |                  supervised_scoring=None, scoring_data=None,
 40 |                  best_threshold_precedence=True):
 41 |         """Initialize.
 42 | 
 43 |         Parameters
 44 |         ----------
 45 |         :param method: string
 46 |             The linkage algorithm to use.
 47 |             See scipy.cluster.hierarchy.linkage for further details.
 48 | 
 49 |         :param affinity: string or callable
 50 |             The distance metric to use.
 51 |             - "precomputed": assume that X is a distance matrix;
 52 |             - callable: a function returning a distance matrix.
 53 |             - Otherwise, any value supported by
 54 |               scipy.cluster.hierarchy.linkage.
 55 | 
 56 |         :param n_clusters: int
 57 |             The number of flat clusters to form.
 58 | 
 59 |         :param threshold: float or None
 60 |             The thresold to apply when forming flat clusters, if
 61 |             n_clusters=None.
 62 |             See scipy.cluster.hierarchy.fcluster for further details.
 63 | 
 64 |         :param criterion: string
 65 |             The criterion to use in forming flat clusters.
 66 |             See scipy.cluster.hierarchy.fcluster for further details.
 67 | 
 68 |         :param depth: int
 69 |             The maximum depth to perform the inconsistency calculation.
 70 |             See scipy.cluster.hierarchy.fcluster for further details.
 71 | 
 72 |         :param R: array-like or None
 73 |             The inconsistency matrix to use for the 'inconsistent' criterion.
 74 |             See scipy.cluster.hierarchy.fcluster for further details.
 75 | 
 76 |         :param monocrit: array-like or None
 77 |             The statistics upon which non-singleton i is thresholded.
 78 |             See scipy.cluster.hierarchy.fcluster for further details.
 79 | 
 80 |          :param scoring_data: string or None
 81 |             The type of input data to pass to the scoring function:
 82 |             - "raw": for passing the original X array;
 83 |             - "affinity": for passing an affinity matrix Xa;
 84 |             - None: for not passing anything but the labels.
 85 | 
 86 |         :param supervised_scoring: callable or None
 87 |             The scoring function to maximize in order to estimate the best
 88 |             threshold. Labels must be provided in y for this scoring function.
 89 |             There are 3 possible cases based on the value of `scoring_data`:
 90 |                 - scoring_data == "raw":
 91 |                     supervised_scoring(X_raw, label_true, label_pred);
 92 |                 - scoring_data == "affinity":
 93 |                     supervised_scoring(X_affinity, label_true, label_pred);
 94 |                 - scoring_data is None:
 95 |                     supervised_scoring(label_true, label_pred).
 96 | 
 97 |         :param unsupervised_scoring: callable or None
 98 |             The scoring function to maximize in order to estimate the best
 99 |             threshold.  Labels must not be provided in y for this scoring
100 |             function.There are 3 possible cases based on the value of
101 |             `scoring_data`:
102 |                 - scoring_data == "raw":
103 |                     unsupervised_scoring(X_raw, label_pred);
104 |                 - scoring_data == "affinity":
105 |                     unsupervised_scoring(X_affinity, label_pred);
106 |                 - scoring_data is None:
107 |                     unsupervised_scoring(label_pred).
108 | 
109 |         """
110 |         self.method = method
111 |         self.affinity = affinity
112 |         self.threshold = threshold
113 |         self.n_clusters = n_clusters
114 |         self.criterion = criterion
115 |         self.depth = depth
116 |         self.R = R
117 |         self.monocrit = monocrit
118 |         self.unsupervised_scoring = unsupervised_scoring
119 |         self.supervised_scoring = supervised_scoring
120 |         self.scoring_data = scoring_data
121 |         self.best_threshold_precedence = best_threshold_precedence
122 | 
123 |     def fit(self, X, y=None):
124 |         """Perform hierarchical clustering on input data.
125 | 
126 |         Parameters
127 |         ----------
128 |         :param X: array-like, shape (n_samples, n_features) or
129 |                   (n_samples, n_samples)
130 |             Input data, as an array of samples or as a distance matrix if
131 |             affinity == 'precomputed'.
132 | 
133 |         :param y: array-like, shape (n_samples, )
134 |             Input labels, in case of (semi-)supervised clustering.
135 |             Labels equal to -1 stand for unknown labels.
136 | 
137 |         Returns
138 |         -------
139 |         :returns: self
140 |         """
141 |         X = np.array(X)
142 |         X_raw = X
143 |         n_samples = X.shape[0]
144 | 
145 |         # Build linkage matrix
146 |         if self.affinity == "precomputed" or callable(self.affinity):
147 |             if callable(self.affinity):
148 |                 X = self.affinity(X)
149 |             X_affinity = X
150 |             if X.ndim == 2:
151 |                 i, j = np.triu_indices(X.shape[0], k=1)
152 |                 X = X[i, j]
153 |             self.linkage_ = hac.linkage(X, method=self.method)
154 |         else:
155 |             X_affinity = None
156 |             self.linkage_ = hac.linkage(X,
157 |                                         method=self.method,
158 |                                         metric=self.affinity)
159 | 
160 |         if self.scoring_data == "affinity" and X_affinity is None:
161 |             raise ValueError("The scoring function expects an affinity matrix,"
162 |                              " which cannot be computed from the combination"
163 |                              " of parameters you provided.")
164 | 
165 |         # Estimate threshold in case of semi-supervised or unsupervised
166 |         # As default value we use the highest so we obtain only 1 cluster.
167 |         best_threshold = (self.linkage_[-1, 2] if self.threshold is None
168 |                           else self.threshold)
169 | 
170 |         n_clusters = self.n_clusters
171 |         supervised_scoring = self.supervised_scoring
172 |         unsupervised_scoring = self.unsupervised_scoring
173 |         ground_truth = (y is not None) and np.any(np.array(y) != -1)
174 |         scoring = supervised_scoring is not None or \
175 |             unsupervised_scoring is not None
176 | 
177 |         if n_clusters is None and scoring:
178 |             best_score = -np.inf
179 |             thresholds = np.concatenate(([0],
180 |                                          self.linkage_[:, 2],
181 |                                          [self.linkage_[-1, 2]]))
182 | 
183 |             for i in range(len(thresholds) - 1):
184 |                 t1, t2 = thresholds[i:i + 2]
185 |                 threshold = (t1 + t2) / 2.0
186 |                 labels = hac.fcluster(self.linkage_, threshold,
187 |                                       criterion=self.criterion,
188 |                                       depth=self.depth, R=self.R,
189 |                                       monocrit=self.monocrit)
190 | 
191 |                 if ground_truth and supervised_scoring is not None:
192 |                     train = (y != -1)
193 | 
194 |                     if self.scoring_data == "raw":
195 |                         score = supervised_scoring(X_raw, y[train],
196 |                                                    labels[train])
197 | 
198 |                     elif self.scoring_data == "affinity":
199 |                         score = supervised_scoring(X_affinity, y[train],
200 |                                                    labels[train])
201 | 
202 |                     else:
203 |                         score = supervised_scoring(y[train],
204 |                                                    labels[train])
205 | 
206 |                 elif unsupervised_scoring is not None:
207 |                     if self.scoring_data == "raw":
208 |                         score = unsupervised_scoring(X_raw, labels)
209 | 
210 |                     elif self.scoring_data == "affinity":
211 |                         score = unsupervised_scoring(X_affinity, labels)
212 | 
213 |                     else:
214 |                         score = unsupervised_scoring(labels)
215 | 
216 |                 else:
217 |                     break
218 | 
219 |                 if score >= best_score:
220 |                     best_score = score
221 |                     best_threshold = threshold
222 | 
223 |         self.best_threshold_ = best_threshold
224 |         self.n_samples_ = n_samples
225 | 
226 |         return self
227 | 
228 |     @property
229 |     def labels_(self):
230 |         """Compute the labels assigned to the input data.
231 | 
232 |         Note that labels are computed on-the-fly from the linkage matrix,
233 |         based on the value of self.threshold or self.n_clusters.
234 |         """
235 |         n_clusters = self.n_clusters
236 | 
237 |         if n_clusters is not None:
238 |             if n_clusters < 1 or n_clusters > self.n_samples_:
239 |                 raise ValueError("n_clusters must be within [1; n_samples].")
240 | 
241 |             else:
242 |                 thresholds = np.concatenate(([0],
243 |                                             self.linkage_[:, 2],
244 |                                             [self.linkage_[-1, 2]]))
245 | 
246 |                 for i in range(len(thresholds) - 1):
247 |                     t1, t2 = thresholds[i:i + 2]
248 |                     threshold = (t1 + t2) / 2.0
249 |                     labels = hac.fcluster(self.linkage_, threshold,
250 |                                           criterion=self.criterion,
251 |                                           depth=self.depth, R=self.R,
252 |                                           monocrit=self.monocrit)
253 | 
254 |                     if len(np.unique(labels)) <= n_clusters:
255 |                         _, labels = np.unique(labels, return_inverse=True)
256 |                         return labels
257 | 
258 |         else:
259 |             threshold = self.threshold
260 | 
261 |             if self.best_threshold_precedence:
262 |                 threshold = self.best_threshold_
263 | 
264 |             labels = hac.fcluster(self.linkage_, threshold,
265 |                                   criterion=self.criterion, depth=self.depth,
266 |                                   R=self.R, monocrit=self.monocrit)
267 | 
268 |             _, labels = np.unique(labels, return_inverse=True)
269 | 
270 |             return labels
271 | 


--------------------------------------------------------------------------------
/beard/ext/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #
 3 | # This file is part of Beard.
 4 | # Copyright (C) 2015 CERN.
 5 | #
 6 | # Beard is a free software; you can redistribute it and/or modify it
 7 | # under the terms of the Revised BSD License; see LICENSE file for
 8 | # more details.
 9 | 
10 | """External libraries and source files used by Beard."""
11 | 


--------------------------------------------------------------------------------
/beard/metrics/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #
 3 | # This file is part of Beard.
 4 | # Copyright (C) 2014, 2015 CERN.
 5 | #
 6 | # Beard is a free software; you can redistribute it and/or modify it
 7 | # under the terms of the Revised BSD License; see LICENSE file for
 8 | # more details.
 9 | 
10 | """Scoring metrics."""
11 | 
12 | from .clustering import b3_precision_recall_fscore
13 | from .clustering import b3_precision_score
14 | from .clustering import b3_recall_score
15 | from .clustering import b3_f_score
16 | from .clustering import paired_precision_recall_fscore
17 | from .clustering import paired_precision_score
18 | from .clustering import paired_recall_score
19 | from .clustering import paired_f_score
20 | from .clustering import silhouette_score
21 | from .text import jaro
22 | from .text import jaro_winkler
23 | from .text import levenshtein
24 | 
25 | __all__ = ("b3_precision_recall_fscore",
26 |            "b3_precision_score",
27 |            "b3_recall_score",
28 |            "b3_f_score",
29 |            "paired_precision_recall_fscore",
30 |            "paired_precision_score",
31 |            "paired_recall_score",
32 |            "paired_f_score",
33 |            "silhouette_score",
34 |            "jaro",
35 |            "jaro_winkler",
36 |            "levenshtein")
37 | 


--------------------------------------------------------------------------------
/beard/metrics/clustering.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is part of Beard.
  4 | # Copyright (C) 2014, 2015 CERN.
  5 | #
  6 | # Beard is a free software; you can redistribute it and/or modify it
  7 | # under the terms of the Revised BSD License; see LICENSE file for
  8 | # more details.
  9 | 
 10 | """Clustering evaluation metrics.
 11 | 
 12 | .. codeauthor:: Evangelos Tzemis <evangelos.tzemis@cern.ch>
 13 | .. codeauthor:: Gilles Louppe <g.louppe@cern.ch>
 14 | .. codeauthor:: Hussein Al-Natsheh <h.natsheh@ciapple@com>
 15 | 
 16 | """
 17 | from __future__ import division
 18 | 
 19 | import numpy as np
 20 | from operator import mul
 21 | from itertools import groupby
 22 | 
 23 | from sklearn.metrics import silhouette_score as sklearn_silhouette_score
 24 | from sklearn.metrics.cluster.supervised import check_clusterings
 25 | 
 26 | 
 27 | def silhouette_score(X, labels, metric="precomputed"):
 28 |     """Compute the silhouette score.
 29 | 
 30 |     The silhouette coefficent is only defined if number of clusters if
 31 |     1 < n_clusters < n_samples.
 32 | 
 33 |     Parameters:
 34 |     -----------
 35 |     :param X : array [n_samples_a, n_samples_a] if metric == "precomputed",
 36 |                or [n_samples_a, n_features] otherwise
 37 |         Array of pairwise distances between samples, or a feature array.
 38 |     :param labels : array, shape = [n_samples]
 39 |         Predicted labels for each sample.
 40 |     :param metric : string, or callable
 41 |         The metric to use when calculating distance between instances in a
 42 |         feature array. If metric is a string, it must be one of the options
 43 |         allowed by `sklearn.metrics.pairwise.pairwise_distances`. If X is the
 44 |         distance array itself, use metric="precomputed".
 45 | 
 46 |     Returns:
 47 |     --------
 48 |     :return floate: mean silhouette coefficient for all samples or
 49 |         -1.0 if n_clusters <= 1 or n_clusters >= n_samples.
 50 |     """
 51 |     n_samples = X.shape[0]
 52 |     n_clusters = len(np.unique(labels))
 53 | 
 54 |     if 1 < n_clusters < n_samples:
 55 |         return sklearn_silhouette_score(X, labels, metric)
 56 |     else:
 57 |         return -1.0
 58 | 
 59 | 
 60 | def b3_precision_recall_fscore(labels_true, labels_pred):
 61 |     """Compute the B^3 variant of precision, recall and F-score.
 62 | 
 63 |     Parameters
 64 |     ----------
 65 |     :param labels_true: 1d array containing the ground truth cluster labels.
 66 |     :param labels_pred: 1d array containing the predicted cluster labels.
 67 | 
 68 |     Returns
 69 |     -------
 70 |     :return float precision: calculated precision
 71 |     :return float recall: calculated recall
 72 |     :return float f_score: calculated f_score
 73 | 
 74 |     Reference
 75 |     ---------
 76 |     Amigo, Enrique, et al. "A comparison of extrinsic clustering evaluation
 77 |     metrics based on formal constraints." Information retrieval 12.4
 78 |     (2009): 461-486.
 79 |     """
 80 |     # Check that labels_* are 1d arrays and have the same size
 81 |     labels_true, labels_pred = check_clusterings(labels_true, labels_pred)
 82 | 
 83 |     # Check that input given is not the empty set
 84 |     if labels_true.shape == (0, ):
 85 |         raise ValueError(
 86 |             "input labels must not be empty.")
 87 | 
 88 |     # Compute P/R/F scores
 89 |     n_samples = len(labels_true)
 90 |     true_clusters = {}  # true cluster_id => set of sample indices
 91 |     pred_clusters = {}  # pred cluster_id => set of sample indices
 92 | 
 93 |     for i in range(n_samples):
 94 |         true_cluster_id = labels_true[i]
 95 |         pred_cluster_id = labels_pred[i]
 96 | 
 97 |         if true_cluster_id not in true_clusters:
 98 |             true_clusters[true_cluster_id] = set()
 99 |         if pred_cluster_id not in pred_clusters:
100 |             pred_clusters[pred_cluster_id] = set()
101 | 
102 |         true_clusters[true_cluster_id].add(i)
103 |         pred_clusters[pred_cluster_id].add(i)
104 | 
105 |     for cluster_id, cluster in true_clusters.items():
106 |         true_clusters[cluster_id] = frozenset(cluster)
107 |     for cluster_id, cluster in pred_clusters.items():
108 |         pred_clusters[cluster_id] = frozenset(cluster)
109 | 
110 |     precision = 0.0
111 |     recall = 0.0
112 | 
113 |     intersections = {}
114 | 
115 |     for i in range(n_samples):
116 |         pred_cluster_i = pred_clusters[labels_pred[i]]
117 |         true_cluster_i = true_clusters[labels_true[i]]
118 | 
119 |         if (pred_cluster_i, true_cluster_i) in intersections:
120 |             intersection = intersections[(pred_cluster_i, true_cluster_i)]
121 |         else:
122 |             intersection = pred_cluster_i.intersection(true_cluster_i)
123 |             intersections[(pred_cluster_i, true_cluster_i)] = intersection
124 | 
125 |         precision += len(intersection) / len(pred_cluster_i)
126 |         recall += len(intersection) / len(true_cluster_i)
127 | 
128 |     precision /= n_samples
129 |     recall /= n_samples
130 | 
131 |     f_score = 2 * precision * recall / (precision + recall)
132 | 
133 |     return precision, recall, f_score
134 | 
135 | 
136 | def b3_precision_score(labels_true, labels_pred):
137 |     """Compute the B^3 variant of precision.
138 | 
139 |     Parameters
140 |     ----------
141 |     :param labels_true: 1d array containing the ground truth cluster labels.
142 |     :param labels_pred: 1d array containing the predicted cluster labels.
143 | 
144 |     Returns
145 |     -------
146 |     :return float precision: calculated precision
147 |     """
148 |     p, _, _ = b3_precision_recall_fscore(labels_true, labels_pred)
149 |     return p
150 | 
151 | 
152 | def b3_recall_score(labels_true, labels_pred):
153 |     """Compute the B^3 variant of recall.
154 | 
155 |     Parameters
156 |     ----------
157 |     :param labels_true: 1d array containing the ground truth cluster labels.
158 |     :param labels_pred: 1d array containing the predicted cluster labels.
159 | 
160 |     Returns
161 |     -------
162 |     :return float recall: calculated recall
163 |     """
164 |     _, r, _ = b3_precision_recall_fscore(labels_true, labels_pred)
165 |     return r
166 | 
167 | 
168 | def b3_f_score(labels_true, labels_pred):
169 |     """Compute the B^3 variant of F-score.
170 | 
171 |     Parameters
172 |     ----------
173 |     :param labels_true: 1d array containing the ground truth cluster labels.
174 |     :param labels_pred: 1d array containing the predicted cluster labels.
175 | 
176 |     Returns
177 |     -------
178 |     :return float f_score: calculated F-score
179 |     """
180 |     _, _, f = b3_precision_recall_fscore(labels_true, labels_pred)
181 |     return f
182 | 
183 | 
184 | def paired_precision_recall_fscore(labels_true, labels_pred):
185 |     """Compute the pairwise variant of precision, recall and F-score.
186 | 
187 |     Precision is the ability not to label as positive a sample
188 |     that is negative. The best value is 1 and the worst is 0.
189 | 
190 |     Recall is the ability to successfully find all the positive samples.
191 |     The best value is 1 and the worst is 0.
192 | 
193 |     F-score (Harmonic mean) can be thought as a weighted harmonic mean of
194 |     the precision and recall, where an F-score reaches its best value at 1
195 |     and worst at 0.
196 | 
197 |     Parameters
198 |     ----------
199 |     :param labels_true: 1d array containing the ground truth cluster labels.
200 |     :param labels_pred: 1d array containing the predicted cluster labels.
201 | 
202 |     Returns
203 |     -------
204 |     :return float precision: calculated precision
205 |     :return float recall: calculated recall
206 |     :return float f_score: calculated f_score
207 | 
208 |     Reference
209 |     ---------
210 |     Levin, Michael et al., "Citation-based bootstrapping for large-scale
211 |     author disambiguation", Journal of the American Society for Information
212 |     Science and Technology 63.5 (2012): 1030-1047.
213 |     """
214 |     # Check that labels_* are 1d arrays and have the same size
215 |     labels_true, labels_pred = check_clusterings(labels_true, labels_pred)
216 | 
217 |     # Check that input given is not the empty set
218 |     if labels_true.shape == (0, ):
219 |         raise ValueError(
220 |             "input labels must not be empty.")
221 | 
222 |     # Assigns each label to its own cluster
223 |     default_clustering = range(len(labels_pred))
224 | 
225 |     # Calculate precision
226 |     numerator = _general_merge_distance(labels_true, labels_pred,
227 |                                         fm=_zero, fs=mul)
228 |     denominator = _general_merge_distance(default_clustering,
229 |                                           labels_pred,
230 |                                           fm=_zero, fs=mul)
231 |     try:
232 |         precision = 1.0 - numerator / denominator
233 |     except ZeroDivisionError:
234 |         precision = 1.0
235 | 
236 |     # Calculate recall
237 |     numerator = _general_merge_distance(labels_true, labels_pred,
238 |                                         fm=mul, fs=_zero)
239 |     denominator = _general_merge_distance(labels_true,
240 |                                           default_clustering,
241 |                                           fm=mul, fs=_zero)
242 |     try:
243 |         recall = 1.0 - numerator / denominator
244 |     except ZeroDivisionError:
245 |         recall = 1.0
246 | 
247 |     # Calculate f_score
248 | 
249 |     # If both are zero (minimum score) then f_score is also zero
250 |     if precision + recall == 0.0:
251 |         f_score = 0.0
252 |     else:
253 |         f_score = 2.0 * precision * recall / (precision + recall)
254 | 
255 |     return precision, recall, f_score
256 | 
257 | 
258 | def paired_precision_score(labels_true, labels_pred):
259 |     """Compute the pairwise variant of precision.
260 | 
261 |     Precision is the ability not to label as positive a sample
262 |     that is negative. The best value is 1 and the worst is 0.
263 | 
264 |     Parameters
265 |     ----------
266 |     :param labels_true: 1d array containing the ground truth cluster labels.
267 |     :param labels_pred: 1d array containing the predicted cluster labels.
268 | 
269 |     Returns
270 |     -------
271 |     :return float precision: calculated precision
272 |     """
273 |     p, _, _ = paired_precision_recall_fscore(labels_true, labels_pred)
274 |     return p
275 | 
276 | 
277 | def paired_recall_score(labels_true, labels_pred):
278 |     """Compute the pairwise variant of recall.
279 | 
280 |     Recall is the ability to succesfully find all the positive samples.
281 |     The best value is 1 and the worst is 0.
282 | 
283 |     Parameters
284 |     ----------
285 |     :param labels_true: 1d array containing the ground truth labels.
286 |     :param labels_pred: 1d array containing the predicted labels.
287 | 
288 |     Returns
289 |     -------
290 |     :return float recall: calculated recall
291 |     """
292 |     _, r, _ = paired_precision_recall_fscore(labels_true, labels_pred)
293 |     return r
294 | 
295 | 
296 | def paired_f_score(labels_true, labels_pred):
297 |     """Compute the pairwise variant of F-score.
298 | 
299 |     F score can be thought as a weighted harmonic mean of the precision
300 |     and recall, where an F score reaches its best value at 1
301 |     and worst at 0.
302 | 
303 |     Parameters
304 |     ----------
305 |     :param labels_true: 1d array containing the ground truth cluster labels.
306 |     :param labels_pred: 1d array containing the predicted cluster labels.
307 | 
308 |     Returns
309 |     -------
310 |     :return float f_score: calculated harmonic mean (f_score)
311 | 
312 |     """
313 |     _, _, f = paired_precision_recall_fscore(labels_true, labels_pred)
314 |     return f
315 | 
316 | 
317 | def _zero(x, y):
318 |     return 0.0
319 | 
320 | 
321 | def _cluster_samples(labels):
322 |     """Group input to sets that belong to the same cluster.
323 | 
324 |     Parameters
325 |     ----------
326 |     :param labels: array with the cluster labels
327 | 
328 |     Returns
329 |     -------
330 |     :return: dictionary with keys the cluster ids and values a tuple containing
331 |              the ids of elements tha belong to this cluster.
332 |     """
333 |     groupped_samples = groupby(np.argsort(labels), lambda i: labels[i])
334 | 
335 |     return {k: tuple(values) for k, values in groupped_samples}
336 | 
337 | 
338 | def _general_merge_distance(y_true, y_pred,
339 |                             fs=lambda x, y: 1.0, fm=lambda x, y: 1.0):
340 |     """Slice algorithm for computing generalized merge distance.
341 | 
342 |     Slice is a linear time algorithm.
343 | 
344 |     Merge Distance is the minimum number of splits and merges
345 |     to get from R-flat to y_true.
346 | 
347 |     Parameters
348 |     ----------
349 |     :param y_true: array with the ground truth cluster labels.
350 |     :param y_pred: array with the predicted cluster labels.
351 |     :param fs: Optional. Function defining the cost of split.
352 |     :param fm: Optional. Function defining the cost of merge.
353 | 
354 |     Returns
355 |     -------
356 |     :return float: Cost of getting from y_pred to y_true.
357 | 
358 |     Reference
359 |     ---------
360 |     Menestrina, David Michael., "Matching and unifying records in a
361 |     distributed system", Department of Computer Science Thesis, Ph.D.
362 |     dissertation, Stanford University (2010).
363 |     """
364 |     r = _cluster_samples(y_pred)
365 |     s = _cluster_samples(y_true)
366 |     r_sizes = {k: len(v) for k, v in r.items()}
367 | 
368 |     cost = 0.0
369 |     for si in s.values():
370 |         # determine which clusters in r contain the records of si
371 |         p_map = {}
372 |         for element in si:
373 |             cl = y_pred[element]
374 |             if cl not in p_map:
375 |                 p_map[cl] = 0
376 |             p_map[cl] += 1
377 | 
378 |         # Compute cost to generate si
379 |         si_cost = 0.0
380 |         total_recs = 0
381 |         for i, count in p_map.items():
382 |             # add the cost to split ri
383 |             if r_sizes[i] > count:
384 |                 si_cost += fs(count, r_sizes[i] - count)
385 |             r_sizes[i] -= count
386 |             if total_recs != 0:
387 |                 # Cost to merge into si
388 |                 si_cost += fm(count, total_recs)
389 |             total_recs += count
390 |         cost += si_cost
391 | 
392 |     return cost
393 | 


--------------------------------------------------------------------------------
/beard/metrics/text.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is part of Beard.
  4 | # Copyright (C) 2015 CERN.
  5 | #
  6 | # Beard is a free software; you can redistribute it and/or modify it
  7 | # under the terms of the Revised BSD License; see LICENSE file for
  8 | # more details.
  9 | 
 10 | """Text metrics.
 11 | 
 12 | .. codeauthor:: Petros Ioannidis <petros.ioannidis91@gmail.com>
 13 | .. codeauthor:: Evangelos Tzemis <evangelos.tzemis@gmail.com>
 14 | 
 15 | """
 16 | 
 17 | from __future__ import division
 18 | import numpy as np
 19 | import re
 20 | 
 21 | 
 22 | def _find_all(s, pattern):
 23 |     """Find all occurences of the given pattern.
 24 | 
 25 |     Parameters
 26 |     ----------
 27 |     :param s: string
 28 |         String to be searched
 29 | 
 30 |     :param letter: string
 31 |         Substring we are searching for
 32 | 
 33 |     Returns
 34 |     -------
 35 |     :returns: generator
 36 |         A generator that holds the indexes of the patterns
 37 |     """
 38 |     for match in re.finditer(pattern, s):
 39 |         yield match.start()
 40 | 
 41 | 
 42 | def _jaro_matching(s1, s2):
 43 |     """Return the number of matching letters and transpositions.
 44 | 
 45 |     Parameters
 46 |     ----------
 47 |     :param s1: string
 48 |         First string
 49 | 
 50 |     :param s2: string
 51 |         Second string
 52 | 
 53 |     Returns
 54 |     -------
 55 |     :returns: (int, int)
 56 |         The number of matching letters and transpositions
 57 |     """
 58 |     H = min(len(s1), len(s2)) // 2
 59 | 
 60 |     letters_cache = {}
 61 |     matches = 0
 62 |     transpositions = 0
 63 |     s1_matching_letters = []
 64 |     s2_matching_letters = []
 65 |     s1_matched_positions = []
 66 |     s2_matched_positions = []
 67 | 
 68 |     for letter in s1:
 69 |         if letter not in letters_cache:
 70 |             letters_cache[letter] = (tuple(_find_all(s1, letter)),
 71 |                                      tuple(_find_all(s2, letter)))
 72 | 
 73 |     for letter, (s1_positions, s2_positions) in letters_cache.items():
 74 |         for i in s1_positions:
 75 |             for j in s2_positions:
 76 |                 if i - H <= j <= i + H:
 77 |                     if j not in s2_matched_positions:
 78 |                         matches += 1
 79 |                         s2_matched_positions.append(j)
 80 |                         s1_matching_letters.append((i, letter))
 81 |                         break
 82 | 
 83 |     for letter, (s1_positions, s2_positions) in letters_cache.items():
 84 |         for j in s2_positions:
 85 |             for i in s1_positions:
 86 |                 if j - H <= i <= j + H:
 87 |                     if i not in s1_matched_positions:
 88 |                         s1_matched_positions.append(i)
 89 |                         s2_matching_letters.append((j, letter))
 90 |                         break
 91 | 
 92 |     s1_matching_letters.sort()
 93 |     s2_matching_letters.sort()
 94 |     transpositions = len(tuple(filter(lambda x: x[0][1] != x[1][1],
 95 |                                zip(s1_matching_letters,
 96 |                                    s2_matching_letters))))
 97 | 
 98 |     return matches, transpositions
 99 | 
100 | 
101 | def jaro(s1, s2):
102 |     """Return the Jaro similarity of the strings s1 and s2.
103 | 
104 |     Parameters
105 |     ----------
106 |     :param s1: string
107 |         First string
108 | 
109 |     :param s2: string
110 |         Second string
111 | 
112 |     Returns
113 |     -------
114 |     :returns: float
115 |         Similarity of s1 and s2
116 | 
117 |     Reference
118 |     ---------
119 |     Jaro, M. A., "Advances in record-linkage methodology as applied to
120 |     matching the 1985 census of Tampa, Florida", Journal of the American
121 |     Statistical Association, 84:414-420, 1989.
122 |     """
123 |     if len(s1) == 0 or len(s2) == 0:
124 |         return 0
125 | 
126 |     n_matches, n_transpositions = _jaro_matching(s1, s2)
127 | 
128 |     if n_matches == 0:
129 |         return 0
130 | 
131 |     return 1 / 3 * (n_matches / len(s1) +
132 |                     n_matches / len(s2) +
133 |                     (n_matches - n_transpositions / 2) / n_matches)
134 | 
135 | 
136 | def jaro_winkler(s1, s2, p=0.1):
137 |     """Return the Jaro-Winkler similarity of the strings s1 and s2.
138 | 
139 |     Parameters
140 |     ----------
141 |     :param s1: string
142 |         First string
143 | 
144 |     :param s2: string
145 |         Second string
146 | 
147 |     Returns
148 |     -------
149 |     :returns: float
150 |         Similarity of s1 and s2
151 | 
152 |     Reference
153 |     ---------
154 |     Winkler, W. E., "The state of record linkage and current research
155 |     problems", Statistical Research Division, US Census Bureau. 1999.
156 |     """
157 |     jaro_distance = jaro(s1, s2)
158 | 
159 |     common_prefix = 0
160 |     for s1_letter, s2_letter in zip(s1, s2):
161 |         if s1_letter == s2_letter and common_prefix < 4:
162 |             common_prefix += 1
163 |         else:
164 |             break
165 | 
166 |     return jaro_distance + p * common_prefix * (1 - jaro_distance)
167 | 
168 | 
169 | def levenshtein(a, b):
170 |     """Calculate the levenshtein distance between strings a and b.
171 | 
172 |     Case sensitiveness is activated, meaning that uppercase letters
173 |     are treated differently than their corresponding lowercase ones.
174 | 
175 |     Parameters
176 |     ----------
177 |     :param a: string
178 |         String to be compared
179 | 
180 |     :param b: string
181 |         String to be compared
182 | 
183 |     Returns
184 |     -------
185 |     :returns int:
186 |         The calculated levenshtein distance.
187 |     """
188 |     len_a, len_b = len(a), len(b)
189 | 
190 |     if len_a < len_b:
191 |         return levenshtein(b, a)
192 |     if len_b == 0:
193 |         return len_a
194 | 
195 |     # We use tuple() to force strings to be used as sequences.
196 |     a = np.array(tuple(a))
197 |     b = np.array(tuple(b))
198 | 
199 |     # Instead of calculating the whole matrix, we only keep the last 2 rows.
200 |     previous_row = np.arange(len_b + 1)
201 |     for character in a:
202 |         # Insertion
203 |         current_row = previous_row + 1
204 |         # Substitution or matching
205 |         current_row[1:] = np.minimum(
206 |             current_row[1:],
207 |             np.add(previous_row[:-1], b != character))
208 |         # Deletion
209 |         current_row[1:] = np.minimum(
210 |             current_row[1:],
211 |             current_row[:-1] + 1)
212 |         previous_row = current_row
213 | 
214 |     return current_row[-1]
215 | 


--------------------------------------------------------------------------------
/beard/similarity/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #
 3 | # This file is part of Beard.
 4 | # Copyright (C) 2014 CERN.
 5 | #
 6 | # Beard is a free software; you can redistribute it and/or modify it
 7 | # under the terms of the Revised BSD License; see LICENSE file for
 8 | # more details.
 9 | 
10 | """Similarity learning algorithms."""
11 | 
12 | from .pairs import AbsoluteDifference
13 | from .pairs import CosineSimilarity
14 | from .pairs import EstimatorTransformer
15 | from .pairs import ElementMultiplication
16 | from .pairs import JaccardSimilarity
17 | from .pairs import PairTransformer
18 | from .pairs import StringDistance
19 | from .pairs import Thresholder
20 | 
21 | __all__ = ("AbsoluteDifference",
22 |            "CosineSimilarity",
23 |            "EstimatorTransformer",
24 |            "ElementMultiplication",
25 |            "JaccardSimilarity",
26 |            "PairTransformer",
27 |            "StringDistance",
28 |            "Thresholder")
29 | 


--------------------------------------------------------------------------------
/beard/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #
 3 | # This file is part of Beard.
 4 | # Copyright (C) 2014 CERN.
 5 | #
 6 | # Beard is a free software; you can redistribute it and/or modify it
 7 | # under the terms of the Revised BSD License; see LICENSE file for
 8 | # more details.
 9 | 
10 | """Helper functions."""
11 | 
12 | from .misc import memoize
13 | from .names import phonetic_tokenize_name
14 | from .names import given_name_initial
15 | from .names import given_name
16 | from .names import name_initials
17 | from .names import normalize_name
18 | from .strings import asciify
19 | from .transformers import FuncTransformer
20 | from .transformers import Shaper
21 | 
22 | __all__ = ("memoize",
23 |            "phonetic_tokenize_name",
24 |            "given_name_initial",
25 |            "given_name",
26 |            "normalize_name",
27 |            "name_initials",
28 |            "asciify",
29 |            "FuncTransformer",
30 |            "Shaper")
31 | 


--------------------------------------------------------------------------------
/beard/utils/misc.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #
 3 | # This file is part of Beard.
 4 | # Copyright (C) 2015 CERN.
 5 | #
 6 | # Beard is a free software; you can redistribute it and/or modify it
 7 | # under the terms of the Revised BSD License; see LICENSE file for
 8 | # more details.
 9 | 
10 | """Miscellaneous helpers.
11 | 
12 | .. codeauthor:: Gilles Louppe <g.louppe@cern.ch>
13 | 
14 | """
15 | 
16 | from functools import wraps
17 | 
18 | 
19 | def memoize(func):
20 |     """Memoization function."""
21 |     cache = {}
22 | 
23 |     @wraps(func)
24 |     def wrap(*args, **kwargs):
25 | 
26 |         frozen = frozenset(kwargs.items())
27 |         if (args, frozen) not in cache:
28 |             cache[(args, frozen)] = func(*args, **kwargs)
29 |         return cache[(args, frozen)]
30 | 
31 |     return wrap
32 | 


--------------------------------------------------------------------------------
/beard/utils/names.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is part of Beard.
  4 | # Copyright (C) 2015 CERN.
  5 | #
  6 | # Beard is a free software; you can redistribute it and/or modify it
  7 | # under the terms of the Revised BSD License; see LICENSE file for
  8 | # more details.
  9 | 
 10 | """Helper functions for handling personal names.
 11 | 
 12 | .. codeauthor:: Gilles Louppe <g.louppe@cern.ch>
 13 | .. codeauthor:: Mateusz Susik <mateusz.susik@cern.ch>
 14 | 
 15 | """
 16 | 
 17 | import functools
 18 | import re
 19 | import sys
 20 | 
 21 | import fuzzy
 22 | 
 23 | from .misc import memoize
 24 | from .strings import asciify
 25 | 
 26 | RE_NORMALIZE_WHOLE_NAME = re.compile("[^a-zA-Z,\s]+")
 27 | RE_NORMALIZE_OTHER_NAMES = re.compile("(,\s(i{1,3}|iv|v|vi|jr))|[\.'\-,\s]+")
 28 | RE_APOSTROPHES = re.compile('\'+')
 29 | RE_REMOVE_NON_CHARACTERS = re.compile('[^a-zA-Z\',\s]+')
 30 | DROPPED_AFFIXES = {'a', 'ab', 'am', 'ap', 'abu', 'al', 'auf', 'aus', 'bar',
 31 |                    'bath', 'bat', 'ben', 'bet', 'bin', 'bint', 'd', 'da',
 32 |                    'dall', 'dalla', 'das', 'de', 'degli', 'del', 'dell',
 33 |                    'della', 'dem', 'den', 'der', 'di', 'do', 'dos', 'ds', 'du',
 34 |                    'e', 'el', 'i', 'ibn', 'im', 'jr', 'l', 'la', 'las', 'le',
 35 |                    'los', 'm', 'mac', 'mc', 'mhic', 'mic', 'o', 'ter', 'und',
 36 |                    'v', 'van', 'vom', 'von', 'zu', 'zum', 'zur'}
 37 | 
 38 | 
 39 | @memoize
 40 | def normalize_name(name, drop_common_affixes=True):
 41 |     """Normalize a personal name.
 42 | 
 43 |     Parameters
 44 |     ----------
 45 |     :param name: string
 46 |         Name, formatted as "Last Name, Other Names".
 47 | 
 48 |     :param drop_common_affixes: boolean
 49 |         If the affixes like ``della`` should be dropeed.
 50 | 
 51 |     Returns
 52 |     -------
 53 |     :return: string
 54 |         Normalized name, formatted as "lastnames first names" where last names
 55 |         are joined.
 56 |     """
 57 |     name = asciify(name).lower()
 58 |     name = RE_NORMALIZE_WHOLE_NAME.sub(' ', name)
 59 |     names = name.split(",", 1)
 60 |     if not names:
 61 |         return ""
 62 |     if len(names) == 1:
 63 |         # There was no comma in the name
 64 |         all_names = names[0].split(" ")
 65 |         if len(all_names) > 1:
 66 |             # The last string should be the surname
 67 |             names = [all_names[-1], " ".join(all_names[:-1])]
 68 |         else:
 69 |             names = [all_names[0], ""]
 70 | 
 71 |     if drop_common_affixes:
 72 |         last_names = names[0].split(" ")
 73 |         without_affixes = list(filter(lambda x: x not in DROPPED_AFFIXES,
 74 |                                last_names))
 75 |         if len(without_affixes) > 0:
 76 |             names[0] = "".join(without_affixes)
 77 |     else:
 78 |         names[0] = re.sub('\s', '', names[0])
 79 | 
 80 |     name = "%s, %s" % (names[0], names[1])
 81 |     name = RE_NORMALIZE_OTHER_NAMES.sub(" ", name)
 82 |     name = name.strip()
 83 | 
 84 |     return name
 85 | 
 86 | 
 87 | @memoize
 88 | def name_initials(name):
 89 |     """Compute the set of initials of a given name."""
 90 |     return set([w[0] for w in name.split()])
 91 | 
 92 | 
 93 | @memoize
 94 | def phonetic_tokenize_name(name, phonetic_algorithm="double_metaphone"):
 95 |     """Create Double Metaphone tokens from the string.
 96 | 
 97 |      Parameters
 98 |     ----------
 99 |     :param name: string
100 |         Name of the author. Usually it should be in the format:
101 |         surnames, first names.
102 | 
103 |     :param phonetic algorithm: string
104 |         Which phonetic algorithm will be used. Options:
105 |         -  "double_metaphone"
106 |         -  "nysiis"
107 |         -  "soundex"
108 | 
109 |     Returns
110 |     -------
111 |     :return: tuple
112 |         The first element is a tuple with the tokens for surnames, the second
113 |         is a tuple with the tokens for first names. The tuple always contains
114 |         exactly two elements. Only the first results of the double metaphone
115 |         algorithm are included in tuples.
116 |     """
117 |     if phonetic_algorithm == "soundex":
118 |         error = (
119 |             "The version of the 'fuzzy' package in use has a buggy soundex"
120 |             " implementation (see https://github.com/yougov/fuzzy/issues/14 ),"
121 |             " downgrade the package to 1.1 (compatible with Python 2 only) if"
122 |             " you want to use the soundex phonetic encoding."
123 |         )
124 |         try:
125 |             if fuzzy.Soundex(4)("fuzzy") != "F200":
126 |                 raise ValueError(error)
127 |         except UnicodeDecodeError:
128 |             raise ValueError(error)
129 | 
130 |     dm = fuzzy.DMetaphone()
131 |     soundex = fuzzy.Soundex(5)
132 |     phonetic_algorithms = {
133 |         "double_metaphone": lambda y: (dm(y)[0] or b'').decode(),
134 |         "nysiis": lambda y: fuzzy.nysiis(y),
135 |         "soundex": lambda y: soundex(y)
136 |     }
137 | 
138 |     tokens = tokenize_name(name)
139 |     # Use double metaphone
140 |     tokens = tuple(map(lambda x: tuple(map(lambda y: phonetic_algorithms[
141 |         phonetic_algorithm](y), x)),
142 |         tokens))
143 | 
144 |     return tokens
145 | 
146 | 
147 | @memoize
148 | def tokenize_name(name, handle_soft_sign=True, drop_common_affixes=True):
149 |     """Normalize the name and create tokens from it.
150 | 
151 |      Parameters
152 |     ----------
153 |     :param name: string
154 |         Name of the author. Usually it should be in the format:
155 |         surnames, first names.
156 |     :param handle_soft_sign: boolean
157 |         Should the case of cyrillic soft sign be handled.
158 |     :param drop_common_affixes: boolean
159 |         Should the common affixes like ``von`` be dropped.
160 | 
161 |     Returns
162 |     -------
163 |     :return: tuple
164 |         The first element is a tuple with surnames, the second
165 |         is a tuple first names. The tuple always contains
166 |         exactly two elements.
167 |     """
168 |     name = asciify(name)
169 | 
170 |     # Get rid of non character. Leave apostrophes as they are handled in a
171 |     # different way.
172 |     name = RE_REMOVE_NON_CHARACTERS.sub(' ', name)
173 | 
174 |     if handle_soft_sign:
175 |         # Handle the "miagkii znak" in russian names.
176 |         matches = re.findall(r"^([^',]*)'([a-z].*)", name)
177 |         if matches:
178 |             name = matches[0][0] + matches[0][1]
179 | 
180 |     # Remove apostrophes
181 |     name = RE_APOSTROPHES.sub(' ', name)
182 | 
183 |     # Extract surname and name
184 |     tokens = name.split(',')
185 |     # If there are no first names, the default value is an empty string.
186 |     tokens = [tokens[0], functools.reduce(lambda x, y: x+y, tokens[1:], '')]
187 | 
188 |     # Remove whitespaces and split both surnames and first-names
189 |     tokens = list(map(lambda x: ' '.join(x.split()).lower().split(' '),
190 |                       tokens))
191 | 
192 |     # Special case where there is no first name, i.e. there was no comma in
193 |     # the signature.
194 |     if tokens[1] == [''] and len(tokens[0]) > 1:
195 |         # Probably the first string is the first name
196 |         tokens = [tokens[0][1:], [tokens[0][0]]]
197 |     elif tokens[1] == ['']:
198 |         tokens = [[tokens[0][0]], [u'']]
199 | 
200 |     if drop_common_affixes:
201 |         # Remove common prefixes
202 |         without_affixes = list(filter(lambda x: x not in DROPPED_AFFIXES,
203 |                                       tokens[0]))
204 |         if len(without_affixes) > 0:
205 |             tokens[0] = without_affixes
206 | 
207 |     return tokens
208 | 
209 | RE_CHARACTERS = re.compile('\w')
210 | 
211 | 
212 | @memoize
213 | def given_name_initial(name, index=0):
214 |     """Get the initial from the first given name if available.
215 | 
216 |     Parameters
217 |     ----------
218 |     :param name: string
219 |         Name of the author. Usually it should be in the format:
220 |         surnames, first names.
221 |     :param index: integer
222 |         Which given name's initial should be returned. 0 for first, 1 for
223 |         second, etc.
224 | 
225 |     Returns
226 |     -------
227 |     :return: string
228 |         The given name initial. Asciified one character, lowercase if
229 |         available, empty string otherwise.
230 |     """
231 |     try:
232 |         asciified = asciify(name.split(",")[1]).lower().strip()
233 |         names = asciified.split(" ")
234 |         return RE_CHARACTERS.findall(names[index])[0]
235 |     except IndexError:
236 |         if index > 0:
237 |             return ""
238 |         split_name = name.split(" ")
239 |         if len(split_name) > 1:
240 |             # For example "John Smith", without comma. The first string should
241 |             # indicate the first given name.
242 |             asciified = asciify(split_name[0]).lower().strip()
243 |             try:
244 |                 return RE_CHARACTERS.findall(asciified)[0]
245 |             except IndexError:
246 |                 pass
247 |         return ""
248 | 
249 | 
250 | @memoize
251 | def given_name(full_name, index):
252 |     """Get a specific given name from full name.
253 | 
254 |     Parameters
255 |     ----------
256 |     :param full_name: string
257 |         Name of the author. Usually it should be in the format:
258 |         surnames, first names.
259 |     :param index: integer
260 |         Which given name should be returned. 0 for the first, 1 for the second,
261 |         etc.
262 | 
263 |     Returns
264 |     -------
265 |     :return: string
266 |         Given name or empty string if it is not available.
267 |     """
268 |     try:
269 |         given_names = full_name.split(',')[1].strip()
270 |         try:
271 |             return given_names.split(' ')[index]
272 |         except IndexError:
273 |             return ""
274 |     except IndexError:
275 |         names = full_name.split(' ')
276 |         try:
277 |             return names[index]
278 |         except IndexError:
279 |             return ""
280 | 


--------------------------------------------------------------------------------
/beard/utils/strings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #
 3 | # This file is part of Beard.
 4 | # Copyright (C) 2015 CERN.
 5 | #
 6 | # Beard is a free software; you can redistribute it and/or modify it
 7 | # under the terms of the Revised BSD License; see LICENSE file for
 8 | # more details.
 9 | 
10 | """Helper functions for strings.
11 | 
12 | .. codeauthor:: Gilles Louppe <g.louppe@cern.ch>
13 | .. codeauthor:: Mateusz Susik <mateusz.susik@cern.ch>
14 | 
15 | """
16 | 
17 | import sys
18 | import unicodedata
19 | 
20 | from unidecode import unidecode
21 | 
22 | from .misc import memoize
23 | 
24 | IS_PYTHON_3 = sys.version_info[0] == 3
25 | 
26 | 
27 | @memoize
28 | def asciify(string):
29 |     """Transliterate a string to ASCII."""
30 |     if not IS_PYTHON_3 and not isinstance(string, unicode):
31 |         string = unicode(string, "utf8", errors="ignore")
32 | 
33 |     string = unidecode(unicodedata.normalize("NFKD", string))
34 |     string = string.encode("ascii", "ignore")
35 |     string = string.decode("utf8")
36 | 
37 |     return string
38 | 


--------------------------------------------------------------------------------
/beard/utils/transformers.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is part of Beard.
  4 | # Copyright (C) 2015 CERN.
  5 | #
  6 | # Beard is a free software; you can redistribute it and/or modify it
  7 | # under the terms of the Revised BSD License; see LICENSE file for
  8 | # more details.
  9 | 
 10 | """Generic transformers for data manipulation.
 11 | 
 12 | .. codeauthor:: Gilles Louppe <g.louppe@cern.ch>
 13 | 
 14 | """
 15 | import numpy as np
 16 | 
 17 | from sklearn.base import BaseEstimator
 18 | from sklearn.base import TransformerMixin
 19 | 
 20 | 
 21 | class FuncTransformer(BaseEstimator, TransformerMixin):
 22 |     """Apply a given function element-wise."""
 23 | 
 24 |     def __init__(self, func, dtype=None):
 25 |         """Initialize.
 26 | 
 27 |         Parameters
 28 |         ----------
 29 |         :param func: callable
 30 |             The function to apply on each element.
 31 | 
 32 |         :param dtype: numpy dtype
 33 |             The type of the values returned by `func`.
 34 |             If None, then use X.dtype as dtype.
 35 |         """
 36 |         self.func = func
 37 |         self.dtype = dtype
 38 | 
 39 |     def fit(self, X, y=None):
 40 |         """(Do nothing).
 41 | 
 42 |         Parameters
 43 |         ----------
 44 |         :param X: array-like, shape (n_samples, n_features)
 45 |             Input data.
 46 | 
 47 |         Returns
 48 |         -------
 49 |         :returns: self
 50 |         """
 51 |         return self
 52 | 
 53 |     def transform(self, X):
 54 |         """Apply `func` on all elements of X.
 55 | 
 56 |         Parameters
 57 |         ----------
 58 |         :param X: array-like, shape (n_samples, n_features)
 59 |             Input data.
 60 | 
 61 |         Returns
 62 |         -------
 63 |         :returns Xt: array-like, shape (n_samples, n_features)
 64 |             The transformed data.
 65 |         """
 66 |         dtype = self.dtype
 67 |         if dtype is None:
 68 |             dtype = X.dtype
 69 | 
 70 |         vfunc = np.vectorize(self.func, otypes=[dtype])
 71 |         return vfunc(X)
 72 | 
 73 | 
 74 | class Shaper(BaseEstimator, TransformerMixin):
 75 |     """Reshape arrays."""
 76 | 
 77 |     def __init__(self, newshape, order="C"):
 78 |         """Initialize.
 79 | 
 80 |         Parameters
 81 |         ----------
 82 |         :param newshape: int or tuple
 83 |             The new shape of the array.
 84 |             See numpy.reshape for further details.
 85 | 
 86 |         :param order: {'C', 'F', 'A'}
 87 |             The index order.
 88 |             See numpy.reshape for further details.
 89 |         """
 90 |         self.newshape = newshape
 91 |         self.order = order
 92 | 
 93 |     def fit(self, X, y=None):
 94 |         """(Do nothing).
 95 | 
 96 |         Parameters
 97 |         ----------
 98 |         :param X: array-like, shape (n_samples, n_features)
 99 |             Input data.
100 | 
101 |         Returns
102 |         -------
103 |         :returns: self
104 |         """
105 |         return self
106 | 
107 |     def transform(self, X):
108 |         """Reshape X.
109 | 
110 |         Parameters
111 |         ----------
112 |         :param X: array-like, shape (n_samples, n_features)
113 |             Input data.
114 | 
115 |         Returns
116 |         -------
117 |         :returns Xt: array-like, shape (self.newshape)
118 |             The transformed data.
119 |         """
120 |         return X.reshape(self.newshape, order=self.order)
121 | 


--------------------------------------------------------------------------------
/doc/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # User-friendly check for sphinx-build
 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
 13 | endif
 14 | 
 15 | # Internal variables.
 16 | PAPEROPT_a4     = -D latex_paper_size=a4
 17 | PAPEROPT_letter = -D latex_paper_size=letter
 18 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 19 | # the i18n builder cannot share the environment and doctrees with the others
 20 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 21 | 
 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
 23 | 
 24 | help:
 25 | 	@echo "Please use \`make <target>' where <target> is one of"
 26 | 	@echo "  html       to make standalone HTML files"
 27 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 28 | 	@echo "  singlehtml to make a single large HTML file"
 29 | 	@echo "  pickle     to make pickle files"
 30 | 	@echo "  json       to make JSON files"
 31 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 32 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 33 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 34 | 	@echo "  epub       to make an epub"
 35 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 36 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 37 | 	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
 38 | 	@echo "  text       to make text files"
 39 | 	@echo "  man        to make manual pages"
 40 | 	@echo "  texinfo    to make Texinfo files"
 41 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 42 | 	@echo "  gettext    to make PO message catalogs"
 43 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 44 | 	@echo "  xml        to make Docutils-native XML files"
 45 | 	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
 46 | 	@echo "  linkcheck  to check all external links for integrity"
 47 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 48 | 
 49 | clean:
 50 | 	rm -rf $(BUILDDIR)/*
 51 | 
 52 | html:
 53 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 54 | 	@echo
 55 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 56 | 
 57 | dirhtml:
 58 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 59 | 	@echo
 60 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 61 | 
 62 | singlehtml:
 63 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 64 | 	@echo
 65 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 66 | 
 67 | pickle:
 68 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 69 | 	@echo
 70 | 	@echo "Build finished; now you can process the pickle files."
 71 | 
 72 | json:
 73 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 74 | 	@echo
 75 | 	@echo "Build finished; now you can process the JSON files."
 76 | 
 77 | htmlhelp:
 78 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 79 | 	@echo
 80 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 81 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 82 | 
 83 | qthelp:
 84 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 85 | 	@echo
 86 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 87 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 88 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/beard.qhcp"
 89 | 	@echo "To view the help file:"
 90 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/beard.qhc"
 91 | 
 92 | devhelp:
 93 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
 94 | 	@echo
 95 | 	@echo "Build finished."
 96 | 	@echo "To view the help file:"
 97 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/beard"
 98 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/beard"
 99 | 	@echo "# devhelp"
100 | 
101 | epub:
102 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
103 | 	@echo
104 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
105 | 
106 | latex:
107 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
108 | 	@echo
109 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
110 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
111 | 	      "(use \`make latexpdf' here to do that automatically)."
112 | 
113 | latexpdf:
114 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
115 | 	@echo "Running LaTeX files through pdflatex..."
116 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
117 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
118 | 
119 | latexpdfja:
120 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
121 | 	@echo "Running LaTeX files through platex and dvipdfmx..."
122 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
123 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
124 | 
125 | text:
126 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
127 | 	@echo
128 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
129 | 
130 | man:
131 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
132 | 	@echo
133 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
134 | 
135 | texinfo:
136 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
137 | 	@echo
138 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
139 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
140 | 	      "(use \`make info' here to do that automatically)."
141 | 
142 | info:
143 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
144 | 	@echo "Running Texinfo files through makeinfo..."
145 | 	make -C $(BUILDDIR)/texinfo info
146 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
147 | 
148 | gettext:
149 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
150 | 	@echo
151 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
152 | 
153 | changes:
154 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
155 | 	@echo
156 | 	@echo "The overview file is in $(BUILDDIR)/changes."
157 | 
158 | linkcheck:
159 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
160 | 	@echo
161 | 	@echo "Link check complete; look for any errors in the above output " \
162 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
163 | 
164 | doctest:
165 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
166 | 	@echo "Testing of doctests in the sources finished, look at the " \
167 | 	      "results in $(BUILDDIR)/doctest/output.txt."
168 | 
169 | xml:
170 | 	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
171 | 	@echo
172 | 	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
173 | 
174 | pseudoxml:
175 | 	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
176 | 	@echo
177 | 	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
178 | 


--------------------------------------------------------------------------------
/doc/_build/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inspirehep/beard/0ad199d6c89ce331be29c3c9112b27b7d87011f8/doc/_build/.keep


--------------------------------------------------------------------------------
/doc/_static/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inspirehep/beard/0ad199d6c89ce331be29c3c9112b27b7d87011f8/doc/_static/.keep


--------------------------------------------------------------------------------
/doc/_templates/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inspirehep/beard/0ad199d6c89ce331be29c3c9112b27b7d87011f8/doc/_templates/.keep


--------------------------------------------------------------------------------
/doc/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # beard documentation build configuration file, created by
  4 | # sphinx-quickstart on Wed Oct 29 10:00:05 2014.
  5 | #
  6 | # This file is execfile()d with the current directory set to its
  7 | # containing dir.
  8 | #
  9 | # Note that not all possible configuration values are present in this
 10 | # autogenerated file.
 11 | #
 12 | # All configuration values have a default; values that are commented out
 13 | # serve to show the default.
 14 | 
 15 | import sys
 16 | import os
 17 | 
 18 | # If extensions (or modules to document with autodoc) are in another directory,
 19 | # add these directories to sys.path here. If the directory is relative to the
 20 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 21 | #sys.path.insert(0, os.path.abspath('.'))
 22 | 
 23 | # -- General configuration ------------------------------------------------
 24 | 
 25 | # If your documentation needs a minimal Sphinx version, state it here.
 26 | #needs_sphinx = '1.0'
 27 | 
 28 | # Add any Sphinx extension module names here, as strings. They can be
 29 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 30 | # ones.
 31 | extensions = [
 32 |     'sphinx.ext.autodoc',
 33 |     'sphinx.ext.doctest',
 34 |     'sphinx.ext.intersphinx',
 35 |     'sphinx.ext.todo',
 36 |     'sphinx.ext.coverage',
 37 |     'sphinx.ext.mathjax',
 38 |     'sphinx.ext.ifconfig',
 39 |     'sphinx.ext.viewcode',
 40 | ]
 41 | 
 42 | # Add any paths that contain templates here, relative to this directory.
 43 | templates_path = ['_templates']
 44 | 
 45 | # The suffix of source filenames.
 46 | source_suffix = '.rst'
 47 | 
 48 | # The encoding of source files.
 49 | #source_encoding = 'utf-8-sig'
 50 | 
 51 | # The master toctree document.
 52 | master_doc = 'index'
 53 | 
 54 | # General information about the project.
 55 | project = u'beard'
 56 | copyright = u'2014, Invenio collaboration'
 57 | 
 58 | # The version info for the project you're documenting, acts as replacement for
 59 | # |version| and |release|, also used in various other places throughout the
 60 | # built documents.
 61 | #
 62 | # The short X.Y version.
 63 | version = '0.0'
 64 | # The full version, including alpha/beta/rc tags.
 65 | release = '0.0'
 66 | 
 67 | # The language for content autogenerated by Sphinx. Refer to documentation
 68 | # for a list of supported languages.
 69 | #language = None
 70 | 
 71 | # There are two options for replacing |today|: either, you set today to some
 72 | # non-false value, then it is used:
 73 | #today = ''
 74 | # Else, today_fmt is used as the format for a strftime call.
 75 | #today_fmt = '%B %d, %Y'
 76 | 
 77 | # List of patterns, relative to source directory, that match files and
 78 | # directories to ignore when looking for source files.
 79 | exclude_patterns = ['_build']
 80 | 
 81 | # The reST default role (used for this markup: `text`) to use for all
 82 | # documents.
 83 | #default_role = None
 84 | 
 85 | # If true, '()' will be appended to :func: etc. cross-reference text.
 86 | #add_function_parentheses = True
 87 | 
 88 | # If true, the current module name will be prepended to all description
 89 | # unit titles (such as .. function::).
 90 | #add_module_names = True
 91 | 
 92 | # If true, sectionauthor and moduleauthor directives will be shown in the
 93 | # output. They are ignored by default.
 94 | #show_authors = False
 95 | 
 96 | # The name of the Pygments (syntax highlighting) style to use.
 97 | pygments_style = 'sphinx'
 98 | 
 99 | # A list of ignored prefixes for module index sorting.
100 | #modindex_common_prefix = []
101 | 
102 | # If true, keep warnings as "system message" paragraphs in the built documents.
103 | #keep_warnings = False
104 | 
105 | 
106 | # -- Options for HTML output ----------------------------------------------
107 | 
108 | # The theme to use for HTML and HTML Help pages.  See the documentation for
109 | # a list of builtin themes.
110 | html_theme = 'alabaster'
111 | 
112 | # Theme options are theme-specific and customize the look and feel of a theme
113 | # further.  For a list of options available for each theme, see the
114 | # documentation.
115 | #html_theme_options = {}
116 | 
117 | # Add any paths that contain custom themes here, relative to this directory.
118 | #html_theme_path = []
119 | 
120 | # The name for this set of Sphinx documents.  If None, it defaults to
121 | # "<project> v<release> documentation".
122 | #html_title = None
123 | 
124 | # A shorter title for the navigation bar.  Default is the same as html_title.
125 | #html_short_title = None
126 | 
127 | # The name of an image file (relative to this directory) to place at the top
128 | # of the sidebar.
129 | #html_logo = None
130 | 
131 | # The name of an image file (within the static path) to use as favicon of the
132 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
133 | # pixels large.
134 | #html_favicon = None
135 | 
136 | # Add any paths that contain custom static files (such as style sheets) here,
137 | # relative to this directory. They are copied after the builtin static files,
138 | # so a file named "default.css" will overwrite the builtin "default.css".
139 | html_static_path = ['_static']
140 | 
141 | # Add any extra paths that contain custom files (such as robots.txt or
142 | # .htaccess) here, relative to this directory. These files are copied
143 | # directly to the root of the documentation.
144 | #html_extra_path = []
145 | 
146 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
147 | # using the given strftime format.
148 | #html_last_updated_fmt = '%b %d, %Y'
149 | 
150 | # If true, SmartyPants will be used to convert quotes and dashes to
151 | # typographically correct entities.
152 | #html_use_smartypants = True
153 | 
154 | # Custom sidebar templates, maps document names to template names.
155 | #html_sidebars = {}
156 | 
157 | # Additional templates that should be rendered to pages, maps page names to
158 | # template names.
159 | #html_additional_pages = {}
160 | 
161 | # If false, no module index is generated.
162 | #html_domain_indices = True
163 | 
164 | # If false, no index is generated.
165 | #html_use_index = True
166 | 
167 | # If true, the index is split into individual pages for each letter.
168 | #html_split_index = False
169 | 
170 | # If true, links to the reST sources are added to the pages.
171 | #html_show_sourcelink = True
172 | 
173 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
174 | #html_show_sphinx = True
175 | 
176 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
177 | #html_show_copyright = True
178 | 
179 | # If true, an OpenSearch description file will be output, and all pages will
180 | # contain a <link> tag referring to it.  The value of this option must be the
181 | # base URL from which the finished HTML is served.
182 | #html_use_opensearch = ''
183 | 
184 | # This is the file name suffix for HTML files (e.g. ".xhtml").
185 | #html_file_suffix = None
186 | 
187 | # Output file base name for HTML help builder.
188 | htmlhelp_basename = 'bearddoc'
189 | 
190 | 
191 | # -- Options for LaTeX output ---------------------------------------------
192 | 
193 | latex_elements = {
194 | # The paper size ('letterpaper' or 'a4paper').
195 | #'papersize': 'letterpaper',
196 | 
197 | # The font size ('10pt', '11pt' or '12pt').
198 | #'pointsize': '10pt',
199 | 
200 | # Additional stuff for the LaTeX preamble.
201 | #'preamble': '',
202 | }
203 | 
204 | # Grouping the document tree into LaTeX files. List of tuples
205 | # (source start file, target name, title,
206 | #  author, documentclass [howto, manual, or own class]).
207 | latex_documents = [
208 |   ('index', 'beard.tex', u'beard Documentation',
209 |    u'Invenio collaboration', 'manual'),
210 | ]
211 | 
212 | # The name of an image file (relative to this directory) to place at the top of
213 | # the title page.
214 | #latex_logo = None
215 | 
216 | # For "manual" documents, if this is true, then toplevel headings are parts,
217 | # not chapters.
218 | #latex_use_parts = False
219 | 
220 | # If true, show page references after internal links.
221 | #latex_show_pagerefs = False
222 | 
223 | # If true, show URL addresses after external links.
224 | #latex_show_urls = False
225 | 
226 | # Documents to append as an appendix to all manuals.
227 | #latex_appendices = []
228 | 
229 | # If false, no module index is generated.
230 | #latex_domain_indices = True
231 | 
232 | 
233 | # -- Options for manual page output ---------------------------------------
234 | 
235 | # One entry per manual page. List of tuples
236 | # (source start file, name, description, authors, manual section).
237 | man_pages = [
238 |     ('index', 'beard', u'beard Documentation',
239 |      [u'Invenio collaboration'], 1)
240 | ]
241 | 
242 | # If true, show URL addresses after external links.
243 | #man_show_urls = False
244 | 
245 | 
246 | # -- Options for Texinfo output -------------------------------------------
247 | 
248 | # Grouping the document tree into Texinfo files. List of tuples
249 | # (source start file, target name, title, author,
250 | #  dir menu entry, description, category)
251 | texinfo_documents = [
252 |   ('index', 'beard', u'beard Documentation',
253 |    u'Invenio collaboration', 'beard', 'One line description of project.',
254 |    'Miscellaneous'),
255 | ]
256 | 
257 | # Documents to append as an appendix to all manuals.
258 | #texinfo_appendices = []
259 | 
260 | # If false, no module index is generated.
261 | #texinfo_domain_indices = True
262 | 
263 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
264 | #texinfo_show_urls = 'footnote'
265 | 
266 | # If true, do not generate a @detailmenu in the "Top" node's menu.
267 | #texinfo_no_detailmenu = False
268 | 
269 | 
270 | # Example configuration for intersphinx: refer to the Python standard library.
271 | intersphinx_mapping = {'http://docs.python.org/': None}
272 | 


--------------------------------------------------------------------------------
/doc/index.rst:
--------------------------------------------------------------------------------
 1 | .. beard documentation master file, created by
 2 |    sphinx-quickstart on Wed Oct 29 10:00:05 2014.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to beard's documentation!
 7 | =================================
 8 | 
 9 | Contents:
10 | 
11 | .. toctree::
12 |    :maxdepth: 2
13 | 
14 | 
15 | 
16 | Indices and tables
17 | ==================
18 | 
19 | * :ref:`genindex`
20 | * :ref:`modindex`
21 | * :ref:`search`
22 | 
23 | 


--------------------------------------------------------------------------------
/doc/make.bat:
--------------------------------------------------------------------------------
  1 | @ECHO OFF
  2 | 
  3 | REM Command file for Sphinx documentation
  4 | 
  5 | if "%SPHINXBUILD%" == "" (
  6 | 	set SPHINXBUILD=sphinx-build
  7 | )
  8 | set BUILDDIR=_build
  9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
 10 | set I18NSPHINXOPTS=%SPHINXOPTS% .
 11 | if NOT "%PAPER%" == "" (
 12 | 	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
 13 | 	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
 14 | )
 15 | 
 16 | if "%1" == "" goto help
 17 | 
 18 | if "%1" == "help" (
 19 | 	:help
 20 | 	echo.Please use `make ^<target^>` where ^<target^> is one of
 21 | 	echo.  html       to make standalone HTML files
 22 | 	echo.  dirhtml    to make HTML files named index.html in directories
 23 | 	echo.  singlehtml to make a single large HTML file
 24 | 	echo.  pickle     to make pickle files
 25 | 	echo.  json       to make JSON files
 26 | 	echo.  htmlhelp   to make HTML files and a HTML help project
 27 | 	echo.  qthelp     to make HTML files and a qthelp project
 28 | 	echo.  devhelp    to make HTML files and a Devhelp project
 29 | 	echo.  epub       to make an epub
 30 | 	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
 31 | 	echo.  text       to make text files
 32 | 	echo.  man        to make manual pages
 33 | 	echo.  texinfo    to make Texinfo files
 34 | 	echo.  gettext    to make PO message catalogs
 35 | 	echo.  changes    to make an overview over all changed/added/deprecated items
 36 | 	echo.  xml        to make Docutils-native XML files
 37 | 	echo.  pseudoxml  to make pseudoxml-XML files for display purposes
 38 | 	echo.  linkcheck  to check all external links for integrity
 39 | 	echo.  doctest    to run all doctests embedded in the documentation if enabled
 40 | 	goto end
 41 | )
 42 | 
 43 | if "%1" == "clean" (
 44 | 	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
 45 | 	del /q /s %BUILDDIR%\*
 46 | 	goto end
 47 | )
 48 | 
 49 | 
 50 | %SPHINXBUILD% 2> nul
 51 | if errorlevel 9009 (
 52 | 	echo.
 53 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
 54 | 	echo.installed, then set the SPHINXBUILD environment variable to point
 55 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
 56 | 	echo.may add the Sphinx directory to PATH.
 57 | 	echo.
 58 | 	echo.If you don't have Sphinx installed, grab it from
 59 | 	echo.http://sphinx-doc.org/
 60 | 	exit /b 1
 61 | )
 62 | 
 63 | if "%1" == "html" (
 64 | 	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
 65 | 	if errorlevel 1 exit /b 1
 66 | 	echo.
 67 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
 68 | 	goto end
 69 | )
 70 | 
 71 | if "%1" == "dirhtml" (
 72 | 	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
 73 | 	if errorlevel 1 exit /b 1
 74 | 	echo.
 75 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
 76 | 	goto end
 77 | )
 78 | 
 79 | if "%1" == "singlehtml" (
 80 | 	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
 81 | 	if errorlevel 1 exit /b 1
 82 | 	echo.
 83 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
 84 | 	goto end
 85 | )
 86 | 
 87 | if "%1" == "pickle" (
 88 | 	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
 89 | 	if errorlevel 1 exit /b 1
 90 | 	echo.
 91 | 	echo.Build finished; now you can process the pickle files.
 92 | 	goto end
 93 | )
 94 | 
 95 | if "%1" == "json" (
 96 | 	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
 97 | 	if errorlevel 1 exit /b 1
 98 | 	echo.
 99 | 	echo.Build finished; now you can process the JSON files.
100 | 	goto end
101 | )
102 | 
103 | if "%1" == "htmlhelp" (
104 | 	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
105 | 	if errorlevel 1 exit /b 1
106 | 	echo.
107 | 	echo.Build finished; now you can run HTML Help Workshop with the ^
108 | .hhp project file in %BUILDDIR%/htmlhelp.
109 | 	goto end
110 | )
111 | 
112 | if "%1" == "qthelp" (
113 | 	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
114 | 	if errorlevel 1 exit /b 1
115 | 	echo.
116 | 	echo.Build finished; now you can run "qcollectiongenerator" with the ^
117 | .qhcp project file in %BUILDDIR%/qthelp, like this:
118 | 	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\beard.qhcp
119 | 	echo.To view the help file:
120 | 	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\beard.ghc
121 | 	goto end
122 | )
123 | 
124 | if "%1" == "devhelp" (
125 | 	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
126 | 	if errorlevel 1 exit /b 1
127 | 	echo.
128 | 	echo.Build finished.
129 | 	goto end
130 | )
131 | 
132 | if "%1" == "epub" (
133 | 	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
134 | 	if errorlevel 1 exit /b 1
135 | 	echo.
136 | 	echo.Build finished. The epub file is in %BUILDDIR%/epub.
137 | 	goto end
138 | )
139 | 
140 | if "%1" == "latex" (
141 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
142 | 	if errorlevel 1 exit /b 1
143 | 	echo.
144 | 	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
145 | 	goto end
146 | )
147 | 
148 | if "%1" == "latexpdf" (
149 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
150 | 	cd %BUILDDIR%/latex
151 | 	make all-pdf
152 | 	cd %BUILDDIR%/..
153 | 	echo.
154 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
155 | 	goto end
156 | )
157 | 
158 | if "%1" == "latexpdfja" (
159 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
160 | 	cd %BUILDDIR%/latex
161 | 	make all-pdf-ja
162 | 	cd %BUILDDIR%/..
163 | 	echo.
164 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
165 | 	goto end
166 | )
167 | 
168 | if "%1" == "text" (
169 | 	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
170 | 	if errorlevel 1 exit /b 1
171 | 	echo.
172 | 	echo.Build finished. The text files are in %BUILDDIR%/text.
173 | 	goto end
174 | )
175 | 
176 | if "%1" == "man" (
177 | 	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
178 | 	if errorlevel 1 exit /b 1
179 | 	echo.
180 | 	echo.Build finished. The manual pages are in %BUILDDIR%/man.
181 | 	goto end
182 | )
183 | 
184 | if "%1" == "texinfo" (
185 | 	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
186 | 	if errorlevel 1 exit /b 1
187 | 	echo.
188 | 	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
189 | 	goto end
190 | )
191 | 
192 | if "%1" == "gettext" (
193 | 	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
194 | 	if errorlevel 1 exit /b 1
195 | 	echo.
196 | 	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
197 | 	goto end
198 | )
199 | 
200 | if "%1" == "changes" (
201 | 	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
202 | 	if errorlevel 1 exit /b 1
203 | 	echo.
204 | 	echo.The overview file is in %BUILDDIR%/changes.
205 | 	goto end
206 | )
207 | 
208 | if "%1" == "linkcheck" (
209 | 	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
210 | 	if errorlevel 1 exit /b 1
211 | 	echo.
212 | 	echo.Link check complete; look for any errors in the above output ^
213 | or in %BUILDDIR%/linkcheck/output.txt.
214 | 	goto end
215 | )
216 | 
217 | if "%1" == "doctest" (
218 | 	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
219 | 	if errorlevel 1 exit /b 1
220 | 	echo.
221 | 	echo.Testing of doctests in the sources finished, look at the ^
222 | results in %BUILDDIR%/doctest/output.txt.
223 | 	goto end
224 | )
225 | 
226 | if "%1" == "xml" (
227 | 	%SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
228 | 	if errorlevel 1 exit /b 1
229 | 	echo.
230 | 	echo.Build finished. The XML files are in %BUILDDIR%/xml.
231 | 	goto end
232 | )
233 | 
234 | if "%1" == "pseudoxml" (
235 | 	%SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
236 | 	if errorlevel 1 exit /b 1
237 | 	echo.
238 | 	echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
239 | 	goto end
240 | )
241 | 
242 | :end
243 | 


--------------------------------------------------------------------------------
/examples/README.rst:
--------------------------------------------------------------------------------
1 | ========
2 | Examples
3 | ========
4 | 
5 | General purpose and introductory examples of Beard.
6 | 


--------------------------------------------------------------------------------
/examples/applications/author-disambiguation/README.rst:
--------------------------------------------------------------------------------
 1 | This example shows how to build a full author disambiguation pipeline.
 2 | The pipeline is made of several scripts:
 3 | 
 4 | - ``sampling.py``: Build a training set of labeled pairs from a set of
 5 |   signatures, to be further used as input for ``distance.py``.::
 6 | 
 7 |     python sampling.py \
 8 |         --input_signatures input/signatures.json \
 9 |         --input_clusters input/clusters.json \
10 |         --balanced 1 \
11 |         --sample_size 1000000 \
12 |         --output_pairs pairs/1M_nysiis_balanced.json \
13 |         --use_blocking 1 \
14 |         --blocking_function block_phonetic \
15 |         --blocking_threshold 1 \
16 |         --blocking_phonetic_alg nysiis \
17 |         --verbose 1
18 | 
19 | - ``distance.py``: for inferring with supervised learning a distance or
20 |   linkage function between signatures. An estimator is learned from
21 |   labeled paired data and models whether two signatures belong to the same
22 |   person.::
23 | 
24 |     python distance.py \
25 |         --distance_pairs 1M_nysiis_balanced.json \
26 |         --distance_model linkage.dat \
27 |         --input_signatures input/signatures.json \
28 |         --input_records input/records.json \
29 |         --input_ethnicity_estimator ethnicity_estimator.pickle \
30 |         --verbose 3
31 | 
32 | - ``clustering.py``: Semi-supervised block clustering, for grouping together
33 |   signatures from the same author. Signatures are blocked and then clustered
34 |   using hierarchical clustering together with the linkage function learned at
35 |   the  previous step. For each block, the best cut-off threshold is chosen so
36 |   as to maximize some scoring metric on the provided labeled data.::
37 | 
38 |     python clustering.py \
39 |      --distance_model linkage.dat \
40 |      --input_signatures input/signatures.json \
41 |      --input_records input/records.json \
42 |      --output_clusters predicted_clusters.json \
43 |      --blocking_function block_phonetic \
44 |      --blocking_threshold 0 \
45 |      --blocking_phonetic_alg nysiis \
46 |      --clustering_threshold 0.709 \
47 |      --verbose 3 \
48 |      --n_jobs 16
49 | 
50 |   If partial clusters are known, these should be specified using the
51 |   ``input_clusters`` option.
52 | 


--------------------------------------------------------------------------------
/examples/applications/author-disambiguation/clustering.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is part of Beard.
  4 | # Copyright (C) 2015 CERN.
  5 | #
  6 | # Beard is a free software; you can redistribute it and/or modify it
  7 | # under the terms of the Revised BSD License; see LICENSE file for
  8 | # more details.
  9 | 
 10 | """Author disambiguation -- Clustering.
 11 | 
 12 | See README.rst for further details.
 13 | 
 14 | .. codeauthor:: Gilles Louppe <g.louppe@cern.ch>
 15 | .. codeauthor:: Mateusz Susik <mateusz.susik@cern.ch>
 16 | 
 17 | """
 18 | 
 19 | import argparse
 20 | import pickle
 21 | import json
 22 | import numpy as np
 23 | 
 24 | from functools import partial
 25 | 
 26 | try:
 27 |     from sklearn.cross_validation import train_test_split
 28 | except ImportError:
 29 |     from sklearn.model_selection import train_test_split
 30 | 
 31 | # These imports are used during unpickling.
 32 | from utils import get_author_full_name
 33 | from utils import get_author_other_names
 34 | from utils import get_author_initials
 35 | from utils import get_surname
 36 | from utils import get_first_initial
 37 | from utils import get_second_initial
 38 | from utils import get_author_affiliation
 39 | from utils import get_title
 40 | from utils import get_journal
 41 | from utils import get_abstract
 42 | from utils import get_coauthors_from_range
 43 | from utils import get_keywords
 44 | from utils import get_collaborations
 45 | from utils import get_references
 46 | from utils import get_topics
 47 | from utils import get_year
 48 | from utils import group_by_signature
 49 | from utils import load_signatures
 50 | 
 51 | from beard.clustering import BlockClustering
 52 | from beard.clustering import block_last_name_first_initial
 53 | from beard.clustering import block_phonetic
 54 | from beard.clustering import ScipyHierarchicalClustering
 55 | from beard.metrics import b3_f_score
 56 | from beard.metrics import b3_precision_recall_fscore
 57 | from beard.metrics import paired_precision_recall_fscore
 58 | 
 59 | 
 60 | def _affinity(X, step=10000):
 61 |     """Custom affinity function, using a pre-learned distance estimator."""
 62 |     # Assumes that 'distance_estimator' lives in global, making things fast
 63 |     global distance_estimator
 64 | 
 65 |     all_i, all_j = np.triu_indices(len(X), k=1)
 66 |     n_pairs = len(all_i)
 67 |     distances = np.zeros(n_pairs, dtype=np.float64)
 68 | 
 69 |     for start in range(0, n_pairs, step):
 70 |         end = min(n_pairs, start+step)
 71 |         Xt = np.empty((end-start, 2), dtype=np.object)
 72 | 
 73 |         for k, (i, j) in enumerate(zip(all_i[start:end],
 74 |                                        all_j[start:end])):
 75 |             Xt[k, 0], Xt[k, 1] = X[i, 0], X[j, 0]
 76 | 
 77 |         Xt = distance_estimator.predict_proba(Xt)[:, 1]
 78 |         distances[start:end] = Xt[:]
 79 | 
 80 |     return distances
 81 | 
 82 | 
 83 | def clustering(input_signatures, input_records, distance_model,
 84 |                input_clusters=None, output_clusters=None,
 85 |                verbose=1, n_jobs=-1, clustering_method="average",
 86 |                train_signatures_file=None, clustering_threshold=None,
 87 |                results_file=None, blocking_function="block_phonetic",
 88 |                blocking_threshold=1, blocking_phonetic_alg="nysiis"):
 89 |     """Cluster signatures using a pretrained distance model.
 90 | 
 91 |     Parameters
 92 |     ----------
 93 |     :param input_signatures: string
 94 |         Path to the file with signatures. The content should be a JSON array
 95 |         of dictionaries holding metadata about signatures.
 96 | 
 97 |         [{"signature_id": 0,
 98 |           "author_name": "Doe, John",
 99 |           "publication_id": 10, ...}, { ... }, ...]
100 | 
101 |     :param input_records: string
102 |         Path to the file with records. The content should be a JSON array of
103 |         dictionaries holding metadata about records
104 | 
105 |         [{"publication_id": 0,
106 |           "title": "Author disambiguation using Beard", ... }, { ... }, ...]
107 | 
108 |     :param distance_model: string
109 |         Path to the file with the distance model. The file should be a pickle
110 |         created using the ``distance.py`` script.
111 | 
112 |     :param input_clusters: string
113 |         Path to the file with knownn clusters. The file should be a dictionary,
114 |         where keys are cluster labels and values are the `signature_id` of the
115 |         signatures grouped in the clusters. Signatures assigned to the cluster
116 |         with label "-1" are not clustered.
117 | 
118 |         {"0": [0, 1, 3], "1": [2, 5], ...}
119 | 
120 |     :param output_clusters: string
121 |         Path to the file with output cluster. The file will be filled with
122 |         clusters, using the same format as ``input_clusters``.
123 | 
124 |     :param verbose: int
125 |         If not zero, function will output scores on stdout.
126 | 
127 |     :param n_jobs: int
128 |         Parameter passed to joblib. Number of threads to be used.
129 | 
130 |     :param clustering_method: string
131 |         Parameter passed to ``ScipyHierarchicalClustering``. Used only if
132 |         ``clustering_test_size`` is specified.
133 | 
134 |     :param train_signatures_file: str
135 |         Path to the file with train set signatures. Format the same as in
136 |         ``input_signatures``.
137 | 
138 |     :param clustering_threshold: float
139 |         Threshold passed to ``ScipyHierarchicalClustering``.
140 | 
141 |     :param results_file: str
142 |         Path to the file where the results will be output. It will give
143 |         additional information about pairwise variant of scores.
144 | 
145 |     :param blocking_function: string
146 |         must be a defined blocking function. Defined functions are:
147 |         - "block_last_name_first_initial"
148 |         - "block_phonetic"
149 | 
150 |     :param blocking_threshold: int or None
151 |         It determines the maximum allowed size of blocking on the last name
152 |         It can only be:
153 |         -   None; if the blocking function is block_last_name_first_initial
154 |         -   int; if the blocking function is block_phonetic
155 |             please check the documentation of phonetic blocking in
156 |             beard.clustering.blocking_funcs.py
157 | 
158 |     :param blocking_phonetic_alg: string or None
159 |         If not None, determines which phonetic algorithm is used. Options:
160 |         -  "double_metaphone"
161 |         -  "nysiis" (only for Python 2)
162 |         -  "soundex" (only for Python 2)
163 |     """
164 |     # Assumes that 'distance_estimator' lives in global, making things fast
165 |     global distance_estimator
166 |     distance_estimator = pickle.load(open(distance_model, "rb"))
167 | 
168 |     try:
169 |         distance_estimator.steps[-1][1].set_params(n_jobs=1)
170 |     except:
171 |         pass
172 | 
173 |     signatures, records = load_signatures(input_signatures,
174 |                                           input_records)
175 | 
176 |     indices = {}
177 |     X = np.empty((len(signatures), 1), dtype=np.object)
178 |     for i, signature in enumerate(sorted(signatures.values(),
179 |                                          key=lambda s: s["signature_id"])):
180 |         X[i, 0] = signature
181 |         indices[signature["signature_id"]] = i
182 | 
183 |     if blocking_function == "block_last_name_first_initial":
184 |         block_function = block_last_name_first_initial
185 |     else:
186 |         block_function = partial(block_phonetic,
187 |                                  threshold=blocking_threshold,
188 |                                  phonetic_algorithm=blocking_phonetic_alg)
189 | 
190 |     # Semi-supervised block clustering
191 |     if input_clusters:
192 |         true_clusters = json.load(open(input_clusters, "r"))
193 |         y_true = -np.ones(len(X), dtype=np.int)
194 | 
195 |         for label, signature_ids in true_clusters.items():
196 |             for signature_id in signature_ids:
197 |                 y_true[indices[signature_id]] = label
198 | 
199 |         y = -np.ones(len(X), dtype=np.int)
200 | 
201 |         if train_signatures_file:
202 |             train_signatures = json.load(open(train_signatures_file, "r"))
203 |             train_ids = [x['signature_id'] for x in train_signatures]
204 |             del train_signatures
205 |             y[train_ids] = y_true[train_ids]
206 |             test_ids = list(set([x['signature_id'] for _, x in
207 |                                  signatures.iteritems()]) - set(train_ids))
208 |         else:
209 |             y = y_true
210 | 
211 |     else:
212 |         y = None
213 | 
214 |     clusterer = BlockClustering(
215 |         blocking=block_function,
216 |         base_estimator=ScipyHierarchicalClustering(
217 |             affinity=_affinity,
218 |             threshold=clustering_threshold,
219 |             method=clustering_method,
220 |             supervised_scoring=b3_f_score),
221 |         verbose=verbose,
222 |         n_jobs=n_jobs).fit(X, y)
223 | 
224 |     labels = clusterer.labels_
225 | 
226 |     # Save predicted clusters
227 |     if output_clusters:
228 |         clusters = {}
229 | 
230 |         for label in np.unique(labels):
231 |             mask = (labels == label)
232 |             clusters[str(label)] = [r[0]["signature_id"] for r in X[mask]]
233 | 
234 |         json.dump(clusters, open(output_clusters, "w"))
235 | 
236 |     # Statistics
237 |     if verbose and input_clusters:
238 |         print("Number of blocks =", len(clusterer.clusterers_))
239 |         print("True number of clusters", len(np.unique(y_true)))
240 |         print("Number of computed clusters", len(np.unique(labels)))
241 | 
242 |         b3_overall = b3_precision_recall_fscore(y_true, labels)
243 |         print("B^3 F-score (overall) =", b3_overall[2])
244 | 
245 |         if train_signatures_file:
246 |             b3_train = b3_precision_recall_fscore(
247 |                 y_true[train_ids],
248 |                 labels[train_ids]
249 |             )
250 |             b3_test = b3_precision_recall_fscore(
251 |                 y_true[test_ids],
252 |                 labels[test_ids]
253 |             )
254 |             print("B^3 F-score (train) =", b3_train[2])
255 |             print("B^3 F-score (test) =", b3_test[2])
256 |             if results_file:
257 |                 paired_overall = paired_precision_recall_fscore(y_true, labels)
258 |                 paired_train = paired_precision_recall_fscore(
259 |                     y_true[train_ids],
260 |                     labels[train_ids]
261 |                 )
262 |                 paired_test = paired_precision_recall_fscore(
263 |                     y_true[test_ids],
264 |                     labels[test_ids]
265 |                 )
266 | 
267 |                 json.dump({
268 |                     "description": ["precision", "recall", "f_score"],
269 |                     "b3": {"overall": list(b3_overall),
270 |                            "train": list(b3_train),
271 |                            "test": list(b3_test)
272 |                            },
273 |                     "paired": {"overall": list(paired_overall),
274 |                                "train": list(paired_train),
275 |                                "test": list(paired_test)
276 |                                }
277 |                 }, open(results_file, 'w'))
278 | 
279 | if __name__ == "__main__":
280 |     parser = argparse.ArgumentParser()
281 |     parser.add_argument("--distance_model", required=True, type=str)
282 |     parser.add_argument("--input_signatures", required=True, type=str)
283 |     parser.add_argument("--input_records", required=True, type=str)
284 |     parser.add_argument("--input_clusters", default=None, type=str)
285 |     parser.add_argument("--output_clusters", required=True, type=str)
286 |     parser.add_argument("--clustering_method", default="average", type=str)
287 |     parser.add_argument("--clustering_threshold", default=None, type=float)
288 |     parser.add_argument("--train_signatures", default=None, type=str)
289 |     parser.add_argument("--results_file", default=None, type=str)
290 |     parser.add_argument("--blocking_function", default="block_phonetic",
291 |                         type=str)
292 |     parser.add_argument("--blocking_threshold", default=1, type=int)
293 |     parser.add_argument("--blocking_phonetic_alg", default="nysiis", type=str)
294 |     parser.add_argument("--verbose", default=1, type=int)
295 |     parser.add_argument("--n_jobs", default=1, type=int)
296 |     args = parser.parse_args()
297 | 
298 |     clustering(args.input_signatures, args.input_records, args.distance_model,
299 |                args.input_clusters, args.output_clusters,
300 |                args.verbose, args.n_jobs, args.clustering_method,
301 |                args.train_signatures, args.clustering_threshold,
302 |                args.results_file, args.blocking_function,
303 |                args.blocking_threshold, args.blocking_phonetic_alg)
304 | 


--------------------------------------------------------------------------------
/examples/applications/author-disambiguation/ethnicity.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #
 3 | # This file is part of Beard.
 4 | # Copyright (C) 2015 CERN.
 5 | #
 6 | # Beard is a free software; you can redistribute it and/or modify it
 7 | # under the terms of the Revised BSD License; see LICENSE file for
 8 | # more details.
 9 | 
10 | """Author disambiguation -- Build an estimator for guessing an author ethnic
11 | group from his name.
12 | 
13 | .. codeauthor:: Gilles Louppe <g.louppe@cern.ch>
14 | .. codeauthor:: Hussein Al-Natsheh <hussein.al.natsheh@cern.ch>
15 | 
16 | """
17 | 
18 | import argparse
19 | import numpy as np
20 | import pandas as pd
21 | import pickle
22 | 
23 | from sklearn.feature_extraction.text import TfidfVectorizer
24 | from sklearn.pipeline import Pipeline
25 | from sklearn.svm import LinearSVC
26 | 
27 | from beard.utils import normalize_name
28 | 
29 | 
30 | if __name__ == "__main__":
31 |     parser = argparse.ArgumentParser()
32 |     parser.add_argument("--input_datafile", required=True, type=str)
33 |     parser.add_argument("--output_ethnicity_estimator",
34 |                         default="ethnicity_estimator.pickle", type=str)
35 |     parser.add_argument("--C", default=4.0, type=float)
36 |     args = parser.parse_args()
37 | 
38 |     # Load data
39 |     data = pd.read_csv(args.input_datafile)
40 |     y = data.RACE.values
41 |     X = ["%s, %s" % (last, first) for last, first in zip(data.NAMELAST.values,
42 |                                                          data.NAMEFRST.values)]
43 |     X = [normalize_name(name) for name in X]
44 | 
45 |     # Train an estimator
46 |     estimator = Pipeline([
47 |         ("transformer", TfidfVectorizer(analyzer="char_wb",
48 |                                         ngram_range=(1, 5),
49 |                                         min_df=0.00005,
50 |                                         dtype=np.float32,
51 |                                         decode_error="replace")),
52 |         ("classifier", LinearSVC(C=args.C))])
53 |     estimator.fit(X, y)
54 | 
55 |     pickle.dump(estimator,
56 |                 open(args.output_ethnicity_estimator, "w"),
57 |                 protocol=pickle.HIGHEST_PROTOCOL)
58 | 


--------------------------------------------------------------------------------
/examples/applications/author-disambiguation/sampling.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is part of Beard.
  4 | # Copyright (C) 2015 CERN.
  5 | #
  6 | # Beard is a free software; you can redistribute it and/or modify it
  7 | # under the terms of the Revised BSD License; see LICENSE file for
  8 | # more details.
  9 | 
 10 | r"""Script for generating the training set.
 11 | 
 12 | It samples pairs of signatures labeled with 1 if they are of different authors
 13 | or 0 if they are of the same author.
 14 | 
 15 | Examples of command line use:
 16 | 
 17 | Sampling without blocking
 18 | 
 19 | python sampling.py --input_clusters big/clusters.json \
 20 |     --train_signatures train.json --output_pairs pairs.json --use_blocking 0
 21 | 
 22 | Sampling with blocking, without balancing
 23 | 
 24 | python sampling.py --input_clusters big/clusters.json \
 25 |     --train_signatures train.json --output_pairs pairs.json --input_balanced 0
 26 | 
 27 | Sampling with blocking, with balancing and smaller sample size.
 28 | 
 29 | python sampling.py --input_clusters big/clusters.json --sample_size 500000 \
 30 |     --train_signatures train.json --output_pairs pairs.json --input_balanced 1
 31 | 
 32 | 
 33 | .. codeauthor:: Hussein Al-Natsheh <hussein.al.natsheh@cern.ch>
 34 | .. codeauthor:: Mateusz Susik <mateusz.susik@cern.ch>
 35 | """
 36 | 
 37 | from __future__ import print_function
 38 | 
 39 | import argparse
 40 | import json
 41 | import math
 42 | import numpy as np
 43 | import random
 44 | import six
 45 | 
 46 | from beard.clustering import block_phonetic
 47 | from beard.clustering import block_last_name_first_initial
 48 | 
 49 | import sys
 50 | 
 51 | # for Python 3
 52 | if sys.version_info[0]==3:
 53 |     from functools import reduce
 54 | 
 55 | def _noblocking_sampling(sample_size, train_signatures, clusters_reversed):
 56 |     pairs = []
 57 |     # Pairs dict will prevent duplicates
 58 |     pairs_dict = {}
 59 |     category_size = sample_size // 2
 60 |     negative = 0
 61 |     while negative < category_size:
 62 |         s1 = random.choice(train_signatures)['signature_id']
 63 |         s2 = random.choice(train_signatures)['signature_id']
 64 |         if s1 == s2:
 65 |             continue
 66 |         elif s1 > s2:
 67 |             s1, s2 = s2, s1
 68 |         s1_cluster = clusters_reversed[s1]
 69 |         s2_cluster = clusters_reversed[s2]
 70 |         if s1_cluster != s2_cluster:
 71 |             if negative < category_size:
 72 |                 if s1 in pairs_dict:
 73 |                     if s2 in pairs_dict[s1]:
 74 |                         continue
 75 |                     pairs_dict[s1].append(s2)
 76 |                 else:
 77 |                     pairs_dict[s1] = [s2]
 78 |                 pairs.append((s1, s2, 1))
 79 |                 negative += 1
 80 | 
 81 |     print("successfully sampled pairs from different authors")
 82 | 
 83 |     positive_pairs = []
 84 |     for i in range(100):
 85 |         print("sampling positive examples: %s out of 100 folds" % (i+1))
 86 |         some_signatures = random.sample(train_signatures,
 87 |                                         len(train_signatures)//20)
 88 |         for i, s1 in enumerate(some_signatures):
 89 |             for s2 in some_signatures[i+1:]:
 90 |                 s1_id = s1['signature_id']
 91 |                 s2_id = s2['signature_id']
 92 |                 s1_cluster = clusters_reversed[s1_id]
 93 |                 s2_cluster = clusters_reversed[s2_id]
 94 |                 if s1_cluster == s2_cluster:
 95 |                     positive_pairs.append((s1_id, s2_id, 0))
 96 | 
 97 |         sampled = random.sample(positive_pairs, category_size//100)
 98 |         pairs += sampled
 99 |         for s1, s2, _ in sampled:
100 |             if s1 > s2:
101 |                 s2, s1 = s1, s2
102 |                 if s1 in pairs_dict:
103 |                     if s2 in pairs_dict[s1]:
104 |                         continue
105 |                     pairs_dict[s1].append(s2)
106 |                 else:
107 |                     pairs_dict[s1] = [s2]
108 | 
109 |     print("successfully sampled pairs belonging to the same author")
110 |     return pairs
111 | 
112 | 
113 | def pair_sampling(blocking_function,
114 |                   blocking_threshold,
115 |                   blocking_phonetic_alg,
116 |                   clusters_filename,
117 |                   train_filename,
118 |                   balanced=1, verbose=1,
119 |                   sample_size=1000000,
120 |                   use_blocking=1):
121 |     """Sampling pairs from the ground-truth data.
122 | 
123 |     This function builds a pair dataset from claimed signatures.
124 |     It gives the ability to specify the
125 |     blocking function and whether the sampling would be balanced or not.
126 | 
127 |     Parameters
128 |     ----------
129 |     :param blocking_function: string
130 |         must be a defined blocking function. Defined functions are:
131 |         - "block_last_name_first_initial"
132 |         - "block_phonetic"
133 | 
134 |     :param blocking_threshold: int or None
135 |         It determines the maximum allowed size of blocking on the last name
136 |         It can only be:
137 |         -   None; if the blocking function is block_last_name_first_initial
138 |         -   int; if the blocking function is block_phonetic
139 |             please check the documentation of phonetic blocking in
140 |             beard.clustering.blocking_funcs.py
141 | 
142 |     :param blocking_phonetic_alg: string or None
143 |         If not None, determines which phonetic algorithm is used. Options:
144 |         -  "double_metaphone"
145 |         -  "nysiis" (only for Python 2)
146 |         -  "soundex" (only for Python 2)
147 | 
148 |     :param clusters_filename: string
149 |         Path to the input clusters (ground-truth) file
150 | 
151 |     :param train_filename: string
152 |         Path to train set file
153 | 
154 |     :param balanced: boolean
155 |         determines if the sampling would be balanced.
156 |         The balance is defined as the same number of pairs with the same name
157 |         on signature and pairs with different names. The balance is preserved
158 |         both in the pairs belonging to one authors and in the pairs belonging
159 |         to different authors. Note that if there are not enough pairs to
160 |         satisfy the balance condition, some of the pairs will be replicated.
161 | 
162 |     :param verbose: boolean
163 |         determines if some processing statistics would be shown
164 | 
165 |     :param sample_size: integer
166 |         The desired sample size
167 | 
168 |     :param use_blocking: boolean
169 |         determines if the signatures should be blocked before sampling
170 | 
171 |     Returns
172 |     -------
173 |     :returns: list
174 |         list of signature pairs
175 |     """
176 |     # Load ground-truth
177 |     true_clusters = json.load(open(clusters_filename, "r"))
178 |     clusters_reversed = {v: k for k, va in six.iteritems(true_clusters)
179 |                          for v in va}
180 | 
181 |     train_signatures = json.load(open(train_filename, "r"))
182 | 
183 |     if not use_blocking:
184 |         return _noblocking_sampling(sample_size, train_signatures,
185 |                                     clusters_reversed)
186 | 
187 |     train_signatures_ids = []
188 |     for item in train_signatures:
189 |         train_signatures_ids.append([item])
190 | 
191 |     train_signatures_ids = np.array(train_signatures_ids)
192 | 
193 |     if blocking_function == "block_last_name_first_initial":
194 |         blocking = block_last_name_first_initial(train_signatures_ids)
195 |     elif blocking_function == "block_phonetic" and blocking_threshold:
196 |         blocking = block_phonetic(train_signatures_ids,
197 |                                   blocking_threshold,
198 |                                   blocking_phonetic_alg)
199 |     else:
200 |         raise ValueError("No such blocking strategy.")
201 | 
202 |     category_size = sample_size // 4
203 | 
204 |     blocking_dict = {}
205 | 
206 |     for index, b in enumerate(blocking):
207 |         if b in blocking_dict:
208 |             blocking_dict[b].append(index)
209 |         else:
210 |             blocking_dict[b] = [index]
211 | 
212 |     # 'd' stands for different, 's' stands for same, 'a' stands for author
213 |     # 'n' stands for name
214 |     dasn = []
215 |     sasn = []
216 |     sadn = []
217 |     dadn = []
218 | 
219 |     for _, sig_s in six.iteritems(blocking_dict):
220 | 
221 |         for i, s1 in enumerate(sig_s):
222 |             for s2 in sig_s[i+1:]:
223 |                 s1_id = train_signatures[s1]['signature_id']
224 |                 s2_id = train_signatures[s2]['signature_id']
225 |                 s1_name = train_signatures[s1]['author_name']
226 |                 s2_name = train_signatures[s2]['author_name']
227 |                 s1_cluster = clusters_reversed[s1_id]
228 |                 s2_cluster = clusters_reversed[s2_id]
229 | 
230 |                 if s1_cluster == s2_cluster:
231 |                     # Same author
232 |                     if s1_name == s2_name:
233 |                         sasn.append((s1_id, s2_id, 0))
234 |                     else:
235 |                         sadn.append((s1_id, s2_id, 0))
236 |                 else:
237 |                     # Different authors
238 |                     if s1_name == s2_name:
239 |                         dasn.append((s1_id, s2_id, 1))
240 |                     else:
241 |                         dadn.append((s1_id, s2_id, 1))
242 | 
243 |     if balanced:
244 |         if verbose:
245 |             print("len of dasn:", len(dasn))
246 |             print("len of sadn:", len(sadn))
247 |             print("len of sasn:", len(sasn))
248 |             print("len of dadn:", len(dadn))
249 | 
250 |         all_pairs = map(lambda x: int(math.ceil(
251 |                         category_size/float(len(x)))) * x,
252 |                         [dasn, sasn, sadn, dadn])
253 | 
254 |         if sys.version_info[0]==3:
255 |             all_pairs = list(all_pairs)
256 | 
257 |         pairs = reduce(lambda x, y: x + random.sample(y, category_size),
258 |                        all_pairs, [])
259 | 
260 |     else:
261 |         positive = sasn + sadn
262 |         negative = dasn + dadn
263 |         pairs = random.sample(positive,
264 |                               sample_size/2) + random.sample(negative,
265 |                                                              sample_size/2)
266 | 
267 |     return pairs
268 | 
269 | if __name__ == "__main__":
270 |     # Parse command line arugments
271 |     parser = argparse.ArgumentParser()
272 |     parser.add_argument("--input_signatures", required=True, type=str)
273 |     parser.add_argument("--input_clusters", default="clusters.json", type=str)
274 |     parser.add_argument("--balanced", default=1, type=int)
275 |     parser.add_argument("--sample_size", default=1000000, type=int)
276 |     parser.add_argument("--output_pairs", default="pairs.json", type=str)
277 |     parser.add_argument("--use_blocking", default=1, type=int)
278 |     parser.add_argument("--blocking_function", default="block_phonetic",
279 |                         type=str)
280 |     parser.add_argument("--blocking_threshold", default=1, type=int)
281 |     parser.add_argument("--blocking_phonetic_alg", default="nysiis", type=str)
282 |     parser.add_argument("--verbose", default=1, type=int)
283 | 
284 |     args = parser.parse_args()
285 | 
286 |     pairs = pair_sampling(
287 |         train_filename=args.input_signatures,
288 |         clusters_filename=args.input_clusters,
289 |         balanced=args.balanced,
290 |         sample_size=args.sample_size,
291 |         use_blocking=args.use_blocking,
292 |         blocking_function=args.blocking_function,
293 |         blocking_threshold=args.blocking_threshold,
294 |         blocking_phonetic_alg=args.blocking_phonetic_alg,
295 |         verbose=args.verbose
296 |     )
297 | 
298 |     if args.verbose:
299 |         print("number of pairs", len(pairs))
300 | 
301 |     json.dump(pairs, open(args.output_pairs, "w"))
302 | 
303 |     print("The sampled pairs file was successfully created")
304 | 


--------------------------------------------------------------------------------
/examples/applications/author-disambiguation/utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is part of Beard.
  4 | # Copyright (C) 2015 CERN.
  5 | #
  6 | # Beard is a free software; you can redistribute it and/or modify it
  7 | # under the terms of the Revised BSD License; see LICENSE file for
  8 | # more details.
  9 | 
 10 | """Helpers for author disambiguation.
 11 | 
 12 | .. codeauthor:: Gilles Louppe <g.louppe@cern.ch>
 13 | .. codeauthor:: Mateusz Susik <mateusz.susik@cern.ch>
 14 | 
 15 | """
 16 | 
 17 | import json
 18 | 
 19 | from beard.utils import given_name
 20 | from beard.utils import name_initials
 21 | from beard.utils import normalize_name
 22 | from beard.utils import given_name_initial
 23 | 
 24 | 
 25 | def load_signatures(signatures_filename, records_filename):
 26 |     """Load signatures from JSON files.
 27 | 
 28 |     Parameters
 29 |     ----------
 30 |     :param signatures_filename: string
 31 |         Path to the signatures file. The file should be in json format.
 32 | 
 33 |     :param records_filename: string
 34 |         Path to the records file. The file should be in json formaat.
 35 | 
 36 |     Returns
 37 |     -------
 38 |     :returns: tuple
 39 |         Signatures and records. Both are lists of dictionaries.
 40 |     """
 41 |     signatures = json.load(open(signatures_filename, "r"))
 42 |     records = json.load(open(records_filename, "r"))
 43 | 
 44 |     if isinstance(signatures, list):
 45 |         signatures = {s["signature_id"]: s for s in signatures}
 46 | 
 47 |     if isinstance(records, list):
 48 |         records = {r["publication_id"]: r for r in records}
 49 | 
 50 |     for signature_id, signature in signatures.items():
 51 |         signature["publication"] = records[signature["publication_id"]]
 52 | 
 53 |     return signatures, records
 54 | 
 55 | 
 56 | def get_author_full_name(s):
 57 |     """Get author full name from the signature.
 58 | 
 59 |     Parameters
 60 |     ----------
 61 |     :param s: dict
 62 |         Signature
 63 | 
 64 |     Returns
 65 |     -------
 66 |     :returns: string
 67 |         Normalized author name
 68 |     """
 69 |     v = s["author_name"]
 70 |     v = normalize_name(v) if v else ""
 71 |     return v
 72 | 
 73 | 
 74 | def get_first_given_name(s):
 75 |     """Get author first given name from the signature.
 76 | 
 77 |     Parameters
 78 |     ----------
 79 |     :param s: dict
 80 |         Signature
 81 | 
 82 |     Returns
 83 |     -------
 84 |     :returns: string
 85 |         Author's first given name
 86 |     """
 87 |     v = given_name(s["author_name"], 0)
 88 |     return v
 89 | 
 90 | 
 91 | def get_second_given_name(s):
 92 |     """Get author second given name from the signature.
 93 | 
 94 |     Parameters
 95 |     ----------
 96 |     :param s: dict
 97 |         Signature
 98 | 
 99 |     Returns
100 |     -------
101 |     :returns: string
102 |         Author's second given name
103 |     """
104 |     v = given_name(s["author_name"], 1)
105 |     return v
106 | 
107 | 
108 | def get_surname(s):
109 |     return s['author_name'].split(" ")[0].split(",")[0]
110 | 
111 | 
112 | def get_first_initial(s):
113 |     v = given_name_initial(s["author_name"], 0)
114 |     try:
115 |         return v
116 |     except IndexError:
117 |         return ""
118 | 
119 | 
120 | def get_second_initial(s):
121 |     """Get author second given name's initial from the signature.
122 | 
123 |     Parameters
124 |     ----------
125 |     :param s: dict
126 |         Signature
127 | 
128 |     Returns
129 |     -------
130 |     :returns: string
131 |         Second given name's initial. Empty string in case it's not available.
132 |     """
133 |     v = given_name_initial(s["author_name"], 1)
134 |     try:
135 |         return v
136 |     except IndexError:
137 |         return ""
138 | 
139 | 
140 | def get_author_other_names(s):
141 |     """Get author other names from the signature.
142 | 
143 |     Parameters
144 |     ----------
145 |     :param s: dict
146 |         Signature
147 | 
148 |     Returns
149 |     -------
150 |     :returns: string
151 |         Normalized other author names
152 |     """
153 |     v = s["author_name"]
154 |     v = v.split(",", 1)
155 |     v = normalize_name(v[1]) if len(v) == 2 else ""
156 |     return v
157 | 
158 | 
159 | def get_author_initials(s):
160 |     """Get author initials from the signature.
161 | 
162 |     Parameters
163 |     ----------
164 |     :param s: dict
165 |         Signature
166 | 
167 |     Returns
168 |     -------
169 |     :returns: string
170 |         Initials, not separated
171 |     """
172 |     v = s["author_name"]
173 |     v = v if v else ""
174 |     v = "".join(name_initials(v))
175 |     return v
176 | 
177 | 
178 | def get_author_affiliation(s):
179 |     """Get author affiliation from the signature.
180 | 
181 |     Parameters
182 |     ----------
183 |     :param s: dict
184 |         Signature
185 | 
186 |     Returns
187 |     -------
188 |     :returns: string
189 |         Normalized affiliation name
190 |     """
191 |     v = s["author_affiliation"]
192 |     v = normalize_name(v) if v else ""
193 |     return v
194 | 
195 | 
196 | def get_title(s):
197 |     """Get publication's title from the signature.
198 | 
199 |     Parameters
200 |     ----------
201 |     :param s: dict
202 |         Signature
203 | 
204 |     Returns
205 |     -------
206 |     :returns: string
207 |         Title of the publication
208 |     """
209 |     v = s["publication"]["title"]
210 |     v = v if v else ""
211 |     return v
212 | 
213 | 
214 | def get_journal(s):
215 |     """Get journal's name from the signature.
216 | 
217 |     Parameters
218 |     ----------
219 |     :param s: dict
220 |         Signature
221 | 
222 |     Returns
223 |     -------
224 |     :returns: string
225 |         Journal's name
226 |     """
227 |     v = s["publication"]["journal"]
228 |     v = v if v else ""
229 |     return v
230 | 
231 | 
232 | def get_abstract(s):
233 |     """Get author full name from the signature.
234 | 
235 |     Parameters
236 |     ----------
237 |     :param s: dict
238 |         Signature
239 | 
240 |     Returns
241 |     -------
242 |     :returns: string
243 |         Normalized author name
244 |     """
245 |     v = s["publication"]["abstract"]
246 |     v = v if v else ""
247 |     return v
248 | 
249 | 
250 | def get_coauthors(s):
251 |     """Get coauthors from the signature.
252 | 
253 |     Parameters
254 |     ----------
255 |     :param s: dict
256 |         Signature
257 | 
258 |     Returns
259 |     -------
260 |     :returns: string
261 |         Coauthors ids separated by a space
262 |     """
263 |     v = s["publication"]["authors"]
264 |     v = " ".join(v)
265 |     return v
266 | 
267 | 
268 | def get_coauthors_from_range(s, range=10):
269 |     """Get coauthors from the signature.
270 | 
271 |     Only the signatures from the range-neighbourhood of the given signature
272 |     will be selected. Signatures on the paper are ordered (although they don't
273 |     have to be sorted!), and the distance between signatures is defined
274 |     as absolute difference of the indices.
275 | 
276 |     The function was introduced due to the high memory usage of
277 |     a simple version.
278 | 
279 |     Parameters
280 |     ----------
281 |     :param s: dict
282 |         Signature
283 |     :param range: integer
284 |         The maximum distance for the signatures between the author and his
285 |         coauthor.
286 | 
287 |     Returns
288 |     -------
289 |     :returns: string
290 |         Coauthors ids separated by a space
291 |     """
292 |     v = s["publication"]["authors"]
293 |     try:
294 |         index = v.index(s["author_name"])
295 |         v = " ".join(v[max(0, index-range):min(len(v), index+range)])
296 |         return v
297 |     except ValueError:
298 |         v = " ".join(v)
299 |         return v
300 | 
301 | 
302 | def get_keywords(s):
303 |     """Get keywords from the signature.
304 | 
305 |     Parameters
306 |     ----------
307 |     :param s: dict
308 |         Signature
309 | 
310 |     Returns
311 |     -------
312 |     :returns: string
313 |         Keywords separated by a space
314 |     """
315 |     v = s["publication"]["keywords"]
316 |     v = " ".join(v)
317 |     return v
318 | 
319 | 
320 | def get_topics(s):
321 |     """Get topics from the signature.
322 | 
323 |     Parameters
324 |     ----------
325 |     :param s: dict
326 |         Signature
327 | 
328 |     Returns
329 |     -------
330 |     :returns: string
331 |         Topics separated by a space
332 |     """
333 |     v = s["publication"]["topics"]
334 |     v = " ".join(v)
335 |     return v
336 | 
337 | 
338 | def get_collaborations(s):
339 |     """Get collaborations from the signature.
340 | 
341 |     Parameters
342 |     ----------
343 |     :param s: dict
344 |         Signature
345 | 
346 |     Returns
347 |     -------
348 |     :returns: string
349 |         Collaboations separated by a space
350 |     """
351 |     v = s["publication"]["collaborations"]
352 |     v = " ".join(v)
353 |     return v
354 | 
355 | 
356 | def get_references(s):
357 |     """Get references from the signature.
358 |     Parameters
359 |     ----------
360 |     :param s: dict
361 |         Signature
362 |     Returns
363 |     -------
364 |     :returns: string
365 |         Ids of references separated by a space
366 |     """
367 |     v = s["publication"]["references"]
368 |     v = " ".join(str(r) for r in v)
369 |     v = v if v else ""
370 |     return v
371 | 
372 | 
373 | def get_year(s):
374 |     """Get year from the signature.
375 | 
376 |     Parameters
377 |     ----------
378 |     :param s: dict
379 |         Signature
380 | 
381 |     Returns
382 |     -------
383 |     :returns: int
384 |         Year of publication if present on the signature, -1 otherwise
385 |     """
386 |     v = s["publication"]["year"]
387 |     v = int(v) if v else -1
388 |     return v
389 | 
390 | 
391 | def group_by_signature(r):
392 |     """Grouping function for ``PairTransformer``.
393 | 
394 |     Parameters
395 |     ----------
396 |     :param r: iterable
397 |         signature in a singleton.
398 | 
399 |     Returns
400 |     -------
401 |     :returns: string
402 |         Signature id
403 |     """
404 |     return r[0]["signature_id"]
405 | 


--------------------------------------------------------------------------------
/examples/author_disambiguation.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is part of Beard.
  4 | # Copyright (C) 2015 CERN.
  5 | #
  6 | # Beard is a free software; you can redistribute it and/or modify it
  7 | # under the terms of the Revised BSD License; see LICENSE file for
  8 | # more details.
  9 | 
 10 | """Simplified author disambiguation example.
 11 | 
 12 | This example shows how to use block clustering for the author
 13 | disambiguation problem. To goal is to cluster together all (author name,
 14 | affiliation) tuples that correspond to the same actual person.
 15 | 
 16 | .. codeauthor:: Gilles Louppe <g.louppe@cern.ch>
 17 | 
 18 | """
 19 | 
 20 | from __future__ import print_function
 21 | 
 22 | import numpy as np
 23 | 
 24 | from beard.clustering import BlockClustering
 25 | from beard.clustering import block_last_name_first_initial
 26 | from beard.clustering import ScipyHierarchicalClustering
 27 | from beard.metrics import paired_f_score
 28 | from beard.utils import normalize_name
 29 | from beard.utils import name_initials
 30 | 
 31 | 
 32 | def affinity(X):
 33 |     """Compute pairwise distances between (author, affiliation) tuples.
 34 | 
 35 |     Note that this function is a heuristic. It should ideally be replaced
 36 |     by a more robust distance function, e.g. using a model learned over
 37 |     pairs of tuples.
 38 |     """
 39 |     distances = np.zeros((len(X), len(X)), dtype=np.float)
 40 | 
 41 |     for i, j in zip(*np.triu_indices(len(X), k=1)):
 42 |         name_i = normalize_name(X[i, 0])
 43 |         aff_i = X[i, 1]
 44 |         initials_i = name_initials(name_i)
 45 |         name_j = normalize_name(X[j, 0])
 46 |         aff_j = X[j, 1]
 47 |         initials_j = name_initials(name_j)
 48 | 
 49 |         # Names and affiliations match
 50 |         if (name_i == name_j and aff_i == aff_j):
 51 |             distances[i, j] = 0.0
 52 | 
 53 |         # Compatible initials and affiliations match
 54 |         elif (len(initials_i | initials_j) == max(len(initials_i),
 55 |                                                   len(initials_j)) and
 56 |               aff_i == aff_j and aff_i != ""):
 57 |             distances[i, j] = 0.0
 58 | 
 59 |         # Initials are not compatible
 60 |         elif (len(initials_i | initials_j) != max(len(initials_i),
 61 |                                                   len(initials_j))):
 62 |             distances[i, j] = 1.0
 63 | 
 64 |         # We dont know
 65 |         else:
 66 |             distances[i, j] = 0.5
 67 | 
 68 |     distances += distances.T
 69 |     return distances
 70 | 
 71 | if __name__ == "__main__":
 72 |     # Load data
 73 |     data = np.load("data/author-disambiguation.npz")
 74 |     X = data["X"]
 75 |     truth = data["y"]
 76 | 
 77 |     # Block clustering with fixed threshold
 78 |     block_clusterer = BlockClustering(
 79 |         blocking=block_last_name_first_initial,
 80 |         base_estimator=ScipyHierarchicalClustering(
 81 |             threshold=0.5,
 82 |             affinity=affinity,
 83 |             method="complete"),
 84 |         verbose=3,
 85 |         n_jobs=-1)
 86 |     block_clusterer.fit(X)
 87 |     labels = block_clusterer.labels_
 88 | 
 89 |     # Print clusters
 90 |     for cluster in np.unique(labels):
 91 |         entries = set()
 92 | 
 93 |         for name, affiliation in X[labels == cluster]:
 94 |             entries.add((name, affiliation))
 95 | 
 96 |         print("Cluster #%d = %s" % (cluster, entries))
 97 |     print()
 98 | 
 99 |     # Statistics
100 |     print("Number of blocks =", len(block_clusterer.clusterers_))
101 |     print("True number of clusters", len(np.unique(truth)))
102 |     print("Number of computed clusters", len(np.unique(labels)))
103 |     print("Paired F-score =", paired_f_score(truth, labels))
104 | 


--------------------------------------------------------------------------------
/examples/data/README.rst:
--------------------------------------------------------------------------------
1 | This directory contains disambiguation input data from INSPIRE.
2 | All signatures of people whose names contain *wang* are extracted.
3 | Please note that ids of records/signatures in ``wang_records.json`` file
4 | under authors/references/citations keys do NOT represent positions
5 | of corresponding entities in the files in this directory.
6 | Still, the disambiguation runs finely on these files.
7 | 


--------------------------------------------------------------------------------
/examples/data/author-disambiguation.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inspirehep/beard/0ad199d6c89ce331be29c3c9112b27b7d87011f8/examples/data/author-disambiguation.npz


--------------------------------------------------------------------------------
/miniconda.sh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inspirehep/beard/0ad199d6c89ce331be29c3c9112b27b7d87011f8/miniconda.sh


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
 1 | # This file is part of Beard.
 2 | # Copyright (C) 2014 CERN.
 3 | #
 4 | # Beard is a free software; you can redistribute it and/or modify it
 5 | # under the terms of the Revised BSD License; see LICENSE file for
 6 | # more details.
 7 | 
 8 | [pytest]
 9 | addopts = --pep8 --ignore=doc --ignore=setup.py --ignore=examples --ignore=beard/ext --doctest-modules --cov=beard --cov-report=term-missing --cov-config=.coveragerc
10 | 


--------------------------------------------------------------------------------
/run-tests.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This file is part of Beard.
 3 | # Copyright (C) 2016 CERN.
 4 | #
 5 | # Beard is a free software; you can redistribute it and/or modify it
 6 | # under the terms of the Revised BSD License; see LICENSE file for
 7 | # more details.
 8 | 
 9 | set -e
10 | 
11 | check-manifest --ignore miniconda.sh
12 | python setup.py test
13 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is part of Beard.
  4 | # Copyright (C) 2014, 2015 CERN.
  5 | #
  6 | # Beard is a free software; you can redistribute it and/or modify it
  7 | # under the terms of the Revised BSD License; see LICENSE file for
  8 | # more details.
  9 | 
 10 | """Setup file for Beard.
 11 | 
 12 | .. codeauthor:: Mateusz Susik <mateusz.susik@cern.ch>
 13 | .. codeauthor:: Jan Aage Lavik <jan.age.lavik@cern.ch>
 14 | 
 15 | """
 16 | 
 17 | from setuptools import setup, find_packages
 18 | from setuptools.command.test import test as TestCommand
 19 | import os
 20 | import re
 21 | import sys
 22 | 
 23 | 
 24 | class PyTest(TestCommand):
 25 | 
 26 |     """Handle ``python setup.py test``."""
 27 | 
 28 |     user_options = [("pytest-args=", "a", "Arguments to pass to py.test")]
 29 | 
 30 |     def initialize_options(self):
 31 |         """Read options from ``pytest.ini`` config file."""
 32 |         TestCommand.initialize_options(self)
 33 |         try:
 34 |             from ConfigParser import ConfigParser
 35 |         except ImportError:
 36 |             from configparser import ConfigParser
 37 |         config = ConfigParser()
 38 |         config.read("pytest.ini")
 39 |         self.pytest_args = config.get("pytest", "addopts").split(" ")
 40 | 
 41 |     def finalize_options(self):
 42 |         """Finalize options."""
 43 |         TestCommand.finalize_options(self)
 44 |         self.test_args = []
 45 |         self.test_suite = True
 46 | 
 47 |     def run_tests(self):
 48 |         """Run tests using pytest library."""
 49 |         # import here, cause outside the eggs aren't loaded
 50 |         import pytest
 51 |         errno = pytest.main(self.pytest_args)
 52 |         sys.exit(errno)
 53 | 
 54 | 
 55 | packages = find_packages(exclude=['doc', 'examples'])
 56 | # Get the version string. Cannot be done with import!
 57 | with open(os.path.join("beard", "__init__.py"), "rt") as f:
 58 |     _version = re.search(
 59 |         '__version__\s*=\s*"(?P<version>.*)"\n',
 60 |         f.read()
 61 |     ).group("version")
 62 | 
 63 | _classifiers = [
 64 |     # classifiers for PyPI
 65 |     "Development Status :: 4 - Beta",
 66 |     "Environment :: Console",
 67 |     "Intended Audience :: Developers",
 68 |     "License :: OSI Approved :: BSD License",
 69 |     "Operating System :: OS Independent",
 70 |     "Programming Language :: Python",
 71 |     "Programming Language :: Python :: 2",
 72 |     "Programming Language :: Python :: 2.7",
 73 |     "Programming Language :: Python :: 3",
 74 |     "Programming Language :: Python :: 3.6",
 75 |     "Topic :: Scientific/Engineering :: Artificial Intelligence",
 76 |     "Topic :: Scientific/Engineering :: Information Analysis"
 77 | ]
 78 | 
 79 | _keywords = [
 80 |     "author disambiguation",
 81 |     "machine learning",
 82 |     "data mining"
 83 | ]
 84 | 
 85 | _install_requires = [
 86 |     "setuptools-scm<4.0.0",
 87 |     # jellyfish 0.7 is Python 3 only
 88 |     "jellyfish<=0.7",
 89 |     "numpy>=1.9",
 90 |     "scipy>=0.14",
 91 |     "scikit-learn>=0.15.2",
 92 |     "six",
 93 |     "structlog",
 94 |     "unidecode",
 95 | ]
 96 | 
 97 | if sys.version[0] == '2':
 98 |     # use version 1.1 due to Soundex bug in 1.2
 99 |     _install_requires.append("fuzzy==1.1")
100 | else:
101 |     # need to use version 1.2 with buggy Soundex for Python 3 compatibility
102 |     _install_requires.append("fuzzy~=1.0,>=1.2")
103 | 
104 | _tests_require = [
105 |     "coverage",
106 |     "pytest>=2.6.1",
107 |     "pytest-cache>=1.0",
108 |     "pytest-cov>=1.8.0",
109 |     "pytest-pep8>=1.0.6",
110 | ]
111 | 
112 | _parameters = {
113 |     "author": "CERN",
114 |     "author_email": "admin@inspirehep.net",
115 |     "classifiers": _classifiers,
116 |     "cmdclass": {"test": PyTest},
117 |     "description": "Bibliographic Entity Automatic \
118 |         Recognition and Disambiguation",
119 |     "install_requires": _install_requires,
120 |     "keywords": _keywords,
121 |     "license": "BSD",
122 |     "long_description": open("README.rst").read(),
123 |     "name": "beard",
124 |     "packages": packages,
125 |     "platforms": "any",
126 |     "tests_require": _tests_require,
127 |     "url": "https://github.com/inspirehep/beard",
128 |     "version": _version,
129 | }
130 | 
131 | setup(**_parameters)
132 | 


--------------------------------------------------------------------------------
/tests/clustering/test_block.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #
 3 | # This file is part of Beard.
 4 | # Copyright (C) 2015 CERN.
 5 | #
 6 | # Beard is a free software; you can redistribute it and/or modify it
 7 | # under the terms of the Revised BSD License; see LICENSE file for
 8 | # more details.
 9 | 
10 | """Tests of _Block class.
11 | 
12 | .. codeauthor:: Mateusz Susik <mateusz.susik@cern.ch>
13 | 
14 | """
15 | 
16 | import pytest
17 | 
18 | from beard.clustering.blocking_funcs import _Block
19 | 
20 | 
21 | @pytest.fixture
22 | def block():
23 |     """Create a block for mr Abc, D. Vasquez."""
24 |     return _Block(*(("ABC",), ("D", "VSQ")))
25 | 
26 | 
27 | def test_add_signature(block):
28 |     """Test adding signatures to the cluster."""
29 |     assert block._content[("ABC",)][("D", "VSQ")] == 1
30 |     block.add_signature(*(("ABC",), ("D", "VSQ")))
31 |     assert block._content[("ABC",)][("D", "VSQ")] == 2
32 |     block.add_signature(*(("ABC",), ("E",)))
33 |     assert block._content[("ABC",)][("E",)] == 1
34 |     block.add_signature(*(("ABD",), ("D", "VSQ",)))
35 |     assert block._content[("ABD",)][("D", "VSQ")] == 1
36 |     block.add_signature(*(("ABC", ""), ("D", "VSQ")))
37 |     # Check handling of multiple surnames
38 |     block.add_signature(*(("ABD", "EFG"), ("D", "VSQ",)))
39 |     assert block._content[("ABD", "EFG")][("D", "VSQ")] == 1
40 |     assert block._content[("ABC",)][("D", "VSQ")] == 2
41 | 
42 | 
43 | def test_compare_tokens_from_last(block):
44 |     """Test comparing tokens from the back."""
45 |     assert block.compare_tokens_from_last(("VSQ",), ("ABC",))
46 |     assert block.compare_tokens_from_last(("C", "D", "VSQ",), ("ABC",))
47 |     with pytest.raises(KeyError) as excinfo:
48 |         block.compare_tokens_from_last(("VSQ",), ("DEF"))
49 |         assert "cluster doesn't contain a key" in str(excinfo.value)
50 |     assert not block.compare_tokens_from_last(("VSD",), ("ABC",))
51 |     assert not block.compare_tokens_from_last(("DGM", "VSQ"), ("ABC",))
52 | 
53 | 
54 | def test_contains(block):
55 |     """Test contains method."""
56 |     assert block.contains(("ABC",))
57 |     assert not block.contains(("DEF",))
58 | 


--------------------------------------------------------------------------------
/tests/clustering/test_blocking.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is part of Beard.
  4 | # Copyright (C) 2015 CERN.
  5 | #
  6 | # Beard is a free software; you can redistribute it and/or modify it
  7 | # under the terms of the Revised BSD License; see LICENSE file for
  8 | # more details.
  9 | 
 10 | """Tests of blocking for clustering.
 11 | 
 12 | .. codeauthor:: Gilles Louppe <g.louppe@cern.ch>
 13 | 
 14 | """
 15 | from __future__ import division
 16 | 
 17 | import numpy as np
 18 | from numpy.testing import assert_equal
 19 | from numpy.testing import assert_array_equal
 20 | 
 21 | from pytest import mark
 22 | import pytest
 23 | 
 24 | from sklearn.cluster import AgglomerativeClustering
 25 | from sklearn.cluster import MiniBatchKMeans
 26 | from sklearn.datasets import make_blobs
 27 | from sklearn.metrics.pairwise import euclidean_distances
 28 | from sklearn.utils import check_random_state
 29 | 
 30 | from beard.clustering import BlockClustering
 31 | from beard.clustering import ScipyHierarchicalClustering
 32 | from beard.metrics import paired_f_score
 33 | 
 34 | random_state = check_random_state(42)
 35 | X, y = make_blobs(centers=4, shuffle=False, random_state=random_state)
 36 | 
 37 | 
 38 | def _distance(X_ids):
 39 |     return euclidean_distances(X[X_ids.ravel()])
 40 | 
 41 | 
 42 | @mark.parametrize('n_jobs', (1, 2))
 43 | def test_fit(n_jobs):
 44 |     """Test fit."""
 45 |     # Single block
 46 |     clusterer = BlockClustering(
 47 |         blocking="single",
 48 |         base_estimator=AgglomerativeClustering(n_clusters=4,
 49 |                                                linkage="complete"),
 50 |         n_jobs=n_jobs)
 51 |     clusterer.fit(X)
 52 | 
 53 |     assert_equal(len(clusterer.clusterers_), 1)
 54 |     assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_))
 55 | 
 56 |     # Precomputed blocks
 57 |     clusterer = BlockClustering(
 58 |         blocking="precomputed",
 59 |         base_estimator=AgglomerativeClustering(n_clusters=2,
 60 |                                                linkage="complete"),
 61 |         n_jobs=n_jobs)
 62 |     clusterer.fit(X, blocks=(y <= 1))
 63 | 
 64 |     assert_equal(len(clusterer.clusterers_), 2)
 65 |     assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_))
 66 | 
 67 |     # Precomputed affinity
 68 |     clusterer = BlockClustering(
 69 |         affinity="precomputed",
 70 |         blocking="precomputed",
 71 |         base_estimator=ScipyHierarchicalClustering(affinity="precomputed",
 72 |                                                    n_clusters=2,
 73 |                                                    method="complete"),
 74 |         n_jobs=n_jobs)
 75 |     X_affinity = euclidean_distances(X)
 76 |     clusterer.fit(X_affinity, blocks=(y <= 1))
 77 | 
 78 |     assert_equal(len(clusterer.clusterers_), 2)
 79 |     assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_))
 80 | 
 81 |     # Custom blocking function
 82 |     X_ids = np.arange(len(X)).reshape((-1, 1))
 83 | 
 84 |     def _blocking(X_ids):
 85 |         return y[X_ids.ravel()] <= 1  # block labels into {0,1} and {2,3}
 86 | 
 87 |     clusterer = BlockClustering(
 88 |         blocking=_blocking,
 89 |         base_estimator=AgglomerativeClustering(n_clusters=2,
 90 |                                                linkage="complete",
 91 |                                                affinity=_distance))
 92 |     clusterer.fit(X_ids)
 93 | 
 94 |     assert_equal(len(clusterer.clusterers_), 2)
 95 |     assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_))
 96 | 
 97 | 
 98 | def test_partial_fit():
 99 |     """Test partial_fit."""
100 |     blocks = (y <= 1)
101 | 
102 |     clusterer1 = BlockClustering(blocking="precomputed",
103 |                                  base_estimator=MiniBatchKMeans(n_clusters=2))
104 |     clusterer1.partial_fit(X[y <= 1], blocks=blocks[y <= 1])
105 |     assert_equal(len(clusterer1.clusterers_), 1)
106 |     clusterer1.partial_fit(X[y > 1], blocks=blocks[y > 1])
107 |     assert_equal(len(clusterer1.clusterers_), 2)
108 | 
109 |     clusterer2 = BlockClustering(blocking="precomputed",
110 |                                  base_estimator=MiniBatchKMeans(n_clusters=2))
111 |     clusterer2.fit(X, blocks=blocks)
112 | 
113 |     c1 = clusterer1.predict(X, blocks=blocks)
114 |     c2 = clusterer2.labels_
115 | 
116 |     assert_equal(paired_f_score(c1, c2), 1.0)
117 | 
118 | 
119 | def test_onthefly_labels():
120 |     """Test assigning labels on the fly."""
121 |     clusterer = BlockClustering(
122 |         base_estimator=ScipyHierarchicalClustering(n_clusters=1,
123 |                                                    method="complete"))
124 |     clusterer.fit(X)
125 |     assert_array_equal([100], np.bincount(clusterer.labels_))
126 |     clusterer.clusterers_[0].set_params(n_clusters=4)
127 |     assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_))
128 | 
129 | 
130 | def test_predict():
131 |     """Test predict."""
132 |     clusterer = BlockClustering(blocking="precomputed",
133 |                                 base_estimator=MiniBatchKMeans(n_clusters=2))
134 |     clusterer.fit(X, blocks=(y <= 1))
135 |     pred = clusterer.predict(X, blocks=(y <= 1))
136 |     assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_))
137 | 
138 |     pred = clusterer.predict(X, blocks=10 * np.ones(len(X)))
139 |     assert_array_equal(-np.ones(len(X)), pred)
140 | 
141 | 
142 | @mark.parametrize('n_jobs', (1, 2))
143 | def test_single_signature(n_jobs):
144 |     """Test clustering of a  single signature."""
145 |     import numbers
146 |     clusterer = BlockClustering(base_estimator=MiniBatchKMeans(n_clusters=2))
147 |     clusterer.fit(np.array([X[0]]))
148 |     assert isinstance(clusterer.predict(X[0])[0], numbers.Integral)
149 | 
150 | 
151 | def test_validation():
152 |     """Test the validation of hyper-parameters and input data."""
153 |     with pytest.raises(ValueError):
154 |         clusterer = BlockClustering(
155 |             blocking="foobar",
156 |             base_estimator=MiniBatchKMeans(n_clusters=2))
157 |         clusterer.fit(X)
158 | 
159 |     with pytest.raises(ValueError):
160 |         clusterer = BlockClustering(
161 |             blocking="precomputed",
162 |             base_estimator=MiniBatchKMeans(n_clusters=2))
163 |         clusterer.fit(X)
164 | 
165 |     with pytest.raises(ValueError):
166 |         clusterer = BlockClustering(
167 |             blocking="precomputed",
168 |             base_estimator=MiniBatchKMeans(n_clusters=2))
169 |         clusterer.fit(X, blocks=(y <= 1))
170 |         clusterer.predict(X)
171 | 


--------------------------------------------------------------------------------
/tests/clustering/test_blocking_funcs.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #
 3 | # This file is part of Beard.
 4 | # Copyright (C) 2015 CERN.
 5 | #
 6 | # Beard is a free software; you can redistribute it and/or modify it
 7 | # under the terms of the Revised BSD License; see LICENSE file for
 8 | # more details.
 9 | 
10 | """Tests of the blocking algorithm.
11 | 
12 | .. codeauthor:: Mateusz Susik <mateusz.susik@cern.ch>
13 | 
14 | """
15 | 
16 | import numpy as np
17 | 
18 | from beard.clustering.blocking_funcs import block_phonetic
19 | from beard.clustering.blocking_funcs import block_last_name_first_initial
20 | 
21 | 
22 | def run_blocking(names, expected_results, threshold=100):
23 |     """Run dm_blocking and assert that the results are correct."""
24 |     sigs = np.array([[{'author_name': sig}] for sig in names])
25 |     for index, value in enumerate(block_phonetic(sigs, threshold)):
26 |         assert value == expected_results[index]
27 | 
28 | 
29 | def test_single_signature():
30 |     """Cluster one signature."""
31 |     run_blocking(['Smith, Joe'], ['SM0'])
32 | 
33 | 
34 | def test_first_surname_included():
35 |     """Check first surname full match."""
36 |     run_blocking(['Smith-Jones, Joe', 'Smith, Joe',
37 |                   'Jones, Paul', 'Smith-Jones, Paul'],
38 |                  ['SM0', 'SM0', 'JNS', 'SM0'])
39 | 
40 | 
41 | def test_last_surname_included():
42 |     """Check last surname full match."""
43 |     run_blocking(['Jones-Smith, Joe', 'Smith, Joe', 'Jones-Smith, Paul'],
44 |                  ['SM0', 'SM0', 'SM0'])
45 | 
46 | 
47 | def test_no_suitable_block_for_multiple_surnames():
48 |     """Check if a block is created for surnames that don't match."""
49 |     run_blocking(['Jones-Smith, Joe'], ['SM0'])
50 | 
51 | 
52 | def test_precluster_split():
53 |     """Check if huge blocks are split."""
54 |     run_blocking(['Smith, Joe', 'Smith, Paul'], ['SM0j', 'SM0p'],
55 |                  threshold=1)
56 | 
57 | 
58 | def test_compare_tokens_from_last_usage():
59 |     """Check if the surnames are compared to the first_names."""
60 |     run_blocking(['Jones, Joe', 'Smith, Joe Jones', 'Jones, Joe',
61 |                   'Jones-Smith, Joe'], ['JNS', 'SM0', 'JNS', 'SM0'])
62 | 
63 | 
64 | def test_block_last_name_first_initial():
65 |     """Block using LNFI strategy."""
66 |     names = ['Smith, Jonh', 'Smith, James', 'Smith, Peter', 'Smit, John']
67 |     sigs = np.array([[{'author_name': sig}] for sig in names])
68 |     lnfi_blocking = block_last_name_first_initial(sigs)
69 |     assert lnfi_blocking.tolist() == ['smith j', 'smith j',
70 |                                       'smith p', 'smit j']
71 | 


--------------------------------------------------------------------------------
/tests/clustering/test_wrappers.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is part of Beard.
  4 | # Copyright (C) 2015 CERN.
  5 | #
  6 | # Beard is a free software; you can redistribute it and/or modify it
  7 | # under the terms of the Revised BSD License; see LICENSE file for
  8 | # more details.
  9 | 
 10 | """Test of clustering wrappers.
 11 | 
 12 | .. codeauthor:: Gilles Louppe <g.louppe@cern.ch>
 13 | .. codeauthor:: Hussein Al-Natsheh<h.natsheh@ciapple.com>
 14 | 
 15 | """
 16 | from __future__ import division
 17 | 
 18 | from functools import partial
 19 | import numpy as np
 20 | from numpy.testing import assert_equal
 21 | from numpy.testing import assert_array_equal
 22 | import pytest
 23 | 
 24 | from sklearn.datasets import make_blobs
 25 | from sklearn.metrics.pairwise import euclidean_distances
 26 | from sklearn.utils import check_random_state
 27 | 
 28 | from beard.metrics import b3_f_score
 29 | from beard.metrics import silhouette_score
 30 | from beard.clustering import ScipyHierarchicalClustering
 31 | 
 32 | 
 33 | def generate_data(supervised=False, affinity=False):
 34 |     rng = check_random_state(42)
 35 |     X, y = make_blobs(centers=4, cluster_std=0.01,
 36 |                       shuffle=False, random_state=rng)
 37 | 
 38 |     if affinity:
 39 |         d = euclidean_distances(X)
 40 |         d = (d + d.T) / 2.0
 41 |         d /= d.max()
 42 |         X = d
 43 | 
 44 |     if supervised:
 45 |         mask = rng.randint(2, size=len(y)).astype(np.bool)
 46 |         y[mask] = -1
 47 | 
 48 |     else:
 49 |         y[:] = -1
 50 | 
 51 |     return X, y
 52 | 
 53 | 
 54 | def test_shc_semi_supervised_scoring_data_raw():
 55 |     """Test semi-supervised learning for SHC when scoring_data='raw'."""
 56 |     X, y = generate_data(supervised=True, affinity=False)
 57 | 
 58 |     def _scoring(X_raw, labels_true, labels_pred):
 59 |         assert X_raw.shape == X.shape
 60 |         score = b3_f_score(labels_true, labels_pred)
 61 |         return score
 62 | 
 63 |     clusterer = ScipyHierarchicalClustering(supervised_scoring=_scoring,
 64 |                                             scoring_data="raw")
 65 |     clusterer.fit(X, y)
 66 |     labels = clusterer.labels_
 67 |     assert_array_equal([25, 25, 25, 25], np.bincount(labels))
 68 | 
 69 | 
 70 | def test_shc_semi_supervised_scoring_data_affinity():
 71 |     """Test semi-supervised learning for SHC when scoring_data='affinity'."""
 72 |     # Passing feature matrix
 73 |     X1, y1 = generate_data(supervised=True, affinity=False)
 74 | 
 75 |     def _scoring1(X_affinity, labels_true, labels_pred):
 76 |         assert X_affinity.shape[0] == X_affinity.shape[1]
 77 |         assert X_affinity.shape != X1.shape
 78 |         score = b3_f_score(labels_true, labels_pred)
 79 |         return score
 80 | 
 81 |     clusterer = ScipyHierarchicalClustering(supervised_scoring=_scoring1,
 82 |                                             scoring_data="affinity",
 83 |                                             affinity=euclidean_distances)
 84 |     clusterer.fit(X1, y1)
 85 |     labels = clusterer.labels_
 86 |     assert_array_equal([25, 25, 25, 25], np.bincount(labels))
 87 | 
 88 |     # Passing affinity matrix
 89 |     X2, y2 = generate_data(supervised=True, affinity=True)
 90 | 
 91 |     def _scoring2(X_affinity, labels_true, labels_pred):
 92 |         assert X_affinity.shape[0] == X_affinity.shape[1]
 93 |         assert X_affinity.shape == X2.shape
 94 |         score = b3_f_score(labels_true, labels_pred)
 95 |         return score
 96 | 
 97 |     clusterer = ScipyHierarchicalClustering(supervised_scoring=_scoring2,
 98 |                                             scoring_data="affinity",
 99 |                                             affinity="precomputed")
100 |     clusterer.fit(X2, y2)
101 |     labels = clusterer.labels_
102 |     assert_array_equal([25, 25, 25, 25], np.bincount(labels))
103 | 
104 | 
105 | def test_shc_semi_supervised_scoring_data_none():
106 |     """Test semi-supervised learning for SHC when scoring_data is None."""
107 |     X, y = generate_data(supervised=True, affinity=False)
108 | 
109 |     def _scoring(labels_true, labels_pred):
110 |         score = b3_f_score(labels_true, labels_pred)
111 |         return score
112 | 
113 |     # We should find all 4 clusters
114 |     clusterer = ScipyHierarchicalClustering(supervised_scoring=_scoring)
115 |     clusterer.fit(X, y)
116 |     labels = clusterer.labels_
117 |     assert_array_equal([25, 25, 25, 25], np.bincount(labels))
118 | 
119 | 
120 | def test_shc_unsupervised_scoring_data_raw():
121 |     """Test unsupervised clustering for SHC when scoring_data='raw'."""
122 |     X, _ = generate_data(supervised=False, affinity=False)
123 |     _scoring = partial(silhouette_score, metric="euclidean")
124 |     clusterer = ScipyHierarchicalClustering(affinity=euclidean_distances,
125 |                                             unsupervised_scoring=_scoring,
126 |                                             scoring_data="raw")
127 |     labels = clusterer.fit_predict(X)
128 |     assert_array_equal([25, 25, 25, 25], np.bincount(labels))
129 | 
130 | 
131 | def test_shc_unsupervised_scoring_data_affinity():
132 |     """Test unsupervised clustering for SHC when scoring_data='affinity'."""
133 |     # Passing feature matrix
134 |     X, _ = generate_data(supervised=False, affinity=False)
135 |     _scoring = partial(silhouette_score, metric="precomputed")
136 |     clusterer = ScipyHierarchicalClustering(affinity=euclidean_distances,
137 |                                             unsupervised_scoring=_scoring,
138 |                                             scoring_data="affinity")
139 |     labels = clusterer.fit_predict(X)
140 |     assert_array_equal([25, 25, 25, 25], np.bincount(labels))
141 | 
142 |     # Passing affinity matrix
143 |     X, _ = generate_data(supervised=False, affinity=True)
144 |     _scoring = partial(silhouette_score, metric="precomputed")
145 |     clusterer = ScipyHierarchicalClustering(affinity="precomputed",
146 |                                             unsupervised_scoring=_scoring,
147 |                                             scoring_data="affinity")
148 |     labels = clusterer.fit_predict(X)
149 |     assert_array_equal([25, 25, 25, 25], np.bincount(labels))
150 | 
151 | 
152 | def test_shc_unsupervised_scoring_data_None():
153 |     """Test unsupervised clustering for SHC when scoring_data is None."""
154 |     X, _ = generate_data(supervised=False, affinity=False)
155 | 
156 |     def _scoring(labels_pred):
157 |         return -np.inf
158 | 
159 |     clusterer = ScipyHierarchicalClustering(affinity=euclidean_distances,
160 |                                             unsupervised_scoring=_scoring)
161 |     labels = clusterer.fit_predict(X)
162 |     assert_array_equal([100], np.bincount(labels))
163 | 
164 | 
165 | def test_shc_default_euclidean():
166 |     """Test default parameters of SHC, using euclidean distance."""
167 |     X, _ = generate_data(supervised=False, affinity=False)
168 |     clusterer = ScipyHierarchicalClustering(n_clusters=4)
169 |     labels = clusterer.fit_predict(X)
170 |     assert_array_equal([25, 25, 25, 25], np.bincount(labels))
171 | 
172 | 
173 | def test_shc_custom_affinity():
174 |     """Test custom affinity function in SHC."""
175 |     X, _ = generate_data(supervised=False, affinity=False)
176 |     clusterer = ScipyHierarchicalClustering(affinity=euclidean_distances,
177 |                                             n_clusters=4)
178 |     labels = clusterer.fit_predict(X)
179 |     assert_array_equal([25, 25, 25, 25], np.bincount(labels))
180 | 
181 | 
182 | def test_shc_precomputed_distance():
183 |     """Test using precomputed distances in SHC."""
184 |     X, _ = generate_data(supervised=False, affinity=True)
185 |     clusterer = ScipyHierarchicalClustering(affinity="precomputed",
186 |                                             n_clusters=4)
187 |     labels = clusterer.fit_predict(X)
188 |     assert_array_equal([25, 25, 25, 25], np.bincount(labels))
189 | 
190 | 
191 | def test_shc_n_clusters():
192 |     """Test changing number of clusters in SHC."""
193 |     X, _ = generate_data(supervised=False, affinity=True)
194 | 
195 |     clusterer = ScipyHierarchicalClustering(affinity="precomputed",
196 |                                             n_clusters=4)
197 | 
198 |     labels = clusterer.fit_predict(X)
199 |     assert_equal(len(np.unique(labels)), 4)
200 |     clusterer.set_params(n_clusters=10)
201 |     labels = clusterer.labels_
202 |     assert_equal(len(np.unique(labels)), 10)
203 | 
204 | 
205 | def test_shc_threshold():
206 |     """Test changing threshold in SHC."""
207 |     X, _ = generate_data(supervised=False, affinity=True)
208 | 
209 |     # n_clusters has precedence over threshold
210 |     clusterer = ScipyHierarchicalClustering(affinity="precomputed",
211 |                                             n_clusters=2)
212 |     labels1 = clusterer.fit_predict(X)
213 |     clusterer.set_params(threshold=clusterer.linkage_[-4, 2])
214 |     labels2 = clusterer.labels_
215 |     assert_array_equal(labels1, labels2)
216 |     assert_equal(len(np.unique(labels1)), 2)
217 | 
218 |     # change threshold
219 |     clusterer.set_params(best_threshold_precedence=False)
220 |     clusterer.set_params(n_clusters=None,
221 |                          threshold=clusterer.linkage_[-5, 2])
222 |     labels = clusterer.labels_
223 |     assert_equal(len(np.unique(labels)), 5)
224 |     clusterer.set_params(threshold=clusterer.linkage_[-4, 2])
225 |     labels = clusterer.labels_
226 |     assert_equal(len(np.unique(labels)), 4)
227 | 
228 | 
229 | def test_shc_validation():
230 |     """Test the validation of hyper-parameters and input data in SHC"""
231 |     X, _ = generate_data(supervised=False, affinity=False)
232 | 
233 |     with pytest.raises(ValueError):
234 |         clusterer = ScipyHierarchicalClustering(n_clusters=len(X) + 1)
235 |         labels = clusterer.fit_predict(X)
236 | 
237 |     with pytest.raises(ValueError):
238 |         clusterer = ScipyHierarchicalClustering(n_clusters=-1)
239 |         labels = clusterer.fit_predict(X)
240 | 
241 |     with pytest.raises(ValueError):
242 |         clusterer = ScipyHierarchicalClustering(scoring_data="affinity")
243 |         labels = clusterer.fit_predict(X)
244 | 


--------------------------------------------------------------------------------
/tests/metrics/test_clustering.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is part of Beard.
  4 | # Copyright (C) 2014, 2015 CERN.
  5 | #
  6 | # Beard is a free software; you can redistribute it and/or modify it
  7 | # under the terms of the Revised BSD License; see LICENSE file for
  8 | # more details.
  9 | 
 10 | """Test clustering evaluation metrics.
 11 | 
 12 | .. codeauthor:: Evangelos Tzemis <evangelos.tzemis@cern.ch>
 13 | .. codeauthor:: Gilles Louppe <g.louppe@cern.ch>
 14 | 
 15 | """
 16 | from __future__ import division
 17 | 
 18 | import numpy as np
 19 | from numpy.testing import assert_equal
 20 | from numpy.testing import assert_almost_equal
 21 | import pytest
 22 | 
 23 | from beard.metrics.clustering import b3_precision_recall_fscore
 24 | from beard.metrics.clustering import b3_precision_score
 25 | from beard.metrics.clustering import b3_recall_score
 26 | from beard.metrics.clustering import b3_f_score
 27 | from beard.metrics.clustering import paired_precision_recall_fscore
 28 | from beard.metrics.clustering import paired_precision_score
 29 | from beard.metrics.clustering import paired_recall_score
 30 | from beard.metrics.clustering import paired_f_score
 31 | from beard.metrics.clustering import _cluster_samples
 32 | from beard.metrics.clustering import _general_merge_distance
 33 | 
 34 | 
 35 | def test_b3_precision_recall_fscore():
 36 |     """Test the results of b3_precision_recall_fscore."""
 37 |     # test for the border case where score maximum
 38 |     y = [1, 2, 1, 3, 2, 4, 5, 4]
 39 |     assert_equal(b3_precision_recall_fscore(y, y), (1, 1, 1))
 40 | 
 41 |     # test for border case when predicting singletons
 42 |     y_true = [1, 1, 2, 2]
 43 |     y_pred = [1, 2, 3, 4]
 44 |     assert_equal(b3_precision_recall_fscore(y_true, y_pred), (1, 0.5, 2 / 3))
 45 | 
 46 | 
 47 | def test_b3_precision_score():
 48 |     """Test the returned results of b3_precision_score."""
 49 |     y_true = [1, 1, 2, 2, 3, 4, 5]
 50 |     y_pred = [1, 2, 2, 2, 3, 4, 5]
 51 |     assert_almost_equal(b3_precision_score(y_true, y_pred), 17 / 21)
 52 | 
 53 |     y_true = [1, 1, 1, 4, 5, 5, 0, 4]
 54 |     y_pred = [1, 1, 1, 1, 5, 5, 6, 7]
 55 |     assert_equal(b3_precision_score(y_true, y_pred), 13 / 16)
 56 | 
 57 |     # test for the trivial maximum case
 58 |     assert_equal(b3_precision_score(y_true, y_true), 1)
 59 | 
 60 | 
 61 | def test_b3_recall_score():
 62 |     """Test the returned results of b3_recall_score."""
 63 |     y_true = [1, 1, 2, 2, 3, 4, 5]
 64 |     y_pred = [1, 2, 2, 2, 3, 4, 5]
 65 |     assert_almost_equal(b3_recall_score(y_true, y_pred), 6 / 7)
 66 | 
 67 |     y_true = [1, 1, 1, 4, 5, 5, 0, 4]
 68 |     y_pred = [1, 1, 1, 1, 5, 5, 6, 7]
 69 |     assert_equal(b3_recall_score(y_true, y_pred), 7 / 8)
 70 | 
 71 |     # test for the trivial maximum case
 72 |     assert_equal(b3_recall_score(y_true, y_true), 1)
 73 | 
 74 | 
 75 | def test_b3_f_score():
 76 |     """Test the returned results of b3_f_score."""
 77 |     y_true = [1, 1, 2, 2, 3, 4, 5]
 78 |     y_pred = [1, 2, 2, 2, 3, 4, 5]
 79 |     desired_output = 2 * (17 / 21) * (6 / 7) / (17 / 21 + 6 / 7)
 80 |     assert_almost_equal(b3_f_score(y_true, y_pred), desired_output)
 81 | 
 82 |     y_true = [1, 1, 1, 4, 5, 5, 0, 4]
 83 |     y_pred = [1, 1, 1, 1, 5, 5, 6, 7]
 84 |     desired_output = 2 * (13 / 16) * (7 / 8) / (13 / 16 + 7 / 8)
 85 |     assert_almost_equal(b3_f_score(y_true, y_pred),  desired_output)
 86 | 
 87 |     # test for the trivial maximum case
 88 |     assert_equal(b3_f_score(y_true, y_true), 1)
 89 | 
 90 | 
 91 | def test_b3_label_invariability():
 92 |     """Test that paired P/R/F values are label invariant."""
 93 |     y = [1, 2, 1, 3, 2, 4, 5, 4]
 94 |     y_prime_invariant = [3, 6, 6, 5, 6, 2, 4, 2]
 95 |     y_prime = [2, 3, 3, 4, 3, 5, 1, 5]
 96 |     assert_equal(b3_precision_recall_fscore(y, y_prime),
 97 |                  b3_precision_recall_fscore(y, y_prime_invariant))
 98 | 
 99 | 
100 | def test_b3_raise_error():
101 |     """Test the raise of the ValueError exception for paired P/R/F."""
102 |     y = np.array([1, 2, 1, 3, 2, 4, 5, 4])
103 | 
104 |     # test raise when not 1d shape
105 |     y = y.reshape(2, 4)
106 |     with pytest.raises(ValueError):
107 |         b3_precision_recall_fscore(y, y)
108 | 
109 |     # test raise when different size of elements
110 |     y = y.reshape(8, 1)
111 |     with pytest.raises(ValueError):
112 |         b3_precision_recall_fscore(y[1:], y[2:])
113 | 
114 |     # test error raise when labels_true is empty
115 |     with pytest.raises(ValueError):
116 |         b3_precision_recall_fscore(y, [])
117 | 
118 |     # test error raise when labels_pred is empty
119 |     with pytest.raises(ValueError):
120 |         b3_precision_recall_fscore([], y)
121 | 
122 |     # test error raise when both inputs are empty
123 |     with pytest.raises(ValueError):
124 |         b3_precision_recall_fscore([], [])
125 | 
126 | 
127 | def test_paired_precision_recall_fscore():
128 |     """Test the results of paired_precision_recall_fscore."""
129 |     # test for border case where score is maximum
130 |     y = [1, 2, 1, 3, 2, 4, 5, 4]
131 |     assert_equal(paired_precision_recall_fscore(y, y), (1, 1, 1))
132 | 
133 |     # test for border case where score is minimum
134 |     y_true = [1, 2, 1, 3, 2, 4, 5, 4]
135 |     y_pred = [1, 1, 2, 2, 3, 3, 4, 4]
136 |     assert_equal(paired_precision_recall_fscore(y_true, y_pred), (0, 0, 0))
137 | 
138 | 
139 | def test_paired_precision_score():
140 |     """Test the returned results of paired_precision_score."""
141 |     y_true = [1, 1, 2, 2, 3, 4, 5]
142 |     y_pred = [1, 2, 2, 2, 3, 4, 5]
143 |     assert_almost_equal(paired_precision_score(y_true, y_pred), 1 / 3)
144 | 
145 |     y_true = [1, 1, 1, 4, 5, 5, 0, 4]
146 |     y_pred = [1, 1, 1, 1, 5, 5, 6, 7]
147 |     assert_equal(paired_precision_score(y_true, y_pred), 4 / 7)
148 | 
149 |     # test for the trivial maximum case
150 |     assert_equal(paired_precision_score(y_true, y_true), 1)
151 | 
152 | 
153 | def test_paired_recall_score():
154 |     """Test the returned results of paired_recall_score."""
155 |     y_true = [1, 1, 2, 2, 3, 4, 5]
156 |     y_pred = [1, 2, 2, 2, 3, 4, 5]
157 |     assert_almost_equal(paired_recall_score(y_true, y_pred), 0.5)
158 | 
159 |     y_true = [1, 1, 1, 4, 5, 5, 0, 4]
160 |     y_pred = [1, 1, 1, 1, 5, 5, 6, 7]
161 |     assert_equal(paired_recall_score(y_true, y_pred), 4 / 5)
162 | 
163 |     # test for the trivial maximum case
164 |     assert_equal(paired_recall_score(y_true, y_true), 1)
165 | 
166 | 
167 | def test_paired_f_score():
168 |     """Test the returned results of paired_f_score."""
169 |     y_true = [1, 1, 2, 2, 3, 4, 5]
170 |     y_pred = [1, 2, 2, 2, 3, 4, 5]
171 |     desired_output = 2 * (1 / 3) * 0.5 / (1 / 3 + 0.5)
172 |     assert_almost_equal(paired_f_score(y_true, y_pred), desired_output)
173 | 
174 |     y_true = [1, 1, 1, 4, 5, 5, 0, 4]
175 |     y_pred = [1, 1, 1, 1, 5, 5, 6, 7]
176 |     desired_output = 2 * (4 / 7) * (4 / 5) / (4 / 7 + 4 / 5)
177 |     assert_almost_equal(paired_f_score(y_true, y_pred), desired_output)
178 | 
179 |     # test for the trivial maximum case
180 |     assert_equal(paired_f_score(y_true, y_true), 1)
181 | 
182 | 
183 | def test_paired_label_invariability():
184 |     """Test that paired P/R/F values are label invariant."""
185 |     y = [1, 2, 1, 3, 2, 4, 5, 4]
186 |     y_prime_invariant = [3, 6, 6, 5, 6, 2, 4, 2]
187 |     y_prime = [2, 3, 3, 4, 3, 5, 1, 5]
188 |     assert_equal(paired_precision_recall_fscore(y, y_prime),
189 |                  paired_precision_recall_fscore(y, y_prime_invariant))
190 | 
191 | 
192 | def test_paired_raise_error():
193 |     """Test the raise of the ValueError exception for paired P/R/F."""
194 |     y = np.array([1, 2, 1, 3, 2, 4, 5, 4])
195 | 
196 |     # test raise when not 1d shape
197 |     y = y.reshape(2, 4)
198 |     with pytest.raises(ValueError):
199 |         paired_precision_recall_fscore(y, y)
200 | 
201 |     # test raise when different size of elements
202 |     y = y.reshape(8, 1)
203 |     with pytest.raises(ValueError):
204 |         paired_precision_recall_fscore(y[1:], y[2:])
205 | 
206 |     # test error raise when labels_true is empty
207 |     with pytest.raises(ValueError):
208 |         paired_precision_recall_fscore(y, [])
209 | 
210 |     # test error raise when labels_pred is empty
211 |     with pytest.raises(ValueError):
212 |         paired_precision_recall_fscore([], y)
213 | 
214 |     # test error raise when both inputs are empty
215 |     with pytest.raises(ValueError):
216 |         paired_precision_recall_fscore([], [])
217 | 
218 | 
219 | def test_cluster_samples():
220 |     """Test that samples are correctly seperated into appropriate groups."""
221 |     y = [1, 2, 1, 3, 2, 4, 5, 4]
222 |     cls_true = {1: [0, 2], 2: [1, 4], 3: [3], 4: [5, 7], 5: [6]}
223 | 
224 |     assert_equal(cls_true, _cluster_samples(y))
225 | 
226 | 
227 | def test_general_merge_distance():
228 |     """Test general merge distance function."""
229 |     y_true = np.array([1, 2, 1, 2, 1, 2])
230 |     y_pred = [1, 1, 1, 2, 2, 2]
231 | 
232 |     # test for trivial case
233 |     assert_equal(_general_merge_distance(y_true, y_true), 0)
234 | 
235 |     # test that fs and fm has effect on result
236 |     zero_res = _general_merge_distance(y_true, y_pred,
237 |                                        fm=lambda x, y: 0,
238 |                                        fs=lambda x, y: 0)
239 |     assert_equal(zero_res, 0)
240 | 
241 |     # test for default functions
242 |     assert_equal(_general_merge_distance(y_true, y_pred), 4)
243 | 


--------------------------------------------------------------------------------
/tests/metrics/test_text.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is part of Beard.
  4 | # Copyright (C) 2015 CERN.
  5 | #
  6 | # Beard is a free software; you can redistribute it and/or modify it
  7 | # under the terms of the Revised BSD License; see LICENSE file for
  8 | # more details.
  9 | 
 10 | """Test text metrics.
 11 | 
 12 | .. codeauthor:: Petros Ioannidis <petros.ioannidis91@gmail.com>
 13 | .. codeauthor:: Evangelos Tzemis <evangelos.tzemis@gmail.com>
 14 | 
 15 | """
 16 | from __future__ import generators
 17 | 
 18 | from numpy.testing import assert_almost_equal
 19 | import pytest
 20 | from pytest import mark
 21 | 
 22 | from beard.metrics.text import _find_all
 23 | from beard.metrics.text import _jaro_matching
 24 | from beard.metrics.text import jaro
 25 | from beard.metrics.text import jaro_winkler
 26 | from beard.metrics.text import levenshtein
 27 | 
 28 | 
 29 | @mark.parametrize('s, letter, occur',
 30 |                   (('MARTHA', 'A', (1, 5)),
 31 |                    ('DWAYNE', 'D', (0, )),
 32 |                    ('A', 'A', (0, )),
 33 |                    ('AABAA', 'AA', (0, 3)),
 34 |                    ('ABCD', 'D', (3, ))))
 35 | def test_find_all_normal_string(s, letter, occur):
 36 |     """Test find_all behaviour for average cases."""
 37 |     assert tuple(_find_all(s, letter)) == occur
 38 | 
 39 | 
 40 | @mark.parametrize('s, letter',
 41 |                   (('MARTHA', 'Z'),
 42 |                    ('', 'A')))
 43 | def test_find_all_none_string(s, letter):
 44 |     """Test find_all behaviour for empty cases."""
 45 |     with pytest.raises(StopIteration):
 46 |         assert next(_find_all(s, letter))
 47 | 
 48 | 
 49 | @mark.parametrize('s, letter',
 50 |                   ((set(), 'A'),
 51 |                    (dict(), 'A'),
 52 |                    (int(), 'A'),
 53 |                    (float(), 'A'),
 54 |                    (list(), 'A')))
 55 | def test_find_all_abnormal_string(s, letter):
 56 |     """Test find_all behaviour called with wrong objects."""
 57 |     with pytest.raises(TypeError):
 58 |         next(_find_all(s, letter))
 59 | 
 60 | 
 61 | @mark.parametrize('s1, s2, match',
 62 |                   (('MARTHA', 'MARHTA', (6, 2)),
 63 |                    ('DWAYNE', 'DUANE', (4, 0)),
 64 |                    ('DUANE', 'DWAYNE', (4, 0)),
 65 |                    ('MARHTA', 'MARTHA', (6, 2))))
 66 | def test_jaro_matching(s1, s2, match):
 67 |     """Test jaro_matching behaviour."""
 68 |     assert _jaro_matching(s1, s2) == match
 69 | 
 70 | 
 71 | @mark.parametrize('s1, s2, match',
 72 |                   (('MARTHA', 'MARHTA', 0.944),
 73 |                    ('DWAYNE', 'DUANE', 0.822),
 74 |                    ('ABCDEFG', 'ABCDEFG', 1.0),
 75 |                    ('', 'ABCDEFG', 0.0),
 76 |                    ('ABCDEFG', 'HIGKLMN', 0.0),
 77 |                    ('apple', 'apple', 1.0)))
 78 | def test_jaro(s1, s2, match):
 79 |     """Test jaro_similarity_metric behaviour."""
 80 |     assert_almost_equal(jaro(s1, s2), match, 3)
 81 | 
 82 | 
 83 | @mark.parametrize('s1, s2, match',
 84 |                   (('MARTHA', 'MARHTA', 0.961),
 85 |                    ('DWAYNE', 'DUANE', 0.84),
 86 |                    ('ABCDEFG', 'ABCDEFG', 1.0),
 87 |                    ('', 'ABCDEFG', 0.0),
 88 |                    ('ABCDEFG', 'HIGKLMN', 0.0)))
 89 | def test_jaro_winkler(s1, s2, match):
 90 |     """Test jaro_similarity_metric behaviour."""
 91 |     assert_almost_equal(jaro_winkler(s1, s2), match, 3)
 92 | 
 93 | 
 94 | @mark.parametrize('string_a, string_b, distance',
 95 |                   (('back', 'book', 2),
 96 |                    ('weight', 'height', 1),
 97 |                    ('Adam', 'Adams', 1),
 98 |                    ('YES', 'yes', 3),
 99 |                    ('weight', 'muchweigh', 5),
100 |                    ('grand father', '', len('grand father')),
101 |                    ('', 'grand father', len('grand father')),
102 |                    (' ', ' ', 0),
103 |                    ('', '', 0)))
104 | def test_levenshtein(string_a, string_b, distance):
105 |     """Test levenshtein_metric behaviour."""
106 |     assert levenshtein(string_a, string_b) == distance
107 | 


--------------------------------------------------------------------------------
/tests/similarity/test_pairs.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is part of Beard.
  4 | # Copyright (C) 2015 CERN.
  5 | #
  6 | # Beard is a free software; you can redistribute it and/or modify it
  7 | # under the terms of the Revised BSD License; see LICENSE file for
  8 | # more details.
  9 | 
 10 | """Tests of transformers for paired data.
 11 | 
 12 | .. codeauthor:: Gilles Louppe <g.louppe@cern.ch>
 13 | .. codeauthor:: Hussein AL-NATSHEH <hussein.al.natsheh@cern.ch>
 14 | 
 15 | """
 16 | 
 17 | import jellyfish
 18 | import numpy as np
 19 | from numpy.testing import assert_array_almost_equal
 20 | import scipy.sparse as sp
 21 | 
 22 | from sklearn.preprocessing import MinMaxScaler
 23 | from sklearn.preprocessing import OneHotEncoder
 24 | from sklearn.preprocessing import StandardScaler
 25 | try:
 26 |     from sklearn.cross_validation import train_test_split
 27 | except ImportError:
 28 |     from sklearn.model_selection import train_test_split
 29 | from sklearn.datasets import load_iris
 30 | from sklearn.svm import LinearSVC
 31 | 
 32 | from beard.similarity import AbsoluteDifference
 33 | from beard.similarity import CosineSimilarity
 34 | from beard.similarity import ElementMultiplication
 35 | from beard.similarity import EstimatorTransformer
 36 | from beard.similarity import JaccardSimilarity
 37 | from beard.similarity import PairTransformer
 38 | from beard.similarity import StringDistance
 39 | from beard.utils import FuncTransformer
 40 | 
 41 | 
 42 | def test_pair_transformer():
 43 |     """Test for PairTransformer."""
 44 |     X = np.array([[0, 1], [2, 0], [2, 5]], dtype=np.float)
 45 |     tf = PairTransformer(element_transformer=FuncTransformer(lambda v: v + 1))
 46 |     Xt = tf.fit_transform(X)
 47 |     assert_array_almost_equal(Xt, X + 1)
 48 | 
 49 |     X = np.array([[0, 1], [2, 0], [2, 5],
 50 |                   [0, 1], [2, 0], [2, 5]], dtype=np.float)
 51 |     tf = PairTransformer(element_transformer=FuncTransformer(lambda v: v + 1),
 52 |                          groupby=lambda r: r[0])
 53 |     Xt = tf.fit_transform(X)
 54 |     assert_array_almost_equal(Xt, X + 1)
 55 | 
 56 |     X = np.array([[0, 1], [2, 3], [4, 5]], dtype=np.float)
 57 |     Xt = PairTransformer(element_transformer=MinMaxScaler()).fit_transform(X)
 58 |     assert_array_almost_equal(Xt, [[0, 0.2], [0.4, 0.6], [0.8, 1.0]])
 59 | 
 60 |     X = np.array([[0, 1], [2, 3]], dtype=np.float)
 61 |     tf = PairTransformer(element_transformer=OneHotEncoder(sparse=True))
 62 |     Xt = tf.fit_transform(X)
 63 |     assert sp.issparse(Xt)
 64 |     assert_array_almost_equal(Xt.todense(), [[1, 0, 0, 0, 0, 1, 0, 0],
 65 |                                              [0, 0, 1, 0, 0, 0, 0, 1]])
 66 | 
 67 |     X = sp.csr_matrix(np.array([[0, 1], [2, 3]], dtype=np.float))
 68 |     tf = PairTransformer(element_transformer=StandardScaler(with_mean=False))
 69 |     Xt = tf.fit_transform(X)
 70 |     assert sp.issparse(Xt)
 71 |     assert_array_almost_equal(Xt.todense(), [[0, 0.89442719],
 72 |                                              [1.78885438, 2.68328157]])
 73 | 
 74 | 
 75 | def test_cosine_similarity():
 76 |     """Test for CosineSimilarity."""
 77 |     X = np.array([[1, 0, 0, 0, 0, 0],
 78 |                   [1, 0, 1, 1, 0, 0],
 79 |                   [1, 0, 0, 1, 0, 0],
 80 |                   [0, 0, 0, 0, 0, 0],
 81 |                   [1, 1, 1, 1, 1, 1]])
 82 | 
 83 |     Xt = CosineSimilarity().fit_transform(X)
 84 |     assert_array_almost_equal(Xt, [[0.], [2 ** -0.5], [1.], [0.], [1.]])
 85 | 
 86 |     Xt = CosineSimilarity().fit_transform(sp.csr_matrix(X))
 87 |     assert_array_almost_equal(Xt, [[0.], [2 ** -0.5], [1.], [0.], [1.]])
 88 | 
 89 | 
 90 | def test_absolute_difference():
 91 |     """Test for AbsoluteDifference."""
 92 |     X = np.array([[0, 0, 0, 0],
 93 |                   [0, 1, 1, 0],
 94 |                   [1, 1, 1, 1],
 95 |                   [1, 0, 0, 1]])
 96 | 
 97 |     Xt = AbsoluteDifference().fit_transform(X)
 98 |     assert_array_almost_equal(Xt, [[0, 0], [1, 1], [0, 0], [1, 1]])
 99 | 
100 |     Xt = AbsoluteDifference().fit_transform(sp.csr_matrix(X))
101 |     assert_array_almost_equal(Xt, [[0, 0], [1, 1], [0, 0], [1, 1]])
102 | 
103 | 
104 | def test_CharacterEquality():
105 |     """Test for CharacterEquality."""
106 |     X = np.array([['q', 'q'],
107 |                   ['q', 'a'],
108 |                   ['q', ''],
109 |                   ['', ''],
110 |                   ['', 'q']])
111 |     Xt = StringDistance(similarity_function='character_equality').transform(X)
112 |     assert_array_almost_equal(Xt, [[1.], [0.], [0.], [0.5], [0.]])
113 | 
114 | 
115 | def test_StringDistance():
116 |     """Test for StringDistance."""
117 |     X = np.array([[u'this', u'that'],
118 |                   [u'that', u't'],
119 |                   [u't', u't'],
120 |                   [u't', u'this']])
121 |     Xt = StringDistance().transform(X)
122 |     assert_array_almost_equal(Xt, [[jellyfish.jaro_winkler(u'this', u'that')],
123 |                                    [-1.], [-1.], [-1.]])
124 | 
125 | 
126 | def test_JaccardSimilarity():
127 |     """Test for JaccardSimilarity."""
128 |     X = np.array([[0, 0, 0, 0, 0, 0, 0, 0],
129 |                   [0, 0, 1, 1, 0, 1, 0, 1],
130 |                   [0, 1, 0, 1, 0, 0, 1, 0],
131 |                   [1, 0, 1, 1, 1, 1, 0, 7],
132 |                   [0, 3, 0, 1, 0, 9, 0, 1]])
133 | 
134 |     Xt = JaccardSimilarity().fit_transform(X)
135 |     assert_array_almost_equal(Xt, [[0.], [0.33333333], [0.], [0.5], [1.]])
136 | 
137 |     Xt = JaccardSimilarity().fit_transform(sp.csr_matrix(X))
138 |     assert_array_almost_equal(Xt, [[0.], [0.33333333], [0.], [0.5], [1.]])
139 | 
140 |     X = np.array([[0, 0, 0, 0, 0, 0, 0, 0],
141 |                   [0, 0, 0, 0, 0, 0, 0, 0],
142 |                   [0, 0, 0, 0, 0, 0, 0, 0],
143 |                   [0, 0, 0, 0, 0, 0, 0, 0]])
144 | 
145 |     Xt = JaccardSimilarity().fit_transform(X)
146 |     assert_array_almost_equal(Xt, [[0.], [0.], [0.], [0.]])
147 | 
148 |     Xt = JaccardSimilarity().fit_transform(sp.csr_matrix(X))
149 |     assert_array_almost_equal(Xt, [[0.], [0.], [0.], [0.]])
150 | 
151 | 
152 | def test_EstimatorTransformer():
153 |     """Test for EstimatorTransformer."""
154 |     data = load_iris()
155 |     train, test = train_test_split(np.arange(len(data.data)),
156 |                                    test_size=0.08, random_state=42)
157 |     X_train = data.data[train]
158 |     y_train = data.target[train]
159 |     X_test = data.data[test]
160 | 
161 |     clf = LinearSVC().fit(X_train, y_train)
162 | 
163 |     y_predict = clf.decision_function(X_test)
164 | 
165 |     Xt = EstimatorTransformer(clf).fit_transform(X_test)
166 |     assert_array_almost_equal(Xt, y_predict)
167 | 
168 | 
169 | def test_ElementMultiplication():
170 |     """Test for ElementMultiplication."""
171 |     X = np.array([[1.0, 1.0, 1.0, 2.0],
172 |                   [0.5, 1.0, 1.0, 0.5],
173 |                   [2.5, 0.2, 10.0, 2.0]])
174 | 
175 |     y = np.array([[1.0, 2.0],
176 |                   [0.5, 0.5],
177 |                   [25.0, 0.4]])
178 | 
179 |     Xt = ElementMultiplication().fit_transform(X)
180 |     assert_array_almost_equal(Xt, y)
181 | 


--------------------------------------------------------------------------------
/tests/utils/test_names.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is part of Beard.
  4 | # Copyright (C) 2015 CERN.
  5 | #
  6 | # Beard is a free software; you can redistribute it and/or modify it
  7 | # under the terms of the Revised BSD License; see LICENSE file for
  8 | # more details.
  9 | 
 10 | """Tests of personal names helpers.
 11 | 
 12 | .. codeauthor:: Gilles Louppe <g.louppe@cern.ch>
 13 | .. codeauthor:: Mateusz Susik <mateusz.susik@cern.ch>
 14 | 
 15 | """
 16 | 
 17 | import pytest
 18 | import sys
 19 | 
 20 | import fuzzy
 21 | 
 22 | from beard.ext.metaphone import dm
 23 | 
 24 | from beard.utils.names import phonetic_tokenize_name
 25 | from beard.utils.names import given_name_initial
 26 | from beard.utils.names import given_name
 27 | from beard.utils.names import name_initials
 28 | from beard.utils.names import normalize_name
 29 | 
 30 | 
 31 | def test_name_initals():
 32 |     """Test extracting name initials."""
 33 |     assert name_initials("Dupont, Jean-René") == set(['D', 'J'])
 34 | 
 35 | 
 36 | def test_normalize_name():
 37 |     """Test of normalize_name."""
 38 |     assert normalize_name("Doe, John") == "doe john"
 39 |     assert normalize_name("Doe, J.") == "doe j"
 40 |     assert normalize_name("Doe, J") == "doe j"
 41 |     assert normalize_name("Doe-Foe, Willem") == "doefoe willem"
 42 |     assert normalize_name("Doe-Foe Willem") == "willem doe foe"
 43 |     assert normalize_name("Dupont, René") == "dupont rene"
 44 |     assert normalize_name("Dupont., René") == "dupont rene"
 45 |     assert normalize_name("Dupont, Jean-René") == "dupont jean rene"
 46 |     assert normalize_name("Dupont, René, III") == "dupont rene"
 47 |     assert normalize_name("Dupont, René, Jr.") == "dupont rene"
 48 |     assert normalize_name("Dupont, J.R.") == "dupont j r"
 49 |     assert normalize_name("Dupont, J.-R.") == "dupont j r"
 50 |     assert normalize_name("Dupont") == "dupont"
 51 |     assert normalize_name("Dupont J.R.") == "dupont j r"
 52 |     assert normalize_name("von und zu Hohenstein, F.") == "hohenstein f"
 53 |     assert normalize_name("von und zu Hohenstein, F.",
 54 |                           drop_common_affixes=False) == "vonundzuhohenstein f"
 55 |     assert normalize_name("Jakub, Ibrahim ibn") == "jakub ibrahim ibn"
 56 |     assert normalize_name("o'Neill, Jack") == "neill jack"
 57 |     assert normalize_name("o'Neill, Jack",
 58 |                           drop_common_affixes=False) == "oneill jack"
 59 |     assert normalize_name("Ben, Robert") == "ben robert"
 60 |     assert normalize_name("Robert, L. W") == "robert l w"
 61 |     assert normalize_name("Mueller aus Auer, Peter") == \
 62 |         "muellerauer peter"
 63 |     assert normalize_name("Mueller aus Auer, Peter",
 64 |                           drop_common_affixes=False) == \
 65 |         "muellerausauer peter"
 66 | 
 67 | 
 68 | def test_phonetic_tokenize_name_simple():
 69 |     """Test of tokenize_name."""
 70 |     assert phonetic_tokenize_name("Doe, John") == ((dm(u"Doe")[0],),
 71 |                                                    (dm(u"John")[0],))
 72 |     assert phonetic_tokenize_name("Doe, J.") == \
 73 |         phonetic_tokenize_name(u"Doe, J")
 74 |     assert phonetic_tokenize_name("Doe-Foe, Willem") == ((dm(u"Doe")[0],
 75 |                                                          dm(u"Foe")[0]),
 76 |                                                          (dm(u"Willem")[0],))
 77 |     assert phonetic_tokenize_name("Dupont, René") == \
 78 |         phonetic_tokenize_name("Dupont., René")
 79 |     assert phonetic_tokenize_name("Dupont, Jean-René") == \
 80 |         ((dm(u"Dupont")[0],), (dm(u"Jean")[0], dm(u"René")[0]))
 81 |     assert phonetic_tokenize_name("Dupont, René, III") == \
 82 |         ((dm(u"Dupont")[0],), (dm(u"Rene")[0], dm(u"III")[0]))
 83 |     assert phonetic_tokenize_name("Dupont, René, Jr.") == \
 84 |         ((dm(u"Dupont")[0],), (dm(u"Rene")[0], dm(u"Jr")[0]))
 85 |     assert phonetic_tokenize_name("Dupont, J.R.") == \
 86 |         phonetic_tokenize_name("Dupont, J.-R.")
 87 |     assert phonetic_tokenize_name("Dupont") == ((dm(u"Dupont")[0],), ('',))
 88 |     assert phonetic_tokenize_name("Jean Dupont") == \
 89 |         phonetic_tokenize_name("Dupont, Jean")
 90 | 
 91 | 
 92 | def test_phonetic_tokenize_name_nysiis():
 93 |     assert phonetic_tokenize_name("Dupont, René", "nysiis") == (
 94 |         ((fuzzy.nysiis(u"Dupont"),), (fuzzy.nysiis(u"René"),)))
 95 | 
 96 | 
 97 | @pytest.mark.xfail(reason="soundex is broken in fuzzy 1.2.*")
 98 | def test_phonetic_tokenize_name_soundex():
 99 |     """Test checking if custom phonetic algorithms from fuzzy packages work."""
100 |     soundex = fuzzy.Soundex(5)
101 |     assert phonetic_tokenize_name("Dupont, René", "soundex") == (
102 |         # no direct support for unicode in soundex, thus "Rene"
103 |         ((soundex(u"Dupont"),), (soundex(u"Rene"),)))
104 | 
105 | 
106 | def test_phonetic_normalize_name_tokenize_sign():
107 |     """Test correct handling of the cyrillic soft sign."""
108 |     assert phonetic_tokenize_name("Aref'ev, M.") == ((dm(u"Arefev")[0],),
109 |                                                      (dm(u"M")[0],))
110 |     # If the following letter is uppercase, split
111 |     assert phonetic_tokenize_name("An'Sun, J.") == ((dm(u"An")[0],
112 |                                                     dm(u"Sun")[0]),
113 |                                                     (dm(u"J")[0],))
114 | 
115 | 
116 | def test_phonetic_normalize_name_remove_tokenizefixes():
117 |     """Test correct removal of the common affixes."""
118 |     assert phonetic_tokenize_name("von und zu Hohenstein, F.") == \
119 |         phonetic_tokenize_name("Hohenstein, F.")
120 |     # If the name consists of only the common prefixes, don't drop it, as
121 |     # it might actually be the correct surname.
122 |     assert phonetic_tokenize_name("Ben, Robert") == ((dm(u"Ben")[0],),
123 |                                                      (dm(u"Robert")[0],))
124 |     # Don't drop affixes among the first names.
125 |     assert phonetic_tokenize_name("Robert, L. W.") == ((dm(u"Robert")[0],),
126 |                                                        (dm(u"L")[0],
127 |                                                        dm(u"W")[0]))
128 | 
129 | 
130 | def test_given_name_initial():
131 |     """Test the extraction of the first initial."""
132 |     assert given_name_initial("Doe, John") == 'j'
133 |     assert given_name_initial("Doe-Foe, Willem") == 'w'
134 |     assert given_name_initial("Doe=Foe, Willem John", 1) == 'j'
135 |     assert given_name_initial("Dupont, Jean-René") == 'j'
136 |     assert given_name_initial("Dupont, René, III") == 'r'
137 |     assert given_name_initial("Dupont, René Pierre", 1) == 'p'
138 |     assert given_name_initial("Dupont, René, III Pierre", 1) == ''
139 |     assert given_name_initial("Mieszko") == ''
140 |     assert given_name_initial("John Doe") == 'j'
141 |     assert given_name_initial("Dupont, .J") == 'j'
142 | 
143 | 
144 | def test_given_name():
145 |     """Test given name extraction."""
146 |     assert given_name("Doe, John", 0) == 'John'
147 |     assert given_name("Doe, John", 1) == ''
148 |     assert given_name("Doe, John William", 0) == 'John'
149 |     assert given_name("Doe, John William", 1) == 'William'
150 |     assert given_name("Dupont, .J", 0) == ".J"
151 |     assert given_name("John Doe", 0) == 'John'
152 |     assert given_name("Mieszko", 0) == 'Mieszko'
153 |     assert given_name("Dupont, René, III Pierre", 0) == 'René'
154 |     assert given_name("Dupont, René, III Pierre", 1) == ''
155 |     assert given_name("Dupont, René, III Pierre", 2) == ''
156 | 


--------------------------------------------------------------------------------
/tests/utils/test_strings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #
 3 | # This file is part of Beard.
 4 | # Copyright (C) 2015 CERN.
 5 | #
 6 | # Beard is a free software; you can redistribute it and/or modify it
 7 | # under the terms of the Revised BSD License; see LICENSE file for
 8 | # more details.
 9 | 
10 | """Tests of string helpers.
11 | 
12 | .. codeauthor:: Gilles Louppe <g.louppe@cern.ch>
13 | 
14 | """
15 | 
16 | from beard.utils.strings import asciify
17 | 
18 | 
19 | def test_asciify():
20 |     """Test of asciify."""
21 |     assert asciify("") == ""
22 |     assert asciify("foo") == "foo"
23 |     assert asciify("bèård") == "beard"
24 |     assert asciify("schröder") == "schroder"
25 | 


--------------------------------------------------------------------------------
/tests/utils/test_transformers.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #
 3 | # This file is part of Beard.
 4 | # Copyright (C) 2015 CERN.
 5 | #
 6 | # Beard is a free software; you can redistribute it and/or modify it
 7 | # under the terms of the Revised BSD License; see LICENSE file for
 8 | # more details.
 9 | 
10 | """Tests of generic transformers.
11 | 
12 | .. codeauthor:: Gilles Louppe <g.louppe@cern.ch>
13 | 
14 | """
15 | 
16 | import numpy as np
17 | from numpy.testing import assert_array_equal
18 | from numpy.testing import assert_equal
19 | 
20 | from beard.utils.transformers import FuncTransformer
21 | from beard.utils.transformers import Shaper
22 | 
23 | 
24 | def test_func_transformer():
25 |     """Test for FuncTransformer."""
26 |     X = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.int)
27 | 
28 |     def myfunc(v):
29 |         return v + 1
30 | 
31 |     Xt = FuncTransformer(myfunc).fit_transform(X)
32 |     assert_array_equal(Xt, X + 1)
33 |     assert_equal(X.dtype, Xt.dtype)
34 | 
35 |     Xt = FuncTransformer(myfunc, dtype=np.float).fit_transform(X)
36 |     assert_array_equal(Xt, X + 1)
37 |     assert_equal(Xt.dtype, np.float)
38 | 
39 | 
40 | def test_shaper():
41 |     """Test for Shaper"""
42 |     X = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.int)
43 | 
44 |     Xt = Shaper((-1, 1)).fit_transform(X)
45 |     assert_array_equal(Xt, [[0], [1], [2], [3], [4], [5]])
46 |     assert_array_equal(Xt.shape, (6, 1))
47 | 
48 |     Xt = Shaper((-1,)).fit_transform(X)
49 |     assert_array_equal(Xt, [0, 1, 2, 3, 4, 5])
50 |     assert_array_equal(Xt.shape, (6,))
51 | 
52 |     Xt = Shaper((-1, 1), order="F").fit_transform(X)
53 |     assert_array_equal(Xt, [[0], [3], [1], [4], [2], [5]])
54 |     assert_array_equal(Xt.shape, (6, 1))
55 |     # assert np.isfortran(Xt)
56 | 


--------------------------------------------------------------------------------
/travis-install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This file is part of Beard.
 3 | # Copyright (C) 2014 CERN.
 4 | #
 5 | # Beard is a free software; you can redistribute it and/or modify it
 6 | # under the terms of the Revised BSD License; see LICENSE file for
 7 | # more details.
 8 | 
 9 | # This script is freely inspired from the Scikit-Learn integration scripts.
10 | # https://github.com/scikit-learn/scikit-learn/blob/master/continuous_integration/install.sh
11 | # License: 3-clause BSD
12 | 
13 | set -e
14 | 
15 | # Fix the compilers to workaround avoid having the Python 3.4 build
16 | # lookup for g++44 unexpectedly.
17 | export CC=gcc
18 | export CXX=g++
19 | 
20 | # Deactivate the travis-provided virtual environment and setup a
21 | # conda-based environment instead
22 | # deactivate
23 | 
24 | # Use the miniconda installer for faster download / install of conda
25 | # itself
26 | 
27 | wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh  \
28 |     -O miniconda.sh
29 | 
30 | chmod +x miniconda.sh && ./miniconda.sh -b
31 | export PATH=/home/travis/miniconda2/bin:$PATH
32 | conda update --yes conda
33 | 
34 | # Configure the conda environment and put it in the path using the
35 | # provided versions
36 | conda create -n testenv --yes python=$PYTHON_VERSION pip \
37 |     numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION scikit-learn=$SKLEARN_VERSION \
38 |     pytest pytest-pep8 pytest-cache sphinx
39 | source activate testenv
40 | 
41 | python --version
42 | python -c "import numpy; print('numpy %s' % numpy.__version__)"
43 | python -c "import scipy; print('scipy %s' % scipy.__version__)"
44 | 


--------------------------------------------------------------------------------