├── .coveragerc
├── .github
    └── workflows
    │   ├── ci-workflow.yml
    │   ├── python-package.yml
    │   └── render-docs.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .readthedocs.yaml
├── LICENSE
├── MANIFEST.in
├── README.md
├── benchmarks
    ├── __init__.py
    ├── bench_comparing.py
    └── bench_indexing.py
├── docs
    ├── Makefile
    ├── about.rst
    ├── annotation.rst
    ├── changelog.rst
    ├── conf.py
    ├── contributing.rst
    ├── guides
    │   ├── classifiers.rst
    │   ├── data_deduplication.ipynb
    │   └── link_two_dataframes.ipynb
    ├── images
    │   ├── elas_1705.png
    │   ├── indexing_basic.png
    │   ├── indexing_plot.py
    │   └── recordlinkage-banner-transparent.svg
    ├── index.rst
    ├── installation.rst
    ├── make.bat
    ├── performance.rst
    ├── ref-classifiers.rst
    ├── ref-compare.rst
    ├── ref-datasets.rst
    ├── ref-evaluation.rst
    ├── ref-index.rst
    ├── ref-misc.rst
    └── ref-preprocessing.rst
├── examples
    ├── README.rst
    ├── dedup_deterministic.py
    ├── linking_deterministic.py
    ├── supervised_keras.py
    ├── supervised_learning_prob.py
    └── unsupervised_learning_prob.py
├── pyproject.toml
├── recordlinkage
    ├── __init__.py
    ├── _lib
    │   ├── numeric.c
    │   └── numeric.h
    ├── adapters.py
    ├── algorithms
    │   ├── __init__.py
    │   ├── c_numeric.pyx
    │   ├── compare.py
    │   ├── distance.py
    │   ├── indexing.py
    │   ├── nb_sklearn.py
    │   ├── numeric.py
    │   └── string.py
    ├── annotation.py
    ├── api.py
    ├── base.py
    ├── classifiers.py
    ├── compare.py
    ├── config.py
    ├── config_init.py
    ├── contrib
    │   ├── README.rst
    │   ├── __init__.py
    │   ├── compare
    │   │   ├── __init__.py
    │   │   └── random
    │   │   │   ├── README.rst
    │   │   │   ├── __init__.py
    │   │   │   ├── random.py
    │   │   │   └── test_random.py
    │   └── index
    │   │   ├── __init__.py
    │   │   └── neighbourhoodblock
    │   │       ├── README.rst
    │   │       ├── __init__.py
    │   │       ├── neighbourhoodblock.py
    │   │       └── test_neighbourhoodblock.py
    ├── datasets
    │   ├── __init__.py
    │   ├── external.py
    │   ├── febrl.py
    │   ├── febrl
    │   │   ├── dataset1.csv
    │   │   ├── dataset2.csv
    │   │   ├── dataset3.csv
    │   │   ├── dataset4a.csv
    │   │   └── dataset4b.csv
    │   └── generate.py
    ├── deprecated.py
    ├── index.py
    ├── measures.py
    ├── network.py
    ├── preprocessing
    │   ├── __init__.py
    │   ├── cleaning.py
    │   └── encoding.py
    ├── rl_logging.py
    ├── standardise
    │   └── __init__.py
    ├── types.py
    └── utils.py
└── tests
    ├── test_annotator.py
    ├── test_classify.py
    ├── test_compare.py
    ├── test_datasets.py
    ├── test_generate.py
    ├── test_indexing.py
    ├── test_measures.py
    ├── test_misc.py
    ├── test_network.py
    └── test_preprocessing.py


/.coveragerc:
--------------------------------------------------------------------------------
 1 | [run]
 2 | branch = True
 3 | 
 4 | [report]
 5 | exclude_lines =
 6 |     if self.debug:
 7 |     pragma: no cover
 8 |     raise NotImplementedError
 9 |     if __name__ == .__main__.:
10 | 
11 | ignore_errors = False
12 | 
13 | omit =
14 |     tests/*
15 |     docs/*
16 |     recordlinkage/_version.py
17 |     recordlinkage/types.py


--------------------------------------------------------------------------------
/.github/workflows/ci-workflow.yml:
--------------------------------------------------------------------------------
 1 | name: tests
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |   build:
 7 | 
 8 |     runs-on: ubuntu-latest
 9 |     strategy:
10 |       fail-fast: false
11 |       matrix:
12 |         python-version: ["3.8", "3.9", "3.10", "3.11"]
13 |         pandas-version: ["1.0", "2.0"]
14 |     steps:
15 |     - uses: actions/checkout@v2
16 |     - name: Set up Python ${{ matrix.python-version }}
17 |       uses: actions/setup-python@v1
18 |       with:
19 |         python-version: ${{ matrix.python-version }}
20 |     - name: Install pandas
21 |       run: |
22 |         pip install pandas~=${{ matrix.pandas-version }}
23 |     - name: Package recordlinkage
24 |       run: |
25 |         pip install --upgrade pip
26 |         pip install build
27 |         python -m build
28 |     - name: Install recordlinkage
29 |       run: |
30 |         pip install networkx>=2
31 |         pip install ./dist/recordlinkage-*.whl
32 |     - name: Test with pytest
33 |       run: |
34 |         pip install pytest
35 |         # remove recordlinkage to prevent relative imports (use installed package)
36 |         # this is like wrapping stuff in a src folder
37 |         rm -r recordlinkage/
38 |         pytest
39 |   lint:
40 |     runs-on: ubuntu-latest
41 |     steps:
42 |     - uses: actions/checkout@v2
43 |     - uses: actions/setup-python@v1
44 |     - name: Install ruff
45 |       run: |
46 |         pip install ruff
47 |     - name: Lint with ruff
48 |       run: |
49 |         ruff .
50 | 


--------------------------------------------------------------------------------
/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
 1 | name: Upload Python Package
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [published]
 6 | 
 7 | permissions:
 8 |   contents: read
 9 | 
10 | jobs:
11 |   deploy:
12 |     runs-on: ubuntu-latest
13 |     steps:
14 |     - uses: actions/checkout@v3
15 |     - name: Set up Python
16 |       uses: actions/setup-python@v4
17 |       with:
18 |         python-version: '3.x'
19 |     - name: Install dependencies
20 |       run: |
21 |         python -m pip install --upgrade pip
22 |         pip install build
23 |     - name: Build package
24 |       run: python -m build
25 |     - name: Publish package
26 |       uses: pypa/gh-action-pypi-publish@release/v1
27 |       with:
28 |         user: __token__
29 |         password: ${{ secrets.pypi_password }}
30 | 


--------------------------------------------------------------------------------
/.github/workflows/render-docs.yml:
--------------------------------------------------------------------------------
 1 | name: Build HTML with Sphinx
 2 | on: [push, pull_request]
 3 | jobs:
 4 |   html-sphinx:
 5 |     runs-on: ubuntu-latest
 6 |     steps:
 7 |     - name: Clone repo
 8 |       uses: actions/checkout@v2
 9 |     - name: Set up Python
10 |       uses: actions/setup-python@v2
11 |       with:
12 |         python-version: '3.10'
13 |     - name: Install recordlinkage and docs tools
14 |       run: |
15 |         sudo apt install pandoc
16 |         python -m pip install .[docs]
17 |     - name: Build HTML
18 |       run: |
19 |         python -m sphinx -W --keep-going --color docs/ _build/html/
20 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | 
 2 | recordlinkage/datasets/krebsregister/*
 3 | 
 4 | recordlinkage/_version.py
 5 | 
 6 | 
 7 | .DS_Store
 8 | */.DS_Store
 9 | 
10 | # Byte-compiled / optimized / DLL files
11 | __pycache__/
12 | *.py[cod]
13 | *$py.class
14 | 
15 | # C extensions and Cython .pyx compilations
16 | *.so
17 | algorithms/*.c
18 | 
19 | # Distribution / packaging
20 | .Python
21 | build/
22 | develop-eggs/
23 | dist/
24 | downloads/
25 | eggs/
26 | .eggs/
27 | lib/
28 | lib64/
29 | parts/
30 | sdist/
31 | var/
32 | *.egg-info/
33 | .installed.cfg
34 | *.egg
35 | *.bat
36 | 
37 | # PyInstaller
38 | #  Usually these files are written by a python script from a template
39 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
40 | *.manifest
41 | *.spec
42 | 
43 | # Installer logs
44 | pip-log.txt
45 | pip-delete-this-directory.txt
46 | 
47 | # Unit test / coverage reports
48 | htmlcov/
49 | .tox/
50 | .coverage
51 | .coverage.*
52 | .cache
53 | nosetests.xml
54 | .pytest_cache/*
55 | coverage.xml
56 | *,cover
57 | 
58 | # Translations
59 | *.mo
60 | *.pot
61 | 
62 | # Django stuff:
63 | *.log
64 | 
65 | # Sphinx documentation
66 | docs/_build/
67 | 
68 | # PyBuilder
69 | target/
70 | 
71 | # IPython Notebook
72 | .ipynb_checkpoints
73 | 
74 | # dotenv
75 | .env
76 | 
77 | # virtualenv
78 | venv/
79 | ENV/
80 | 
81 | /tests/sandbox
82 | # ASV
83 | .asv/
84 | 
85 | # PyCharm IDE
86 | /sandbox
87 | /cover
88 | /coverage-report
89 | .idea/
90 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v4.4.0
 4 |     hooks:
 5 |       - id: trailing-whitespace
 6 |       - id: end-of-file-fixer
 7 |       - id: check-yaml
 8 |       - id: check-added-large-files
 9 |   - repo: https://github.com/charliermarsh/ruff-pre-commit
10 |     rev: v0.0.278
11 |     hooks:
12 |       - id: ruff
13 |   - repo: https://github.com/psf/black
14 |     rev: 23.7.0
15 |     hooks:
16 |       - id: black
17 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | build:
 4 |   os: ubuntu-22.04
 5 |   tools:
 6 |     python: "3.11"
 7 | 
 8 | sphinx:
 9 |   configuration: docs/conf.py
10 | 
11 | python:
12 |   install:
13 |     - method: pip
14 |       path: .
15 |       extra_requirements:
16 |         - docs
17 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2016-2018, Jonathan de Bruin
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | * Redistributions of source code must retain the above copyright notice, this
 8 |   list of conditions and the following disclaimer.
 9 | 
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 |   this list of conditions and the following disclaimer in the documentation
12 |   and/or other materials provided with the distribution.
13 | 
14 | * Neither the name of the copyright holder nor the names of its
15 |   contributors may be used to endorse or promote products derived from
16 |   this software without specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include recordlinkage/datasets/febrl *.csv
2 | recursive-include recordlinkage/datasets/krebsregister *.csv
3 | 
4 | global-exclude test_*.py
5 | global-exclude *_test.py
6 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <div align="center">
  2 |   <img src="https://raw.githubusercontent.com/J535D165/recordlinkage/master/docs/images/recordlinkage-banner-transparent.svg"><br>
  3 | </div>
  4 | 
  5 | # RecordLinkage: powerful and modular Python record linkage toolkit
  6 | 
  7 | [![Pypi Version](https://badge.fury.io/py/recordlinkage.svg)](https://pypi.python.org/pypi/recordlinkage/)
  8 | [![Github Actions CI Status](https://github.com/J535D165/recordlinkage/workflows/tests/badge.svg?branch=master)](https://github.com/J535D165/recordlinkage/actions)
  9 | [![Code Coverage](https://codecov.io/gh/J535D165/recordlinkage/branch/master/graph/badge.svg)](https://codecov.io/gh/J535D165/recordlinkage)
 10 | [![Documentation Status](https://readthedocs.org/projects/recordlinkage/badge/?version=latest)](https://recordlinkage.readthedocs.io/en/latest/?badge=latest)
 11 | [![Zenodo DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.3559042.svg)](https://doi.org/10.5281/zenodo.3559042)
 12 | 
 13 | **RecordLinkage** is a powerful and modular record linkage toolkit to
 14 | link records in or between data sources. The toolkit provides most of
 15 | the tools needed for record linkage and deduplication. The package
 16 | contains indexing methods, functions to compare records and classifiers.
 17 | The package is developed for research and the linking of small or medium
 18 | sized files.
 19 | 
 20 | This project is inspired by the [Freely Extensible Biomedical Record
 21 | Linkage (FEBRL)](https://sourceforge.net/projects/febrl/) project, which
 22 | is a great project. In contrast with FEBRL, the recordlinkage project
 23 | uses [pandas](http://pandas.pydata.org/) and
 24 | [numpy](http://www.numpy.org/) for data handling and computations. The
 25 | use of *pandas*, a flexible and powerful data analysis and manipulation
 26 | library for Python, makes the record linkage process much easier and
 27 | faster. The extensive *pandas* library can be used to integrate your
 28 | record linkage directly into existing data manipulation projects.
 29 | 
 30 | One of the aims of this project is to make an easily extensible record
 31 | linkage framework. It is easy to include your own indexing algorithms,
 32 | comparison/similarity measures and classifiers.
 33 | 
 34 | ## Basic linking example
 35 | 
 36 | Import the `recordlinkage` module with all important tools for record
 37 | linkage and import the data manipulation framework **pandas**.
 38 | 
 39 | ``` python
 40 | import recordlinkage
 41 | import pandas
 42 | ```
 43 | 
 44 | Load your data into pandas DataFrames.
 45 | 
 46 | ``` python
 47 | df_a = pandas.DataFrame(YOUR_FIRST_DATASET)
 48 | df_b = pandas.DataFrame(YOUR_SECOND_DATASET)
 49 | ```
 50 | 
 51 | Comparing all record can be computationally intensive. Therefore, we
 52 | make set of candidate links with one of the built-in indexing techniques
 53 | like **blocking**. In this example, only pairs of records that agree on
 54 | the surname are returned.
 55 | 
 56 | ``` python
 57 | indexer = recordlinkage.Index()
 58 | indexer.block('surname')
 59 | candidate_links = indexer.index(df_a, df_b)
 60 | ```
 61 | 
 62 | For each candidate link, compare the records with one of the comparison
 63 | or similarity algorithms in the Compare class.
 64 | 
 65 | ``` python
 66 | c = recordlinkage.Compare()
 67 | 
 68 | c.string('name_a', 'name_b', method='jarowinkler', threshold=0.85)
 69 | c.exact('sex', 'gender')
 70 | c.date('dob', 'date_of_birth')
 71 | c.string('str_name', 'streetname', method='damerau_levenshtein', threshold=0.7)
 72 | c.exact('place', 'placename')
 73 | c.numeric('income', 'income', method='gauss', offset=3, scale=3, missing_value=0.5)
 74 | 
 75 | # The comparison vectors
 76 | feature_vectors = c.compute(candidate_links, df_a, df_b)
 77 | ```
 78 | 
 79 | Classify the candidate links into matching or distinct pairs based on
 80 | their comparison result with one of the [classification
 81 | algorithms](https://recordlinkage.readthedocs.io/en/latest/ref-classifiers.html).
 82 | The following code classifies candidate pairs with a Logistic Regression
 83 | classifier. This (supervised machine learning) algorithm requires
 84 | training data.
 85 | 
 86 | ``` python
 87 | logrg = recordlinkage.LogisticRegressionClassifier()
 88 | logrg.fit(TRAINING_COMPARISON_VECTORS, TRAINING_PAIRS)
 89 | 
 90 | logrg.predict(feature_vectors)
 91 | ```
 92 | 
 93 | The following code shows the classification of candidate pairs with the
 94 | Expectation-Conditional Maximisation (ECM) algorithm. This variant of
 95 | the Expectation-Maximisation algorithm doesn't require training data
 96 | (unsupervised machine learning).
 97 | 
 98 | ``` python
 99 | ecm = recordlinkage.ECMClassifier()
100 | ecm.fit_predict(feature_vectors)
101 | ```
102 | 
103 | ## Main Features
104 | 
105 | The main features of this Python record linkage toolkit are:
106 | 
107 | -   Clean and standardise data with easy to use tools
108 | -   Make pairs of records with smart indexing methods such as
109 |     **blocking** and **sorted neighbourhood indexing**
110 | -   Compare records with a large number of comparison and similarity
111 |     measures for different types of variables such as strings, numbers
112 |     and dates.
113 | -   Several classifications algorithms, both supervised and unsupervised
114 |     algorithms.
115 | -   Common record linkage evaluation tools
116 | -   Several built-in datasets.
117 | 
118 | ## Documentation
119 | 
120 | The most recent documentation and API reference can be found at
121 | [recordlinkage.readthedocs.org](http://recordlinkage.readthedocs.org/en/latest/).
122 | The documentation provides some basic usage examples like
123 | [deduplication](http://recordlinkage.readthedocs.io/en/latest/guides/data_deduplication.html)
124 | and
125 | [linking](http://recordlinkage.readthedocs.io/en/latest/guides/link_two_dataframes.html)
126 | census data. More examples are coming soon. If you do have interesting
127 | examples to share, let us know.
128 | 
129 | ## Installation
130 | 
131 | The Python Record linkage Toolkit requires Python 3.8 or higher. Install the
132 | package easily with pip
133 | 
134 | ``` sh
135 | pip install recordlinkage
136 | ```
137 | 
138 | The toolkit depends on popular packages like
139 | [Pandas](https://github.com/pydata/pandas),
140 | [Numpy](http://www.numpy.org), [Scipy](https://www.scipy.org/) and,
141 | [Scikit-learn](http://scikit-learn.org/). A complete list of
142 | dependencies can be found in the [installation
143 | manual](https://recordlinkage.readthedocs.io/en/latest/installation.html)
144 | as well as recommended and optional dependencies.
145 | 
146 | ## License
147 | 
148 | The license for this record linkage tool is BSD-3-Clause.
149 | 
150 | ## Citation
151 | 
152 | Please cite this package when being used in an academic context. Ensure
153 | that the DOI and version match the installed version. Citatation styles
154 | can be found on the publishers website
155 | [10.5281/zenodo.3559042](https://doi.org/10.5281/zenodo.3559042).
156 | 
157 | ``` text
158 | @software{de_bruin_j_2019_3559043,
159 |   author       = {De Bruin, J},
160 |   title        = {{Python Record Linkage Toolkit: A toolkit for
161 |                    record linkage and duplicate detection in Python}},
162 |   month        = dec,
163 |   year         = 2019,
164 |   publisher    = {Zenodo},
165 |   version      = {v0.14},
166 |   doi          = {10.5281/zenodo.3559043},
167 |   url          = {https://doi.org/10.5281/zenodo.3559043}
168 | }
169 | ```
170 | 
171 | ## Need help?
172 | 
173 | Stuck on your record linkage code or problem? Any other questions? Don't
174 | hestitate to send me an email (<jonathandebruinos@gmail.com>).
175 | 


--------------------------------------------------------------------------------
/benchmarks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/J535D165/recordlinkage/b93d97641952f8c85106be5794ca93b1f1298fbc/benchmarks/__init__.py


--------------------------------------------------------------------------------
/benchmarks/bench_comparing.py:
--------------------------------------------------------------------------------
  1 | import recordlinkage as rl
  2 | from recordlinkage.datasets import load_febrl1
  3 | from recordlinkage.datasets import load_febrl4
  4 | 
  5 | 
  6 | class CompareRecordLinkage:
  7 |     timeout = 30 * 60
  8 | 
  9 |     def setup(self):
 10 |         # download data
 11 |         self.A, self.B = load_febrl4()
 12 | 
 13 |         # make pairs
 14 |         c_pairs = rl.FullIndex()
 15 |         pairs = c_pairs.index(self.A, self.B)
 16 | 
 17 |         # different sizes of pairs
 18 |         self.pairs_xsmall = pairs[0:5e3]
 19 |         self.pairs_small = pairs[0:5e4]
 20 |         self.pairs_medium = pairs[0:5e5]
 21 |         self.pairs_large = pairs[0:5e6]
 22 | 
 23 |     def time_global_xsmall(self):
 24 |         c_compare = rl.Compare(self.pairs_xsmall, self.A, self.B)
 25 |         c_compare.string("given_name", "given_name", method="jaro")
 26 |         c_compare.string("surname", "surname", method="jarowinkler", threshold=0.85)
 27 |         c_compare.date("date_of_birth", "date_of_birth")
 28 |         c_compare.exact("suburb", "suburb")
 29 |         c_compare.exact("state", "state")
 30 |         c_compare.string("address_1", "address_1", method="levenshtein", threshold=0.85)
 31 | 
 32 |     def time_global_small(self):
 33 |         c_compare = rl.Compare(self.pairs_small, self.A, self.B)
 34 |         c_compare.string("given_name", "given_name", method="jaro")
 35 |         c_compare.string("surname", "surname", method="jarowinkler", threshold=0.85)
 36 |         c_compare.date("date_of_birth", "date_of_birth")
 37 |         c_compare.exact("suburb", "suburb")
 38 |         c_compare.exact("state", "state")
 39 |         c_compare.string("address_1", "address_1", method="levenshtein", threshold=0.85)
 40 | 
 41 |     def time_global_medium(self):
 42 |         c_compare = rl.Compare(self.pairs_medium, self.A, self.B)
 43 |         c_compare.string("given_name", "given_name", method="jaro")
 44 |         c_compare.string("surname", "surname", method="jarowinkler", threshold=0.85)
 45 |         c_compare.date("date_of_birth", "date_of_birth")
 46 |         c_compare.exact("suburb", "suburb")
 47 |         c_compare.exact("state", "state")
 48 |         c_compare.string("address_1", "address_1", method="levenshtein", threshold=0.85)
 49 | 
 50 |     def time_global_large(self):
 51 |         c_compare = rl.Compare(self.pairs_large, self.A, self.B)
 52 |         c_compare.string("given_name", "given_name", method="jaro")
 53 |         c_compare.string("surname", "surname", method="jarowinkler", threshold=0.85)
 54 |         c_compare.date("date_of_birth", "date_of_birth")
 55 |         c_compare.exact("suburb", "suburb")
 56 |         c_compare.exact("state", "state")
 57 |         c_compare.string("address_1", "address_1", method="levenshtein", threshold=0.85)
 58 | 
 59 | 
 60 | class CompareDeduplication:
 61 |     timeout = 30 * 60
 62 | 
 63 |     def setup(self):
 64 |         # download data
 65 |         self.A = load_febrl1()
 66 | 
 67 |         # make pairs
 68 |         c_pairs = rl.FullIndex()
 69 |         pairs = c_pairs.index(self.A)
 70 | 
 71 |         # different sizes of pairs
 72 |         self.pairs_xsmall = pairs[0:5e3]
 73 |         self.pairs_small = pairs[0:5e4]
 74 |         self.pairs_medium = pairs[0:5e5]
 75 |         self.pairs_large = pairs[0:5e6]
 76 | 
 77 |     def time_global_xsmall(self):
 78 |         c_compare = rl.Compare(self.pairs_xsmall, self.A)
 79 |         c_compare.string("given_name", "given_name", method="jaro")
 80 |         c_compare.string("surname", "surname", method="jarowinkler", threshold=0.85)
 81 |         c_compare.date("date_of_birth", "date_of_birth")
 82 |         c_compare.exact("suburb", "suburb")
 83 |         c_compare.exact("state", "state")
 84 |         c_compare.string("address_1", "address_1", method="levenshtein", threshold=0.85)
 85 | 
 86 |     def time_global_small(self):
 87 |         c_compare = rl.Compare(self.pairs_small, self.A)
 88 |         c_compare.string("given_name", "given_name", method="jaro")
 89 |         c_compare.string("surname", "surname", method="jarowinkler", threshold=0.85)
 90 |         c_compare.date("date_of_birth", "date_of_birth")
 91 |         c_compare.exact("suburb", "suburb")
 92 |         c_compare.exact("state", "state")
 93 |         c_compare.string("address_1", "address_1", method="levenshtein", threshold=0.85)
 94 | 
 95 |     def time_global_medium(self):
 96 |         c_compare = rl.Compare(self.pairs_medium, self.A)
 97 |         c_compare.string("given_name", "given_name", method="jaro")
 98 |         c_compare.string("surname", "surname", method="jarowinkler", threshold=0.85)
 99 |         c_compare.date("date_of_birth", "date_of_birth")
100 |         c_compare.exact("suburb", "suburb")
101 |         c_compare.exact("state", "state")
102 |         c_compare.string("address_1", "address_1", method="levenshtein", threshold=0.85)
103 | 
104 |     def time_global_large(self):
105 |         c_compare = rl.Compare(self.pairs_large, self.A)
106 |         c_compare.string("given_name", "given_name", method="jaro")
107 |         c_compare.string("surname", "surname", method="jarowinkler", threshold=0.85)
108 |         c_compare.date("date_of_birth", "date_of_birth")
109 |         c_compare.exact("suburb", "suburb")
110 |         c_compare.exact("state", "state")
111 |         c_compare.string("address_1", "address_1", method="levenshtein", threshold=0.85)
112 | 
113 | 
114 | class CompareAlgorithms:
115 |     timeout = 30 * 60
116 | 
117 |     def setup(self):
118 |         # download data
119 |         self.A, self.B = load_febrl4()
120 | 
121 |         # Add numbers (age)
122 |         self.A["postcode"] = self.A["postcode"].astype(float)
123 |         self.B["postcode"] = self.B["postcode"].astype(float)
124 | 
125 |         # make pairs
126 |         c_pairs = rl.FullIndex()
127 |         self.pairs = c_pairs.index(self.A, self.B)[0:5e4]
128 | 
129 |     # ************* STRING *************
130 | 
131 |     def time_string_jaro(self):
132 |         c_compare = rl.Compare(self.pairs, self.A, self.B)
133 |         c_compare.string("given_name", "given_name", method="jaro")
134 | 
135 |     def time_string_jarowinkler(self):
136 |         c_compare = rl.Compare(self.pairs, self.A, self.B)
137 |         c_compare.string("given_name", "given_name", method="jarowinkler")
138 | 
139 |     def time_string_qgram(self):
140 |         c_compare = rl.Compare(self.pairs, self.A, self.B)
141 |         c_compare.string("given_name", "given_name", method="qgram")
142 | 
143 |     def time_string_cosine(self):
144 |         c_compare = rl.Compare(self.pairs, self.A, self.B)
145 |         c_compare.string("given_name", "given_name", method="cosine")
146 | 
147 |     def time_string_levenshtein(self):
148 |         c_compare = rl.Compare(self.pairs, self.A, self.B)
149 |         c_compare.string("given_name", "given_name", method="levenshtein")
150 | 
151 |     # ************* Exact *************
152 | 
153 |     def time_exact(self):
154 |         c_compare = rl.Compare(self.pairs, self.A, self.B)
155 |         c_compare.exact("state", "state")
156 | 
157 |     # ************* NUMERIC *************
158 | 
159 |     def time_numeric_gauss(self):
160 |         c_compare = rl.Compare(self.pairs, self.A, self.B)
161 |         c_compare.numeric("age", "age", method="gauss", scale=2)
162 | 


--------------------------------------------------------------------------------
/benchmarks/bench_indexing.py:
--------------------------------------------------------------------------------
 1 | import recordlinkage as rl
 2 | from recordlinkage.datasets import load_febrl1
 3 | from recordlinkage.datasets import load_febrl4
 4 | 
 5 | 
 6 | class PairsRecordLinkage:
 7 |     timeout = 30 * 60
 8 | 
 9 |     def setup(self):
10 |         # download data
11 |         self.A, self.B = load_febrl4()
12 | 
13 |     def time_full_index(self):
14 |         # setup class
15 |         c_pairs = rl.FullIndex()
16 | 
17 |         # Make pairs
18 |         c_pairs.index(self.A, self.B)
19 | 
20 |     def time_block_index(self):
21 |         # setup class
22 |         c_pairs = rl.BlockIndex("given_name")
23 | 
24 |         # Make pairs
25 |         c_pairs.index(self.A, self.B)
26 | 
27 |     def time_sni_index(self):
28 |         # setup class
29 |         c_pairs = rl.SortedNeighbourhoodIndex(on="given_name", w=5)
30 | 
31 |         # Make pairs
32 |         c_pairs.index(self.A, self.B)
33 | 
34 |     def time_random_index(self):
35 |         # setup class
36 |         c_pairs = rl.RandomIndex(2500)
37 | 
38 |         # Make pairs
39 |         c_pairs.index(self.A, self.B)
40 | 
41 | 
42 | class PairsDeduplication:
43 |     timeout = 30 * 60
44 | 
45 |     def setup(self):
46 |         # download data
47 |         self.A = load_febrl1()
48 | 
49 |     def time_full_index(self):
50 |         # setup class
51 |         c_pairs = rl.FullIndex()
52 | 
53 |         # Make pairs
54 |         c_pairs.index(self.A)
55 | 
56 |     def time_block_index(self):
57 |         # setup class
58 |         c_pairs = rl.BlockIndex("given_name")
59 | 
60 |         # Make pairs
61 |         c_pairs.index(self.A)
62 | 
63 |     def time_sni_index(self):
64 |         # setup class
65 |         c_pairs = rl.SortedNeighbourhoodIndex(on="given_name", w=5)
66 | 
67 |         # Make pairs
68 |         c_pairs.index(self.A)
69 | 
70 |     def time_random_index(self):
71 |         # setup class
72 |         c_pairs = rl.RandomIndex(2500)
73 | 
74 |         # Make pairs
75 |         c_pairs.index(self.A)
76 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/about.rst:
--------------------------------------------------------------------------------
  1 | *****
  2 | About
  3 | *****
  4 | 
  5 | Introduction
  6 | ============
  7 | 
  8 | The **Python Record Linkage Toolkit** is a library to link records in or
  9 | between data sources. The toolkit provides most of the tools needed for 
 10 | record linkage and deduplication. The package contains indexing methods, 
 11 | functions to compare records and classifiers. The package is developed 
 12 | for research and the linking of small or medium sized files.
 13 | 
 14 | The project is inspired by the `Freely Extensible Biomedical Record Linkage
 15 | (FEBRL) <https://sourceforge.net/projects/febrl/>`__ project, which is a great
 16 | project. In contrast with FEBRL, the recordlinkage project makes extensive use
 17 | of data manipulation tools like `pandas <http://pandas.pydata.org/>`__ and
 18 | `numpy <http://www.numpy.org/>`__. The use of *pandas*, a flexible and
 19 | powerful data analysis and manipulation library for Python, makes the record
 20 | linkage process much easier and faster. The extensive *pandas* library can be
 21 | used to integrate your record linkage directly into existing data manipulation
 22 | projects.
 23 | 
 24 | One of the aims of this project is to make an extensible record linkage
 25 | framework. It is easy to include your own indexing algorithms,
 26 | comparison/similarity measures and classifiers. The main features of the
 27 | Python Record Linkage Toolkit are:
 28 | 
 29 | -  Clean and standardise data with easy to use tools
 30 | -  Make pairs of records with smart indexing methods such as
 31 |    **blocking** and **sorted neighbourhood indexing**
 32 | -  Compare records with a large number of comparison and similarity measures
 33 |    for different types of variables such as strings, numbers and dates.
 34 | -  Several classifications algorithms, both supervised and unsupervised
 35 |    algorithms.
 36 | -  Common record linkage evaluation tools
 37 | -  Several built-in datasets. 
 38 | 
 39 | 
 40 | What is record linkage?
 41 | =======================
 42 | 
 43 | The term record linkage is used to indicate the procedure of bringing together
 44 | information from two or more records that are believed to belong to the same
 45 | entity. Record linkage is used to link data from multiple data sources or to
 46 | find duplicates in a single data source. In computer science, record linkage
 47 | is also known as data matching or deduplication (in case of search duplicate
 48 | records within a single file).
 49 | 
 50 | In record linkage, the attributes of the entity (stored in a record) are used
 51 | to link two or more records. Attributes can be unique entity identifiers (SSN,
 52 | license plate number), but also attributes like (sur)name, date of birth and
 53 | car model/colour. The record linkage procedure can be represented as a
 54 | workflow [Christen, 2012]. The steps are: cleaning, indexing, comparing,
 55 | classifying and evaluation. If needed, the classified record pairs flow back
 56 | to improve the previous step. The Python Record Linkage Toolkit follows this
 57 | workflow.
 58 | 
 59 | .. seealso::
 60 | 
 61 |     *Christen, Peter. 2012. Data matching: concepts and techniques for record 
 62 |     linkage, entity resolution, and duplicate detection. Springer Science & 
 63 |     Business Media.*
 64 | 
 65 |     *Fellegi, Ivan P and Alan B Sunter. 1969. “A theory for record linkage.” 
 66 |     Journal of the American Statistical Association 64(328):1183–1210.*
 67 | 
 68 |     *Dunn, Halbert L. 1946. “Record linkage.” American Journal of Public 
 69 |     Health and the Nations Health 36(12):1412–1416.*
 70 | 
 71 |     *Herzog, Thomas N, Fritz J Scheuren and William E Winkler. 2007. Data 
 72 |     quality and record linkage techniques. Vol. 1 Springer.*
 73 | 
 74 | How to link records?
 75 | ====================
 76 | 
 77 | Import the ``recordlinkage`` module with all important tools for record
 78 | linkage and import the data manipulation framework **pandas**.
 79 | 
 80 | .. code:: python
 81 | 
 82 |     import recordlinkage
 83 |     import pandas
 84 | 
 85 | Consider that you try to link two datasets with personal information
 86 | like name, sex and date of birth. Load these datasets into a pandas
 87 | ``DataFrame``.
 88 | 
 89 | .. code:: python
 90 | 
 91 |     df_a = pandas.DataFrame(YOUR_FIRST_DATASET)
 92 |     df_b = pandas.DataFrame(YOUR_SECOND_DATASET)
 93 | 
 94 | Comparing all record can be computationally intensive. Therefore, we
 95 | make smart set of candidate links with one of the built-in indexing
 96 | techniques like **blocking**. Only records pairs agreeing on the
 97 | surname are included.
 98 | 
 99 | .. code:: python
100 | 
101 |     indexer = recordlinkage.Index()
102 |     indexer.block('surname')
103 |     candidate_links = indexer.index(df_a, df_b)
104 | 
105 | Each ``candidate_link`` needs to be compared on the comparable attributes.
106 | This can be done easily with the Compare class and the available comparison
107 | and similarity measures.
108 | 
109 | .. code:: python
110 | 
111 |     compare = recordlinkage.Compare()
112 | 
113 |     compare.string('name', 'name', method='jarowinkler', threshold=0.85)
114 |     compare.exact('sex', 'gender')
115 |     compare.exact('dob', 'date_of_birth')
116 |     compare.string('streetname', 'streetname', method='damerau_levenshtein', threshold=0.7)
117 |     compare.exact('place', 'placename')
118 |     compare.exact('haircolor', 'haircolor', missing_value=9)
119 | 
120 |     # The comparison vectors
121 |     compare_vectors = compare.compute(candidate_links, df_a, df_b)
122 | 
123 | This record linkage package contains several classification algorithms.
124 | Plenty of the algorithms need trainings data (supervised learning) while
125 | some others are unsupervised. An example of supervised learning:
126 | 
127 | .. code:: python
128 | 
129 |     true_linkage = pandas.Series(YOUR_GOLDEN_DATA, index=pandas.MultiIndex(YOUR_MULTI_INDEX))
130 | 
131 |     logrg = recordlinkage.LogisticRegressionClassifier()
132 |     logrg.fit(compare_vectors[true_linkage.index], true_linkage)
133 | 
134 |     logrg.predict(compare_vectors)
135 | 
136 | and an example of unsupervised learning (the well known ECM-algorithm):
137 | 
138 | .. code:: python
139 | 
140 |     ecm = recordlinkage.BernoulliEMClassifier()
141 |     ecm.fit_predict(compare_vectors)
142 | 
143 | 
144 | 


--------------------------------------------------------------------------------
/docs/annotation.rst:
--------------------------------------------------------------------------------
  1 | **********
  2 | Annotation
  3 | **********
  4 | 
  5 | Manually labeled record pairs are useful in training and validation tasks.
  6 | Training data is usually not available in record linkage applications because
  7 | it is highly dataset and sample-specific. The Python Record Linkage Toolkit
  8 | comes with a `browser-based user interface`_ for manually classifying record
  9 | pairs. A hosted version of `RecordLinkage ANNOTATOR`_ can be found on Github.
 10 | 
 11 | .. _`browser-based user interface`: https://github.com/J535D165/recordlinkage-annotator
 12 | .. _`RecordLinkage ANNOTATOR`: https://j535d165.github.io/recordlinkage-annotator/
 13 | 
 14 | .. image:: https://github.com/J535D165/recordlinkage-annotator/blob/master/images/annotator_review.png?raw=true
 15 |    :alt: Review screen of RecordLinkage ANNOTATOR
 16 |    :target: https://j535d165.github.io/recordlinkage-annotator/
 17 | 
 18 | Generate annotation file
 19 | ========================
 20 | 
 21 | The `RecordLinkage ANNOTATOR`_ software requires a structured annotation
 22 | file. The required schema_ of the annotation file is open. The function
 23 | :func:`recordlinkage.write_annotation_file` can be used to render and save an
 24 | annotation file. The function can be used for both linking and deduplication
 25 | purposes. 
 26 | 
 27 | .. _schema: https://github.com/J535D165/recordlinkage-annotator/tree/master/schema
 28 | 
 29 | .. autofunction:: recordlinkage.write_annotation_file
 30 | 
 31 | Linking
 32 | -------
 33 | 
 34 | This is a simple example of the code to render an annotation
 35 | file for linking records:
 36 | 
 37 | .. code:: python
 38 | 
 39 |     import recordlinkage as rl
 40 |     from recordlinkage.index import Block
 41 |     from recordlinkage.datasets import load_febrl4
 42 | 
 43 |     df_a, df_b = load_febrl4()
 44 | 
 45 |     blocker = Block("surname", "surname")
 46 |     pairs = blocker.index(df_a, df_b)
 47 | 
 48 |     rl.write_annotation_file(
 49 |         "annotation_demo_linking.json",
 50 |         pairs[0:50],
 51 |         df_a,
 52 |         df_b,
 53 |         dataset_a_name="Febrl4 A",
 54 |         dataset_b_name="Febrl4 B"
 55 |     )
 56 | 
 57 | Deduplication
 58 | -------------
 59 | 
 60 | This is a simple example of the code to render an annotation
 61 | file for duplicate detection:
 62 | 
 63 | .. code:: python
 64 | 
 65 |     import recordlinkage as rl
 66 |     from recordlinkage.index import Block
 67 |     from recordlinkage.datasets import load_febrl1
 68 | 
 69 |     df_a = load_febrl1()
 70 | 
 71 |     blocker = Block("surname", "surname")
 72 |     pairs = blocker.index(df_a)
 73 | 
 74 |     rl.write_annotation_file(
 75 |         "annotation_demo_dedup.json",
 76 |         pairs[0:50],
 77 |         df_a,
 78 |         dataset_a_name="Febrl1 A"
 79 |     )
 80 | 
 81 | 
 82 | Manual labeling
 83 | ===============
 84 | 
 85 | Go to `RecordLinkage ANNOTATOR`_ or start the server yourself. 
 86 | 
 87 | Choose the annotation file on the landing screen or use the drag and drop
 88 | functionality. A new screen shows the first record pair to label. Start
 89 | labeling data the manually. Use the button `Match` for record pairs belonging
 90 | to the same entity. Use `Distinct` for record pairs belonging to different
 91 | entities. After all records are labeled by hand, the result can be saved to a
 92 | file.
 93 | 
 94 | 
 95 | Export/read annotation file
 96 | ===========================
 97 | 
 98 | After labeling all record pairs, you can export the annotation file to a JSON
 99 | file. Use the function :func:`recordlinkage.read_annotation_file` to read the
100 | results. 
101 | 
102 | .. code:: python
103 | 
104 |     import recordlinkage as rl
105 | 
106 |     result = rl.read_annotation_file('my_annotation.json')
107 |     print(result.links)
108 | 
109 | The function :func:`recordlinkage.read_annotation_file` reads the file and returns 
110 | an :class:`recordlinkage.annotation.AnnotationResult` object. This object contains 
111 | links and distinct attributes that return a :class:`pandas.MultiIndex` object.
112 | 
113 | .. autofunction:: recordlinkage.read_annotation_file
114 | 
115 | 
116 | .. autoclass:: recordlinkage.annotation.AnnotationResult 
117 |     :members:
118 |     :inherited-members:
119 | 
120 | 


--------------------------------------------------------------------------------
/docs/changelog.rst:
--------------------------------------------------------------------------------
 1 | *************
 2 | Release notes
 3 | *************
 4 | 
 5 | Version 0.15
 6 | ============
 7 | 
 8 | - Remove deprecated recordlinkage classes  (`#173`_)
 9 | - Bump min Python version to 3.6, ideally 3.8+ (`#171`_)
10 | - Bump min pandas version to >=1
11 | - Resolve deprecation warnings for numpy and pandas
12 | - Happy lint, sort imports, format code with yapf
13 | - Remove unnecessary np.sort in SNI algorithm (`#141`_)
14 | - Fix bug for cosine and qgram string comparisons with threshold (`#135`_)
15 | - Fix several typos in docs (`#151`_)(`#152`_)(`#153`_)(`#154`_)(`#163`_)(`#164`_)
16 | - Fix random indexer (`#158`_)
17 | - Fix various deprecation warnings and broken docs build (`#170`_)
18 | - Fix broken docs build due to pandas depr warnings (`#169`_)
19 | - Fix broken build and removed warning messages (`#168`_)
20 | - Update narrative
21 | - Replace Travis by Github Actions (`#132`_)
22 | - Fix broken test NotFittedError
23 | - Fix bug in low memory random sampling and add more tests (`#130`_)
24 | - Add extras_require to setup.py for deps management
25 | - Add banner to README and update title
26 | - Add Binder and Colab buttons at tutorials (`#174`_)
27 | 
28 | Special thanks to Tomasz Waleń @twalen and other contributors for their
29 | work on this release.
30 | 
31 | .. _#173: https://github.com/J535D165/recordlinkage/pull/173
32 | .. _#171: https://github.com/J535D165/recordlinkage/pull/171
33 | .. _#141: https://github.com/J535D165/recordlinkage/pull/141
34 | .. _#135: https://github.com/J535D165/recordlinkage/pull/135
35 | .. _#151: https://github.com/J535D165/recordlinkage/pull/151
36 | .. _#152: https://github.com/J535D165/recordlinkage/pull/152
37 | .. _#153: https://github.com/J535D165/recordlinkage/pull/153
38 | .. _#154: https://github.com/J535D165/recordlinkage/pull/154
39 | .. _#163: https://github.com/J535D165/recordlinkage/pull/163
40 | .. _#164: https://github.com/J535D165/recordlinkage/pull/164
41 | .. _#158: https://github.com/J535D165/recordlinkage/pull/158
42 | .. _#170: https://github.com/J535D165/recordlinkage/pull/170
43 | .. _#169: https://github.com/J535D165/recordlinkage/pull/169
44 | .. _#168: https://github.com/J535D165/recordlinkage/pull/168
45 | .. _#132: https://github.com/J535D165/recordlinkage/pull/132
46 | .. _#130: https://github.com/J535D165/recordlinkage/pull/130
47 | .. _#174: https://github.com/J535D165/recordlinkage/pull/174
48 | 
49 | Version 0.14
50 | ============
51 | 
52 | - Drop Python 2.7 and Python 3.4 support. (`#91`_)
53 | - Upgrade minimal pandas version to 0.23.
54 | - Simplify the use of all cpus in parallel mode. (`#102`_)
55 | - Store large example datasets in user home folder or use environment
56 |   variable. Before, example datasets were stored in the package. (see
57 |   issue `#42`_) (`#92`_)
58 | - Add support to write and read annotation files for recordlinkage ANNOTATOR.
59 |   See the docs and https://github.com/J535D165/recordlinkage-annotator for
60 |   more information.
61 | - Replace `.labels` by `.codes` for `pandas.MultiIndex` objects for newer
62 |   versions of pandas (>0.24).  (`#103`_)
63 | - Fix totals for pandas.MultiIndex input on confusion matrix and accuracy
64 |   metrics. (see issue `#84`_) (`#109`_)
65 | - Initialize Compare with (a list of) features (Bug). (`#124`_)
66 | - Various updates in relation to deprecation warnings in third-party
67 |   libraries such as sklearn, pandas and networkx.
68 | 
69 | .. _#42: https://github.com/J535D165/recordlinkage/issues/42
70 | .. _#84: https://github.com/J535D165/recordlinkage/issues/84
71 | 
72 | .. _#91: https://github.com/J535D165/recordlinkage/pull/91
73 | .. _#92: https://github.com/J535D165/recordlinkage/pull/92
74 | .. _#102: https://github.com/J535D165/recordlinkage/pull/102
75 | .. _#103: https://github.com/J535D165/recordlinkage/pull/103
76 | .. _#109: https://github.com/J535D165/recordlinkage/pull/109
77 | .. _#124: https://github.com/J535D165/recordlinkage/pull/124
78 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # Configuration file for the Sphinx documentation builder.
  2 | #
  3 | # For the full list of built-in configuration values, see the documentation:
  4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
  5 | 
  6 | import datetime
  7 | import os
  8 | import sys
  9 | 
 10 | sys.path.insert(0, os.path.abspath(".."))
 11 | 
 12 | # -- Project information -----------------------------------------------------
 13 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
 14 | 
 15 | project = "Python Record Linkage Toolkit"
 16 | copyright = f"2016-{datetime.datetime.now().year}, Jonathan de Bruin"
 17 | author = "Jonathan de Bruin"
 18 | 
 19 | version = "0.15"
 20 | release = "0.15"
 21 | 
 22 | # -- General configuration ---------------------------------------------------
 23 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
 24 | 
 25 | extensions = [
 26 |     "sphinx.ext.autodoc",
 27 |     "sphinx.ext.doctest",
 28 |     "sphinx.ext.napoleon",
 29 |     "sphinx.ext.intersphinx",
 30 |     "IPython.sphinxext.ipython_console_highlighting",
 31 |     "IPython.sphinxext.ipython_directive",
 32 |     "nbsphinx",
 33 | ]
 34 | 
 35 | templates_path = ['_templates']
 36 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
 37 | 
 38 | autodoc_member_order = "bysource"
 39 | 
 40 | intersphinx_mapping = {
 41 |     "python": ("https://docs.python.org/3/", None),
 42 |     "pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None),
 43 |     "numpy": ("https://numpy.org/doc/stable/", None),
 44 |     "sklearn": ("https://scikit-learn.org/stable/", None),
 45 | }
 46 | 
 47 | 
 48 | # -- Options for HTML output -------------------------------------------------
 49 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
 50 | 
 51 | html_theme = "sphinx_rtd_theme"
 52 | html_static_path = ['_static']
 53 | 
 54 | master_doc = "index"
 55 | pygments_style = "sphinx"
 56 | 
 57 | todo_include_todos = False
 58 | 
 59 | # -- Options for HTML output ----------------------------------------------
 60 | 
 61 | html_static_path = []
 62 | html_domain_indices = False
 63 | 
 64 | # Output file base name for HTML help builder.
 65 | htmlhelp_basename = "RecordLinkageToolkitdoc"
 66 | 
 67 | # -- Napoleon options ---------------------------------------------------
 68 | 
 69 | napoleon_google_docstring = False
 70 | napoleon_numpy_docstring = True
 71 | napoleon_include_private_with_doc = False
 72 | napoleon_include_special_with_doc = False
 73 | napoleon_use_admonition_for_examples = False
 74 | napoleon_use_admonition_for_notes = True
 75 | napoleon_use_admonition_for_references = True
 76 | napoleon_use_ivar = False
 77 | napoleon_use_param = True
 78 | napoleon_use_rtype = False
 79 | 
 80 | # -- NBSphinx options ----------------------------------------------------
 81 | 
 82 | # nbsphinx_execute = 'never'
 83 | 
 84 | # This is processed by Jinja2 and inserted before each notebook
 85 | nbsphinx_prolog = r"""
 86 | {% set docname = 'docs/' + env.doc2path(env.docname, base=None) %}
 87 | 
 88 | .. note::
 89 | 
 90 |     This page was generated from `{{ docname|e }} <https://github.com/J535D165/recordlinkage/blob/{{ env.config.release|e }}/{{ docname|e }}>`_.
 91 |     Run an online interactive version of this page with |binder| or |colab|.
 92 | 
 93 | .. |binder| image:: https://mybinder.org/badge_logo.svg
 94 |     :target: https://mybinder.org/v2/gh/J535D165/recordlinkage/v{{ env.config.release|e }}?filepath={{ docname|e }}
 95 | 
 96 | .. |colab| image:: https://colab.research.google.com/assets/colab-badge.svg
 97 |     :target: https://githubtocolab.com/J535D165/recordlinkage/blob/v{{ env.config.release|e }}/{{ docname|e }}
 98 | 
 99 | """
100 | 


--------------------------------------------------------------------------------
/docs/contributing.rst:
--------------------------------------------------------------------------------
 1 | ************
 2 | Contributing
 3 | ************
 4 | 
 5 | Thanks for your interest in contributing to the Python Record Linkage Toolkit.
 6 | There is a lot of work to do. See `Github <https://github.com/J535D165/recordlinkage/graphs/contributors>`_ 
 7 | for the contributors to this package. 
 8 | 
 9 | The workflow for contributing is as follows:
10 | 
11 | - clone https://github.com/J535D165/recordlinkage.git
12 | - Make a branch with your modifications/contributions
13 | - Write tests
14 | - Run all tests
15 | - Do a pull request
16 | 
17 | Testing
18 | =======
19 | 
20 | Install `pytest`:
21 | 
22 | .. code:: sh
23 | 
24 | 	pip install pytest
25 | 
26 | Run the following command to test the package
27 | 
28 | .. code:: sh
29 | 
30 | 	python -m pytest tests/
31 | 
32 | Performance
33 | ===========
34 | 
35 | Performance is very important in record linkage. The performance is monitored
36 | for all serious modifications of the core API. The performance monitoring is
37 | performed with `Airspeed Velocity <http://github.com/spacetelescope/asv/>`_
38 | (asv).
39 | 
40 | Install Airspeed Velocity:
41 | 
42 | .. code:: sh
43 | 
44 | 	pip install asv
45 | 
46 | Run the following command from the root of the repository to test the
47 | performance of the current version of the package:
48 | 
49 | .. code:: sh
50 | 
51 | 	asv run
52 | 
53 | Run the following command to test all versions since tag v0.6.0
54 | 
55 | .. code:: sh
56 | 
57 | 	asv run --skip-existing-commits v0.6.0..master
58 | 
59 | 
60 | 
61 | 
62 | 
63 | 
64 | 


--------------------------------------------------------------------------------
/docs/images/elas_1705.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/J535D165/recordlinkage/b93d97641952f8c85106be5794ca93b1f1298fbc/docs/images/elas_1705.png


--------------------------------------------------------------------------------
/docs/images/indexing_basic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/J535D165/recordlinkage/b93d97641952f8c85106be5794ca93b1f1298fbc/docs/images/indexing_basic.png


--------------------------------------------------------------------------------
/docs/images/indexing_plot.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.colors as mlc
 2 | import matplotlib.pyplot as mlp
 3 | import numpy as np
 4 | 
 5 | figure, axes = mlp.subplots(nrows=1, ncols=2, figsize=(8, 5))
 6 | 
 7 | # linking
 8 | db_a = ["A1", "A2", "A3", "A4", "A5", "A6"]
 9 | db_b = ["B1", "B2", "B3", "B4", "B5", "B6"]
10 | 
11 | img = np.ones((len(db_a), len(db_b)), dtype=float)
12 | 
13 | color_map = mlc.LinearSegmentedColormap.from_list(
14 |     "ColorMap", [(0.984, 0.501, 0.447), (1.000, 1.000, 1.000)]
15 | )
16 | axes[0].imshow(img, cmap=color_map, interpolation="none")
17 | 
18 | axes[0].set_xlabel("Dataset A", fontsize=13)
19 | axes[0].set_xticks(np.arange(0, len(db_b), 1))
20 | axes[0].set_xticks(np.arange(-0.5, len(db_b), 1), minor=True)
21 | axes[0].set_xticklabels(db_a)
22 | 
23 | axes[0].set_ylabel("Dataset B", fontsize=13)
24 | axes[0].set_yticks(np.arange(0, len(db_a), 1))
25 | axes[0].set_yticks(np.arange(-0.5, len(db_a), 1), minor=True)
26 | axes[0].set_yticklabels(db_b)
27 | 
28 | axes[0].grid(which="minor", color="k")
29 | 
30 | axes[0].set_title("Linking A and B", fontsize=15, fontweight="bold")
31 | 
32 | # dedup
33 | db_a = ["A1", "A2", "A3", "A4", "A5", "A6"]
34 | db_b = ["A1", "A2", "A3", "A4", "A5", "A6"]
35 | 
36 | img = np.ones((len(db_a), len(db_b)), dtype=float)
37 | img = np.triu(img, 1)
38 | 
39 | color_map = mlc.LinearSegmentedColormap.from_list(
40 |     "ColorMap", [(1.000, 1.000, 1.000), (0.984, 0.501, 0.447)]
41 | )
42 | axes[1].imshow(img, cmap=color_map, interpolation="none")
43 | 
44 | axes[1].set_xlabel("Dataset A", fontsize=13)
45 | axes[1].set_xticks(np.arange(0, len(db_b), 1))
46 | axes[1].set_xticks(np.arange(-0.5, len(db_b), 1), minor=True)
47 | axes[1].set_xticklabels(db_a)
48 | 
49 | axes[1].set_ylabel("Dataset A", fontsize=13)
50 | axes[1].set_yticks(np.arange(0, len(db_a), 1))
51 | axes[1].set_yticks(np.arange(-0.5, len(db_a), 1), minor=True)
52 | axes[1].set_yticklabels(db_b)
53 | 
54 | axes[1].grid(which="minor", color="k")
55 | 
56 | axes[1].set_title("Duplicate detection A", fontsize=15, fontweight="bold")
57 | 
58 | figure.tight_layout()
59 | 
60 | mlp.savefig("indexing_basic.png", dpi=150)
61 | 


--------------------------------------------------------------------------------
/docs/images/recordlinkage-banner-transparent.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 1280 640"><defs><style>.cls-1{fill:#ccc;}.cls-2{fill:#df6400;}.cls-3{fill:#ffac15;}.cls-4{fill:#ff9215;}</style></defs><g id="logo"><rect class="cls-1" x="118.85" y="293.54" width="9.15" height="9.15" rx="3"/><rect class="cls-1" x="180.1" y="169.27" width="9.15" height="9.15" rx="3"/><rect class="cls-1" x="300.57" y="293.54" width="9.15" height="9.15" rx="3"/><rect class="cls-1" x="300.57" y="169.27" width="9.15" height="9.15" rx="3"/><rect class="cls-1" x="239.57" y="169.27" width="9.15" height="9.15" rx="3"/><rect class="cls-1" x="118.85" y="230.26" width="9.15" height="9.15" rx="3"/><rect class="cls-1" x="300.57" y="230.26" width="9.15" height="9.15" rx="3"/><rect class="cls-1" x="361.56" y="230.26" width="9.15" height="9.15" rx="3"/><rect class="cls-1" x="422.55" y="230.26" width="9.15" height="9.15" rx="3"/><rect class="cls-1" x="361.56" y="293.54" width="9.15" height="9.15" rx="3"/><rect class="cls-1" x="422.55" y="293.54" width="9.15" height="9.15" rx="3"/><rect class="cls-1" x="422.55" y="169.27" width="9.15" height="9.15" rx="3"/><rect class="cls-1" x="361.56" y="169.27" width="9.15" height="9.15" rx="3"/><rect class="cls-1" x="118.85" y="479.83" width="9.15" height="9.15" rx="3"/><rect class="cls-1" x="180.1" y="479.83" width="9.15" height="9.15" rx="3"/><rect class="cls-1" x="180.1" y="355.56" width="9.15" height="9.15" rx="3"/><rect class="cls-1" x="118.85" y="355.56" width="9.15" height="9.15" rx="3"/><rect class="cls-1" x="239.57" y="479.83" width="9.15" height="9.15" rx="3"/><rect class="cls-1" x="300.57" y="479.83" width="9.15" height="9.15" rx="3"/><rect class="cls-1" x="239.57" y="355.56" width="9.15" height="9.15" rx="3"/><rect class="cls-1" x="118.85" y="416.55" width="9.15" height="9.15" rx="3"/><rect class="cls-1" x="180.1" y="416.55" width="9.15" height="9.15" rx="3"/><rect class="cls-1" x="239.57" y="416.55" width="9.15" height="9.15" rx="3"/><rect class="cls-1" x="300.57" y="416.55" width="9.15" height="9.15" rx="3"/><rect class="cls-1" x="361.56" y="479.83" width="9.15" height="9.15" rx="3"/><rect class="cls-1" x="422.55" y="479.83" width="9.15" height="9.15" rx="3"/><rect class="cls-1" x="422.55" y="355.56" width="9.15" height="9.15" rx="3"/><rect class="cls-1" x="361.56" y="355.56" width="9.15" height="9.15" rx="3"/><rect class="cls-2" x="161.8" y="275.25" width="45.75" height="45.75"/><rect class="cls-3" x="100.56" y="150.97" width="45.75" height="45.75"/><rect class="cls-3" x="221.27" y="275.25" width="45.75" height="45.75"/><rect class="cls-4" x="161.8" y="211.97" width="45.75" height="45.75"/><rect class="cls-3" x="221.27" y="211.97" width="45.75" height="45.75"/><rect class="cls-3" x="282.27" y="337.26" width="45.75" height="45.75"/><rect class="cls-4" x="343.26" y="398.25" width="45.75" height="45.75"/><rect class="cls-3" x="404.25" y="398.25" width="45.75" height="45.75"/></g><g id="wordmark"><path class="cls-3" d="M497.2,295.1a93.08,93.08,0,0,1,17-1.42c9.49,0,15.61,1.73,19.89,5.61a17.11,17.11,0,0,1,5.41,13c0,9.08-5.71,15.1-13,17.55v.3c5.31,1.84,8.47,6.74,10.1,13.88,2.25,9.59,3.88,16.22,5.31,18.87H532.8c-1.13-1.94-2.66-7.86-4.59-16.43-2-9.48-5.72-13.05-13.78-13.36h-8.36v29.79H497.2Zm8.87,31.32h9.08c9.48,0,15.5-5.2,15.5-13.06,0-8.87-6.42-12.75-15.81-12.85a36.36,36.36,0,0,0-8.77.82Z"/><path class="cls-3" d="M557.17,339.88c.21,12.14,8,17.14,16.93,17.14a32.4,32.4,0,0,0,13.67-2.55l1.53,6.43c-3.16,1.43-8.57,3.06-16.42,3.06-15.2,0-24.28-10-24.28-24.89s8.78-26.63,23.16-26.63c16.12,0,20.4,14.18,20.4,23.26a32,32,0,0,1-.31,4.18Zm26.32-6.42c.1-5.71-2.35-14.59-12.45-14.59-9.07,0-13,8.37-13.77,14.59Z"/><path class="cls-3" d="M638.47,361.1A33,33,0,0,1,624.29,364c-14.9,0-24.59-10.1-24.59-25.2s10.41-26.21,26.53-26.21a29.91,29.91,0,0,1,12.44,2.55l-2,6.93a20.86,20.86,0,0,0-10.4-2.34c-11.33,0-17.45,8.36-17.45,18.66,0,11.43,7.35,18.47,17.14,18.47a25.55,25.55,0,0,0,11-2.45Z"/><path class="cls-3" d="M693,337.84c0,18.26-12.65,26.22-24.59,26.22-13.36,0-23.66-9.79-23.66-25.4,0-16.53,10.81-26.22,24.48-26.22C683.45,312.44,693,322.75,693,337.84Zm-39.17.51c0,10.82,6.22,19,15,19,8.57,0,15-8.06,15-19.18,0-8.36-4.19-19-14.79-19S653.87,329,653.87,338.35Z"/><path class="cls-3" d="M704.36,329c0-5.81-.11-10.81-.41-15.4h7.85l.31,9.69h.41c2.24-6.63,7.65-10.82,13.66-10.82a9.78,9.78,0,0,1,2.55.31v8.47a13.05,13.05,0,0,0-3.06-.31c-6.32,0-10.81,4.8-12,11.53a24.56,24.56,0,0,0-.41,4.18v26.32h-8.87Z"/><path class="cls-3" d="M779.33,290.51v59.68c0,4.38.1,9.38.41,12.75h-8.06l-.41-8.57h-.2c-2.76,5.51-8.78,9.69-16.84,9.69-11.93,0-21.11-10.1-21.11-25.09-.1-16.43,10.1-26.53,22.13-26.53,7.55,0,12.65,3.58,14.9,7.55h.2V290.51Zm-9,43.15a15.61,15.61,0,0,0-.41-3.77,13.2,13.2,0,0,0-13-10.41c-9.28,0-14.79,8.16-14.79,19.08,0,10,4.89,18.26,14.58,18.26a13.49,13.49,0,0,0,13.16-10.71,15.46,15.46,0,0,0,.41-3.88Z"/><path class="cls-2" d="M794.52,294.19h8.88v61.3h29.38v7.45H794.52Z"/><path class="cls-2" d="M852.36,299.69c.1,3.06-2.15,5.51-5.72,5.51a5.34,5.34,0,0,1-5.4-5.51,5.47,5.47,0,0,1,5.61-5.61A5.33,5.33,0,0,1,852.36,299.69Zm-10,63.25V313.57h9v49.37Z"/><path class="cls-2" d="M866.23,326.93c0-5.1-.1-9.28-.41-13.36h8l.51,8.16h.2a18.17,18.17,0,0,1,16.32-9.29c6.84,0,17.44,4.09,17.44,21v29.48h-9V334.48c0-8-3-14.59-11.43-14.59a13.19,13.19,0,0,0-12.65,13.36v29.69h-9Z"/><path class="cls-2" d="M931.71,336.21h.21c1.22-1.73,3-3.87,4.38-5.61l14.49-17H961.6l-19.08,20.3,21.73,29.07H953.34l-17-23.67-4.59,5.1v18.57h-8.87V290.51h8.87Z"/><path class="cls-2" d="M999.85,362.94l-.72-6.23h-.3a18.39,18.39,0,0,1-15.1,7.35c-10,0-15.1-7-15.1-14.18,0-11.93,10.61-18.46,29.69-18.36v-1c0-4.08-1.13-11.42-11.22-11.42a24.55,24.55,0,0,0-12.86,3.67l-2-5.92a30.67,30.67,0,0,1,16.22-4.39c15.1,0,18.77,10.31,18.77,20.2V351.1a68.76,68.76,0,0,0,.82,11.84Zm-1.33-25.2c-9.79-.2-20.91,1.53-20.91,11.12,0,5.81,3.88,8.57,8.47,8.57A12.29,12.29,0,0,0,998,349.17a9.27,9.27,0,0,0,.51-2.86Z"/><path class="cls-2" d="M1064.21,313.57c-.21,3.57-.41,7.55-.41,13.56V355.8c0,11.32-2.24,18.26-7,22.54-4.79,4.49-11.73,5.92-18,5.92-5.92,0-12.44-1.43-16.42-4.08l2.24-6.84a27.88,27.88,0,0,0,14.49,3.88c9.18,0,15.91-4.8,15.91-17.24v-5.51h-.21c-2.75,4.59-8,8.26-15.7,8.26-12.24,0-21-10.4-21-24.07,0-16.73,10.92-26.22,22.24-26.22,8.57,0,13.26,4.49,15.4,8.57h.21l.4-7.44Zm-9.28,19.48a13,13,0,0,0-.51-4.08c-1.64-5.2-6-9.49-12.55-9.49-8.57,0-14.69,7.25-14.69,18.67,0,9.69,4.9,17.75,14.59,17.75a13.12,13.12,0,0,0,12.44-9.18,15.66,15.66,0,0,0,.72-4.8Z"/><path class="cls-2" d="M1083.69,339.88c.2,12.14,8,17.14,16.93,17.14a32.44,32.44,0,0,0,13.67-2.55l1.53,6.43c-3.16,1.43-8.57,3.06-16.42,3.06-15.2,0-24.28-10-24.28-24.89s8.77-26.63,23.15-26.63c16.12,0,20.41,14.18,20.41,23.26a33.84,33.84,0,0,1-.31,4.18Zm26.31-6.42c.11-5.71-2.34-14.59-12.44-14.59-9.08,0-13.06,8.37-13.77,14.59Z"/></g></svg>


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | *******************************************
 2 | Python Record Linkage Toolkit Documentation
 3 | *******************************************
 4 | 
 5 | .. figure:: /images/recordlinkage-banner-transparent.svg
 6 |     :width: 100%
 7 | 
 8 | All you need to start linking records.
 9 | 
10 | .. toctree::
11 | 	:caption: First steps
12 | 	:maxdepth: 2
13 | 
14 | 	about
15 | 	installation
16 | 	guides/link_two_dataframes.ipynb
17 | 	guides/data_deduplication.ipynb
18 | 
19 | .. toctree::
20 | 	:caption: Record linkage
21 | 	:maxdepth: 2
22 | 
23 | 	ref-preprocessing
24 | 	ref-index
25 | 	ref-compare
26 | 	ref-classifiers
27 | 	ref-evaluation
28 | 	ref-datasets
29 | 	ref-misc
30 | 
31 | .. toctree::
32 | 	:caption: Miscellaneous
33 | 	:maxdepth: 2
34 | 
35 | 	annotation
36 | 	guides/classifiers.rst
37 | 	performance.rst
38 | 
39 | .. toctree::
40 | 	:caption: Developers
41 | 	:maxdepth: 1
42 | 
43 | 	contributing
44 | 	changelog
45 | 


--------------------------------------------------------------------------------
/docs/installation.rst:
--------------------------------------------------------------------------------
 1 | ************
 2 | Installation
 3 | ************
 4 | 
 5 | Python version support
 6 | ======================
 7 | 
 8 | The Python Record Linkage Toolkit supports the versions of Python that Pandas
 9 | supports as well. You can find the supported Python versions in the Pandas
10 | documentation.
11 | 
12 | Installation
13 | ============
14 | 
15 | The Python Record linkage Toolkit requires Python 3.6 or higher. Install the
16 | package easily with pip
17 | 
18 | .. code:: sh
19 | 
20 |     pip install recordlinkage
21 | 
22 | You can also clone the project on Github.
23 | 
24 | To install all recommended and optional dependencies, run
25 | 
26 | .. code:: sh
27 | 
28 |     pip install recordlinkage['all']
29 | 
30 | Dependencies
31 | ============
32 | 
33 | The Python Record Linkage Toolkit depends on the following packages:
34 | 
35 | -  `numpy <http://www.numpy.org>`__
36 | -  `pandas <https://github.com/pydata/pandas>`__
37 | -  `scipy <https://www.scipy.org/>`__
38 | -  `sklearn <http://scikit-learn.org/>`__
39 | -  `jellyfish <https://github.com/jamesturk/jellyfish>`__
40 | - `joblib`
41 | 
42 | Recommended dependencies
43 | ------------------------
44 | 
45 | -  `numexpr <https://github.com/pydata/numexpr>`__ - accelerating certain numerical operations
46 | -  `bottleneck <https://github.com/pydata/bottleneck>`__ - accelerating certain types of nan evaluations
47 | 
48 | Optional dependecies
49 | --------------------
50 | 
51 | - networkx - for network operations like connected components
52 | 
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | 	echo.
16 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | 	echo.installed, then set the SPHINXBUILD environment variable to point
18 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | 	echo.may add the Sphinx directory to PATH.
20 | 	echo.
21 | 	echo.If you don't have Sphinx installed, grab it from
22 | 	echo.https://www.sphinx-doc.org/
23 | 	exit /b 1
24 | )
25 | 
26 | if "%1" == "" goto help
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/performance.rst:
--------------------------------------------------------------------------------
  1 | 
  2 | Performance
  3 | ===========
  4 | 
  5 | Performance plays an important role in record linkage. Record linkage problems
  6 | scale quadratically with the size of the dataset(s). The number of record
  7 | pairs can be enormous and so are the number of comparisons. The Python Record
  8 | Linkage Toolkit can be used for large scale record linkage applications.
  9 | Nevertheless, the toolkit is developed with experimenting in first place and
 10 | performance on the second place. This page provides tips and tricks to improve
 11 | the performance.
 12 | 
 13 | Do you know more tricks? Let us know!
 14 | 
 15 | Indexing
 16 | --------
 17 | 
 18 | Block on multiple columns
 19 | ~~~~~~~~~~~~~~~~~~~~~~~~~
 20 | 
 21 | Blocking is an effective way to increase the performance of your record
 22 | linkage. If the performance of your implementation is still poor, decrease the
 23 | number of pairs by blocking on multiple variables. This implies that the
 24 | record pair is agrees on two or more variables. In the following example, the
 25 | record pairs agree on the given name **and** surname.
 26 | 
 27 | .. code:: python
 28 | 
 29 |     from recordlinkage.index import Block
 30 |     indexer = Block(left_on=['first_name', 'surname'], 
 31 |                                  right_on=['name', 'surname'])
 32 |     pairs = indexer.index(dfA, dfB)
 33 | 
 34 | You might exclude more links then desired. This can be solved by
 35 | repeating the process with different blocking variables.
 36 | 
 37 | .. code:: python
 38 | 
 39 |     indexer = recordlinkage.Index()
 40 |     indexer.block(left_on=['first_name', 'surname'], 
 41 |                   right_on=['name', 'surname'])
 42 |     indexer.block(left_on=['first_name', 'age'], 
 43 |                   right_on=['name', 'age'])
 44 |     pairs = indexer.index(dfA, dfB)
 45 | 
 46 | .. note:: Sorted Neighbourhood indexing supports, besides the sorted
 47 |         neighbourhood, additional blocking on variables. 
 48 | 
 49 | Make record pairs
 50 | ~~~~~~~~~~~~~~~~~
 51 | 
 52 | The structure of the Python Record Linkage Toolkit has a drawback for the
 53 | performance. In the indexation step (the step in which record pairs are
 54 | selected), only the index of both records is stored. The entire records
 55 | are not stored. This results in less memory usage. The drawback is that the
 56 | records need to be queried from the data. 
 57 | 
 58 | 
 59 | Comparing
 60 | ---------
 61 | 
 62 | Compare only discriminating variables
 63 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 64 | 
 65 | Not all variables may be worth comparing in a record linkage. Some variables
 66 | do not discriminate the links of the non-links or do have only minor effects.
 67 | These variables can be excluded. Only discriminating and informative should be
 68 | included.
 69 | 
 70 | Prevent string comparisons
 71 | ~~~~~~~~~~~~~~~~~~~~~~~~~~
 72 | 
 73 | String similarity measures and phonetic encodings are computationally
 74 | expensive. Phonetic encoding takes place on the original data, while string
 75 | simililatiry measures are applied on the record pairs. After phonetic encoding
 76 | of the string variables, exact comparing can be used instead of computing the
 77 | string similarity of all record pairs. If the number of candidate pairs is
 78 | much larger than the number of records in both datasets together, then
 79 | consider using phonetic encoding of string variables instead of string
 80 | comparison.
 81 | 
 82 | String comparing
 83 | ~~~~~~~~~~~~~~~~
 84 | 
 85 | Comparing strings is computationally expensive. The Python Record Linkage
 86 | Toolkit uses the package ``jellyfish`` for string comparisons. The package has
 87 | two implementations, a C and a Python implementation. Ensure yourself of
 88 | having the Rust-version installed (``import jellyfish.rustyfish`` should not
 89 | raise an exception).
 90 | 
 91 | There can be a large difference in the performance of different string
 92 | comparison algorithms. The Jaro and Jaro-Winkler methods are faster than the
 93 | Levenshtein distance and much faster than the Damerau-Levenshtein distance.
 94 | 
 95 | Indexing with large files
 96 | ~~~~~~~~~~~~~~~~~~~~~~~~~
 97 | 
 98 | Sometimes, the input files are very large. In that case, it can be hard
 99 | to make an index without running out of memory in the indexing step or
100 | in the comparing step. ``recordlinkage`` has a method to deal with large
101 | files. It is fast, although is not primary developed to be fast. SQL
102 | databases may outperform this method. It is especially developed for the
103 | useability. The idea was to split the input files into small blocks.
104 | For each block the record pairs are computed. Then iterate over the
105 | blocks. Consider full indexing:
106 | 
107 | .. code:: python
108 | 
109 |     import recordlinkage
110 |     import numpy
111 | 
112 |     cl = recordlinkage.index.Full()
113 |     
114 |     for dfB_subset in numpy.split(dfB):
115 |         
116 |         # a subset of record pairs
117 |         pairs_subset = cl.index(dfA, dfB_subset)
118 |         
119 |         # Your analysis on pairs_subset here
120 | 
121 | 
122 | 


--------------------------------------------------------------------------------
/docs/ref-classifiers.rst:
--------------------------------------------------------------------------------
  1 | *****************
  2 | 3. Classification
  3 | *****************
  4 | 
  5 | Classifiers
  6 | ===========
  7 | 
  8 | Classification is the step in the record linkage process were record pairs are
  9 | classified into matches, non-matches and possible matches [Christen2012]_.
 10 | Classification algorithms can be supervised or unsupervised (with or without
 11 | training data).
 12 | 
 13 | 
 14 | .. seealso::
 15 | 
 16 |     .. [Christen2012] Christen, Peter. 2012. Data matching: concepts and
 17 |         techniques for record linkage, entity resolution, and duplicate
 18 |         detection. Springer Science & Business Media.
 19 | 
 20 | Supervised
 21 | ----------
 22 | 
 23 | .. autoclass:: recordlinkage.LogisticRegressionClassifier
 24 |     :members:
 25 |     :inherited-members:
 26 | 
 27 | .. autoclass:: recordlinkage.NaiveBayesClassifier
 28 |     :members:
 29 |     :inherited-members:
 30 | 
 31 | .. autoclass:: recordlinkage.SVMClassifier
 32 |     :members:
 33 |     :inherited-members:
 34 | 
 35 | Unsupervised
 36 | ------------
 37 | 
 38 | .. autoclass:: recordlinkage.ECMClassifier
 39 |     :members:
 40 |     :inherited-members:
 41 | 
 42 | .. autoclass:: recordlinkage.KMeansClassifier
 43 |     :members:
 44 |     :inherited-members:
 45 | 
 46 | 
 47 | Adapters
 48 | ========
 49 | 
 50 | Adapters can be used to wrap a machine learning models from external packages
 51 | like ScitKit-learn and Keras. For example, this makes it possible to classify
 52 | record pairs with an neural network developed in Keras.
 53 | 
 54 | .. autoclass:: recordlinkage.adapters.SKLearnAdapter
 55 | 
 56 | 
 57 | .. code:: python
 58 | 
 59 |     # import ScitKit-Learn classifier
 60 |     from sklearn.ensemble import RandomForestClassifier
 61 | 
 62 |     # import BaseClassifier from recordlinkage.base
 63 |     from recordlinkage.base import BaseClassifier
 64 |     from recordlinkage.adapters import SKLearnClassifier
 65 |     from recordlinkage.datasets import binary_vectors
 66 | 
 67 |     class RandomForest(SKLearnClassifier, BaseClassifier):
 68 | 
 69 |         def __init__(*args, **kwargs):
 70 |             super(self, RandomForest).__init__()
 71 | 
 72 |             # set the kernel
 73 |             kernel = RandomForestClassifier(*args, **kwargs)
 74 | 
 75 | 
 76 |     # make a sample dataset
 77 |     features, links = binary_vectors(10000, 2000, return_links=True)
 78 | 
 79 |     # initialise the random forest
 80 |     cl = RandomForest(n_estimators=20)
 81 |     cl.fit(features, links)
 82 | 
 83 |     # predict the matches
 84 |     cl.predict(...)
 85 | 
 86 | 
 87 | .. autoclass:: recordlinkage.adapters.KerasAdapter
 88 | 
 89 | Example of a Keras model used for classification.
 90 | 
 91 | .. code:: python
 92 | 
 93 |     from tensorflow.keras import layers
 94 |     from recordlinkage.base import BaseClassifier
 95 |     from recordlinkage.adapters import KerasAdapter
 96 | 
 97 |     class NNClassifier(KerasAdapter, BaseClassifier):
 98 |         """Neural network classifier."""
 99 |         def __init__(self):
100 |             super(NNClassifier, self).__init__()
101 | 
102 |             model = tf.keras.Sequential()
103 |             model.add(layers.Dense(16, input_dim=8, activation='relu'))
104 |             model.add(layers.Dense(8, activation='relu'))
105 |             model.add(layers.Dense(1, activation='sigmoid'))
106 |             model.compile(
107 |                 optimizer=tf.train.AdamOptimizer(0.001),
108 |                 loss='binary_crossentropy',
109 |                 metrics=['accuracy']
110 |             )
111 | 
112 |             self.kernel = model
113 | 
114 |     # initialise the model
115 |     cl = NNClassifier()
116 |     # fit the model to the data
117 |     cl.fit(X_train, links_true)
118 |     # predict the class of the data
119 |     cl.predict(X_pred)
120 | 
121 | 
122 | User-defined algorithms
123 | =======================
124 | 
125 | Classifiers can make use of the :class:`recordlinkage.base.BaseClassifier` for
126 | algorithms. ScitKit-learn based models may want
127 | :class:`recordlinkage.adapters.SKLearnAdapter` as subclass as well.
128 | 
129 | .. autoclass:: recordlinkage.base.BaseClassifier
130 |     :members:
131 |     :inherited-members:
132 | 
133 | Probabilistic models can use the Fellegi and Sunter base class. This class is
134 | used for the :class:`recordlinkage.ECMClassifier` and the
135 | :class:`recordlinkage.NaiveBayesClassifier`.
136 | 
137 | .. autoclass:: recordlinkage.classifiers.FellegiSunter
138 |     :members:
139 |     :inherited-members:
140 | 
141 | Examples
142 | ========
143 | 
144 | Unsupervised learning with the ECM algorithm. [See example on Github.](https://github.com/J535D165/recordlinkage/examples/unsupervised_learning.py)
145 | 
146 | 
147 | Network
148 | =======
149 | 
150 | The Python Record Linkage Toolkit provides network/graph analysis tools for
151 | classification of record pairs into matches and distinct pairs. The toolkit
152 | provides the functionality for one-to-one linking and one-to-many linking. It
153 | is also possible to detect all connected components which is useful in  data
154 | deduplication.
155 | 
156 | .. autoclass:: recordlinkage.OneToOneLinking
157 |     :members:
158 |     :inherited-members:
159 | 
160 | .. autoclass:: recordlinkage.OneToManyLinking
161 |     :members:
162 |     :inherited-members:
163 | 
164 | .. autoclass:: recordlinkage.ConnectedComponents
165 |     :members:
166 |     :inherited-members:
167 | 


--------------------------------------------------------------------------------
/docs/ref-datasets.rst:
--------------------------------------------------------------------------------
 1 | ********
 2 | Datasets
 3 | ********
 4 | 
 5 | The Python Record Linkage Toolkit contains several open public datasets. Four
 6 | datasets were generated by the developers of Febrl. In the future, we are
 7 | developing tools to generate your own datasets.
 8 | 
 9 | .. autofunction:: recordlinkage.datasets.load_krebsregister
10 | 
11 | .. autofunction:: recordlinkage.datasets.load_febrl1
12 | 
13 | .. autofunction:: recordlinkage.datasets.load_febrl2
14 | 
15 | .. autofunction:: recordlinkage.datasets.load_febrl3
16 | 
17 | .. autofunction:: recordlinkage.datasets.load_febrl4
18 | 
19 | .. autofunction:: recordlinkage.datasets.binary_vectors
20 | 


--------------------------------------------------------------------------------
/docs/ref-evaluation.rst:
--------------------------------------------------------------------------------
 1 | *************
 2 | 4. Evaluation
 3 | *************
 4 | 
 5 | Evaluation of classifications plays an important role in record linkage.
 6 | Express your classification quality in terms accuracy, recall and F-score
 7 | based on ``true positives``, ``false positives``, ``true negatives`` and
 8 | ``false negatives``.
 9 | 
10 | .. autofunction:: recordlinkage.reduction_ratio
11 | .. autofunction:: recordlinkage.true_positives
12 | .. autofunction:: recordlinkage.true_negatives
13 | .. autofunction:: recordlinkage.false_positives
14 | .. autofunction:: recordlinkage.false_negatives
15 | .. autofunction:: recordlinkage.confusion_matrix
16 | .. autofunction:: recordlinkage.precision
17 | .. autofunction:: recordlinkage.recall
18 | .. autofunction:: recordlinkage.accuracy
19 | .. autofunction:: recordlinkage.specificity
20 | .. autofunction:: recordlinkage.fscore
21 | .. autofunction:: recordlinkage.max_pairs
22 | .. autofunction:: recordlinkage.full_index_size


--------------------------------------------------------------------------------
/docs/ref-index.rst:
--------------------------------------------------------------------------------
  1 | ***********
  2 | 1. Indexing
  3 | ***********
  4 | 
  5 | The indexing module is used to make pairs of records. These pairs are called
  6 | candidate links or candidate matches. There are several indexing algorithms
  7 | available such as blocking and sorted neighborhood indexing. See
  8 | [christen2012]_ and [christen2008]_ for background information about
  9 | indexation.
 10 | 
 11 | .. [christen2012] Christen, P. (2012). Data matching: concepts and
 12 |         techniques for record linkage, entity resolution, and duplicate
 13 |         detection. Springer Science & Business Media.
 14 | .. [christen2008] Christen, P. (2008). Febrl - A Freely Available Record
 15 |         Linkage System with a Graphical User Interface.
 16 | 
 17 | The indexing module can be used for both linking and duplicate detection. In
 18 | case of duplicate detection, only pairs in the upper triangular part of the
 19 | matrix are returned. This means that the first record in each record pair is
 20 | the largest identifier. For example, `("A2", "A1")`, `(5, 2)` and  `("acb",
 21 | "abc")`. The following image shows the record pairs for a complete set of
 22 | record pairs.
 23 | 
 24 | .. figure:: /images/indexing_basic.png
 25 |     :width: 100%
 26 | 
 27 | :class:`recordlinkage.Index` object
 28 | ===================================
 29 | 
 30 | .. autoclass:: recordlinkage.Index
 31 | 
 32 |     .. automethod:: recordlinkage.Index.add
 33 |     .. automethod:: recordlinkage.Index.index
 34 |     .. automethod:: recordlinkage.Index.full
 35 |     .. automethod:: recordlinkage.Index.block
 36 |     .. automethod:: recordlinkage.Index.sortedneighbourhood
 37 |     .. automethod:: recordlinkage.Index.random
 38 | 
 39 | 
 40 | 
 41 | Algorithms
 42 | ==========
 43 | 
 44 | The Python Record Linkage Toolkit contains basic and advanced indexing (or
 45 | blocking) algorithms to make record pairs. The algorithms are Python classes.
 46 | Popular algorithms in the toolkit are:
 47 | 
 48 | - :class:`recordlinkage.index.Full`,
 49 | - :class:`recordlinkage.index.Block`,
 50 | - :class:`recordlinkage.index.SortedNeighbourhood`
 51 | 
 52 | The algorithms are available in the submodule `recordlinkage.index`. Import
 53 | the algorithms in the following way (use blocking algorithm as example):
 54 | 
 55 | .. code:: python
 56 | 
 57 |     from recordlinkage.index import Block
 58 | 
 59 | The full reference for the indexing algorithms in the toolkit is given below.
 60 | 
 61 | .. automodule:: recordlinkage.index
 62 |     :members:
 63 |     :inherited-members:
 64 | 
 65 | User-defined algorithms
 66 | =======================
 67 | 
 68 | A user-defined algorithm can be defined based on
 69 | :class:`recordlinkage.base.BaseIndexAlgorithm`. The :class:`recordlinkage.base.BaseIndexAlgorithm` class is an abstract base
 70 | class that is used for indexing algorithms. The classes
 71 | 
 72 | - :class:`recordlinkage.index.Full`
 73 | - :class:`recordlinkage.index.Block`
 74 | - :class:`recordlinkage.index.SortedNeighbourhood`
 75 | - :class:`recordlinkage.index.Random`
 76 | 
 77 | are inherited from this abstract base class. You can use BaseIndexAlgorithm to
 78 | create a user-defined/custom algorithm.
 79 | 
 80 | To create a custom algorithm, subclass the
 81 | :class:`recordlinkage.base.BaseIndexAlgorithm`. In the subclass, overwrite the
 82 | :meth:`recordlinkage.base.BaseIndexAlgorithm._link_index` method in case of
 83 | linking two datasets. This method accepts two (tuples of)
 84 | :class:`pandas.Series` objects as arguments. Based on these Series objects,
 85 | you create record pairs. The record pairs need to be returned in a 2-level
 86 | :class:`pandas.MultiIndex` object. The :attr:`pandas.MultiIndex.names` are the
 87 | name of index of DataFrame A and name of the index of DataFrame B
 88 | respectively. Overwrite the
 89 | :meth:`recordlinkage.base.BaseIndexAlgorithm._dedup_index` method in case of
 90 | finding link within a single dataset (deduplication). This method accepts a
 91 | single (tuples of) :class:`pandas.Series` objects as arguments.
 92 | 
 93 | The algorithm for linking data frames can be used for finding duplicates as
 94 | well. In this situation, DataFrame B is a copy of DataFrame A. The Pairs class
 95 | removes pairs like (record_i, record_i) and one of the following (record_i,
 96 | record_j) (record_j, record_i) under the hood. As result of this, only unique
 97 | combinations are returned. If you do have a specific algorithm for finding
 98 | duplicates, then you can overwrite the _dedup_index method. This method
 99 | accepts only one argument (DataFrame A) and the internal base class does not
100 | look for combinations like explained above.
101 | 
102 | .. autoclass:: recordlinkage.base.BaseIndexAlgorithm
103 |     :members:
104 |     :private-members:
105 | 
106 | Examples
107 | ========
108 | 
109 | .. code:: python
110 | 
111 |     import recordlinkage as rl
112 |     from recordlinkage.datasets import load_febrl4
113 |     from recordlinkage.index import Block
114 | 
115 |     df_a, df_b = load_febrl4()
116 | 
117 |     indexer = rl.Index()
118 |     indexer.add(Block('given_name', 'given_name'))
119 |     indexer.add(Block('surname', 'surname'))
120 |     indexer.index(df_a, df_b)
121 | 
122 | Equivalent code:
123 | 
124 | .. code:: python
125 | 
126 |     import recordlinkage as rl
127 |     from recordlinkage.datasets import load_febrl4
128 | 
129 |     df_a, df_b = load_febrl4()
130 | 
131 |     indexer = rl.Index()
132 |     indexer.block('given_name', 'given_name')
133 |     indexer.block('surname', 'surname')
134 |     index.index(df_a, df_b)
135 | 
136 | This example shows how to implement a custom indexing algorithm. The algorithm
137 | returns all record pairs of which the given names starts with the letter ‘W’.
138 | 
139 | .. code:: python
140 | 
141 |     import recordlinkage
142 |     from recordlinkage.datasets import load_febrl4
143 | 
144 |     df_a, df_b = load_febrl4()
145 | 
146 |     from recordlinkage.base import BaseIndexAlgorithm
147 | 
148 |     class FirstLetterWIndex(BaseIndexAlgorithm):
149 |         """Custom class for indexing"""
150 | 
151 |         def _link_index(self, df_a, df_b):
152 |             """Make pairs with given names starting with the letter 'w'."""
153 | 
154 |             # Select records with names starting with a w.
155 |             name_a_w = df_a[df_a['given_name'].str.startswith('w') == True]
156 |             name_b_w = df_b[df_b['given_name'].str.startswith('w') == True]
157 | 
158 |             # Make a product of the two numpy arrays
159 |             return pandas.MultiIndex.from_product(
160 |                 [name_a_w.index.values, name_b_w.index.values],
161 |                 names=[df_a.index.name, df_b.index.name]
162 |             )
163 | 
164 |     indexer = FirstLetterWIndex()
165 |     candidate_pairs = indexer.index(df_a, df_b)
166 | 
167 |     print ('Returns a', type(candidate_pairs).__name__)
168 |     print ('Number of candidate record pairs starting with the letter w:', len(candidate_pairs))
169 | 
170 | The custom index class below does not restrict the first letter to ‘w’, but the first letter is an argument (named `letter`). This letter can is initialized during the setup of the class.
171 | 
172 | .. code:: python
173 | 
174 |     class FirstLetterIndex(BaseIndexAlgorithm):
175 |         """Custom class for indexing"""
176 | 
177 |         def __init__(self, letter):
178 |             super(FirstLetterIndex, self).__init__()
179 | 
180 |             # the letter to save
181 |             self.letter = letter
182 | 
183 |         def _link_index(self, df_a, df_b):
184 |             """Make record pairs that agree on the first letter of the given name."""
185 | 
186 |             # Select records with names starting with a 'letter'.
187 |             a_startswith_w = df_a[df_a['given_name'].str.startswith(self.letter) == True]
188 |             b_startswith_w = df_b[df_b['given_name'].str.startswith(self.letter) == True]
189 | 
190 |             # Make a product of the two numpy arrays
191 |             return pandas.MultiIndex.from_product(
192 |                 [a_startswith_w.index.values, b_startswith_w.index.values],
193 |                 names=[df_a.index.name, df_b.index.name]
194 |             )
195 | 


--------------------------------------------------------------------------------
/docs/ref-misc.rst:
--------------------------------------------------------------------------------
 1 | *************
 2 | Miscellaneous
 3 | *************
 4 | 
 5 | .. autofunction:: recordlinkage.index_split
 6 | 
 7 | .. autofunction:: recordlinkage.get_option
 8 | .. autofunction:: recordlinkage.set_option
 9 | .. autofunction:: recordlinkage.reset_option
10 | .. autofunction:: recordlinkage.describe_option


--------------------------------------------------------------------------------
/docs/ref-preprocessing.rst:
--------------------------------------------------------------------------------
 1 | ****************
 2 | 0. Preprocessing
 3 | ****************
 4 | 
 5 | Preprocessing data, like cleaning and standardising, may increase your record
 6 | linkage accuracy. The Python Record Linkage Toolkit contains several tools for
 7 | data preprocessing. The preprocessing and standardising functions are
 8 | available in the submodule `recordlinkage.preprocessing`. Import the
 9 | algorithms in the following way:
10 | 
11 | .. code:: python
12 | 
13 |     from recordlinkage.preprocessing import clean, phonetic
14 | 
15 | Cleaning
16 | ========
17 | 
18 | The Python Record Linkage Toolkit has some cleaning function from which
19 | :func:`recordlinkage.preprocessing.clean` is the most generic function. Pandas
20 | itself is also very usefull for (string) data cleaning. See the pandas
21 | documentation on this topic: `Working with Text Data <https://pandas.pydata.org/pandas-docs/stable/text.html>`_.
22 | 
23 | .. autofunction:: recordlinkage.preprocessing.clean
24 | .. autofunction:: recordlinkage.preprocessing.phonenumbers
25 | .. autofunction:: recordlinkage.preprocessing.value_occurence
26 | 
27 | Phonetic encoding
28 | =================
29 | 
30 | Phonetic algorithms are algorithms for indexing of words by their
31 | pronunciation. The most well-known algorithm is the `Soundex
32 | <https://en.wikipedia.org/wiki/Soundex>`_ algorithm. The Python Record Linkage
33 | Toolkit supports multiple algorithms through the
34 | :func:`recordlinkage.preprocessing.phonetic` function.
35 | 
36 | .. note::
37 | 
38 |     Use phonetic algorithms in advance of the indexing and comparing step.
39 |     This results in most siutations in better performance.
40 | 
41 | .. autofunction:: recordlinkage.preprocessing.phonetic
42 | .. autoattribute:: recordlinkage.preprocessing.phonetic_algorithms
43 | 


--------------------------------------------------------------------------------
/examples/README.rst:
--------------------------------------------------------------------------------
 1 | Python Record Linkage Toolkit examples
 2 | ======================================
 3 | 
 4 | This folder contains examples on record linkage with the Python Record Linkage
 5 | Toolkit. The examples do have a BSD 3-Clause "New" or "Revised" License.
 6 | Contributions are appreciated.
 7 | 
 8 | Basic
 9 | -----
10 | 
11 | `Deterministic deduplication`_
12 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
13 | 
14 | Example of deterministic record linkage to find duplicated records in a
15 | dataset. In this example, the model isn't trained with train data.
16 | 
17 | `Deterministic linkage`_
18 | ~~~~~~~~~~~~~~~~~~~~~~~~
19 | 
20 | Example of deterministic record linkage to find links between two datasets. In
21 | this example, the model isn't trained with train data.
22 | 
23 | `Supervised Fellegi and Sunter with Naive Bayes classifier`_
24 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
25 | 
26 | An implementation of the Fellegi and Sunter (1969) classification model in a
27 | supervised way.
28 | 
29 | `Unsupervised Fellegi and Sunter with ECM classifier`_
30 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
31 | 
32 | An implementation of the Fellegi and Sunter (1969) classification model in an
33 | unsupervised way. The training of model parameters is done with the
34 | Expectation-Conditional Maximisation algorithm.
35 | 
36 | 
37 | Advanced
38 | --------
39 | 
40 | `Record linkage with Neural Networks`_
41 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
42 | 
43 | This example shows how Neural Networks can be used to classify record pairs.
44 | The Neural Network is implemented in Keras.
45 | 
46 | .. _`Deterministic deduplication`: /examples/dedup_deterministic.py
47 | .. _`Deterministic linkage`: /examples/linking_deterministic.py
48 | .. _`Record linkage with Neural Networks`: /examples/supervised_keras.py
49 | .. _`Supervised Fellegi and Sunter with Naive Bayes classifier`: /examples/supervised_learning_prob.py
50 | .. _`Unsupervised Fellegi and Sunter with ECM classifier`: /examples/unsupervised_learning_prob.py
51 | 


--------------------------------------------------------------------------------
/examples/dedup_deterministic.py:
--------------------------------------------------------------------------------
 1 | """Example: Deterministic record linkage to find links in a single file.
 2 | 
 3 | In determininistic record linkage, each compared attribute get a certain
 4 | weight (coefficient). The higher the weight, the more dicriminating the
 5 | variable is. A low weight indicate a less discriminating variable. For
 6 | example, the given name has a higher weight than the hometown.
 7 | 
 8 | This example uses FEBRL3 datasets. This dataset contain records about
 9 | individuals.
10 | 
11 | Deterministic RL parameters are:
12 | intercept = -11.0
13 | coefficients = [1.5, 1.5, 8.0, 6.0, 2.5, 6.5, 5.0]
14 | 
15 | """
16 | 
17 | 
18 | import recordlinkage as rl
19 | from recordlinkage.compare import Exact
20 | from recordlinkage.compare import String
21 | from recordlinkage.datasets import load_febrl3
22 | from recordlinkage.index import Block
23 | 
24 | # set logging
25 | rl.logging.set_verbosity(rl.logging.INFO)
26 | 
27 | # load dataset
28 | print("Loading data...")
29 | dfA, true_links = load_febrl3(return_links=True)
30 | print(len(dfA), "records in dataset A")
31 | print(len(true_links), "links in dataset A")
32 | 
33 | # start indexing
34 | print("Build index...")
35 | indexer = rl.Index()
36 | indexer.add(Block("given_name"))
37 | indexer.add(Block("surname"))
38 | indexer.add(Block("soc_sec_id"))
39 | candidate_links = indexer.index(dfA)
40 | 
41 | # start comparing
42 | print("Start comparing...")
43 | comparer = rl.Compare()
44 | comparer.add(Exact("given_name", "given_name", label="given_name"))
45 | comparer.add(
46 |     String("surname", "surname", method="jarowinkler", threshold=0.85, label="surname")
47 | )
48 | comparer.add(Exact("date_of_birth", "date_of_birth", label="date_of_birth"))
49 | comparer.add(Exact("suburb", "suburb", label="suburb"))
50 | comparer.add(Exact("state", "state", label="state"))
51 | comparer.add(String("address_1", "address_1", threshold=0.85, label="address_1"))
52 | comparer.add(String("address_2", "address_2", threshold=0.85, label="address_2"))
53 | features = comparer.compute(candidate_links, dfA)
54 | 
55 | print("feature shape", features.shape)
56 | 
57 | # use the Logistic Regression Classifier
58 | # this classifier is equivalent to the deterministic record linkage approach
59 | intercept = -9.5
60 | coefficients = [2.0, 3.0, 7.0, 6.0, 2.5, 5.0, 5.5]
61 | 
62 | print("Deterministic classifier")
63 | print("intercept", intercept)
64 | print("coefficients", coefficients)
65 | 
66 | logreg = rl.LogisticRegressionClassifier(coefficients=coefficients, intercept=intercept)
67 | links = logreg.predict(features)
68 | 
69 | print(len(links), "links/matches")
70 | 
71 | # return the confusion matrix
72 | conf_logreg = rl.confusion_matrix(true_links, links, len(candidate_links))
73 | print("confusion matrix")
74 | print(conf_logreg)
75 | 
76 | # compute the F-score for this classification
77 | fscore = rl.fscore(conf_logreg)
78 | print("fscore", fscore)
79 | recall = rl.recall(true_links, links)
80 | print("recall", recall)
81 | precision = rl.precision(true_links, links)
82 | print("precision", precision)
83 | 


--------------------------------------------------------------------------------
/examples/linking_deterministic.py:
--------------------------------------------------------------------------------
 1 | """This example demonstrates deterministic record linkage to link two files.
 2 | 
 3 | In determininistic record linkage, each compared attribute get a certain
 4 | weight (coefficient). The higher the weight, the more dicriminating the
 5 | variable is. A low weight indicate a less discriminating variable. For
 6 | example, the given name has a higher weight than the hometown.
 7 | 
 8 | This example uses FEBRL4 datasets. These datasets contain records about
 9 | individuals.
10 | 
11 | Deterministic RL parameters are:
12 | intercept = -11.0
13 | coefficients = [1.5, 1.5, 8.0, 6.0, 2.5, 6.5, 5.0]
14 | 
15 | """
16 | 
17 | 
18 | import recordlinkage as rl
19 | from recordlinkage.compare import Exact
20 | from recordlinkage.compare import String
21 | from recordlinkage.datasets import load_febrl4
22 | from recordlinkage.index import Block
23 | 
24 | # set logging
25 | rl.logging.set_verbosity(rl.logging.INFO)
26 | 
27 | # load datasets
28 | print("Loading data...")
29 | dfA, dfB, true_links = load_febrl4(return_links=True)
30 | print(len(dfA), "records in dataset A")
31 | print(len(dfB), "records in dataset B")
32 | print(len(true_links), "links between dataset A and B")
33 | 
34 | # start indexing
35 | print("Build index...")
36 | indexer = rl.Index()
37 | indexer.add(Block("given_name"))
38 | indexer.add(Block("surname"))
39 | indexer.add(Block("soc_sec_id"))
40 | candidate_links = indexer.index(dfA, dfB)
41 | 
42 | # start comparing
43 | print("Start comparing...")
44 | comparer = rl.Compare()
45 | comparer.add(Exact("given_name", "given_name", label="given_name"))
46 | comparer.add(
47 |     String("surname", "surname", method="jarowinkler", threshold=0.85, label="surname")
48 | )
49 | comparer.add(Exact("date_of_birth", "date_of_birth", label="date_of_birth"))
50 | comparer.add(Exact("suburb", "suburb", label="suburb"))
51 | comparer.add(Exact("state", "state", label="state"))
52 | comparer.add(String("address_1", "address_1", threshold=0.85, label="address_1"))
53 | comparer.add(String("address_2", "address_2", threshold=0.85, label="address_2"))
54 | features = comparer.compute(candidate_links, dfA, dfB)
55 | 
56 | print("feature shape", features.shape)
57 | 
58 | # use the Logistic Regression Classifier
59 | # this classifier is equivalent to the deterministic record linkage approach
60 | intercept = -11.0
61 | coefficients = [1.5, 1.5, 8.0, 6.0, 2.5, 6.5, 5.0]
62 | 
63 | print("Deterministic classifier")
64 | print("intercept", intercept)
65 | print("coefficients", coefficients)
66 | 
67 | logreg = rl.LogisticRegressionClassifier(coefficients=coefficients, intercept=intercept)
68 | links = logreg.predict(features)
69 | 
70 | print(len(links), "links/matches")
71 | 
72 | # return the confusion matrix
73 | conf_logreg = rl.confusion_matrix(true_links, links, len(candidate_links))
74 | print("confusion matrix")
75 | print(conf_logreg)
76 | 
77 | # compute the F-score for this classification
78 | fscore = rl.fscore(conf_logreg)
79 | print("fscore", fscore)
80 | recall = rl.recall(true_links, links)
81 | print("recall", recall)
82 | precision = rl.precision(true_links, links)
83 | print("precision", precision)
84 | 


--------------------------------------------------------------------------------
/examples/supervised_keras.py:
--------------------------------------------------------------------------------
 1 | """Example: Supervised learning with Neural Networks."""
 2 | 
 3 | 
 4 | import numpy as np
 5 | import tensorflow as tf
 6 | 
 7 | try:
 8 |     from tensorflow.keras import layers
 9 | except ModuleNotFoundError as err:
10 |     raise ModuleNotFoundError("Please upgrade tensorflow.") from err
11 | 
12 | import recordlinkage as rl
13 | from recordlinkage.adapters import KerasAdapter
14 | from recordlinkage.base import BaseClassifier
15 | from recordlinkage.datasets import binary_vectors
16 | 
17 | # create a dataset with the following settings
18 | n_pairs = 50000
19 | n_matches = 7000
20 | m_simulate = np.array([0.94, 0.81, 0.85, 0.90, 0.99, 0.70, 0.56, 0.92])
21 | u_simulate = np.array([0.19, 0.23, 0.50, 0.11, 0.20, 0.14, 0.50, 0.09])
22 | 
23 | # Create the dataset and return the true links.
24 | X_data, links_true = binary_vectors(
25 |     n_pairs,  # the number of candidate links
26 |     n_matches,  # the number of true links
27 |     m=m_simulate,  # the m probabilities
28 |     u=u_simulate,  # the u probabilities
29 |     random_state=535,  # set seed
30 |     return_links=True,
31 | )  # return true links
32 | 
33 | 
34 | # Initialise the Keras.
35 | class NNClassifier(KerasAdapter, BaseClassifier):
36 |     """Neural network classifier."""
37 | 
38 |     def __init__(self, *args, **kwargs):
39 |         super().__init__()
40 | 
41 |         model = tf.keras.Sequential()
42 |         model.add(layers.Dense(16, input_dim=8, activation="relu"))
43 |         model.add(layers.Dense(8, activation="relu"))
44 |         model.add(layers.Dense(1, activation="sigmoid"))
45 |         model.compile(
46 |             optimizer=tf.train.AdamOptimizer(0.001),
47 |             loss="binary_crossentropy",
48 |             metrics=["accuracy"],
49 |         )
50 | 
51 |         self.kernel = model
52 | 
53 | 
54 | cl = NNClassifier()
55 | cl.fit(X_data, links_true)
56 | 
57 | # evaluate the model
58 | links_pred = cl.predict(X_data)
59 | print("Predicted number of links:", len(links_pred))
60 | 
61 | cm = rl.confusion_matrix(links_true, links_pred, total=len(X_data))
62 | print("Confusion matrix:\n", cm)
63 | 
64 | # compute the F-score for this classification
65 | fscore = rl.fscore(cm)
66 | print("fscore", fscore)
67 | recall = rl.recall(links_true, links_pred)
68 | print("recall", recall)
69 | precision = rl.precision(links_true, links_pred)
70 | print("precision", precision)
71 | 
72 | # Predict the match probability for each pair in the dataset.
73 | probs = cl.prob(X_data)
74 | print(probs[0:10])
75 | 


--------------------------------------------------------------------------------
/examples/supervised_learning_prob.py:
--------------------------------------------------------------------------------
 1 | """Example: Supervised learning with the Naive Bayes algorithm.
 2 | 
 3 | """
 4 | 
 5 | 
 6 | import numpy as np
 7 | 
 8 | import recordlinkage as rl
 9 | from recordlinkage.datasets import binary_vectors
10 | 
11 | # create a dataset with the following settings
12 | n_pairs = 50000
13 | n_matches = 7000
14 | m_simulate = np.array([0.94, 0.81, 0.85, 0.90, 0.99, 0.70, 0.56, 0.92])
15 | u_simulate = np.array([0.19, 0.23, 0.50, 0.11, 0.20, 0.14, 0.50, 0.09])
16 | 
17 | # Create the dataset and return the true links.
18 | X_data, links_true = binary_vectors(
19 |     n_pairs,  # the number of candidate links
20 |     n_matches,  # the number of true links
21 |     m=m_simulate,  # the m probabilities
22 |     u=u_simulate,  # the u probabilities
23 |     random_state=535,  # set seed
24 |     return_links=True,
25 | )  # return true links
26 | 
27 | # Initialise the NaiveBayesClassifier.
28 | cl = rl.NaiveBayesClassifier()
29 | cl.fit(X_data, links_true)
30 | 
31 | # Print the parameters that are trained (m, u and p). Note that the estimates
32 | # are very good.
33 | print("p probability P(Match):", cl.p)
34 | print("m probabilities P(x_i=1|Match):", cl.m_probs)
35 | print("u probabilities P(x_i=1|Non-Match):", cl.u_probs)
36 | print("log m probabilities P(x_i=1|Match):", cl.log_m_probs)
37 | print("log u probabilities P(x_i=1|Non-Match):", cl.log_u_probs)
38 | print("log weights of features:", cl.log_weights)
39 | print("weights of features:", cl.weights)
40 | 
41 | # evaluate the model
42 | links_pred = cl.predict(X_data)
43 | print("Predicted number of links:", len(links_pred))
44 | 
45 | cm = rl.confusion_matrix(links_true, links_pred, total=len(X_data))
46 | print("Confusion matrix:\n", cm)
47 | 
48 | # compute the F-score for this classification
49 | fscore = rl.fscore(cm)
50 | print("fscore", fscore)
51 | recall = rl.recall(links_true, links_pred)
52 | print("recall", recall)
53 | precision = rl.precision(links_true, links_pred)
54 | print("precision", precision)
55 | 
56 | # Predict the match probability for each pair in the dataset.
57 | probs = cl.prob(X_data)
58 | 


--------------------------------------------------------------------------------
/examples/unsupervised_learning_prob.py:
--------------------------------------------------------------------------------
 1 | """Example: Unsupervised learning with the ECM algorithm.
 2 | 
 3 | Train data is often hard to collect in record linkage or data matching
 4 | problems. The Expectation-Conditional Maximisation (ECM) algorithm is the most
 5 | well known algorithm for unsupervised data matching. The algorithm preforms
 6 | relatively well compared to supervised methods.
 7 | 
 8 | """
 9 | 
10 | 
11 | import numpy as np
12 | 
13 | import recordlinkage as rl
14 | from recordlinkage.datasets import binary_vectors
15 | 
16 | # create a dataset with the following settings
17 | n_pairs = 50000
18 | n_matches = 7000
19 | m_simulate = np.array([0.94, 0.81, 0.85, 0.90, 0.99, 0.70, 0.56, 0.92])
20 | u_simulate = np.array([0.19, 0.23, 0.50, 0.11, 0.20, 0.14, 0.50, 0.09])
21 | 
22 | # Create the dataset and return the true links.
23 | X_data, links_true = binary_vectors(
24 |     n_pairs,  # the number of candidate links
25 |     n_matches,  # the number of true links
26 |     m=m_simulate,  # the m probabilities
27 |     u=u_simulate,  # the u probabilities
28 |     random_state=535,  # set seed
29 |     return_links=True,
30 | )  # return true links
31 | 
32 | # Initialise the Expectation-Conditional Maximisation classifier.
33 | cl = rl.ECMClassifier()
34 | cl.fit(X_data)
35 | 
36 | # Print the parameters that are trained (m, u and p). Note that the estimates
37 | # are very good.
38 | print("p probability P(Match):", cl.p)
39 | print("m probabilities P(x_i=1|Match):", cl.m_probs)
40 | print("u probabilities P(x_i=1|Non-Match):", cl.u_probs)
41 | print("log m probabilities P(x_i=1|Match):", cl.log_m_probs)
42 | print("log u probabilities P(x_i=1|Non-Match):", cl.log_u_probs)
43 | print("log weights of features:", cl.log_weights)
44 | print("weights of features:", cl.weights)
45 | 
46 | # evaluate the model
47 | links_pred = cl.predict(X_data)
48 | print("Predicted number of links:", len(links_pred))
49 | 
50 | cm = rl.confusion_matrix(links_true, links_pred, total=len(X_data))
51 | print("Confusion matrix:\n", cm)
52 | 
53 | # compute the F-score for this classification
54 | fscore = rl.fscore(cm)
55 | print("fscore", fscore)
56 | recall = rl.recall(links_true, links_pred)
57 | print("recall", recall)
58 | precision = rl.precision(links_true, links_pred)
59 | print("precision", precision)
60 | 
61 | # Predict the match probability for each pair in the dataset.
62 | probs = cl.prob(X_data)
63 | print(probs)
64 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "recordlinkage"
 3 | description = "A record linkage toolkit for linking and deduplication"
 4 | authors = [
 5 |     { name = "Jonathan de Bruin", email = "jonathandebruinos@gmail.com" }
 6 | ]
 7 | readme = "README.md"
 8 | classifiers = [
 9 |     "Development Status :: 4 - Beta",
10 |     "License :: OSI Approved :: BSD License",
11 |     "Programming Language :: Python :: 3.8",
12 |     "Programming Language :: Python :: 3.9",
13 |     "Programming Language :: Python :: 3.10",
14 |     "Programming Language :: Python :: 3.11"
15 | ]
16 | license = {text = "BSD-3-Clause"}
17 | dependencies = [
18 |     "jellyfish>=1",
19 |     "numpy>=1.13",
20 |     "pandas>=1,<3",
21 |     "scipy>=1",
22 |     "scikit-learn>=1",
23 |     "joblib"
24 | ]
25 | dynamic = ["version"]
26 | requires-python = ">=3.8"
27 | 
28 | [project.urls]
29 | homepage = "https://recordlinkage.readthedocs.io/"
30 | repository = "https://github.com/J535D165/recordlinkage"
31 | 
32 | [project.optional-dependencies]
33 | all = ["networkx>=2", "bottleneck", "numexpr"]
34 | lint = ["ruff"]
35 | docs = ["sphinx", "nbsphinx", "sphinx-rtd-theme", "ipykernel"]
36 | test = ["pytest"]
37 | 
38 | [build-system]
39 | build-backend = 'setuptools.build_meta'
40 | requires = ["setuptools>=45", "setuptools_scm[toml]>=6.2"]
41 | 
42 | [tool.setuptools]
43 | packages = ["recordlinkage"]
44 | 
45 | [tool.setuptools.package-data]
46 | "*" = ["*.csv"]
47 | 
48 | [tool.setuptools_scm]
49 | write_to = "recordlinkage/_version.py"
50 | 
51 | [tool.ruff]
52 | select = ["E", "F", "UP", "I", "B"]
53 | ignore = ["B006"]
54 | exclude = ["docs/conf.py"]
55 | 
56 | [tool.ruff.isort]
57 | force-single-line = true
58 | 


--------------------------------------------------------------------------------
/recordlinkage/__init__.py:
--------------------------------------------------------------------------------
 1 | # register the configuration
 2 | import recordlinkage.config_init  # noqa
 3 | from recordlinkage.classifiers import FellegiSunter
 4 | from recordlinkage.classifiers import KMeansClassifier
 5 | from recordlinkage.classifiers import LogisticRegressionClassifier
 6 | from recordlinkage.classifiers import NaiveBayesClassifier
 7 | from recordlinkage.classifiers import SVMClassifier
 8 | from recordlinkage.classifiers import ECMClassifier
 9 | from recordlinkage.measures import reduction_ratio
10 | from recordlinkage.measures import max_pairs
11 | from recordlinkage.measures import full_index_size
12 | from recordlinkage.measures import true_positives
13 | from recordlinkage.measures import true_negatives
14 | from recordlinkage.measures import false_positives
15 | from recordlinkage.measures import false_negatives
16 | from recordlinkage.measures import confusion_matrix
17 | from recordlinkage.measures import precision
18 | from recordlinkage.measures import recall
19 | from recordlinkage.measures import accuracy
20 | from recordlinkage.measures import specificity
21 | from recordlinkage.measures import fscore
22 | from recordlinkage.network import OneToOneLinking
23 | from recordlinkage.network import OneToManyLinking
24 | from recordlinkage.network import ConnectedComponents
25 | from recordlinkage import rl_logging as logging
26 | from recordlinkage.annotation import read_annotation_file
27 | from recordlinkage.annotation import write_annotation_file
28 | from recordlinkage.api import Compare
29 | from recordlinkage.api import Index
30 | from recordlinkage.config import describe_option
31 | from recordlinkage.config import get_option
32 | from recordlinkage.config import option_context
33 | from recordlinkage.config import options
34 | from recordlinkage.config import reset_option
35 | from recordlinkage.config import set_option
36 | from recordlinkage.utils import index_split
37 | from recordlinkage.utils import split_index
38 | 
39 | try:
40 |     from recordlinkage._version import __version__
41 |     from recordlinkage._version import __version_tuple__
42 | except ImportError:
43 |     __version__ = "0.0.0"
44 |     __version_tuple__ = (0, 0, 0)
45 | 
46 | 
47 | __all__ = [
48 |     "logging",
49 |     "read_annotation_file",
50 |     "write_annotation_file",
51 |     "Compare",
52 |     "Index",
53 |     "describe_option",
54 |     "get_option",
55 |     "option_context",
56 |     "options",
57 |     "reset_option",
58 |     "set_option",
59 |     "index_split",
60 |     "split_index",
61 |     "FellegiSunter",
62 |     "KMeansClassifier",
63 |     "LogisticRegressionClassifier",
64 |     "NaiveBayesClassifier",
65 |     "SVMClassifier",
66 |     "ECMClassifier",
67 |     "reduction_ratio",
68 |     "max_pairs",
69 |     "full_index_size",
70 |     "true_positives",
71 |     "true_negatives",
72 |     "false_positives",
73 |     "false_negatives",
74 |     "confusion_matrix",
75 |     "precision",
76 |     "recall",
77 |     "accuracy",
78 |     "specificity",
79 |     "fscore",
80 |     "OneToOneLinking",
81 |     "OneToManyLinking",
82 |     "ConnectedComponents",
83 | ]
84 | 


--------------------------------------------------------------------------------
/recordlinkage/_lib/numeric.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <math.h>
  4 | 
  5 | #define R 6371
  6 | #define TO_RAD (3.1415926536 / 180)
  7 | 
  8 | double euclidean_dist(double x, double y)
  9 | {
 10 | 	return fabs(y - x);
 11 | }
 12 | 
 13 | double haversine_dist(double th1, double ph1, double th2, double ph2)
 14 | {
 15 | 	double dx, dy, dz;
 16 | 	ph1 -= ph2;
 17 | 	ph1 *= TO_RAD, th1 *= TO_RAD, th2 *= TO_RAD;
 18 |  
 19 | 	dz = sin(th1) - sin(th2);
 20 | 	dx = cos(ph1) * cos(th1) - cos(th2);
 21 | 	dy = sin(ph1) * cos(th1);
 22 | 	return asin(sqrt(dx * dx + dy * dy + dz * dz) / 2) * 2 * R;
 23 | }
 24 | 
 25 | double step_sim(double d, double offset, double origin)
 26 | {
 27 | 	if (fabs(d - origin) <= offset)
 28 | 	{
 29 | 		return 1.0;
 30 | 	} else 
 31 | 	{
 32 | 		return 0.0;
 33 | 	}
 34 | }
 35 | 
 36 | double linear_sim(double d, double scale, double offset, double origin)
 37 | {
 38 | 
 39 | 	double d_norm;
 40 | 
 41 | 	// normalise the distance measure
 42 | 	d_norm = fabs(d - origin);
 43 | 
 44 | 	if (d_norm <= offset)
 45 | 	{
 46 | 		return 1.0;
 47 | 	} 
 48 | 	else if (d_norm >= offset + 2 * scale)
 49 | 	{
 50 | 		return 0.0;
 51 | 	} 
 52 | 	else 
 53 | 	{
 54 | 		return 1.0 - (d_norm - offset) / (2 * scale);
 55 | 	}
 56 | }
 57 | 
 58 | 
 59 | double squared_sim(double d, double scale, double offset, double origin)
 60 | {
 61 | 
 62 | 	double d_norm;
 63 | 
 64 | 	// normalise the distance measure
 65 | 	d_norm = fabs(d - origin);
 66 | 
 67 | 	if (d_norm <= offset)
 68 | 	{
 69 | 		return 1.0;
 70 | 	} 
 71 | 	else if (d_norm >= offset + sqrt(2.0) * scale)
 72 | 	{
 73 | 		return 0.0;
 74 | 	} 
 75 | 	else 
 76 | 	{
 77 | 		return 1.0 - 0.5 * exp(2.0 * log((d_norm - offset)/scale));
 78 | 	}
 79 | }
 80 | 
 81 | 
 82 | double exp_sim(double d, double scale, double offset, double origin)
 83 | {
 84 | 
 85 | 	double d_norm;
 86 | 
 87 | 	// normalise the distance measure
 88 | 	d_norm = fabs(d - origin);
 89 | 
 90 | 	if (d_norm <= offset)
 91 | 	{
 92 | 		return 1.0;
 93 | 	} 
 94 | 	else 
 95 | 	{
 96 | 		return pow(2.0, - (d_norm-offset) / scale);
 97 | 	}
 98 | }
 99 | 
100 | 
101 | double gauss_sim(double d, double scale, double offset, double origin)
102 | {
103 | 
104 | 	double d_norm;
105 | 
106 | 	// normalise the distance measure
107 | 	d_norm = fabs(d - origin);
108 | 
109 | 	if (d_norm <= offset)
110 | 	{
111 | 		return 1.0;
112 | 	} 
113 | 	else 
114 | 	{
115 | 		return pow(2.0, - pow((d_norm-offset) / scale, 2.0));
116 | 	}
117 | }
118 | 


--------------------------------------------------------------------------------
/recordlinkage/_lib/numeric.h:
--------------------------------------------------------------------------------
 1 | 
 2 | // numeric distance functions
 3 | double euclidean_dist(double x, double y);
 4 | double haversine_dist(double th1, double ph1, double th2, double ph2);
 5 | 
 6 | // numeric similarity functions
 7 | double step_sim(double d, double offset, double origin);
 8 | double linear_sim(double d, double scale, double offset, double origin);
 9 | double squared_sim(double d, double scale, double offset, double origin);
10 | double exp_sim(double d, double scale, double offset, double origin);
11 | double gauss_sim(double d, double scale, double offset, double origin);
12 | 


--------------------------------------------------------------------------------
/recordlinkage/adapters.py:
--------------------------------------------------------------------------------
  1 | """Module to wrap external machine learning models."""
  2 | 
  3 | __all__ = ["SKLearnAdapter", "KerasAdapter"]
  4 | 
  5 | 
  6 | class SKLearnAdapter:
  7 |     """SciKit-learn adapter for record pair classification.
  8 | 
  9 |     SciKit-learn adapter for record pair classification with SciKit-learn
 10 |     models.
 11 |     """
 12 | 
 13 |     @property
 14 |     def classifier(self):
 15 |         # raise warning
 16 |         return self.kernel
 17 | 
 18 |     @classifier.setter
 19 |     def classifier(self, classifier):
 20 |         self.kernel = classifier
 21 | 
 22 |     def _predict(self, features):
 23 |         """Predict matches and non-matches.
 24 | 
 25 |         Parameters
 26 |         ----------
 27 |         features : numpy.ndarray
 28 |             The data to predict the class of.
 29 | 
 30 |         Returns
 31 |         -------
 32 |         numpy.ndarray
 33 |             The predicted classes.
 34 |         """
 35 | 
 36 |         from sklearn.exceptions import NotFittedError
 37 | 
 38 |         try:
 39 |             prediction = self.kernel.predict(features)
 40 |         except NotFittedError as err:
 41 |             raise NotFittedError(
 42 |                 "{} is not fitted yet. Call 'fit' with appropriate "
 43 |                 "arguments before using this method.".format(type(self).__name__)
 44 |             ) from err
 45 | 
 46 |         return prediction
 47 | 
 48 |     def _fit(self, features, y=None):
 49 |         if y is None:  # unsupervised
 50 |             self.kernel.fit(features)
 51 |         else:
 52 |             self.kernel.fit(features, y)
 53 | 
 54 |     def _prob_match(self, features):
 55 |         """Compute match probabilities.
 56 | 
 57 |         Parameters
 58 |         ----------
 59 |         features : numpy.ndarray
 60 |             The data to train the model on.
 61 | 
 62 |         Returns
 63 |         -------
 64 |         numpy.ndarray
 65 |             The match probabilties.
 66 |         """
 67 | 
 68 |         # compute the probabilities
 69 |         probs = self.kernel.predict_proba(features)
 70 | 
 71 |         # get the position of match probabilities
 72 |         classes = list(self.kernel.classes_)
 73 |         match_class_position = classes.index(1)
 74 | 
 75 |         return probs[:, match_class_position]
 76 | 
 77 | 
 78 | class KerasAdapter:
 79 |     """Keras adapter for record pair classification.
 80 | 
 81 |     Keras adapter for record pair classification with Keras models.
 82 |     """
 83 | 
 84 |     @property
 85 |     def classifier(self):
 86 |         # raise warning
 87 |         return self.kernel
 88 | 
 89 |     @classifier.setter
 90 |     def classifier(self, classifier):
 91 |         self.kernel = classifier
 92 | 
 93 |     def _predict(self, features):
 94 |         """Predict matches and non-matches.
 95 | 
 96 |         Parameters
 97 |         ----------
 98 |         features : numpy.ndarray
 99 |             The data to predict the class of.
100 | 
101 |         Returns
102 |         -------
103 |         numpy.ndarray
104 |             The predicted classes.
105 |         """
106 | 
107 |         from sklearn.exceptions import NotFittedError
108 | 
109 |         try:
110 |             prediction = self.kernel.predict_classes(features)[:, 0]
111 |         except NotFittedError as err:
112 |             raise NotFittedError(
113 |                 "{} is not fitted yet. Call 'fit' with appropriate "
114 |                 "arguments before using this method.".format(type(self).__name__)
115 |             ) from err
116 | 
117 |         return prediction
118 | 
119 |     def _fit(self, features, y=None):
120 |         self.kernel.fit(features, y)
121 | 
122 |     def _prob_match(self, features):
123 |         """Compute match probabilities.
124 | 
125 |         Parameters
126 |         ----------
127 |         features : numpy.ndarray
128 |             The data to train the model on.
129 | 
130 |         Returns
131 |         -------
132 |         numpy.ndarray
133 |             The match probabilties.
134 |         """
135 | 
136 |         # compute the probabilities
137 |         probs = self.kernel.predict_proba(features)[:, 0]
138 | 
139 |         return probs
140 | 


--------------------------------------------------------------------------------
/recordlinkage/algorithms/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/J535D165/recordlinkage/b93d97641952f8c85106be5794ca93b1f1298fbc/recordlinkage/algorithms/__init__.py


--------------------------------------------------------------------------------
/recordlinkage/algorithms/c_numeric.pyx:
--------------------------------------------------------------------------------
 1 | cimport cython
 2 | 
 3 | import numpy as np
 4 | 
 5 | cimport numpy as np
 6 | 
 7 | 
 8 | cdef extern from "../_lib/numeric.h":
 9 | 
10 |     # numeric distance functions
11 |     double euclidean_dist(double x, double y)
12 |     double haversine_dist(double th1, double ph1, double th2, double ph2)
13 | 
14 |     # numeric similarity functions
15 |     double step_sim(double d, double offset, double origin)
16 |     double linear_sim(double d, double scale, double offset, double origin)
17 |     double squared_sim(double d, double scale, double offset, double origin)
18 |     double exp_sim(double d, double scale, double offset, double origin)
19 |     double gauss_sim(double d, double scale, double offset, double origin)
20 | 
21 | 
22 | @cython.boundscheck(False) # turn off bounds-checking for entire function
23 | @cython.wraparound(False)  # turn off negative index wrapping for entire function
24 | def euclidean_distance(np.ndarray[np.float64_t, ndim=1] x, np.ndarray[np.float64_t, ndim=1] y):
25 | 
26 |     cdef int n_rows = x.shape[0]
27 | 
28 |     cdef np.ndarray result = np.zeros(n_rows, dtype=np.float64)
29 | 
30 |     for k in range(n_rows):
31 |         result[k] = euclidean_dist(x[k], y[k])
32 | 
33 |     return result
34 | 
35 | 
36 | @cython.boundscheck(False) # turn off bounds-checking for entire function
37 | @cython.wraparound(False)  # turn off negative index wrapping for entire function
38 | def haversine_distance(np.ndarray[np.float64_t, ndim=1] th1, np.ndarray[np.float64_t, ndim=1] ph1, np.ndarray[np.float64_t, ndim=1] th2, np.ndarray[np.float64_t, ndim=1] ph2):
39 | 
40 |     cdef int n_rows = th1.shape[0]
41 | 
42 |     cdef np.ndarray result = np.zeros(n_rows, dtype=np.float64)
43 | 
44 |     for k in range(n_rows):
45 |         result[k] = haversine_dist(th1[k], ph1[k], th2[k], ph2[k])
46 | 
47 |     return result
48 | 


--------------------------------------------------------------------------------
/recordlinkage/algorithms/compare.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas
 3 | 
 4 | 
 5 | def _compare_exact(s1, s2, agree_value=1, disagree_value=0, missing_value=0):
 6 |     # dtypes can be hard if the passed parameters (agreement, disagreement,
 7 |     # missing_value) are of different types.
 8 |     # http://chris.friedline.net/2015-12-15-rutgers/lessons/python2/03-data-types-and-format.html
 9 | 
10 |     # Convert to pandas.Series if (numpy) arrays are passed.
11 |     if not isinstance(s1, pandas.Series):
12 |         s1 = pandas.Series(s1, index=s1.index)
13 | 
14 |     if not isinstance(s2, pandas.Series):
15 |         s2 = pandas.Series(s2, index=s2.index)
16 | 
17 |     # Values or agree/disagree
18 |     if agree_value == "value":
19 |         compare = s1.copy()
20 |         compare[s1 != s2] = disagree_value
21 | 
22 |     else:
23 |         compare = pandas.Series(disagree_value, index=s1.index)
24 |         compare[s1 == s2] = agree_value
25 | 
26 |     # Only when disagree value is not identical with the missing value
27 |     if disagree_value != missing_value:
28 |         compare[(s1.isnull() | s2.isnull())] = missing_value
29 | 
30 |     return compare
31 | 
32 | 
33 | def _compare_dates(
34 |     s1, s2, swap_month_day=0.5, swap_months="default", errors="coerce", *args, **kwargs
35 | ):
36 |     # validate datatypes
37 |     if str(s1.dtype) != "datetime64[ns]":
38 |         raise ValueError("Left column is not of type datetime64[ns]")
39 | 
40 |     if str(s2.dtype) != "datetime64[ns]":
41 |         raise ValueError("Right column is not of type datetime64[ns]")
42 | 
43 |     c = (s1 == s2).astype(np.int64)  # start with int64 (will become float64)
44 | 
45 |     # The case is which there is a swap_month_day value given.
46 |     if swap_month_day and swap_month_day != 0:
47 |         c[
48 |             (s1.dt.year == s2.dt.year)
49 |             & (s1.dt.month == s2.dt.day)
50 |             & (s1.dt.day == s2.dt.month)
51 |             & (c != 1)
52 |         ] = swap_month_day
53 | 
54 |     if swap_months and swap_months != 0:
55 |         if swap_months == "default":
56 |             swap_months = [(6, 7, 0.5), (7, 6, 0.5), (9, 10, 0.5), (10, 9, 0.5)]
57 |         else:
58 |             try:
59 |                 if not all([len(x) == 3 for x in swap_months]):
60 |                     raise Exception
61 |             except Exception as err:
62 |                 raise ValueError(
63 |                     "swap_months must be a list of (first month, \
64 |                     second month, value) tuples or lists. "
65 |                 ) from err
66 | 
67 |         for month1, month2, value in swap_months:
68 |             # if isinstance(value, float):
69 |             #     c = c.astype(np.float64)
70 |             # elif isinstance(value, int):
71 |             #     c = c.astype(np.int64)
72 |             # else:
73 |             #     c = c.astype(object)
74 | 
75 |             c[
76 |                 (s1.dt.year == s2.dt.year)
77 |                 & (s1.dt.month == month1)
78 |                 & (s2.dt.month == month2)
79 |                 & (s1.dt.day == s2.dt.day)
80 |                 & (c != 1)
81 |             ] = value
82 | 
83 |     c = pandas.Series(c)
84 |     c[s1.isnull() | s2.isnull()] = np.nan
85 | 
86 |     return c
87 | 


--------------------------------------------------------------------------------
/recordlinkage/algorithms/distance.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas
 3 | 
 4 | 
 5 | # Numerical distance algorithms
 6 | def _1d_distance(s1, s2):
 7 |     return pandas.eval("s2-s1")
 8 | 
 9 | 
10 | def _haversine_distance(lat1, lng1, lat2, lng2):
11 |     # degrees to radians conversion
12 |     to_rad = np.deg2rad(1)  # noqa
13 | 
14 |     # numeric expression to use with numexpr package
15 |     expr = (
16 |         "2*6371*arcsin(sqrt((sin((lat2*to_rad-lat1*to_rad)/2))**2+"
17 |         "cos(lat1*to_rad)*cos(lat2*to_rad)*"
18 |         "(sin((lng2*to_rad-lng1*to_rad)/2))**2))"
19 |     )
20 | 
21 |     return pandas.eval(expr)
22 | 


--------------------------------------------------------------------------------
/recordlinkage/algorithms/indexing.py:
--------------------------------------------------------------------------------
  1 | """Algorithms for indexing."""
  2 | 
  3 | import numpy as np
  4 | 
  5 | from recordlinkage.measures import full_index_size
  6 | 
  7 | 
  8 | def _map_tril_1d_on_2d(indices, dims):
  9 |     """Map 1d indices on lower triangular matrix in 2d."""
 10 | 
 11 |     N = (dims * dims - dims) / 2
 12 | 
 13 |     m = np.ceil(np.sqrt(2 * N))
 14 |     c = m - np.round(np.sqrt(2 * (N - indices))) - 1
 15 |     r = np.mod(indices + (c + 1) * (c + 2) / 2 - 1, m) + 1
 16 | 
 17 |     return np.array([r, c], dtype=np.int64)
 18 | 
 19 | 
 20 | def random_pairs_with_replacement(n, shape, random_state=None):
 21 |     """make random record pairs"""
 22 | 
 23 |     if not isinstance(random_state, np.random.RandomState):
 24 |         random_state = np.random.RandomState(random_state)
 25 | 
 26 |     n_max = full_index_size(shape)
 27 | 
 28 |     if n_max <= 0:
 29 |         raise ValueError("n_max must be larger than 0")
 30 | 
 31 |     # make random pairs
 32 |     indices = random_state.randint(0, n_max, n, dtype=np.int64)
 33 | 
 34 |     if len(shape) == 1:
 35 |         return _map_tril_1d_on_2d(indices, shape[0])
 36 |     else:
 37 |         return np.array(np.unravel_index(indices, shape))
 38 | 
 39 | 
 40 | def random_pairs_without_replacement(n, shape, random_state=None):
 41 |     """Return record pairs for dense sample.
 42 | 
 43 |     Sample random record pairs without replacement bounded by the
 44 |     maximum number of record pairs (based on shape). This algorithm is
 45 |     efficient and fast for relative small samples.
 46 |     """
 47 | 
 48 |     n_max = full_index_size(shape)
 49 | 
 50 |     if not isinstance(random_state, np.random.RandomState):
 51 |         random_state = np.random.RandomState(random_state)
 52 | 
 53 |     if not isinstance(n, int) or n <= 0 or n > n_max:
 54 |         raise ValueError("n must be a integer satisfying 0<n<=%s" % n_max)
 55 | 
 56 |     # make a sample without replacement
 57 |     sample = random_state.choice(np.arange(n_max), n, replace=False)
 58 | 
 59 |     # return 2d indices
 60 |     if len(shape) == 1:
 61 |         return _map_tril_1d_on_2d(sample, shape[0])
 62 |     else:
 63 |         return np.array(np.unravel_index(sample, shape))
 64 | 
 65 | 
 66 | def random_pairs_without_replacement_low_memory(n, shape, random_state=None):
 67 |     """Make a sample of random pairs with replacement.
 68 | 
 69 |     Sample random record pairs without replacement bounded by the
 70 |     maximum number of record pairs (based on shape). This algorithm
 71 |     consumes low memory and is fast for relatively small samples.
 72 |     """
 73 | 
 74 |     n_max = full_index_size(shape)
 75 | 
 76 |     if not isinstance(random_state, np.random.RandomState):
 77 |         random_state = np.random.RandomState(random_state)
 78 | 
 79 |     if not isinstance(n, int) or n <= 0 or n > n_max:
 80 |         raise ValueError("n must be a integer satisfying 0<n<=%s" % n_max)
 81 | 
 82 |     sample = np.array([], dtype=np.int64)
 83 | 
 84 |     # Run as long as the number of pairs is less than the requested number
 85 |     # of pairs n.
 86 |     while len(sample) < n:
 87 |         # The number of pairs to sample (sample twice as much record pairs
 88 |         # because the duplicates are dropped).
 89 |         n_sample_size = (n - len(sample)) * 2
 90 |         sample_sub = random_state.randint(n_max, size=n_sample_size)
 91 | 
 92 |         # concatenate pairs and deduplicate
 93 |         pairs_non_unique = np.append(sample, sample_sub)
 94 |         sample = np.unique(pairs_non_unique)
 95 | 
 96 |     # return 2d indices
 97 |     if len(shape) == 1:
 98 |         return _map_tril_1d_on_2d(sample[0:n], shape[0])
 99 |     else:
100 |         return np.array(np.unravel_index(sample[0:n], shape))
101 | 


--------------------------------------------------------------------------------
/recordlinkage/algorithms/numeric.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas
 3 | 
 4 | # Numerical comparison algorithms
 5 | 
 6 | 
 7 | def _step_sim(d, offset=0, origin=0):
 8 |     # scale is not an argument
 9 | 
10 |     if offset < 0:
11 |         raise ValueError("The offset must be positive.")
12 | 
13 |     expr = "abs(d - origin) <= offset"
14 | 
15 |     return pandas.eval(expr).astype(np.float64)
16 | 
17 | 
18 | def _linear_sim(d, scale, offset=0, origin=0):
19 |     if offset < 0:
20 |         raise ValueError("The offset must be positive.")
21 | 
22 |     if scale <= 0:
23 |         raise ValueError("The scale must be larger than 0. ")
24 | 
25 |     d = (abs(d - origin)).clip(offset, offset + 2 * scale)
26 | 
27 |     expr = "1 - (d-offset)/(2*scale)"
28 | 
29 |     return pandas.eval(expr)
30 | 
31 | 
32 | def _squared_sim(d, scale, offset=0, origin=0):
33 |     if offset < 0:
34 |         raise ValueError("The offset must be positive.")
35 | 
36 |     if scale <= 0:
37 |         raise ValueError("The scale must be larger than 0. ")
38 | 
39 |     d = (abs(d - origin)).clip(offset, offset + np.sqrt(2) * scale)
40 |     # solve y=1-ad^2 given y(d=scale)=0.5
41 |     # 1-y = ad^2
42 |     # a = (1-y)/d^2
43 | 
44 |     # fill y=0.5 and d = scale
45 |     # a = (1-0.5)/scale^2
46 |     # a = 1/(2*scale^2)
47 |     # y = 1 - 1/2*(d/scale)^2
48 |     # d = sqrt(2)*scale is the point where similarity is zero.
49 | 
50 |     expr = "1 - 1/2*exp(2*log((d-offset)/scale))"
51 | 
52 |     return pandas.eval(expr)
53 | 
54 | 
55 | def _exp_sim(d, scale, offset=0, origin=0):
56 |     if offset < 0:
57 |         raise ValueError("The offset must be positive.")
58 | 
59 |     if scale <= 0:
60 |         raise ValueError("The scale must be larger than 0. ")
61 | 
62 |     d = (abs(d - origin)).clip(offset, None)
63 | 
64 |     # solve y=exp(-x*a) if 1/2 = exp(-x/scale)
65 |     expr = "2**(-(d-offset)/scale)"
66 | 
67 |     return pandas.eval(expr)
68 | 
69 | 
70 | def _gauss_sim(d, scale, offset=0, origin=0):
71 |     if offset < 0:
72 |         raise ValueError("The offset must be positive.")
73 | 
74 |     if scale <= 0:
75 |         raise ValueError("The scale must be larger than 0. ")
76 | 
77 |     d = (abs(d - origin)).clip(offset, None)
78 | 
79 |     # solve y=exp(-x^2*a) if 1/2 = exp(-x^2/scale^2)
80 |     expr = "2**(-((d-offset)/scale)**2)"
81 | 
82 |     return pandas.eval(expr)
83 | 


--------------------------------------------------------------------------------
/recordlinkage/api.py:
--------------------------------------------------------------------------------
  1 | from recordlinkage.base import BaseCompare
  2 | from recordlinkage.base import BaseIndex
  3 | from recordlinkage.compare import Date
  4 | from recordlinkage.compare import Exact
  5 | from recordlinkage.compare import Geographic
  6 | from recordlinkage.compare import Numeric
  7 | from recordlinkage.compare import String
  8 | from recordlinkage.index import Block
  9 | from recordlinkage.index import Full
 10 | from recordlinkage.index import Random
 11 | from recordlinkage.index import SortedNeighbourhood
 12 | 
 13 | 
 14 | class Index(BaseIndex):
 15 |     """Class to make an index of record pairs.
 16 | 
 17 |     Parameters
 18 |     ----------
 19 |     algorithms: list
 20 |         A list of index algorithm classes. The classes are based on
 21 |         :class:`recordlinkage.base.BaseIndexAlgorithm`
 22 | 
 23 |     Example
 24 |     -------
 25 |     Consider two historical datasets with census data to link. The
 26 |     datasets are named ``census_data_1980`` and ``census_data_1990``::
 27 | 
 28 |         indexer = recordlinkage.Index()
 29 |         indexer.block(left_on='first_name', right_on='givenname')
 30 |         indexer.index(census_data_1980, census_data_1990)
 31 | 
 32 |     """
 33 | 
 34 |     def full(self):
 35 |         """Add a 'full' index.
 36 | 
 37 |         Shortcut of :class:`recordlinkage.index.Full`::
 38 | 
 39 |             from recordlinkage.index import Full
 40 | 
 41 |             indexer = recordlinkage.Index()
 42 |             indexer.add(Full())
 43 | 
 44 |         """
 45 |         indexer = Full()
 46 |         self.add(indexer)
 47 | 
 48 |         return self
 49 | 
 50 |     def block(self, *args, **kwargs):
 51 |         """Add a block index.
 52 | 
 53 |         Shortcut of :class:`recordlinkage.index.Block`::
 54 | 
 55 |             from recordlinkage.index import Block
 56 | 
 57 |             indexer = recordlinkage.Index()
 58 |             indexer.add(Block())
 59 | 
 60 |         """
 61 |         indexer = Block(*args, **kwargs)
 62 |         self.add(indexer)
 63 | 
 64 |         return self
 65 | 
 66 |     def sortedneighbourhood(self, *args, **kwargs):
 67 |         """Add a Sorted Neighbourhood Index.
 68 | 
 69 |         Shortcut of :class:`recordlinkage.index.SortedNeighbourhood`::
 70 | 
 71 |             from recordlinkage.index import SortedNeighbourhood
 72 | 
 73 |             indexer = recordlinkage.Index()
 74 |             indexer.add(SortedNeighbourhood())
 75 | 
 76 |         """
 77 |         indexer = SortedNeighbourhood(*args, **kwargs)
 78 |         self.add(indexer)
 79 | 
 80 |         return self
 81 | 
 82 |     def random(self, *args, **kwargs):
 83 |         """Add a random index.
 84 | 
 85 |         Shortcut of :class:`recordlinkage.index.Random`::
 86 | 
 87 |             from recordlinkage.index import Random
 88 | 
 89 |             indexer = recordlinkage.Index()
 90 |             indexer.add(Random())
 91 | 
 92 |         """
 93 |         indexer = Random(*args, **kwargs)
 94 |         self.add(indexer)
 95 | 
 96 |         return self
 97 | 
 98 | 
 99 | class Compare(BaseCompare):
100 |     """Class to compare record pairs with efficiently.
101 | 
102 |     Class to compare the attributes of candidate record pairs. The
103 |     ``Compare`` class has methods like ``string``, ``exact`` and
104 |     ``numeric`` to initialise the comparing of the records. The
105 |     ``compute`` method is used to start the actual comparing.
106 | 
107 |     Example
108 |     -------
109 | 
110 |     Consider two historical datasets with census data to link. The datasets
111 |     are named ``census_data_1980`` and ``census_data_1990``. The MultiIndex
112 |     ``candidate_pairs`` contains the record pairs to compare. The record
113 |     pairs are compared on the first name, last name, sex, date of birth,
114 |     address, place, and income::
115 | 
116 |         # initialise class
117 |         comp = recordlinkage.Compare()
118 | 
119 |         # initialise similarity measurement algorithms
120 |         comp.string('first_name', 'name', method='jarowinkler')
121 |         comp.string('lastname', 'lastname', method='jarowinkler')
122 |         comp.exact('dateofbirth', 'dob')
123 |         comp.exact('sex', 'sex')
124 |         comp.string('address', 'address', method='levenshtein')
125 |         comp.exact('place', 'place')
126 |         comp.numeric('income', 'income')
127 | 
128 |         # the method .compute() returns the DataFrame with the feature vectors.
129 |         comp.compute(candidate_pairs, census_data_1980, census_data_1990)
130 | 
131 |     Parameters
132 |     ----------
133 |     features : list
134 |         List of compare algorithms.
135 |     n_jobs : integer, optional (default=1)
136 |         The number of jobs to run in parallel for comparing of record
137 |         pairs.
138 |         If -1, then the number of jobs is set to the number of cores.
139 |     indexing_type : string, optional (default='label')
140 |         The indexing type. The MultiIndex is used to index the
141 |         DataFrame(s). This can be done with pandas ``.loc`` or with
142 |         ``.iloc``. Use the value 'label' to make use of ``.loc`` and
143 |         'position' to make use of ``.iloc``. The value 'position' is
144 |         only available when the MultiIndex consists of integers. The
145 |         value 'position' is much faster.
146 | 
147 |     Attributes
148 |     ----------
149 |     features: list
150 |         A list of algorithms to create features.
151 | 
152 | 
153 |     """
154 | 
155 |     def exact(self, *args, **kwargs):
156 |         """Compare attributes of pairs exactly.
157 | 
158 |         Shortcut of :class:`recordlinkage.compare.Exact`::
159 | 
160 |             from recordlinkage.compare import Exact
161 | 
162 |             indexer = recordlinkage.Compare()
163 |             indexer.add(Exact())
164 | 
165 |         """
166 |         compare = Exact(*args, **kwargs)
167 |         self.add(compare)
168 | 
169 |         return self
170 | 
171 |     def string(self, *args, **kwargs):
172 |         """Compare attributes of pairs with string algorithm.
173 | 
174 |         Shortcut of :class:`recordlinkage.compare.String`::
175 | 
176 |             from recordlinkage.compare import String
177 | 
178 |             indexer = recordlinkage.Compare()
179 |             indexer.add(String())
180 | 
181 |         """
182 |         compare = String(*args, **kwargs)
183 |         self.add(compare)
184 | 
185 |         return self
186 | 
187 |     def numeric(self, *args, **kwargs):
188 |         """Compare attributes of pairs with numeric algorithm.
189 | 
190 |         Shortcut of :class:`recordlinkage.compare.Numeric`::
191 | 
192 |             from recordlinkage.compare import Numeric
193 | 
194 |             indexer = recordlinkage.Compare()
195 |             indexer.add(Numeric())
196 | 
197 |         """
198 |         compare = Numeric(*args, **kwargs)
199 |         self.add(compare)
200 | 
201 |         return self
202 | 
203 |     def geo(self, *args, **kwargs):
204 |         """Compare attributes of pairs with geo algorithm.
205 | 
206 |         Shortcut of :class:`recordlinkage.compare.Geographic`::
207 | 
208 |             from recordlinkage.compare import Geographic
209 | 
210 |             indexer = recordlinkage.Compare()
211 |             indexer.add(Geographic())
212 | 
213 |         """
214 |         compare = Geographic(*args, **kwargs)
215 |         self.add(compare)
216 | 
217 |         return self
218 | 
219 |     def date(self, *args, **kwargs):
220 |         """Compare attributes of pairs with date algorithm.
221 | 
222 |         Shortcut of :class:`recordlinkage.compare.Date`::
223 | 
224 |             from recordlinkage.compare import Date
225 | 
226 |             indexer = recordlinkage.Compare()
227 |             indexer.add(Date())
228 | 
229 |         """
230 |         compare = Date(*args, **kwargs)
231 |         self.add(compare)
232 | 
233 |         return self
234 | 


--------------------------------------------------------------------------------
/recordlinkage/config_init.py:
--------------------------------------------------------------------------------
 1 | import recordlinkage.config as cf
 2 | from recordlinkage.config import is_one_of_factory
 3 | 
 4 | pairs_type_doc = """
 5 | : str
 6 |     Specify the format how record pairs are stored. By default, record
 7 |     pairs generated by the toolkit are returned in a
 8 |     pandas.MultiIndex object ('multiindex' option).
 9 | 
10 |     Valid values: 'multiindex'
11 | """
12 | 
13 | classification_return_type_doc = """
14 | : str
15 |     The format of the classification result. The value 'index' returns the
16 |     classification result as a pandas.MultiIndex. The MultiIndex contains
17 |     the predicted matching record pairs. The value 'series' returns a
18 |     pandas.Series with zeros (distinct) and ones (matches). The argument
19 |     value 'array' will return a numpy.ndarray with zeros and ones.
20 | 
21 | """
22 | 
23 | with cf.config_prefix("indexing"):
24 |     cf.register_option(
25 |         "pairs",
26 |         "multiindex",
27 |         pairs_type_doc,
28 |         validator=is_one_of_factory(["multiindex"]),
29 |     )
30 | 
31 | with cf.config_prefix("classification"):
32 |     cf.register_option(
33 |         "return_type",
34 |         "index",
35 |         classification_return_type_doc,
36 |         validator=is_one_of_factory(["index", "series", "array"]),
37 |     )
38 | 


--------------------------------------------------------------------------------
/recordlinkage/contrib/README.rst:
--------------------------------------------------------------------------------
 1 | Python Record Linkage Toolkit contrib
 2 | =====================================
 3 | 
 4 | Any code in this directory is not officially supported, and may change
 5 | or be removed at any time without notice. It is meant to contain
 6 | algorithms and contributions that eventually should get merged into core
 7 | of the toolkit, but whose interfaces may still change, or which require
 8 | some testing to see whether they can find broader acceptance.
 9 | 
10 | Examples
11 | --------
12 | 
13 | .. code:: python
14 | 
15 |    from recordlinkage.contrib.index import NeighbourhoodBlock
16 | 
17 |    # or 
18 | 
19 |    from recordlinkage.contrib.compare.random import RandomContinuous
20 | 
21 | Development
22 | -----------
23 | 
24 | Please add single algorithms directly to ``recordlinkage.contrib.index``
25 | or ``recordlinkage.contrib.compare``, but collections of algorithms as a
26 | submodule.


--------------------------------------------------------------------------------
/recordlinkage/contrib/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 Jonathan de Bruin
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions are met:
 5 | #
 6 | # 1. Redistributions of source code must retain the above copyright notice,
 7 | # this list of conditions and the following disclaimer.
 8 | # 2. Redistributions in binary form must reproduce the above copyright notice,
 9 | # this list of conditions and the following disclaimer in the documentation
10 | # and/or other materials provided with the distribution.
11 | # 3. Neither the name of the copyright holder nor the names of its
12 | # contributors may be used to endorse or promote products derived from this
13 | # software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
19 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25 | # POSSIBILITY OF SUCH DAMAGE.
26 | """contrib module containing community contributions or experimental code."""
27 | 


--------------------------------------------------------------------------------
/recordlinkage/contrib/compare/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 Jonathan de Bruin
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions are met:
 5 | #
 6 | # 1. Redistributions of source code must retain the above copyright notice,
 7 | # this list of conditions and the following disclaimer.
 8 | # 2. Redistributions in binary form must reproduce the above copyright notice,
 9 | # this list of conditions and the following disclaimer in the documentation
10 | # and/or other materials provided with the distribution.
11 | # 3. Neither the name of the copyright holder nor the names of its
12 | # contributors may be used to endorse or promote products derived from this
13 | # software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
19 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25 | # POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | from recordlinkage.contrib.compare.random.random import RandomContinuous
28 | from recordlinkage.contrib.compare.random.random import RandomDiscrete
29 | 
30 | __all__ = ["RandomContinuous", "RandomDiscrete"]
31 | 


--------------------------------------------------------------------------------
/recordlinkage/contrib/compare/random/README.rst:
--------------------------------------------------------------------------------
 1 | Random features
 2 | ===============
 3 | 
 4 | This module contains compare algorithms that return random values. For
 5 | example, comparing two names gives a random value between 0 and 1.
 6 | 
 7 | Example
 8 | -------
 9 | 
10 | .. code:: python
11 | 
12 |    from recordlinkage.contrib.compare import RandomContinuous, RandomDiscrete


--------------------------------------------------------------------------------
/recordlinkage/contrib/compare/random/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/J535D165/recordlinkage/b93d97641952f8c85106be5794ca93b1f1298fbc/recordlinkage/contrib/compare/random/__init__.py


--------------------------------------------------------------------------------
/recordlinkage/contrib/compare/random/random.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2018 Jonathan de Bruin
  2 | #
  3 | # Redistribution and use in source and binary forms, with or without
  4 | # modification, are permitted provided that the following conditions are met:
  5 | #
  6 | # 1. Redistributions of source code must retain the above copyright notice,
  7 | # this list of conditions and the following disclaimer.
  8 | # 2. Redistributions in binary form must reproduce the above copyright notice,
  9 | # this list of conditions and the following disclaimer in the documentation
 10 | # and/or other materials provided with the distribution.
 11 | # 3. Neither the name of the copyright holder nor the names of its
 12 | # contributors may be used to endorse or promote products derived from this
 13 | # software without specific prior written permission.
 14 | #
 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 16 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 18 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 19 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 20 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 21 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 22 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 23 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 24 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 25 | # POSSIBILITY OF SUCH DAMAGE.
 26 | """Random compare strategy to test model behaviour."""
 27 | 
 28 | import numpy as np
 29 | import pandas as pd
 30 | from numpy.random import choice
 31 | from numpy.random import random_sample
 32 | 
 33 | from recordlinkage.base import BaseCompareFeature
 34 | 
 35 | __all__ = ["RandomContinuous", "RandomDiscrete"]
 36 | 
 37 | 
 38 | class RandomContinuous(BaseCompareFeature):
 39 |     """Add a feature with continuous random values.
 40 | 
 41 |     A column with continuous random values between 'a' and 'b' is
 42 |     returned. This comparison vector/feature can be useful for model
 43 |     testing.
 44 | 
 45 |     Parameters
 46 |     ----------
 47 |     a : float
 48 |         Lower bound of the continuous uniform distribution.
 49 |         Default 0.0.
 50 |     b : float
 51 |         Upper bound of the continuous uniform distribution.
 52 |         Default 1.0.
 53 |     label : list, str, int
 54 |         The identifying label(s) for the returned values.
 55 | 
 56 |     """
 57 | 
 58 |     name = "random_cont"
 59 |     description = "Feature for continuous random values."
 60 | 
 61 |     def __init__(self, a=0.0, b=1.0, label=None):
 62 |         super().__init__([], [], label=label)
 63 | 
 64 |         self.a = a
 65 |         self.b = b
 66 | 
 67 |     def _compute_vectorized(self, args, y):
 68 |         random_values = random_sample(args.index.shape[0])
 69 | 
 70 |         if self.a != 0.0 or self.b != 1.0:
 71 |             random_values = (self.b - self.a) * random_values + self.a
 72 | 
 73 |         return random_values
 74 | 
 75 |     def compute(self, pairs, x=None, x_link=None):
 76 |         """Return continuous random values for each record pair.
 77 | 
 78 |         Parameters
 79 |         ----------
 80 |         pairs : pandas.MultiIndex
 81 |             A pandas MultiIndex with the record pairs to compare. The indices
 82 |             in the MultiIndex are indices of the DataFrame(s) to link.
 83 |         x : pandas.DataFrame
 84 |             The DataFrame to link. If `x_link` is given, the comparing is a
 85 |             linking problem. If `x_link` is not given, the problem is one of
 86 |             deduplication.
 87 |         x_link : pandas.DataFrame, optional
 88 |             The second DataFrame.
 89 | 
 90 |         Returns
 91 |         -------
 92 |         pandas.Series, pandas.DataFrame, numpy.ndarray
 93 |             The result of comparing record pairs (the features). Can be
 94 |             a tuple with multiple pandas.Series, pandas.DataFrame,
 95 |             numpy.ndarray objects.
 96 |         """
 97 | 
 98 |         df_empty = pd.DataFrame(index=pairs)
 99 |         return self._compute(tuple([df_empty]), tuple([df_empty]))
100 | 
101 | 
102 | class RandomDiscrete(BaseCompareFeature):
103 |     """Add a feature with discrete random values.
104 | 
105 |     A column with discrete random values. This comparison vector/feature can
106 |     be useful for model testing. By default, random values are sampled from
107 |     a Bernoulli distribution with p=0.5.
108 | 
109 |     Parameters
110 |     ----------
111 |     a : int, numpy.ndarray
112 |         If an ndarray, a random sample is generated from its
113 |         elements. If an int, the random sample is generated
114 |         as if a were np.arange(a). Default [0, 1]
115 |     dtype : data-type
116 |         The type of the data to return. Default np.int64.
117 |     label : list, str, int
118 |         The identifying label(s) for the returned values.
119 | 
120 |     """
121 | 
122 |     name = "random_desc"
123 |     description = "Feature for discrete random values."
124 | 
125 |     def __init__(self, a=[0, 1], dtype=np.int64, label=None):
126 |         super().__init__([], [], label=label)
127 | 
128 |         self.a = a
129 |         self.dtype = dtype
130 | 
131 |     def _compute_vectorized(self, args, y):
132 |         random_values = choice(self.a, args.index.shape[0])
133 |         random_values = random_values.astype(self.dtype)
134 | 
135 |         return random_values
136 | 
137 |     def compute(self, pairs, x=None, x_link=None):
138 |         """Return discrete random values for each record pair.
139 | 
140 |         Parameters
141 |         ----------
142 |         pairs : pandas.MultiIndex
143 |             A pandas MultiIndex with the record pairs to compare. The indices
144 |             in the MultiIndex are indices of the DataFrame(s) to link.
145 |         x : pandas.DataFrame
146 |             The DataFrame to link. If `x_link` is given, the comparing is a
147 |             linking problem. If `x_link` is not given, the problem is one of
148 |             deduplication.
149 |         x_link : pandas.DataFrame, optional
150 |             The second DataFrame.
151 | 
152 |         Returns
153 |         -------
154 |         pandas.Series, pandas.DataFrame, numpy.ndarray
155 |             The result of comparing record pairs (the features). Can be
156 |             a tuple with multiple pandas.Series, pandas.DataFrame,
157 |             numpy.ndarray objects.
158 |         """
159 | 
160 |         df_empty = pd.DataFrame(index=pairs)
161 |         return self._compute(tuple([df_empty]), tuple([df_empty]))
162 | 


--------------------------------------------------------------------------------
/recordlinkage/contrib/compare/random/test_random.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 Jonathan de Bruin
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions are met:
 5 | #
 6 | # 1. Redistributions of source code must retain the above copyright notice,
 7 | # this list of conditions and the following disclaimer.
 8 | # 2. Redistributions in binary form must reproduce the above copyright notice,
 9 | # this list of conditions and the following disclaimer in the documentation
10 | # and/or other materials provided with the distribution.
11 | # 3. Neither the name of the copyright holder nor the names of its
12 | # contributors may be used to endorse or promote products derived from this
13 | # software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
19 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25 | # POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | import pandas as pd
28 | 
29 | import recordlinkage
30 | from recordlinkage.contrib.compare import RandomContinuous
31 | from recordlinkage.contrib.compare import RandomDiscrete
32 | from recordlinkage.index import Full
33 | 
34 | 
35 | class TestRandomContinuous:
36 |     def test_random_cont_standalone(self):
37 |         arr1 = [1, 2, 3, 4, 5]
38 |         arr2 = [1, 2, 3, 4, 5]
39 |         pairs = pd.MultiIndex.from_product([arr1, arr2])
40 | 
41 |         c = RandomContinuous()
42 |         r = c.compute(pairs)
43 | 
44 |         assert r.shape[0] == len(arr1) * len(arr2)
45 | 
46 |     def test_random_cont(self):
47 |         df_a = pd.DataFrame({"v": list("abcde")})
48 |         df_b = pd.DataFrame({"v": list("abcde")})
49 | 
50 |         pairs = Full().index(df_a, df_b)
51 | 
52 |         c = recordlinkage.Compare()
53 |         c.exact("v", "v")
54 |         c.add(RandomContinuous(label="random"))
55 |         cv = c.compute(pairs, df_a, df_b)
56 | 
57 |         assert isinstance(cv, pd.DataFrame)
58 | 
59 |         assert cv["random"].notnull().all()
60 |         assert cv["random"].min() >= 0.0
61 |         assert cv["random"].max() <= 1.0
62 | 
63 | 
64 | class TestRandomDiscrete:
65 |     def test_random_desc_standalone(self):
66 |         arr1 = [1, 2, 3, 4, 5]
67 |         arr2 = [1, 2, 3, 4, 5]
68 |         pairs = pd.MultiIndex.from_product([arr1, arr2])
69 | 
70 |         c = RandomDiscrete()
71 |         r = c.compute(pairs)
72 | 
73 |         assert r.shape[0] == len(arr1) * len(arr2)
74 | 
75 |     def test_random_desc(self):
76 |         df_a = pd.DataFrame({"v": list("abcde")})
77 |         df_b = pd.DataFrame({"v": list("abcde")})
78 | 
79 |         pairs = Full().index(df_a, df_b)
80 | 
81 |         c = recordlinkage.Compare()
82 |         c.exact("v", "v")
83 |         c.add(RandomDiscrete(label="random"))
84 |         cv = c.compute(pairs, df_a, df_b)
85 | 
86 |         assert isinstance(cv, pd.DataFrame)
87 | 
88 |         assert cv["random"].notnull().all()
89 |         assert cv["random"].isin([0, 1]).all()
90 | 


--------------------------------------------------------------------------------
/recordlinkage/contrib/index/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 Jonathan de Bruin
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions are met:
 5 | #
 6 | # 1. Redistributions of source code must retain the above copyright notice,
 7 | # this list of conditions and the following disclaimer.
 8 | # 2. Redistributions in binary form must reproduce the above copyright notice,
 9 | # this list of conditions and the following disclaimer in the documentation
10 | # and/or other materials provided with the distribution.
11 | # 3. Neither the name of the copyright holder nor the names of its
12 | # contributors may be used to endorse or promote products derived from this
13 | # software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
19 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25 | # POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | 
28 | from recordlinkage.contrib.index.neighbourhoodblock.neighbourhoodblock import (
29 |     NeighbourhoodBlock,
30 | )
31 | 
32 | __all__ = ["NeighbourhoodBlock"]
33 | 


--------------------------------------------------------------------------------
/recordlinkage/contrib/index/neighbourhoodblock/README.rst:
--------------------------------------------------------------------------------
 1 | Neighbourhood blocking
 2 | ======================
 3 | 
 4 | Example
 5 | -------
 6 | 
 7 | In the following example, the record pairs are made for two historical
 8 | datasets with census data. The datasets are named ``census_data_1980``
 9 | and ``census_data_1990``. The index includes record pairs with matches
10 | in (at least) any 3 out of the 5 nominated fields. Proximity matching is
11 | allowed in the first two fields, and up to one wildcard match of a
12 | missing value is also allowed.
13 | 
14 | .. code:: python
15 | 
16 |    from recordlinkage.contrib.index import NeighbourhoodBlock
17 | 
18 |    keys = ['first_name', 'surname', 'date_of_birth', 'address', 'ssid']
19 |    windows = [9, 3, 1, 1, 1]
20 | 
21 |    indexer = NeighbourhoodBlock(
22 |        keys, windows=windows, max_nulls=1, max_non_matches=2)
23 |    indexer.index(census_data_1980, census_data_1990)
24 | 
25 | Authors
26 | -------
27 | 
28 | -  Daniel Elias


--------------------------------------------------------------------------------
/recordlinkage/contrib/index/neighbourhoodblock/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions are met:
 5 | #
 6 | # 1. Redistributions of source code must retain the above copyright notice,
 7 | # this list of conditions and the following disclaimer.
 8 | # 2. Redistributions in binary form must reproduce the above copyright notice,
 9 | # this list of conditions and the following disclaimer in the documentation
10 | # and/or other materials provided with the distribution.
11 | # 3. Neither the name of the copyright holder nor the names of its
12 | # contributors may be used to endorse or promote products derived from this
13 | # software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
19 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25 | # POSSIBILITY OF SUCH DAMAGE.
26 | 


--------------------------------------------------------------------------------
/recordlinkage/contrib/index/neighbourhoodblock/test_neighbourhoodblock.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | from operator import eq
  4 | from operator import gt
  5 | 
  6 | import numpy as np
  7 | import pytest
  8 | 
  9 | from recordlinkage.contrib.index import NeighbourhoodBlock
 10 | from recordlinkage.index import Block
 11 | from recordlinkage.index import Full
 12 | from recordlinkage.index import SortedNeighbourhood
 13 | from tests.test_indexing import TestData
 14 | 
 15 | 
 16 | class TestNeighbourhoodBlock(TestData):
 17 |     """General unittest for the NeighbourhoodBlocking indexing class."""
 18 | 
 19 |     @classmethod
 20 |     def setup_class(cls):
 21 |         TestData.setup_class()
 22 | 
 23 |         def incomplete_df_copy(df, nan_proportion=0.1):
 24 |             "copy of DataFrame with some cells set to NaN"
 25 |             nan_count = int(round(len(df) * nan_proportion))
 26 | 
 27 |             def with_nulls(vals):
 28 |                 vals = vals.copy()
 29 |                 vals.iloc[
 30 |                     np.random.choice(len(df), size=nan_count, replace=False)
 31 |                 ] = np.nan
 32 |                 return vals
 33 | 
 34 |             return df.copy() if nan_count <= 0 else df.apply(with_nulls)
 35 | 
 36 |         np.random.seed(0)
 37 |         cls.incomplete_a = incomplete_df_copy(cls.a)
 38 |         cls.incomplete_b = incomplete_df_copy(cls.b)
 39 | 
 40 |     def assert_index_comparisons(self, pairwise_comparison, indexers, *args, **kwargs):
 41 |         indexes = [ndxr.index(*args, **kwargs) for ndxr in indexers]
 42 |         for index1, index2 in zip(indexes, indexes[1:]):
 43 |             pairs1, pairs2 = map(set, [index1, index2])
 44 |             assert (
 45 |                 (len(pairs1) == len(index1))
 46 |                 and (len(pairs2) == len(index2))
 47 |                 and pairwise_comparison(pairs1, pairs2)
 48 |             )
 49 | 
 50 |     def test_dedup_vs_full(self):
 51 |         indexers = [
 52 |             NeighbourhoodBlock(max_non_matches=len(self.a.columns)),
 53 |             Full(),
 54 |         ]
 55 |         self.assert_index_comparisons(eq, indexers, self.a)
 56 | 
 57 |     def test_link_vs_full(self):
 58 |         indexers = [
 59 |             NeighbourhoodBlock(max_non_matches=len(self.a.columns)),
 60 |             Full(),
 61 |         ]
 62 |         self.assert_index_comparisons(eq, indexers, self.a, self.b)
 63 | 
 64 |     def test_dedup_single_blocking_key_vs_block(self):
 65 |         indexers = [
 66 |             NeighbourhoodBlock("var_block10", max_nulls=1),
 67 |             NeighbourhoodBlock(
 68 |                 left_on="var_block10", right_on="var_block10", max_nulls=1
 69 |             ),
 70 |             Block("var_block10"),
 71 |         ]
 72 |         self.assert_index_comparisons(eq, indexers, self.a)
 73 |         self.assert_index_comparisons(gt, indexers[-2:], self.incomplete_a)
 74 | 
 75 |     def test_link_single_blocking_key_vs_block(self):
 76 |         indexers = [
 77 |             NeighbourhoodBlock("var_arange", max_nulls=1),
 78 |             NeighbourhoodBlock(
 79 |                 left_on="var_arange", right_on="var_arange", max_nulls=1
 80 |             ),
 81 |             Block("var_arange"),
 82 |         ]
 83 |         self.assert_index_comparisons(eq, indexers, self.a, self.b)
 84 |         self.assert_index_comparisons(
 85 |             gt, indexers[-2:], self.incomplete_a, self.incomplete_b
 86 |         )
 87 | 
 88 |     def test_dedup_multiple_blocking_keys_vs_block(self):
 89 |         indexers = [
 90 |             NeighbourhoodBlock(["var_single", "var_block10"], max_nulls=1),
 91 |             NeighbourhoodBlock(
 92 |                 left_on=["var_single", "var_block10"],
 93 |                 right_on=["var_single", "var_block10"],
 94 |                 max_nulls=1,
 95 |             ),
 96 |             Block(["var_single", "var_block10"]),
 97 |         ]
 98 |         self.assert_index_comparisons(eq, indexers, self.a)
 99 |         self.assert_index_comparisons(gt, indexers[-2:], self.incomplete_a)
100 | 
101 |     def test_link_multiple_blocking_keys_vs_block(self):
102 |         indexers = [
103 |             NeighbourhoodBlock(["var_arange", "var_block10"], max_nulls=1),
104 |             NeighbourhoodBlock(
105 |                 left_on=["var_arange", "var_block10"],
106 |                 right_on=["var_arange", "var_block10"],
107 |                 max_nulls=1,
108 |             ),
109 |             Block(["var_arange", "var_block10"]),
110 |         ]
111 |         self.assert_index_comparisons(eq, indexers, self.a, self.b)
112 |         self.assert_index_comparisons(
113 |             gt, indexers[-2:], self.incomplete_a, self.incomplete_b
114 |         )
115 | 
116 |     @pytest.mark.parametrize("window", [3, 5, 7, 9, 11])
117 |     def test_dedup_single_sorting_key_vs_sortedneighbourhood(self, window):
118 |         indexers = [
119 |             NeighbourhoodBlock("var_arange", max_nulls=1, windows=window),
120 |             NeighbourhoodBlock(
121 |                 left_on="var_arange", right_on="var_arange", max_nulls=1, windows=window
122 |             ),
123 |             SortedNeighbourhood("var_arange", window=window),
124 |         ]
125 |         self.assert_index_comparisons(eq, indexers, self.a)
126 |         self.assert_index_comparisons(gt, indexers[-2:], self.incomplete_a)
127 | 
128 |     @pytest.mark.parametrize("window", [3, 5, 7, 9, 11])
129 |     def test_link_single_sorting_key_vs_sortedneighbourhood(self, window):
130 |         indexers = [
131 |             NeighbourhoodBlock("var_arange", max_nulls=1, windows=window),
132 |             NeighbourhoodBlock(
133 |                 left_on="var_arange", right_on="var_arange", max_nulls=1, windows=window
134 |             ),
135 |             SortedNeighbourhood("var_arange", window=window),
136 |         ]
137 |         self.assert_index_comparisons(eq, indexers, self.a, self.b)
138 |         self.assert_index_comparisons(
139 |             gt, indexers[-2:], self.incomplete_a, self.incomplete_b
140 |         )
141 | 
142 |     @pytest.mark.parametrize("window", [3, 5, 7, 9, 11])
143 |     def test_dedup_with_blocking_vs_sortedneighbourhood(self, window):
144 |         indexers = [
145 |             NeighbourhoodBlock(
146 |                 ["var_arange", "var_block10"], max_nulls=1, windows=[window, 1]
147 |             ),
148 |             NeighbourhoodBlock(
149 |                 left_on=["var_arange", "var_block10"],
150 |                 right_on=["var_arange", "var_block10"],
151 |                 max_nulls=1,
152 |                 windows=[window, 1],
153 |             ),
154 |             SortedNeighbourhood("var_arange", block_on="var_block10", window=window),
155 |         ]
156 |         self.assert_index_comparisons(eq, indexers, self.a)
157 |         self.assert_index_comparisons(gt, indexers[-2:], self.incomplete_a)
158 | 
159 |     @pytest.mark.parametrize("window", [3, 5, 7, 9, 11])
160 |     def test_link_with_blocking_vs_sortedneighbourhood(self, window):
161 |         indexers = [
162 |             NeighbourhoodBlock(
163 |                 ["var_arange", "var_block10"], max_nulls=1, windows=[window, 1]
164 |             ),
165 |             NeighbourhoodBlock(
166 |                 left_on=["var_arange", "var_block10"],
167 |                 right_on=["var_arange", "var_block10"],
168 |                 max_nulls=1,
169 |                 windows=[window, 1],
170 |             ),
171 |             SortedNeighbourhood("var_arange", block_on="var_block10", window=window),
172 |         ]
173 |         self.assert_index_comparisons(eq, indexers, self.a, self.b)
174 |         self.assert_index_comparisons(
175 |             gt, indexers[-2:], self.incomplete_a, self.incomplete_b
176 |         )
177 | 


--------------------------------------------------------------------------------
/recordlinkage/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | from recordlinkage.datasets.external import clear_data_home
 2 | from recordlinkage.datasets.external import get_data_home
 3 | from recordlinkage.datasets.external import load_krebsregister
 4 | from recordlinkage.datasets.febrl import load_febrl1
 5 | from recordlinkage.datasets.febrl import load_febrl2
 6 | from recordlinkage.datasets.febrl import load_febrl3
 7 | from recordlinkage.datasets.febrl import load_febrl4
 8 | from recordlinkage.datasets.generate import binary_vectors
 9 | 
10 | __all__ = [
11 |     "clear_data_home",
12 |     "get_data_home",
13 |     "load_krebsregister",
14 |     "load_febrl1",
15 |     "load_febrl2",
16 |     "load_febrl3",
17 |     "load_febrl4",
18 |     "binary_vectors",
19 | ]
20 | 


--------------------------------------------------------------------------------
/recordlinkage/datasets/external.py:
--------------------------------------------------------------------------------
  1 | # The function get_data_home() and clear_data_home() are based on
  2 | # SciKit-Learn https://git.io/fjT70. See the 3-clause BSD license.
  3 | 
  4 | import shutil
  5 | import zipfile
  6 | from io import BytesIO
  7 | from os import environ
  8 | from pathlib import Path
  9 | from urllib.request import urlopen
 10 | 
 11 | import pandas
 12 | 
 13 | 
 14 | def get_data_home(data_home=None):
 15 |     """Return the path of the Record Linkage data folder.
 16 | 
 17 |     This folder is used by some large dataset loaders to avoid
 18 |     downloading the data several times. By default the data dir
 19 |     is set to a folder named 'rl_data' in the user
 20 |     home folder.
 21 |     Alternatively, it can be set by the 'RL_DATA' environment
 22 |     variable or programmatically by giving an explicit folder
 23 |     path. The '~' symbol is expanded to the user home folder.
 24 | 
 25 |     If the folder does not already exist, it is automatically
 26 |     created.
 27 | 
 28 |     Parameters
 29 |     ----------
 30 |     data_home : str | None
 31 |         The path to recordlinkage data folder.
 32 |     """
 33 |     if data_home is None:
 34 |         data_home = environ.get("RL_DATA", Path("~", "rl_data"))
 35 |     data_home = Path(data_home).expanduser()
 36 | 
 37 |     if not data_home.exists():
 38 |         data_home.mkdir(parents=True, exist_ok=True)
 39 | 
 40 |     return data_home
 41 | 
 42 | 
 43 | def clear_data_home(data_home=None):
 44 |     """Delete all the content of the data home cache.
 45 | 
 46 |     Parameters
 47 |     ----------
 48 |     data_home : str | None
 49 |         The path to recordlinkage data folder.
 50 |     """
 51 |     data_home = get_data_home(data_home)
 52 |     shutil.rmtree(str(data_home))
 53 | 
 54 | 
 55 | def load_krebsregister(block=None, missing_values=None, shuffle=True):
 56 |     """Load the Krebsregister dataset.
 57 | 
 58 |     This dataset of comparison patterns was obtained in a
 59 |     epidemiological cancer study in Germany. The comparison patterns
 60 |     were created by the Institute for Medical Biostatistics,
 61 |     Epidemiology and Informatics (IMBEI) and the University Medical
 62 |     Center of Johannes Gutenberg University (Mainz, Germany). The
 63 |     dataset is available for research online.
 64 | 
 65 |     "The records represent individual data including first and
 66 |     family name, sex, date of birth and postal code, which were
 67 |     collected through iterative insertions in the course of
 68 |     several years. The comparison patterns in this data set are
 69 |     based on a sample of 100.000 records dating from 2005 to 2008.
 70 |     Data pairs were classified as "match" or "non-match" during
 71 |     an extensive manual review where several documentarists were
 72 |     involved.  The resulting classification formed the basis for
 73 |     assessing the quality of the registry's own record linkage
 74 |     procedure.
 75 | 
 76 |     In order to limit the amount of patterns a blocking procedure
 77 |     was applied, which selects only record pairs that meet
 78 |     specific agreement conditions. The results of the following
 79 |     six blocking iterations were merged together:
 80 | 
 81 |     - Phonetic equality of first name and family name, equality of
 82 |       date of birth.
 83 |     - Phonetic equality of first name, equality of day of birth.
 84 |     - Phonetic equality of first name, equality of month of birth.
 85 |     - Phonetic equality of first name, equality of year of birth.
 86 |     - Equality of complete date of birth.
 87 |     - Phonetic equality of family name, equality of sex.
 88 | 
 89 |     This procedure resulted in 5.749.132 record pairs, of which
 90 |     20.931 are matches. The data set is split into 10 blocks of
 91 |     (approximately) equal size and ratio of matches to
 92 |     non-matches."
 93 | 
 94 |     Parameters
 95 |     ----------
 96 |     block : int, list
 97 |         An integer or a list with integers between 1 and 10. The
 98 |         blocks are the blocks explained in the description. Default
 99 |         all 1 to 10.
100 |     missing_values : object, int, float
101 |         The value of the missing values. Default NaN.
102 |     shuffle : bool
103 |         Shuffle the record pairs. Default True.
104 | 
105 |     Returns
106 |     -------
107 |     (pandas.DataFrame, pandas.MultiIndex)
108 |         A pandas.DataFrame with comparison vectors and a
109 |         pandas.MultiIndex with the indices of the matches.
110 | 
111 |     """
112 | 
113 |     if block is None:
114 |         block = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
115 | 
116 |     # If the data is not found, download it.
117 |     for i in range(1, 11):
118 |         filepath = Path(get_data_home(), "krebsregister", f"block_{i}.zip")
119 | 
120 |         if not filepath.is_file():
121 |             _download_krebsregister()
122 |             break
123 | 
124 |     if isinstance(block, (list, tuple)):
125 |         data = pandas.concat([_krebsregister_block(bl) for bl in block])
126 |     else:
127 |         data = _krebsregister_block(block)
128 | 
129 |     if shuffle:
130 |         data = data.sample(frac=1, random_state=535)
131 | 
132 |     match_index = data.index[data["is_match"]]
133 |     del data["is_match"]
134 | 
135 |     if pandas.notnull(missing_values):
136 |         data.fillna(missing_values, inplace=True)
137 | 
138 |     return data, match_index
139 | 
140 | 
141 | def _download_krebsregister():
142 |     zip_file_url = (
143 |         "http://archive.ics.uci.edu/ml/" "machine-learning-databases/00210/donation.zip"
144 |     )
145 | 
146 |     folder = Path(get_data_home(), "krebsregister")
147 | 
148 |     try:
149 |         print(f"Downloading data to {folder}.")
150 |         r = urlopen(zip_file_url).read()
151 | 
152 |         # unzip the content and put it in the krebsregister folder
153 |         z = zipfile.ZipFile(BytesIO(r))
154 |         z.extractall(str(folder))
155 | 
156 |         print("Data download succesfull.")
157 | 
158 |     except Exception as e:
159 |         print("Issue with downloading the data:", e)
160 | 
161 | 
162 | def _krebsregister_block(block):
163 |     if block not in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]:
164 |         raise ValueError(
165 |             "Argument 'block' has to be integer in "
166 |             "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10] or list of integers."
167 |         )
168 | 
169 |     fp_i = Path(get_data_home(), "krebsregister", f"block_{block}.zip")
170 | 
171 |     data_block = pandas.read_csv(
172 |         fp_i, index_col=["id_1", "id_2"], na_values="?", compression="zip"
173 |     )
174 | 
175 |     data_block.columns = [
176 |         "cmp_firstname1",
177 |         "cmp_firstname2",
178 |         "cmp_lastname1",
179 |         "cmp_lastname2",
180 |         "cmp_sex",
181 |         "cmp_birthday",
182 |         "cmp_birthmonth",
183 |         "cmp_birthyear",
184 |         "cmp_zipcode",
185 |         "is_match",
186 |     ]
187 |     data_block.index.names = ["id1", "id2"]
188 | 
189 |     return data_block
190 | 


--------------------------------------------------------------------------------
/recordlinkage/datasets/febrl.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | 
  3 | import numpy
  4 | import pandas
  5 | 
  6 | 
  7 | def _febrl_load_data(filename):
  8 |     # Internal function for loading febrl data
  9 | 
 10 |     filepath = Path(Path(__file__).parent, "febrl", filename)
 11 | 
 12 |     febrl_data = pandas.read_csv(
 13 |         filepath,
 14 |         index_col="rec_id",
 15 |         sep=",",
 16 |         engine="c",
 17 |         skipinitialspace=True,
 18 |         encoding="utf-8",
 19 |         dtype={
 20 |             "street_number": object,
 21 |             "date_of_birth": object,
 22 |             "soc_sec_id": object,
 23 |             "postcode": object,
 24 |         },
 25 |     )
 26 | 
 27 |     return febrl_data
 28 | 
 29 | 
 30 | def _febrl_links(df):
 31 |     """Get the links of a FEBRL dataset."""
 32 | 
 33 |     index = df.index.to_series()
 34 |     keys = index.str.extract(r"rec-(\d+)", expand=True)[0]
 35 | 
 36 |     index_int = numpy.arange(len(df))
 37 | 
 38 |     df_helper = pandas.DataFrame({"key": keys, "index": index_int})
 39 | 
 40 |     # merge the two frame and make MultiIndex.
 41 |     pairs_df = df_helper.merge(df_helper, on="key")[["index_x", "index_y"]]
 42 |     pairs_df = pairs_df[pairs_df["index_x"] > pairs_df["index_y"]]
 43 | 
 44 |     return pandas.MultiIndex(
 45 |         levels=[df.index.values, df.index.values],
 46 |         codes=[pairs_df["index_x"].values, pairs_df["index_y"].values],
 47 |         names=[None, None],
 48 |         verify_integrity=False,
 49 |     )
 50 | 
 51 | 
 52 | def load_febrl1(return_links=False):
 53 |     """Load the FEBRL 1 dataset.
 54 | 
 55 |     The Freely Extensible Biomedical Record Linkage (Febrl) package is
 56 |     distributed with a dataset generator and four datasets generated
 57 |     with the generator. This function returns the first Febrl dataset
 58 |     as a :class:`pandas.DataFrame`.
 59 | 
 60 |             *"This data set contains 1000 records (500 original and
 61 |             500 duplicates, with exactly one duplicate per original
 62 |             record."*
 63 | 
 64 |     Parameters
 65 |     ----------
 66 |     return_links: bool
 67 |         When True, the function returns also the true links.
 68 | 
 69 |     Returns
 70 |     -------
 71 |     pandas.DataFrame
 72 |         A :class:`pandas.DataFrame` with Febrl dataset1.csv. When
 73 |         return_links is True, the function returns also the true
 74 |         links. The true links are all links in the lower triangular
 75 |         part of the matrix.
 76 | 
 77 |     """
 78 | 
 79 |     df = _febrl_load_data("dataset1.csv")
 80 | 
 81 |     if return_links:
 82 |         links = _febrl_links(df)
 83 |         return df, links
 84 |     else:
 85 |         return df
 86 | 
 87 | 
 88 | def load_febrl2(return_links=False):
 89 |     """Load the FEBRL 2 dataset.
 90 | 
 91 |     The Freely Extensible Biomedical Record Linkage (Febrl) package is
 92 |     distributed with a dataset generator and four datasets generated
 93 |     with the generator. This function returns the second Febrl dataset
 94 |     as a :class:`pandas.DataFrame`.
 95 | 
 96 |             *"This data set contains 5000 records (4000 originals and
 97 |             1000 duplicates), with a maximum of 5 duplicates based on
 98 |             one original record (and a poisson distribution of
 99 |             duplicate records). Distribution of duplicates:
100 |             19 originals records have 5 duplicate records
101 |             47 originals records have 4 duplicate records
102 |             107 originals records have 3 duplicate records
103 |             141 originals records have 2 duplicate records
104 |             114 originals records have 1 duplicate record
105 |             572 originals records have no duplicate record"*
106 | 
107 |     Parameters
108 |     ----------
109 |     return_links: bool
110 |         When True, the function returns also the true links.
111 | 
112 |     Returns
113 |     -------
114 |     pandas.DataFrame
115 |         A :class:`pandas.DataFrame` with Febrl dataset2.csv. When
116 |         return_links is True, the function returns also the true
117 |         links. The true links are all links in the lower triangular
118 |         part of the matrix.
119 | 
120 |     """
121 | 
122 |     df = _febrl_load_data("dataset2.csv")
123 | 
124 |     if return_links:
125 |         links = _febrl_links(df)
126 |         return df, links
127 |     else:
128 |         return df
129 | 
130 | 
131 | def load_febrl3(return_links=False):
132 |     """Load the FEBRL 3 dataset.
133 | 
134 |     The Freely Extensible Biomedical Record Linkage (Febrl) package is
135 |     distributed with a dataset generator and four datasets generated
136 |     with the generator. This function returns the third Febrl dataset
137 |     as a :class:`pandas.DataFrame`.
138 | 
139 |             *"This data set contains 5000 records (2000 originals and
140 |             3000 duplicates), with a maximum of 5 duplicates based on
141 |             one original record (and a Zipf distribution of duplicate
142 |             records). Distribution of duplicates:
143 |             168 originals records have 5 duplicate records
144 |             161 originals records have 4 duplicate records
145 |             212 originals records have 3 duplicate records
146 |             256 originals records have 2 duplicate records
147 |             368 originals records have 1 duplicate record
148 |             1835 originals records have no duplicate record"*
149 | 
150 |     Parameters
151 |     ----------
152 |     return_links: bool
153 |         When True, the function returns also the true links.
154 | 
155 |     Returns
156 |     -------
157 |     pandas.DataFrame
158 |         A :class:`pandas.DataFrame` with Febrl dataset3.csv. When
159 |         return_links is True, the function returns also the true
160 |         links. The true links are all links in the lower triangular
161 |         part of the matrix.
162 | 
163 |     """
164 | 
165 |     df = _febrl_load_data("dataset3.csv")
166 | 
167 |     if return_links:
168 |         links = _febrl_links(df)
169 |         return df, links
170 |     else:
171 |         return df
172 | 
173 | 
174 | def load_febrl4(return_links=False):
175 |     """Load the FEBRL 4 datasets.
176 | 
177 |     The Freely Extensible Biomedical Record Linkage (Febrl) package is
178 |     distributed with a dataset generator and four datasets generated
179 |     with the generator. This function returns the fourth Febrl dataset
180 |     as a :class:`pandas.DataFrame`.
181 | 
182 |             *"Generated as one data set with 10000 records (5000
183 |             originals and 5000  duplicates, with one duplicate per
184 |             original), the originals have been split from the
185 |             duplicates, into dataset4a.csv (containing the 5000
186 |             original records) and dataset4b.csv (containing the
187 |             5000 duplicate records) These two data sets can be
188 |             used for testing linkage procedures."*
189 | 
190 |     Parameters
191 |     ----------
192 |     return_links: bool
193 |         When True, the function returns also the true links.
194 | 
195 |     Returns
196 |     -------
197 |     (pandas.DataFrame, pandas.DataFrame)
198 |         A :class:`pandas.DataFrame` with Febrl dataset4a.csv and a pandas
199 |         dataframe with Febrl dataset4b.csv. When return_links is True,
200 |         the function returns also the true links.
201 | 
202 |     """
203 | 
204 |     df_a = _febrl_load_data("dataset4a.csv")
205 |     df_b = _febrl_load_data("dataset4b.csv")
206 | 
207 |     if return_links:
208 |         links = pandas.MultiIndex.from_arrays(
209 |             [
210 |                 [f"rec-{i}-org" for i in range(0, 5000)],
211 |                 [f"rec-{i}-dup-0" for i in range(0, 5000)],
212 |             ]
213 |         )
214 |         return df_a, df_b, links
215 |     else:
216 |         return df_a, df_b
217 | 


--------------------------------------------------------------------------------
/recordlinkage/datasets/generate.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | 
 5 | def binary_vectors(
 6 |     n,
 7 |     n_match,
 8 |     m=[0.9] * 8,
 9 |     u=[0.1] * 8,
10 |     random_state=None,
11 |     return_links=False,
12 |     dtype=np.int8,
13 | ):
14 |     """Generate random binary comparison vectors.
15 | 
16 |     This function is used to generate random comparison vectors. The
17 |     result of each comparison is a binary value (0 or 1).
18 | 
19 |     Parameters
20 |     ----------
21 |     n : int
22 |         The total number of comparison vectors.
23 |     n_match : int
24 |         The number of matching record pairs.
25 |     m : list, default [0.9] * 8, optional
26 |         A list of m probabilities of each partially identifying
27 |         variable. The m probability is the probability that an
28 |         identifier in matching record pairs agrees.
29 |     u : list, default [0.9] * 8, optional
30 |         A list of u probabilities of each partially identifying
31 |         variable. The u probability is the probability that an
32 |         identifier in non-matching record pairs agrees.
33 |     random_state : int or numpy.random.RandomState, optional
34 |         Seed for the random number generator with an integer or numpy
35 |         RandomState object.
36 |     return_links: bool
37 |         When True, the function returns also the true links.
38 |     dtype: numpy.dtype
39 |         The dtype of each column in the returned DataFrame.
40 | 
41 |     Returns
42 |     -------
43 |     pandas.DataFrame
44 |         A dataframe with comparison vectors.
45 | 
46 | 
47 |     """
48 | 
49 |     if len(m) != len(u):
50 |         raise ValueError("the length of 'm' is not equal the length of 'u'")
51 | 
52 |     if n_match >= n or n_match < 0:
53 |         raise ValueError("the number of matches is bounded by [0, n]")
54 | 
55 |     # set the random seed
56 |     np.random.seed(random_state)
57 | 
58 |     matches = []
59 |     nonmatches = []
60 | 
61 |     sample_set = np.array([0, 1], dtype=dtype)
62 | 
63 |     for i, _ in enumerate(m):
64 |         p_mi = [1 - m[i], m[i]]
65 |         p_ui = [1 - u[i], u[i]]
66 | 
67 |         comp_mi = np.random.choice(sample_set, (n_match, 1), p=p_mi)
68 |         comp_ui = np.random.choice(sample_set, (n - n_match, 1), p=p_ui)
69 | 
70 |         nonmatches.append(comp_ui)
71 |         matches.append(comp_mi)
72 | 
73 |     match_block = np.concatenate(matches, axis=1)
74 |     nonmatch_block = np.concatenate(nonmatches, axis=1)
75 | 
76 |     data_np = np.concatenate((match_block, nonmatch_block), axis=0)
77 |     index_np = np.random.randint(1001, 1001 + n * 2, (n, 2))
78 | 
79 |     data_col_names = ["c_%s" % (i + 1) for i in range(len(m))]
80 |     data_mi = pd.MultiIndex.from_arrays([index_np[:, 0], index_np[:, 1]])
81 |     data_df = pd.DataFrame(data_np, index=data_mi, columns=data_col_names)
82 | 
83 |     features = data_df.sample(frac=1, random_state=random_state)
84 | 
85 |     if return_links:
86 |         links = data_mi[:n_match]
87 |         return features, links
88 |     else:
89 |         return features
90 | 


--------------------------------------------------------------------------------
/recordlinkage/deprecated.py:
--------------------------------------------------------------------------------
1 | """Home of all deprecated functions and classes."""
2 | 


--------------------------------------------------------------------------------
/recordlinkage/network.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | 
  3 | from recordlinkage.types import is_pandas_2d_multiindex
  4 | from recordlinkage.types import is_pandas_multiindex
  5 | 
  6 | 
  7 | class OneToOneLinking:
  8 |     """[EXPERIMENTAL] One-to-one linking
  9 | 
 10 |     A record from dataset A can match at most one record from dataset
 11 |     B. For example, (a1, a2) are records from A and (b1, b2) are records
 12 |     from B.  A linkage of (a1, b1), (a1, b2), (a2, b1), (a2, b2) is not
 13 |     one-to-one  connected. One of the results of one-to-one linking can
 14 |     be (a1, b1), (a2, b2).
 15 | 
 16 |     Parameters
 17 |     ----------
 18 |     method : str
 19 |         The method to solve the problem. Only 'greedy' is supported at
 20 |         the moment.
 21 | 
 22 |     Note
 23 |     ----
 24 | 
 25 |     This class is experimental and might change in future versions.
 26 | 
 27 |     """
 28 | 
 29 |     def __init__(self, method="greedy"):
 30 |         super().__init__()
 31 | 
 32 |         self.method = method
 33 | 
 34 |     @classmethod
 35 |     def _bool_duplicated(cls, links, level):
 36 |         return links.get_level_values(level).duplicated()
 37 | 
 38 |     def _compute_greedy(self, links):
 39 |         result = []
 40 |         set_a = set()
 41 |         set_b = set()
 42 | 
 43 |         for index_a, index_b in links:
 44 |             if index_a not in set_a and index_b not in set_b:
 45 |                 result.append((index_a, index_b))
 46 |                 set_a.add(index_a)
 47 |                 set_b.add(index_b)
 48 | 
 49 |         return pd.MultiIndex.from_tuples(result)
 50 | 
 51 |     def _compute(self, links):
 52 |         if not is_pandas_2d_multiindex(links):
 53 |             if not is_pandas_multiindex(links):
 54 |                 raise TypeError("expected pandas.MultiIndex")
 55 |             elif not is_pandas_2d_multiindex(links):
 56 |                 raise ValueError(
 57 |                     "pandas.MultiIndex has incorrect number of "
 58 |                     "levels (expected 2 levels)"
 59 |                 )
 60 | 
 61 |         if self.method == "greedy":
 62 |             return self._compute_greedy(links)
 63 |         else:
 64 |             raise ValueError(f"unknown matching method {self.method}")
 65 | 
 66 |     def compute(self, links):
 67 |         """Compute the one-to-one linking.
 68 | 
 69 |         Parameters
 70 |         ----------
 71 |         links : pandas.MultiIndex
 72 |             The pairs to apply linking to.
 73 | 
 74 |         Returns
 75 |         -------
 76 |         pandas.MultiIndex
 77 |             A one-to-one matched MultiIndex of record pairs.
 78 | 
 79 |         """
 80 | 
 81 |         return self._compute(links)
 82 | 
 83 | 
 84 | class OneToManyLinking(OneToOneLinking):
 85 |     """[EXPERIMENTAL] One-to-many linking
 86 | 
 87 |     A record from dataset A can link multiple records from dataset B,
 88 |     but a record from B can link to only one record of dataset A. Use
 89 |     the `level` argument to switch A and B.
 90 | 
 91 |     Parameters
 92 |     ----------
 93 |     level : int
 94 |         The level of the MultiIndex to have the one relations. The
 95 |         options are 0 or 1 (incication the level of the MultiIndex).
 96 |         Default 0.
 97 |     method : str
 98 |         The method to solve the problem. Only 'greedy' is supported at
 99 |         the moment.
100 | 
101 |     Example
102 |     -------
103 | 
104 |     Consider a MultiIndex with record pairs constructed from datasets A
105 |     and B. To link a record from B to at most one record of B, use the
106 |     following syntax:
107 | 
108 |     > one_to_many = OneToManyLinking(0)
109 |     > one_to_many.compute(links)
110 | 
111 |     To link a record from B to at most one record
112 |     of B, use:
113 | 
114 |     > one_to_many = OneToManyLinking(1)
115 |     > one_to_many.compute(links)
116 | 
117 |     Note
118 |     ----
119 | 
120 |     This class is experimental and might change in future versions.
121 | 
122 |     """
123 | 
124 |     def __init__(self, level=0, method="greedy"):
125 |         super().__init__(method=method)
126 | 
127 |         self.level = level
128 | 
129 |     def _compute_greedy(self, links):
130 |         source_dupl_bool = self._bool_duplicated(links, self.level)
131 |         return links[~source_dupl_bool]
132 | 
133 |     def compute(self, links):
134 |         """Compute the one-to-many matching.
135 | 
136 |         Parameters
137 |         ----------
138 |         links : pandas.MultiIndex
139 |             The pairs to apply linking to.
140 | 
141 |         Returns
142 |         -------
143 |         pandas.MultiIndex
144 |             A one-to-many matched MultiIndex of record pairs.
145 | 
146 |         """
147 | 
148 |         return self._compute(links)
149 | 
150 | 
151 | class ConnectedComponents:
152 |     """[EXPERIMENTAL] Connected record pairs
153 | 
154 |     This class identifies connected record pairs. Connected components
155 |     are especially used in detecting duplicates in a single dataset.
156 | 
157 |     Note
158 |     ----
159 | 
160 |     This class is experimental and might change in future versions.
161 |     """
162 | 
163 |     def __init__(self):
164 |         super().__init__()
165 | 
166 |     def compute(self, links):
167 |         """Return the connected components.
168 | 
169 |         Parameters
170 |         ----------
171 |         links : pandas.MultiIndex
172 |             The links to apply one-to-one matching on.
173 | 
174 |         Returns
175 |         -------
176 |         list of pandas.MultiIndex
177 |             A list with pandas.MultiIndex objects. Each MultiIndex
178 |             object represents a set of connected record pairs.
179 | 
180 |         """
181 | 
182 |         try:
183 |             import networkx as nx
184 |         except ImportError as err:
185 |             raise Exception("'networkx' module is needed for this operation") from err
186 | 
187 |         graph_pairs = nx.Graph()
188 |         graph_pairs.add_edges_from(links.values)
189 |         connected_pairs = (
190 |             graph_pairs.subgraph(c).copy() for c in nx.connected_components(graph_pairs)
191 |         )
192 | 
193 |         links_result = [
194 |             pd.MultiIndex.from_tuples(subgraph.edges()) for subgraph in connected_pairs
195 |         ]
196 | 
197 |         return links_result
198 | 


--------------------------------------------------------------------------------
/recordlinkage/preprocessing/__init__.py:
--------------------------------------------------------------------------------
 1 | from recordlinkage.preprocessing.cleaning import clean
 2 | from recordlinkage.preprocessing.cleaning import phonenumbers
 3 | from recordlinkage.preprocessing.cleaning import value_occurence
 4 | from recordlinkage.preprocessing.encoding import _list_phonetic_algorithms
 5 | from recordlinkage.preprocessing.encoding import phonetic
 6 | 
 7 | phonetic_algorithms = _list_phonetic_algorithms()
 8 | """List of available phonetic algorithms."""
 9 | 
10 | __all__ = [
11 |     "phonetic_algorithms",
12 |     "clean",
13 |     "phonetic",
14 |     "value_occurence",
15 |     "phonenumbers",
16 | ]
17 | 


--------------------------------------------------------------------------------
/recordlinkage/preprocessing/cleaning.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | 
  3 | from sklearn.feature_extraction.text import strip_accents_ascii
  4 | from sklearn.feature_extraction.text import strip_accents_unicode
  5 | 
  6 | 
  7 | def clean(
  8 |     s,
  9 |     lowercase=True,
 10 |     replace_by_none=r"[^ \-\_A-Za-z0-9]+",
 11 |     replace_by_whitespace=r"[\-\_]",
 12 |     strip_accents=None,
 13 |     remove_brackets=True,
 14 |     encoding="utf-8",
 15 |     decode_error="strict",
 16 | ):
 17 |     """Clean string variables.
 18 | 
 19 |     Clean strings in the Series by removing unwanted tokens,
 20 |     whitespace and brackets.
 21 | 
 22 |     Parameters
 23 |     ----------
 24 |     s : pandas.Series
 25 |         A Series to clean.
 26 |     lower : bool, optional
 27 |         Convert strings in the Series to lowercase. Default True.
 28 |     replace_by_none : str, optional
 29 |         The matches of this regular expression are replaced by ''.
 30 |     replace_by_whitespace : str, optional
 31 |         The matches of this regular expression are replaced by a
 32 |         whitespace.
 33 |     remove_brackets : bool, optional
 34 |         Remove all content between brackets and the bracket
 35 |         themselves. Default True.
 36 |     strip_accents : {'ascii', 'unicode', None}, optional
 37 |         Remove accents during the preprocessing step. 'ascii' is a
 38 |         fast method that only works on characters that have an direct
 39 |         ASCII mapping. 'unicode' is a slightly slower method that
 40 |         works on any characters. None (default) does nothing.
 41 |     encoding : str, optional
 42 |         If bytes are given, this encoding is used to decode. Default
 43 |         is 'utf-8'.
 44 |     decode_error : {'strict', 'ignore', 'replace'}, optional
 45 |         Instruction on what to do if a byte Series is given that
 46 |         contains characters not of the given `encoding`. By default,
 47 |         it is 'strict', meaning that a UnicodeDecodeError will be
 48 |         raised. Other values are 'ignore' and 'replace'.
 49 | 
 50 |     Example
 51 |     -------
 52 |     >>> import pandas
 53 |     >>> from recordlinkage.preprocessing import clean
 54 |     >>>
 55 |     >>> names = ['Mary-ann',
 56 |                 'Bob :)',
 57 |                 'Angel',
 58 |                 'Bob (alias Billy)',
 59 |                 None]
 60 |     >>> s = pandas.Series(names)
 61 |     >>> print(clean(s))
 62 |     0    mary ann
 63 |     1         bob
 64 |     2       angel
 65 |     3         bob
 66 |     4         NaN
 67 |     dtype: object
 68 | 
 69 |     Returns
 70 |     -------
 71 |     pandas.Series:
 72 |         A cleaned Series of strings.
 73 | 
 74 |     """
 75 | 
 76 |     if s.shape[0] == 0:
 77 |         return s
 78 | 
 79 |     # Lower s if lower is True
 80 |     if lowercase is True:
 81 |         s = s.str.lower()
 82 | 
 83 |     # Accent stripping based on https://github.com/scikit-learn/
 84 |     # scikit-learn/blob/412996f/sklearn/feature_extraction/text.py
 85 |     # BSD license
 86 |     if not strip_accents:
 87 |         pass
 88 |     elif callable(strip_accents):
 89 |         strip_accents_fn = strip_accents
 90 |     elif strip_accents == "ascii":
 91 |         strip_accents_fn = strip_accents_ascii
 92 |     elif strip_accents == "unicode":
 93 |         strip_accents_fn = strip_accents_unicode
 94 |     else:
 95 |         raise ValueError(f"Invalid value for 'strip_accents': {strip_accents}")
 96 | 
 97 |     # Remove accents etc
 98 |     if strip_accents:
 99 | 
100 |         def strip_accents_fn_wrapper(x):
101 |             if sys.version_info[0] >= 3:
102 |                 if isinstance(x, str):
103 |                     return strip_accents_fn(x)
104 |                 else:
105 |                     return x
106 |             else:
107 |                 if isinstance(x, unicode):  # noqa
108 |                     return strip_accents_fn(x)
109 |                 else:
110 |                     return x
111 | 
112 |         # encoding
113 |         s = s.apply(
114 |             lambda x: x.decode(encoding, decode_error) if type(x) == bytes else x
115 |         )
116 |         s = s.map(lambda x: strip_accents_fn_wrapper(x))
117 | 
118 |     # Remove all content between brackets
119 |     if remove_brackets is True:
120 |         s = s.str.replace(r"(\[.*?\]|\(.*?\)|\{.*?\})", "", regex=True)
121 | 
122 |     # Remove the special characters
123 |     if replace_by_none:
124 |         s = s.str.replace(replace_by_none, "", regex=True)
125 | 
126 |     if replace_by_whitespace:
127 |         s = s.str.replace(replace_by_whitespace, " ", regex=True)
128 | 
129 |     # Remove multiple whitespaces
130 |     s = s.str.replace(r"\s\s+", " ", regex=True)
131 | 
132 |     # Strip s
133 |     s = s.str.lstrip().str.rstrip()
134 | 
135 |     return s
136 | 
137 | 
138 | def phonenumbers(s):
139 |     """Clean phonenumbers by removing all non-numbers (except +).
140 | 
141 |     Parameters
142 |     ----------
143 |     s: pandas.Series
144 |         A Series to clean.
145 | 
146 |     Returns
147 |     -------
148 |     pandas.Series
149 |         A Series with cleaned phonenumbers.
150 | 
151 |     """
152 | 
153 |     # Remove all special tokens
154 |     s = s.astype(object).str.replace("[^0-9+]+", "", regex=True)
155 | 
156 |     return s
157 | 
158 | 
159 | def value_occurence(s):
160 |     """Count the number of times each value occurs.
161 | 
162 |     This function returns the counts for each row, in contrast with
163 |     `pandas.value_counts <http://pandas.pydata.org/pandas-
164 |     docs/stable/generated/pandas.Series.value_counts.html>`_.
165 | 
166 |     Returns
167 |     -------
168 |     pandas.Series
169 |         A Series with value counts.
170 | 
171 |     """
172 | 
173 |     # https://github.com/pydata/pandas/issues/3729
174 |     value_count = s.fillna("NAN")
175 | 
176 |     return value_count.groupby(by=value_count).transform("count")
177 | 


--------------------------------------------------------------------------------
/recordlinkage/preprocessing/encoding.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | import jellyfish
 4 | import numpy as np
 5 | import pandas
 6 | 
 7 | _phonetic_algorithms = [
 8 |     {"name": "Soundex", "callback": jellyfish.soundex, "argument_names": ["soundex"]},
 9 |     {
10 |         "name": "NYSIIS",
11 |         "callback": jellyfish.nysiis,
12 |         "argument_names": ["nysiis", "nyssis"],
13 |     },
14 |     {
15 |         "name": "Metaphone",
16 |         "callback": jellyfish.metaphone,
17 |         "argument_names": ["metaphone"],
18 |     },
19 |     {
20 |         "name": "Match Rating",
21 |         "callback": jellyfish.match_rating_codex,
22 |         "argument_names": [
23 |             "match_rating",
24 |             "match rating",
25 |             "matchrating",
26 |             "match_rating_codex",
27 |             "matchratingcodex",
28 |         ],
29 |     },
30 | ]
31 | 
32 | 
33 | def _list_phonetic_algorithms():
34 |     """Return list of available phonetic algorithms."""
35 | 
36 |     return [alg["argument_names"][0] for alg in _phonetic_algorithms]
37 | 
38 | 
39 | def phonetic(s, method, concat=True, encoding="utf-8", decode_error="strict"):
40 |     """Convert names or strings into phonetic codes.
41 | 
42 |     The implemented algorithms are `soundex
43 |     <https://en.wikipedia.org/wiki/Soundex>`_, `nysiis
44 |     <https://en.wikipedia.org/wiki/New_York_State_Identification_and_
45 |     Intelligence_System>`_, `metaphone
46 |     <https://en.wikipedia.org/wiki/Metaphone>`_ or  `match_rating
47 |     <https://en.wikipedia.org/wiki/Match_rating_approach>`_.
48 | 
49 |     Parameters
50 |     ----------
51 |     s : pandas.Series
52 |         A pandas.Series with string values (often names) to encode.
53 |     method: str
54 |         The algorithm that is used to phonetically encode the values.
55 |         The possible options are "soundex", "nysiis", "metaphone" or
56 |         "match_rating".
57 |     concat: bool, optional
58 |         Remove whitespace before phonetic encoding.
59 |     encoding: str, optional
60 |         If bytes are given, this encoding is used to decode. Default
61 |         is 'utf-8'.
62 |     decode_error: {'strict', 'ignore', 'replace'}, optional
63 |         Instruction on what to do if a byte Series is given that
64 |         contains characters not of the given `encoding`. By default,
65 |         it is 'strict', meaning that a UnicodeDecodeError will be
66 |         raised. Other values are 'ignore' and 'replace'.
67 | 
68 |     Returns
69 |     -------
70 |     pandas.Series
71 |         A Series with phonetic encoded values.
72 | 
73 |     """
74 | 
75 |     # encoding
76 |     if sys.version_info[0] == 2:
77 |         s = s.apply(
78 |             lambda x: x.decode(encoding, decode_error) if type(x) == bytes else x
79 |         )
80 | 
81 |     if concat:
82 |         s = s.str.replace(r"[\-\_\s]", "", regex=True)
83 | 
84 |     for alg in _phonetic_algorithms:
85 |         if method in alg["argument_names"]:
86 |             phonetic_callback = alg["callback"]
87 |             break
88 |     else:
89 |         raise ValueError(f"The algorithm '{method}' is not known.")
90 | 
91 |     return s.str.upper().apply(
92 |         lambda x: phonetic_callback(x) if pandas.notnull(x) else np.nan
93 |     )
94 | 


--------------------------------------------------------------------------------
/recordlinkage/rl_logging.py:
--------------------------------------------------------------------------------
 1 | """Logging utilities."""
 2 | 
 3 | # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ============================================================================
17 | #
18 | # Modifications copyright Jonathan de Bruin 2017
19 | 
20 | # pylint: disable=unused-import
21 | 
22 | import logging as _logging
23 | import sys as _sys
24 | from logging import DEBUG  # noqa
25 | from logging import ERROR  # noqa
26 | from logging import FATAL  # noqa
27 | from logging import INFO  # noqa
28 | from logging import WARN  # noqa
29 | 
30 | # Determine whether we are in an interactive environment
31 | _interactive = False
32 | try:
33 |     # This is only defined in interactive shells
34 |     if _sys.ps1:
35 |         _interactive = True
36 | except AttributeError:
37 |     # Even now, we may be in an interactive shell with `python -i`.
38 |     _interactive = _sys.flags.interactive
39 | 
40 | # Scope the tensorflow logger to not conflict with users' loggers
41 | _logger = _logging.getLogger("recordlinkage")
42 | 
43 | # If we are in an interactive environment (like jupyter), set loglevel to info
44 | # and pipe the output to stdout
45 | if _interactive:
46 |     _logger.setLevel(WARN)
47 |     _logging_target = _sys.stdout
48 | else:
49 |     _logging_target = _sys.stderr
50 | 
51 | # Add the output handler
52 | _handler = _logging.StreamHandler(_logging_target)
53 | _handler.setFormatter(_logging.Formatter(_logging.BASIC_FORMAT, None))
54 | _logger.addHandler(_handler)
55 | 
56 | log = _logger.log
57 | debug = _logger.debug
58 | error = _logger.error
59 | fatal = _logger.fatal
60 | info = _logger.info
61 | warning = _logger.warning
62 | 
63 | 
64 | def get_verbosity():
65 |     """Return how much logging output will be produced."""
66 |     return _logger.getEffectiveLevel()
67 | 
68 | 
69 | def set_verbosity(verbosity):
70 |     """Sets the threshold for what messages will be logged."""
71 |     _logger.setLevel(verbosity)
72 | 


--------------------------------------------------------------------------------
/recordlinkage/standardise/__init__.py:
--------------------------------------------------------------------------------
 1 | # This module is renamed into preprocessing. Please use the preprocessing
 2 | # module instead of this module.
 3 | 
 4 | import warnings
 5 | 
 6 | from recordlinkage.preprocessing import clean as _clean
 7 | from recordlinkage.preprocessing import phonenumbers as _phonenumbers
 8 | from recordlinkage.preprocessing import phonetic as _phonetic
 9 | from recordlinkage.preprocessing import value_occurence as _value_occurence
10 | 
11 | 
12 | def _depr_warn():
13 |     warnings.warn(
14 |         "module recordlinkage.standardise is deprecated, use "
15 |         "recordlinkage.preprocessing instead",
16 |         DeprecationWarning,
17 |         stacklevel=2,
18 |     )
19 | 
20 | 
21 | def clean(*args, **kwargs):
22 |     _depr_warn()
23 | 
24 |     return _clean(*args, **kwargs)
25 | 
26 | 
27 | def phonenumbers(*args, **kwargs):
28 |     _depr_warn()
29 | 
30 |     return _phonenumbers(*args, **kwargs)
31 | 
32 | 
33 | def value_occurence(*args, **kwargs):
34 |     _depr_warn()
35 | 
36 |     return _value_occurence(*args, **kwargs)
37 | 
38 | 
39 | def phonetic(*args, **kwargs):
40 |     _depr_warn()
41 | 
42 |     return _phonetic(*args, **kwargs)
43 | 


--------------------------------------------------------------------------------
/recordlinkage/types.py:
--------------------------------------------------------------------------------
  1 | """
  2 | basic inference routines
  3 | 
  4 | most functions taken from pandas (https://github.com/pandas-dev/pandas)
  5 | License BSD
  6 | 
  7 | """
  8 | 
  9 | import collections
 10 | import re
 11 | from numbers import Number
 12 | 
 13 | import numpy
 14 | import pandas
 15 | 
 16 | string_and_binary_types = (str, bytes)
 17 | 
 18 | 
 19 | def is_number(obj):
 20 |     return isinstance(obj, (Number, numpy.number))
 21 | 
 22 | 
 23 | def is_string_like(obj):
 24 |     return isinstance(obj, str)
 25 | 
 26 | 
 27 | def _iterable_not_string(x):
 28 |     return isinstance(x, collections.Iterable) and not isinstance(x, str)
 29 | 
 30 | 
 31 | def is_iterator(obj):
 32 |     return hasattr(obj, "__next__")
 33 | 
 34 | 
 35 | def is_re(obj):
 36 |     return isinstance(obj, re._pattern_type)
 37 | 
 38 | 
 39 | def is_re_compilable(obj):
 40 |     try:
 41 |         re.compile(obj)
 42 |     except TypeError:
 43 |         return False
 44 |     else:
 45 |         return True
 46 | 
 47 | 
 48 | def is_list_like(arg):
 49 |     return hasattr(arg, "__iter__") and not isinstance(arg, string_and_binary_types)
 50 | 
 51 | 
 52 | def is_dict_like(arg):
 53 |     return hasattr(arg, "__getitem__") and hasattr(arg, "keys")
 54 | 
 55 | 
 56 | def is_named_tuple(arg):
 57 |     return isinstance(arg, tuple) and hasattr(arg, "_fields")
 58 | 
 59 | 
 60 | def is_hashable(arg):
 61 |     """Return True if hash(arg) will succeed, False otherwise.
 62 | 
 63 |     Some types will pass a test against collections.Hashable but fail when they
 64 |     are actually hashed with hash().
 65 | 
 66 |     Distinguish between these and other types by trying the call to hash() and
 67 |     seeing if they raise TypeError.
 68 | 
 69 |     Examples
 70 |     --------
 71 |     >>> a = ([],)
 72 |     >>> isinstance(a, collections.Hashable)
 73 |     True
 74 |     >>> is_hashable(a)
 75 |     False
 76 |     """
 77 | 
 78 |     # unfortunately, we can't use isinstance(arg, collections.Hashable), which
 79 |     # can be faster than calling hash, because numpy scalars on Python 3 fail
 80 |     # this test
 81 | 
 82 |     # reconsider this decision once this numpy bug is fixed:
 83 |     # https://github.com/numpy/numpy/issues/5562
 84 | 
 85 |     try:
 86 |         hash(arg)
 87 |     except TypeError:
 88 |         return False
 89 |     else:
 90 |         return True
 91 | 
 92 | 
 93 | def is_sequence(x):
 94 |     try:
 95 |         iter(x)
 96 |         len(x)  # it has a length
 97 |         return not isinstance(x, string_and_binary_types)
 98 |     except (TypeError, AttributeError):
 99 |         return False
100 | 
101 | 
102 | def is_pandas_like(x):
103 |     return isinstance(x, (pandas.Series, pandas.DataFrame))
104 | 
105 | 
106 | def is_pandas_multiindex(x):
107 |     return isinstance(x, (pandas.MultiIndex))
108 | 
109 | 
110 | def is_pandas_2d_multiindex(x):
111 |     return is_pandas_multiindex(x) and x.nlevels == 2
112 | 
113 | 
114 | def is_numpy_like(x):
115 |     return isinstance(x, (numpy.ndarray))
116 | 


--------------------------------------------------------------------------------
/recordlinkage/utils.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | from functools import wraps
  3 | 
  4 | import numpy
  5 | import pandas
  6 | 
  7 | import recordlinkage.config as cf
  8 | 
  9 | 
 10 | # Errors and Exception handlers
 11 | class IndexError(Exception):
 12 |     """Error class for errors related to indexing."""
 13 | 
 14 |     pass
 15 | 
 16 | 
 17 | class LearningError(Exception):
 18 |     """Learning error"""
 19 | 
 20 | 
 21 | class DeprecationHelper:
 22 |     """Deprecation helper for classes and functions.
 23 | 
 24 |     Based on https://stackoverflow.com/a/9008509/8727928
 25 |     """
 26 | 
 27 |     def __init__(self, new_target, msg=None):
 28 |         self.new_target = new_target
 29 |         self.msg = msg
 30 | 
 31 |     def _warn(self):
 32 |         from warnings import warn
 33 | 
 34 |         if self.msg is None:
 35 |             msg = "This class will get deprecated."
 36 |         else:
 37 |             msg = self.msg
 38 | 
 39 |         warn(msg, DeprecationWarning, stacklevel=1)
 40 | 
 41 |     def __call__(self, *args, **kwargs):
 42 |         self._warn()
 43 |         return self.new_target(*args, **kwargs)
 44 | 
 45 |     def __getattr__(self, attr):
 46 |         self._warn()
 47 |         return getattr(self.new_target, attr)
 48 | 
 49 | 
 50 | def return_type_deprecator(func):
 51 |     @wraps(func)
 52 |     def func_wrapper(*args, **kwargs):
 53 |         return_type = kwargs.pop("return_type", None)
 54 |         if return_type is not None:
 55 |             warnings.warn(
 56 |                 "The argument 'return_type' is deprecated in the next "
 57 |                 "version. Use recordlinkage.set_option('classification."
 58 |                 "return_type', '{}') instead.".format(return_type),
 59 |                 DeprecationWarning,
 60 |                 stacklevel=2,
 61 |             )
 62 |             with cf.option_context("classification.return_type", return_type):
 63 |                 return func(*args, **kwargs)
 64 |         else:
 65 |             return func(*args, **kwargs)
 66 | 
 67 |     return func_wrapper
 68 | 
 69 | 
 70 | # Checks and conversions
 71 | def is_label_dataframe(label, df):
 72 |     """check column label existance"""
 73 | 
 74 |     setdiff = set(label) - set(df.columns.tolist())
 75 | 
 76 |     if len(setdiff) == 0:
 77 |         return True
 78 |     else:
 79 |         return False
 80 | 
 81 | 
 82 | def get_length(x):
 83 |     """Return int or len(x)"""
 84 | 
 85 |     try:
 86 |         return int(x)
 87 |     except Exception:
 88 |         return len(x)
 89 | 
 90 | 
 91 | def listify(x, none_value=[]):
 92 |     """Make a list of the argument if it is not a list."""
 93 | 
 94 |     if isinstance(x, list):
 95 |         return x
 96 |     elif isinstance(x, tuple):
 97 |         return list(x)
 98 |     elif x is None:
 99 |         return none_value
100 |     else:
101 |         return [x]
102 | 
103 | 
104 | def unique(x):
105 |     """Convert a list in a unique list."""
106 | 
107 |     return list(set(x))
108 | 
109 | 
110 | def merge_dicts(*dict_args):
111 |     """
112 |     Given any number of dicts, shallow copy and merge into a new dict,
113 |     precedence goes to key value pairs in latter dicts.
114 |     """
115 |     result = {}
116 |     for dictionary in dict_args:
117 |         result.update(dictionary)
118 |     return result
119 | 
120 | 
121 | def multi_index_to_frame(index):
122 |     """
123 |     Replicates MultiIndex.to_frame, which was introduced in pandas 0.21,
124 |     for the sake of backwards compatibility.
125 |     """
126 |     return pandas.DataFrame(index.tolist(), index=index, columns=index.names)
127 | 
128 | 
129 | def index_split(index, chunks):
130 |     """Function to split pandas.Index and pandas.MultiIndex objects.
131 | 
132 |     Split :class:`pandas.Index` and :class:`pandas.MultiIndex` objects
133 |     into chunks. This function is based on :func:`numpy.array_split`.
134 | 
135 |     Parameters
136 |     ----------
137 |     index : pandas.Index, pandas.MultiIndex
138 |         A pandas.Index or pandas.MultiIndex to split into chunks.
139 |     chunks : int
140 |         The number of parts to split the index into.
141 | 
142 |     Returns
143 |     -------
144 |     list
145 |         A list with chunked pandas.Index or pandas.MultiIndex objects.
146 | 
147 |     """
148 | 
149 |     Ntotal = index.shape[0]
150 |     Nsections = int(chunks)
151 |     if Nsections <= 0:
152 |         raise ValueError("number sections must be larger than 0.")
153 |     Neach_section, extras = divmod(Ntotal, Nsections)
154 |     section_sizes = (
155 |         [0] + extras * [Neach_section + 1] + (Nsections - extras) * [Neach_section]
156 |     )
157 |     div_points = numpy.array(section_sizes).cumsum()
158 | 
159 |     sub_ind = []
160 |     for i in range(Nsections):
161 |         st = div_points[i]
162 |         end = div_points[i + 1]
163 |         sub_ind.append(index[st:end])
164 | 
165 |     return sub_ind
166 | 
167 | 
168 | def split_index(*args, **kwargs):
169 |     warnings.warn(
170 |         "Function will be removed in the future. Use index_split.",
171 |         DeprecationWarning,
172 |         stacklevel=2,
173 |     )
174 | 
175 |     return index_split(*args, **kwargs)
176 | 
177 | 
178 | def frame_indexing(frame, multi_index, level_i, indexing_type="label"):
179 |     """Index dataframe based on one level of MultiIndex.
180 | 
181 |     Arguments
182 |     ---------
183 |     frame : pandas.DataFrame
184 |         The datafrme to select records from.
185 |     multi_index : pandas.MultiIndex
186 |         A pandas multiindex were one fo the levels is used to sample the
187 |         dataframe with.
188 |     level_i : int, str
189 |         The level of the multiIndex to index on.
190 |     indexing_type : str
191 |         The type of indexing. The value can be 'label' or 'position'.
192 |         Default 'label'.
193 | 
194 |     """
195 | 
196 |     if indexing_type == "label":
197 |         data = frame.loc[multi_index.get_level_values(level_i)]
198 |         data.index = multi_index
199 |     elif indexing_type == "position":
200 |         data = frame.iloc[multi_index.get_level_values(level_i)]
201 |         data.index = multi_index
202 |     else:
203 |         raise ValueError("indexing_type needs to be 'label' or 'position'")
204 | 
205 |     return data
206 | 
207 | 
208 | def fillna(series_or_arr, missing_value=0.0):
209 |     """Fill missing values in pandas objects and numpy arrays.
210 | 
211 |     Arguments
212 |     ---------
213 |     series_or_arr : pandas.Series, numpy.ndarray
214 |         The numpy array or pandas series for which the missing values
215 |         need to be replaced.
216 |     missing_value : float, int, str
217 |         The value to replace the missing value with. Default 0.0.
218 | 
219 |     Returns
220 |     -------
221 |     pandas.Series, numpy.ndarray
222 |         The numpy array or pandas series with the missing values
223 |         filled.
224 |     """
225 | 
226 |     if pandas.notnull(missing_value):
227 |         if isinstance(series_or_arr, (numpy.ndarray)):
228 |             series_or_arr[numpy.isnan(series_or_arr)] = missing_value
229 |         else:
230 |             series_or_arr.fillna(missing_value, inplace=True)
231 | 
232 |     return series_or_arr
233 | 


--------------------------------------------------------------------------------
/tests/test_annotator.py:
--------------------------------------------------------------------------------
 1 | import recordlinkage as rl
 2 | from recordlinkage.datasets import load_febrl1
 3 | from recordlinkage.datasets import load_febrl4
 4 | from recordlinkage.index import Block
 5 | 
 6 | 
 7 | def test_annotation_link(tmp_path):
 8 |     path = tmp_path / "febrl_annotation_link.json"
 9 | 
10 |     # get febrl4 file
11 |     df_a, df_b, matches = load_febrl4(return_links=True)
12 | 
13 |     # get record pairs
14 |     indexer = Block("given_name", "given_name")
15 |     pairs = indexer.index(df_a, df_b)
16 | 
17 |     # create annotation file
18 |     # write an annotation file for the Febrl4 dataset.
19 |     rl.write_annotation_file(path, pairs[0:10], df_a, df_b)
20 | 
21 |     # read the result
22 |     result = rl.read_annotation_file(path)
23 | 
24 |     assert result.links is None
25 |     assert result.distinct is None
26 | 
27 | 
28 | def test_annotation_dedup(tmp_path):
29 |     path = tmp_path / "febrl_annotation_dedup.json"
30 | 
31 |     # get febrl4 file
32 |     df_a, matches = load_febrl1(return_links=True)
33 | 
34 |     # get record pairs
35 |     indexer = Block("given_name", "given_name")
36 |     pairs = indexer.index(df_a)
37 | 
38 |     # create annotation file
39 |     # write an annotation file for the Febrl4 dataset.
40 |     rl.write_annotation_file(path, pairs[0:10], df_a)
41 | 
42 |     # read the result
43 |     result = rl.read_annotation_file(path)
44 | 
45 |     assert result.links is None
46 |     assert result.distinct is None
47 | 


--------------------------------------------------------------------------------
/tests/test_datasets.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | from os import environ
  4 | from pathlib import Path
  5 | 
  6 | import numpy
  7 | import pandas
  8 | import pytest
  9 | 
 10 | from recordlinkage.datasets import binary_vectors
 11 | from recordlinkage.datasets import clear_data_home
 12 | from recordlinkage.datasets import get_data_home
 13 | from recordlinkage.datasets import load_febrl1
 14 | from recordlinkage.datasets import load_febrl2
 15 | from recordlinkage.datasets import load_febrl3
 16 | from recordlinkage.datasets import load_febrl4
 17 | from recordlinkage.datasets import load_krebsregister
 18 | 
 19 | FEBRL_DEDUP = [
 20 |     # nlinks = 500
 21 |     (load_febrl1, 1000, 500),
 22 |     # nlinks=19*6*5/2+47*5*4/2+107*4*3/2+141*3*2/2+114
 23 |     (load_febrl2, 5000, 1934),
 24 |     # nlinks=168*6*5/2+161*5*4/2+212*4*3/2+256*3*2/2+368
 25 |     (load_febrl3, 5000, 6538),
 26 | ]
 27 | 
 28 | 
 29 | @pytest.mark.parametrize("dataset,nrows,nlinks", FEBRL_DEDUP)
 30 | def test_febrl_dedup(dataset, nrows, nlinks):
 31 |     df = dataset()
 32 |     assert isinstance(df, pandas.DataFrame)
 33 |     assert len(df) == nrows
 34 | 
 35 | 
 36 | @pytest.mark.parametrize("dataset,nrows,nlinks", FEBRL_DEDUP)
 37 | def test_febrl_dedup_links(dataset, nrows, nlinks):
 38 |     df, links = dataset(return_links=True)
 39 |     assert isinstance(df, pandas.DataFrame)
 40 |     assert len(df) == nrows
 41 |     assert len(links) == nlinks
 42 |     assert isinstance(links, pandas.MultiIndex)
 43 | 
 44 | 
 45 | @pytest.mark.parametrize("dataset,nrows,nlinks", FEBRL_DEDUP)
 46 | def test_febrl_dedup_tril(dataset, nrows, nlinks):
 47 |     df, links = dataset(return_links=True)
 48 | 
 49 |     s_level_1 = pandas.Series(numpy.arange(len(df)), index=df.index)
 50 |     s_level_2 = pandas.Series(numpy.arange(len(df)), index=df.index)
 51 | 
 52 |     x1 = s_level_1.loc[links.get_level_values(0)]
 53 |     x2 = s_level_2.loc[links.get_level_values(1)]
 54 | 
 55 |     assert numpy.all(x1.values > x2.values)
 56 | 
 57 | 
 58 | def test_febrl4():
 59 |     dfa, dfb = load_febrl4()
 60 |     assert isinstance(dfa, pandas.DataFrame)
 61 |     assert isinstance(dfb, pandas.DataFrame)
 62 |     assert len(dfa) == 5000
 63 |     assert len(dfb) == 5000
 64 | 
 65 | 
 66 | def test_febrl_links():
 67 |     dfa, dfb, links = load_febrl4(return_links=True)
 68 |     assert isinstance(dfa, pandas.DataFrame)
 69 |     assert isinstance(dfb, pandas.DataFrame)
 70 |     assert len(dfa) == 5000
 71 |     assert len(dfb) == 5000
 72 |     assert isinstance(links, pandas.MultiIndex)
 73 | 
 74 | 
 75 | @pytest.mark.skip(reason="Causes undeterministic problems")
 76 | def test_krebs_dataset_download():
 77 |     # remove downloaded datasets
 78 |     clear_data_home()
 79 | 
 80 |     krebs_data, krebs_matches = load_krebsregister()
 81 | 
 82 |     for i in range(1, 11):
 83 |         assert Path(get_data_home(), "krebsregister", f"block_{i}.zip").is_file()
 84 | 
 85 |     # count the number of recordss
 86 |     assert type(krebs_data), pandas.DataFrame
 87 |     assert type(krebs_matches), pandas.MultiIndex
 88 |     assert len(krebs_data) == 5749132
 89 |     assert len(krebs_matches) == 20931
 90 | 
 91 | 
 92 | @pytest.mark.skip(reason="Causes undeterministic problems")
 93 | def test_krebs_dataset_environ(tmpdir):
 94 |     path = Path(str(tmpdir)).expanduser()
 95 |     environ["RL_DATA"] = str(path)
 96 | 
 97 |     krebs_data, krebs_matches = load_krebsregister()
 98 | 
 99 |     for i in range(1, 11):
100 |         assert Path(path, "krebsregister", f"block_{i}.zip").is_file()
101 | 
102 | 
103 | @pytest.mark.skip(reason="Causes undeterministic problems")
104 | def test_krebs_dataset():
105 |     krebs_data_block1, krebs_matches_block1 = load_krebsregister(1)
106 |     krebs_data_block10, krebs_matches_block10 = load_krebsregister(10)
107 | 
108 |     assert len(krebs_data_block1) > 0
109 |     assert len(krebs_data_block10) > 0
110 | 
111 |     # load not existing block
112 |     with pytest.raises(ValueError):
113 |         load_krebsregister(11)
114 | 
115 |     # missing values
116 |     krebs_block10, matches = load_krebsregister(10, missing_values=0)
117 |     assert krebs_block10.isnull().sum().sum() == 0
118 | 
119 | 
120 | @pytest.mark.skip(reason="Causes undeterministic problems")
121 | def test_krebs_missings():
122 |     # missing values
123 |     krebs_block10, matches = load_krebsregister(10, missing_values=0)
124 |     assert krebs_block10.isnull().sum().sum() == 0
125 | 
126 | 
127 | @pytest.mark.skip(reason="Causes undeterministic problems")
128 | def test_krebs_shuffle():
129 |     # missing values
130 |     krebs_block10, matches = load_krebsregister(10, shuffle=False)
131 | 
132 | 
133 | def test_random_comparison_vectors():
134 |     # Test the generation of a random dataset
135 | 
136 |     n_record_pairs = 10000
137 |     n_matches = 500
138 | 
139 |     df = binary_vectors(
140 |         n_record_pairs, n_matches, m=[0.8] * 8, u=[0.2] * 8, random_state=535
141 |     )
142 | 
143 |     # Check the result is a DataFrame with MultiIndex
144 |     assert isinstance(df, pandas.DataFrame)
145 |     assert isinstance(df.index, pandas.MultiIndex)
146 | 
147 |     # Test the length of the dataframe
148 |     assert len(df) == n_record_pairs
149 | 
150 | 
151 | def test_random_comparison_vectors_1value_col():
152 |     m = numpy.array([1, 0.81, 0.85, 0])
153 |     u = numpy.array([1, 0.23, 0.50, 0])
154 | 
155 |     # Create the train dataset.
156 |     X_train, y_train = binary_vectors(
157 |         1000, 500, m=m, u=u, random_state=535, return_links=True
158 |     )
159 | 
160 |     assert len(X_train.iloc[:, 0].unique()) == 1
161 |     assert X_train.iloc[:, 0].unique()[0] == 1
162 | 
163 |     assert len(X_train.iloc[:, 3].unique()) == 1
164 |     assert X_train.iloc[:, 3].unique()[0] == 0
165 | 
166 |     assert len(X_train.iloc[:, 1].unique()) == 2
167 |     assert len(X_train.iloc[:, 2].unique()) == 2
168 | 


--------------------------------------------------------------------------------
/tests/test_generate.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/J535D165/recordlinkage/b93d97641952f8c85106be5794ca93b1f1298fbc/tests/test_generate.py


--------------------------------------------------------------------------------
/tests/test_measures.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | 
  4 | import numpy
  5 | import pandas
  6 | 
  7 | import recordlinkage as rl
  8 | 
  9 | FULL_INDEX = pandas.MultiIndex.from_product(
 10 |     [[1, 2, 3], [1, 2, 3]], names=["first", "second"]  # 3x3 matrix
 11 | )
 12 | LINKS_TRUE = pandas.MultiIndex.from_tuples(
 13 |     [(1, 1), (2, 2), (3, 3)], names=["first", "second"]  # the diagonal
 14 | )
 15 | LINKS_PRED = pandas.MultiIndex.from_tuples(
 16 |     [(1, 1), (2, 1), (3, 1), (1, 2)], names=["first", "second"]  # L shape
 17 | )
 18 | 
 19 | 
 20 | class TestMeasures:
 21 |     def test_confusion_matrix(self):
 22 |         result_len = rl.confusion_matrix(LINKS_TRUE, LINKS_PRED, len(FULL_INDEX))
 23 |         result_full_index = rl.confusion_matrix(LINKS_TRUE, LINKS_PRED, FULL_INDEX)
 24 |         expected = numpy.array([[1, 2], [3, 3]])
 25 | 
 26 |         numpy.testing.assert_array_equal(result_len, expected)
 27 |         numpy.testing.assert_array_equal(result_full_index, expected)
 28 | 
 29 |     def test_tp_fp_tn_fn(self):
 30 |         tp = rl.true_positives(LINKS_TRUE, LINKS_PRED)
 31 |         assert tp == 1
 32 |         fp = rl.false_positives(LINKS_TRUE, LINKS_PRED)
 33 |         assert fp == 3
 34 |         tn = rl.true_negatives(LINKS_TRUE, LINKS_PRED, len(FULL_INDEX))
 35 |         assert tn == 3
 36 |         fn = rl.false_negatives(LINKS_TRUE, LINKS_PRED)
 37 |         assert fn == 2
 38 | 
 39 |     def test_recall(self):
 40 |         # confusion matrix
 41 |         cm = rl.confusion_matrix(LINKS_TRUE, LINKS_PRED)
 42 | 
 43 |         assert rl.recall(LINKS_TRUE, LINKS_PRED) == 1 / 3
 44 |         assert rl.recall(cm) == 1 / 3
 45 | 
 46 |     def test_precision(self):
 47 |         # confusion matrix
 48 |         cm = rl.confusion_matrix(LINKS_TRUE, LINKS_PRED, len(FULL_INDEX))
 49 | 
 50 |         assert rl.precision(LINKS_TRUE, LINKS_PRED) == 1 / 4
 51 |         assert rl.precision(cm) == 1 / 4
 52 | 
 53 |     def test_accuracy(self):
 54 |         # confusion matrix
 55 |         cm = rl.confusion_matrix(LINKS_TRUE, LINKS_PRED, len(FULL_INDEX))
 56 | 
 57 |         assert rl.accuracy(LINKS_TRUE, LINKS_PRED, len(FULL_INDEX)) == 4 / 9
 58 |         assert rl.accuracy(cm) == 4 / 9
 59 |         assert rl.accuracy(LINKS_TRUE, LINKS_PRED, FULL_INDEX) == 4 / 9
 60 | 
 61 |     def test_specificity(self):
 62 |         # confusion matrix
 63 |         cm = rl.confusion_matrix(LINKS_TRUE, LINKS_PRED, len(FULL_INDEX))
 64 | 
 65 |         assert rl.specificity(LINKS_TRUE, LINKS_PRED, len(FULL_INDEX)) == 1 / 2
 66 |         assert rl.specificity(cm) == 1 / 2
 67 |         assert rl.specificity(LINKS_TRUE, LINKS_PRED, FULL_INDEX) == 1 / 2
 68 | 
 69 |     def test_fscore(self):
 70 |         # confusion matrix
 71 |         cm = rl.confusion_matrix(LINKS_TRUE, LINKS_PRED, len(FULL_INDEX))
 72 |         prec = rl.precision(LINKS_TRUE, LINKS_PRED)
 73 |         rec = rl.recall(LINKS_TRUE, LINKS_PRED)
 74 |         expected = float(2 * prec * rec / (prec + rec))
 75 | 
 76 |         assert rl.fscore(LINKS_TRUE, LINKS_PRED) == expected
 77 |         assert rl.fscore(cm) == expected
 78 | 
 79 |     def test_full_index_size(self):
 80 |         df_a = pandas.DataFrame(numpy.arange(10))
 81 |         df_b = pandas.DataFrame(numpy.arange(10))
 82 | 
 83 |         assert rl.full_index_size(df_a) == 45
 84 |         assert rl.full_index_size(len(df_a)) == 45
 85 |         assert rl.full_index_size(len(df_a)) == 45
 86 |         assert rl.full_index_size([len(df_a)]) == 45
 87 | 
 88 |         assert rl.full_index_size(df_a, df_b) == 100
 89 |         assert rl.full_index_size(len(df_a), len(df_b)) == 100
 90 |         assert rl.full_index_size((len(df_a), len(df_b))) == 100
 91 |         assert rl.full_index_size([len(df_a), len(df_b)]) == 100
 92 | 
 93 |     def test_reduction_ratio(self):
 94 |         df_a = pandas.DataFrame(numpy.arange(10))
 95 |         df_b = pandas.DataFrame(numpy.arange(10))
 96 |         candidate_pairs_link = pandas.MultiIndex.from_product(
 97 |             [[1, 2, 3, 4, 5], [1, 2, 3, 4, 5]]
 98 |         )
 99 |         candidate_pairs_dedup = pandas.MultiIndex.from_arrays(
100 |             [[1, 2, 3, 4, 5], [1, 2, 3, 4, 5]]
101 |         )
102 | 
103 |         assert rl.reduction_ratio(candidate_pairs_dedup, df_a) == 8 / 9
104 |         assert rl.reduction_ratio(candidate_pairs_dedup, (df_a)) == 8 / 9
105 |         assert rl.reduction_ratio(candidate_pairs_dedup, (df_a,)) == 8 / 9
106 | 
107 |         assert rl.reduction_ratio(candidate_pairs_link, df_a, df_b) == 3 / 4
108 |         assert rl.reduction_ratio(candidate_pairs_link, (df_a, df_b)) == 3 / 4
109 |         assert rl.reduction_ratio(candidate_pairs_link, [df_a, df_b]) == 3 / 4
110 | 


--------------------------------------------------------------------------------
/tests/test_misc.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | 
 6 | # testing utils from pandas
 7 | import pandas.testing as pdt
 8 | import pytest
 9 | 
10 | import recordlinkage as rl
11 | from recordlinkage import index_split
12 | 
13 | 
14 | def test_multiindex_split():
15 |     index = pd.MultiIndex.from_product([np.arange(5), np.arange(6)])
16 |     result = index_split(index, 3)
17 | 
18 |     assert len(result) == 3
19 | 
20 |     for i, result_index_chunk in enumerate(result):
21 |         expected_index_chunk = index[i * 10 : (i + 1) * 10]
22 |         pdt.assert_index_equal(result_index_chunk, expected_index_chunk)
23 | 
24 |         assert len(result_index_chunk.levels) == 2
25 |         assert len(result_index_chunk.codes) == 2
26 | 
27 | 
28 | def test_options():
29 |     # global set
30 |     rl.options.indexing.pairs = "multiindex"
31 |     assert rl.get_option("indexing.pairs") == "multiindex"
32 | 
33 | 
34 | def test_options_context():
35 |     with rl.option_context("indexing.pairs", "multiindex"):
36 |         rl.options.indexing.pairs = "multiindex"
37 |         assert rl.get_option("indexing.pairs") == "multiindex"
38 | 
39 | 
40 | def test_options_incorrect_values():
41 |     # incorrect value
42 |     with pytest.raises(ValueError):
43 |         rl.options.indexing.pairs = "non_existing"
44 | 


--------------------------------------------------------------------------------
/tests/test_network.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | 
 5 | import pandas as pd
 6 | 
 7 | # testing utils from pandas
 8 | import pandas.testing as pdt
 9 | import pytest
10 | 
11 | try:
12 |     import networkx  # noqa
13 | except ImportError:
14 |     pass
15 | 
16 | from recordlinkage import ConnectedComponents
17 | from recordlinkage import OneToManyLinking
18 | from recordlinkage import OneToOneLinking
19 | 
20 | 
21 | def test_one_to_one_linking():
22 |     sample = pd.MultiIndex.from_tuples(
23 |         [
24 |             (1, 1),
25 |             (2, 2),
26 |             (3, 3),
27 |             (3, 4),
28 |             (3, 5),
29 |             (4, 4),
30 |             (5, 5),
31 |             (6, 5),
32 |             (7, 7),
33 |             (7, 7),
34 |             (7, 8),
35 |         ]
36 |     )
37 |     one_to_many = OneToManyLinking()
38 |     sample_one_to_many = one_to_many.compute(sample)
39 | 
40 |     expected = pd.MultiIndex.from_tuples(
41 |         [(1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 5), (7, 7)]
42 |     )
43 |     pdt.assert_index_equal(sample_one_to_many, expected)
44 | 
45 | 
46 | def test_one_to_many_linking():
47 |     sample = pd.MultiIndex.from_tuples(
48 |         [
49 |             (1, 1),
50 |             (2, 2),
51 |             (3, 3),
52 |             (3, 4),
53 |             (3, 5),
54 |             (4, 4),
55 |             (5, 5),
56 |             (6, 5),
57 |             (7, 7),
58 |             (7, 6),
59 |             (7, 8),
60 |         ]
61 |     )
62 | 
63 |     # test OneToOneLinking
64 |     one_to_one = OneToOneLinking()
65 |     sample_one_to_one = one_to_one.compute(sample)
66 | 
67 |     expected = pd.MultiIndex.from_tuples(
68 |         [(1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (7, 7)]
69 |     )
70 |     pdt.assert_index_equal(sample_one_to_one, expected)
71 | 
72 | 
73 | @pytest.mark.skipif(
74 |     "networkx" not in sys.modules, reason="Requires the Networkx library"
75 | )
76 | def test_connected_components():
77 |     sample = pd.MultiIndex.from_tuples([(1, 2), (2, 3), (3, 4), (5, 6), (5, 7), (8, 9)])
78 | 
79 |     # test ConnectedComponents
80 |     connected = ConnectedComponents()
81 |     sample_connected = connected.compute(sample)
82 | 
83 |     expected = [
84 |         pd.MultiIndex.from_tuples([(1, 2), (2, 3), (3, 4)]),
85 |         pd.MultiIndex.from_tuples([(5, 6), (5, 7)]),
86 |         pd.MultiIndex.from_tuples([(8, 9)]),
87 |     ]
88 | 
89 |     for i, _mi in enumerate(expected):
90 |         pdt.assert_index_equal(sample_connected[i], expected[i])
91 | 


--------------------------------------------------------------------------------