├── .coveragerc ├── .github └── workflows │ ├── ci-workflow.yml │ ├── python-package.yml │ └── render-docs.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .readthedocs.yaml ├── LICENSE ├── MANIFEST.in ├── README.md ├── benchmarks ├── __init__.py ├── bench_comparing.py └── bench_indexing.py ├── docs ├── Makefile ├── about.rst ├── annotation.rst ├── changelog.rst ├── conf.py ├── contributing.rst ├── guides │ ├── classifiers.rst │ ├── data_deduplication.ipynb │ └── link_two_dataframes.ipynb ├── images │ ├── elas_1705.png │ ├── indexing_basic.png │ ├── indexing_plot.py │ └── recordlinkage-banner-transparent.svg ├── index.rst ├── installation.rst ├── make.bat ├── performance.rst ├── ref-classifiers.rst ├── ref-compare.rst ├── ref-datasets.rst ├── ref-evaluation.rst ├── ref-index.rst ├── ref-misc.rst └── ref-preprocessing.rst ├── examples ├── README.rst ├── dedup_deterministic.py ├── linking_deterministic.py ├── supervised_keras.py ├── supervised_learning_prob.py └── unsupervised_learning_prob.py ├── pyproject.toml ├── recordlinkage ├── __init__.py ├── _lib │ ├── numeric.c │ └── numeric.h ├── adapters.py ├── algorithms │ ├── __init__.py │ ├── c_numeric.pyx │ ├── compare.py │ ├── distance.py │ ├── indexing.py │ ├── nb_sklearn.py │ ├── numeric.py │ └── string.py ├── annotation.py ├── api.py ├── base.py ├── classifiers.py ├── compare.py ├── config.py ├── config_init.py ├── contrib │ ├── README.rst │ ├── __init__.py │ ├── compare │ │ ├── __init__.py │ │ └── random │ │ │ ├── README.rst │ │ │ ├── __init__.py │ │ │ ├── random.py │ │ │ └── test_random.py │ └── index │ │ ├── __init__.py │ │ └── neighbourhoodblock │ │ ├── README.rst │ │ ├── __init__.py │ │ ├── neighbourhoodblock.py │ │ └── test_neighbourhoodblock.py ├── datasets │ ├── __init__.py │ ├── external.py │ ├── febrl.py │ ├── febrl │ │ ├── dataset1.csv │ │ ├── dataset2.csv │ │ ├── dataset3.csv │ │ ├── dataset4a.csv │ │ └── dataset4b.csv │ └── generate.py ├── deprecated.py ├── index.py ├── measures.py ├── network.py ├── preprocessing │ ├── __init__.py │ ├── cleaning.py │ └── encoding.py ├── rl_logging.py ├── standardise │ └── __init__.py ├── types.py └── utils.py └── tests ├── test_annotator.py ├── test_classify.py ├── test_compare.py ├── test_datasets.py ├── test_generate.py ├── test_indexing.py ├── test_measures.py ├── test_misc.py ├── test_network.py └── test_preprocessing.py /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | branch = True 3 | 4 | [report] 5 | exclude_lines = 6 | if self.debug: 7 | pragma: no cover 8 | raise NotImplementedError 9 | if __name__ == .__main__.: 10 | 11 | ignore_errors = False 12 | 13 | omit = 14 | tests/* 15 | docs/* 16 | recordlinkage/_version.py 17 | recordlinkage/types.py -------------------------------------------------------------------------------- /.github/workflows/ci-workflow.yml: -------------------------------------------------------------------------------- 1 | name: tests 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | build: 7 | 8 | runs-on: ubuntu-latest 9 | strategy: 10 | fail-fast: false 11 | matrix: 12 | python-version: ["3.8", "3.9", "3.10", "3.11"] 13 | pandas-version: ["1.0", "2.0"] 14 | steps: 15 | - uses: actions/checkout@v2 16 | - name: Set up Python ${{ matrix.python-version }} 17 | uses: actions/setup-python@v1 18 | with: 19 | python-version: ${{ matrix.python-version }} 20 | - name: Install pandas 21 | run: | 22 | pip install pandas~=${{ matrix.pandas-version }} 23 | - name: Package recordlinkage 24 | run: | 25 | pip install --upgrade pip 26 | pip install build 27 | python -m build 28 | - name: Install recordlinkage 29 | run: | 30 | pip install networkx>=2 31 | pip install ./dist/recordlinkage-*.whl 32 | - name: Test with pytest 33 | run: | 34 | pip install pytest 35 | # remove recordlinkage to prevent relative imports (use installed package) 36 | # this is like wrapping stuff in a src folder 37 | rm -r recordlinkage/ 38 | pytest 39 | lint: 40 | runs-on: ubuntu-latest 41 | steps: 42 | - uses: actions/checkout@v2 43 | - uses: actions/setup-python@v1 44 | - name: Install ruff 45 | run: | 46 | pip install ruff 47 | - name: Lint with ruff 48 | run: | 49 | ruff . 50 | -------------------------------------------------------------------------------- /.github/workflows/python-package.yml: -------------------------------------------------------------------------------- 1 | name: Upload Python Package 2 | 3 | on: 4 | release: 5 | types: [published] 6 | 7 | permissions: 8 | contents: read 9 | 10 | jobs: 11 | deploy: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v3 15 | - name: Set up Python 16 | uses: actions/setup-python@v4 17 | with: 18 | python-version: '3.x' 19 | - name: Install dependencies 20 | run: | 21 | python -m pip install --upgrade pip 22 | pip install build 23 | - name: Build package 24 | run: python -m build 25 | - name: Publish package 26 | uses: pypa/gh-action-pypi-publish@release/v1 27 | with: 28 | user: __token__ 29 | password: ${{ secrets.pypi_password }} 30 | -------------------------------------------------------------------------------- /.github/workflows/render-docs.yml: -------------------------------------------------------------------------------- 1 | name: Build HTML with Sphinx 2 | on: [push, pull_request] 3 | jobs: 4 | html-sphinx: 5 | runs-on: ubuntu-latest 6 | steps: 7 | - name: Clone repo 8 | uses: actions/checkout@v2 9 | - name: Set up Python 10 | uses: actions/setup-python@v2 11 | with: 12 | python-version: '3.10' 13 | - name: Install recordlinkage and docs tools 14 | run: | 15 | sudo apt install pandoc 16 | python -m pip install .[docs] 17 | - name: Build HTML 18 | run: | 19 | python -m sphinx -W --keep-going --color docs/ _build/html/ 20 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | recordlinkage/datasets/krebsregister/* 3 | 4 | recordlinkage/_version.py 5 | 6 | 7 | .DS_Store 8 | */.DS_Store 9 | 10 | # Byte-compiled / optimized / DLL files 11 | __pycache__/ 12 | *.py[cod] 13 | *$py.class 14 | 15 | # C extensions and Cython .pyx compilations 16 | *.so 17 | algorithms/*.c 18 | 19 | # Distribution / packaging 20 | .Python 21 | build/ 22 | develop-eggs/ 23 | dist/ 24 | downloads/ 25 | eggs/ 26 | .eggs/ 27 | lib/ 28 | lib64/ 29 | parts/ 30 | sdist/ 31 | var/ 32 | *.egg-info/ 33 | .installed.cfg 34 | *.egg 35 | *.bat 36 | 37 | # PyInstaller 38 | # Usually these files are written by a python script from a template 39 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 40 | *.manifest 41 | *.spec 42 | 43 | # Installer logs 44 | pip-log.txt 45 | pip-delete-this-directory.txt 46 | 47 | # Unit test / coverage reports 48 | htmlcov/ 49 | .tox/ 50 | .coverage 51 | .coverage.* 52 | .cache 53 | nosetests.xml 54 | .pytest_cache/* 55 | coverage.xml 56 | *,cover 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | 65 | # Sphinx documentation 66 | docs/_build/ 67 | 68 | # PyBuilder 69 | target/ 70 | 71 | # IPython Notebook 72 | .ipynb_checkpoints 73 | 74 | # dotenv 75 | .env 76 | 77 | # virtualenv 78 | venv/ 79 | ENV/ 80 | 81 | /tests/sandbox 82 | # ASV 83 | .asv/ 84 | 85 | # PyCharm IDE 86 | /sandbox 87 | /cover 88 | /coverage-report 89 | .idea/ 90 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.4.0 4 | hooks: 5 | - id: trailing-whitespace 6 | - id: end-of-file-fixer 7 | - id: check-yaml 8 | - id: check-added-large-files 9 | - repo: https://github.com/charliermarsh/ruff-pre-commit 10 | rev: v0.0.278 11 | hooks: 12 | - id: ruff 13 | - repo: https://github.com/psf/black 14 | rev: 23.7.0 15 | hooks: 16 | - id: black 17 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | build: 4 | os: ubuntu-22.04 5 | tools: 6 | python: "3.11" 7 | 8 | sphinx: 9 | configuration: docs/conf.py 10 | 11 | python: 12 | install: 13 | - method: pip 14 | path: . 15 | extra_requirements: 16 | - docs 17 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016-2018, Jonathan de Bruin 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | * Neither the name of the copyright holder nor the names of its 15 | contributors may be used to endorse or promote products derived from 16 | this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include recordlinkage/datasets/febrl *.csv 2 | recursive-include recordlinkage/datasets/krebsregister *.csv 3 | 4 | global-exclude test_*.py 5 | global-exclude *_test.py 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 |
3 |
4 | 5 | # RecordLinkage: powerful and modular Python record linkage toolkit 6 | 7 | [![Pypi Version](https://badge.fury.io/py/recordlinkage.svg)](https://pypi.python.org/pypi/recordlinkage/) 8 | [![Github Actions CI Status](https://github.com/J535D165/recordlinkage/workflows/tests/badge.svg?branch=master)](https://github.com/J535D165/recordlinkage/actions) 9 | [![Code Coverage](https://codecov.io/gh/J535D165/recordlinkage/branch/master/graph/badge.svg)](https://codecov.io/gh/J535D165/recordlinkage) 10 | [![Documentation Status](https://readthedocs.org/projects/recordlinkage/badge/?version=latest)](https://recordlinkage.readthedocs.io/en/latest/?badge=latest) 11 | [![Zenodo DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.3559042.svg)](https://doi.org/10.5281/zenodo.3559042) 12 | 13 | **RecordLinkage** is a powerful and modular record linkage toolkit to 14 | link records in or between data sources. The toolkit provides most of 15 | the tools needed for record linkage and deduplication. The package 16 | contains indexing methods, functions to compare records and classifiers. 17 | The package is developed for research and the linking of small or medium 18 | sized files. 19 | 20 | This project is inspired by the [Freely Extensible Biomedical Record 21 | Linkage (FEBRL)](https://sourceforge.net/projects/febrl/) project, which 22 | is a great project. In contrast with FEBRL, the recordlinkage project 23 | uses [pandas](http://pandas.pydata.org/) and 24 | [numpy](http://www.numpy.org/) for data handling and computations. The 25 | use of *pandas*, a flexible and powerful data analysis and manipulation 26 | library for Python, makes the record linkage process much easier and 27 | faster. The extensive *pandas* library can be used to integrate your 28 | record linkage directly into existing data manipulation projects. 29 | 30 | One of the aims of this project is to make an easily extensible record 31 | linkage framework. It is easy to include your own indexing algorithms, 32 | comparison/similarity measures and classifiers. 33 | 34 | ## Basic linking example 35 | 36 | Import the `recordlinkage` module with all important tools for record 37 | linkage and import the data manipulation framework **pandas**. 38 | 39 | ``` python 40 | import recordlinkage 41 | import pandas 42 | ``` 43 | 44 | Load your data into pandas DataFrames. 45 | 46 | ``` python 47 | df_a = pandas.DataFrame(YOUR_FIRST_DATASET) 48 | df_b = pandas.DataFrame(YOUR_SECOND_DATASET) 49 | ``` 50 | 51 | Comparing all record can be computationally intensive. Therefore, we 52 | make set of candidate links with one of the built-in indexing techniques 53 | like **blocking**. In this example, only pairs of records that agree on 54 | the surname are returned. 55 | 56 | ``` python 57 | indexer = recordlinkage.Index() 58 | indexer.block('surname') 59 | candidate_links = indexer.index(df_a, df_b) 60 | ``` 61 | 62 | For each candidate link, compare the records with one of the comparison 63 | or similarity algorithms in the Compare class. 64 | 65 | ``` python 66 | c = recordlinkage.Compare() 67 | 68 | c.string('name_a', 'name_b', method='jarowinkler', threshold=0.85) 69 | c.exact('sex', 'gender') 70 | c.date('dob', 'date_of_birth') 71 | c.string('str_name', 'streetname', method='damerau_levenshtein', threshold=0.7) 72 | c.exact('place', 'placename') 73 | c.numeric('income', 'income', method='gauss', offset=3, scale=3, missing_value=0.5) 74 | 75 | # The comparison vectors 76 | feature_vectors = c.compute(candidate_links, df_a, df_b) 77 | ``` 78 | 79 | Classify the candidate links into matching or distinct pairs based on 80 | their comparison result with one of the [classification 81 | algorithms](https://recordlinkage.readthedocs.io/en/latest/ref-classifiers.html). 82 | The following code classifies candidate pairs with a Logistic Regression 83 | classifier. This (supervised machine learning) algorithm requires 84 | training data. 85 | 86 | ``` python 87 | logrg = recordlinkage.LogisticRegressionClassifier() 88 | logrg.fit(TRAINING_COMPARISON_VECTORS, TRAINING_PAIRS) 89 | 90 | logrg.predict(feature_vectors) 91 | ``` 92 | 93 | The following code shows the classification of candidate pairs with the 94 | Expectation-Conditional Maximisation (ECM) algorithm. This variant of 95 | the Expectation-Maximisation algorithm doesn't require training data 96 | (unsupervised machine learning). 97 | 98 | ``` python 99 | ecm = recordlinkage.ECMClassifier() 100 | ecm.fit_predict(feature_vectors) 101 | ``` 102 | 103 | ## Main Features 104 | 105 | The main features of this Python record linkage toolkit are: 106 | 107 | - Clean and standardise data with easy to use tools 108 | - Make pairs of records with smart indexing methods such as 109 | **blocking** and **sorted neighbourhood indexing** 110 | - Compare records with a large number of comparison and similarity 111 | measures for different types of variables such as strings, numbers 112 | and dates. 113 | - Several classifications algorithms, both supervised and unsupervised 114 | algorithms. 115 | - Common record linkage evaluation tools 116 | - Several built-in datasets. 117 | 118 | ## Documentation 119 | 120 | The most recent documentation and API reference can be found at 121 | [recordlinkage.readthedocs.org](http://recordlinkage.readthedocs.org/en/latest/). 122 | The documentation provides some basic usage examples like 123 | [deduplication](http://recordlinkage.readthedocs.io/en/latest/guides/data_deduplication.html) 124 | and 125 | [linking](http://recordlinkage.readthedocs.io/en/latest/guides/link_two_dataframes.html) 126 | census data. More examples are coming soon. If you do have interesting 127 | examples to share, let us know. 128 | 129 | ## Installation 130 | 131 | The Python Record linkage Toolkit requires Python 3.8 or higher. Install the 132 | package easily with pip 133 | 134 | ``` sh 135 | pip install recordlinkage 136 | ``` 137 | 138 | The toolkit depends on popular packages like 139 | [Pandas](https://github.com/pydata/pandas), 140 | [Numpy](http://www.numpy.org), [Scipy](https://www.scipy.org/) and, 141 | [Scikit-learn](http://scikit-learn.org/). A complete list of 142 | dependencies can be found in the [installation 143 | manual](https://recordlinkage.readthedocs.io/en/latest/installation.html) 144 | as well as recommended and optional dependencies. 145 | 146 | ## License 147 | 148 | The license for this record linkage tool is BSD-3-Clause. 149 | 150 | ## Citation 151 | 152 | Please cite this package when being used in an academic context. Ensure 153 | that the DOI and version match the installed version. Citatation styles 154 | can be found on the publishers website 155 | [10.5281/zenodo.3559042](https://doi.org/10.5281/zenodo.3559042). 156 | 157 | ``` text 158 | @software{de_bruin_j_2019_3559043, 159 | author = {De Bruin, J}, 160 | title = {{Python Record Linkage Toolkit: A toolkit for 161 | record linkage and duplicate detection in Python}}, 162 | month = dec, 163 | year = 2019, 164 | publisher = {Zenodo}, 165 | version = {v0.14}, 166 | doi = {10.5281/zenodo.3559043}, 167 | url = {https://doi.org/10.5281/zenodo.3559043} 168 | } 169 | ``` 170 | 171 | ## Need help? 172 | 173 | Stuck on your record linkage code or problem? Any other questions? Don't 174 | hestitate to send me an email (). 175 | -------------------------------------------------------------------------------- /benchmarks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/J535D165/recordlinkage/b93d97641952f8c85106be5794ca93b1f1298fbc/benchmarks/__init__.py -------------------------------------------------------------------------------- /benchmarks/bench_comparing.py: -------------------------------------------------------------------------------- 1 | import recordlinkage as rl 2 | from recordlinkage.datasets import load_febrl1 3 | from recordlinkage.datasets import load_febrl4 4 | 5 | 6 | class CompareRecordLinkage: 7 | timeout = 30 * 60 8 | 9 | def setup(self): 10 | # download data 11 | self.A, self.B = load_febrl4() 12 | 13 | # make pairs 14 | c_pairs = rl.FullIndex() 15 | pairs = c_pairs.index(self.A, self.B) 16 | 17 | # different sizes of pairs 18 | self.pairs_xsmall = pairs[0:5e3] 19 | self.pairs_small = pairs[0:5e4] 20 | self.pairs_medium = pairs[0:5e5] 21 | self.pairs_large = pairs[0:5e6] 22 | 23 | def time_global_xsmall(self): 24 | c_compare = rl.Compare(self.pairs_xsmall, self.A, self.B) 25 | c_compare.string("given_name", "given_name", method="jaro") 26 | c_compare.string("surname", "surname", method="jarowinkler", threshold=0.85) 27 | c_compare.date("date_of_birth", "date_of_birth") 28 | c_compare.exact("suburb", "suburb") 29 | c_compare.exact("state", "state") 30 | c_compare.string("address_1", "address_1", method="levenshtein", threshold=0.85) 31 | 32 | def time_global_small(self): 33 | c_compare = rl.Compare(self.pairs_small, self.A, self.B) 34 | c_compare.string("given_name", "given_name", method="jaro") 35 | c_compare.string("surname", "surname", method="jarowinkler", threshold=0.85) 36 | c_compare.date("date_of_birth", "date_of_birth") 37 | c_compare.exact("suburb", "suburb") 38 | c_compare.exact("state", "state") 39 | c_compare.string("address_1", "address_1", method="levenshtein", threshold=0.85) 40 | 41 | def time_global_medium(self): 42 | c_compare = rl.Compare(self.pairs_medium, self.A, self.B) 43 | c_compare.string("given_name", "given_name", method="jaro") 44 | c_compare.string("surname", "surname", method="jarowinkler", threshold=0.85) 45 | c_compare.date("date_of_birth", "date_of_birth") 46 | c_compare.exact("suburb", "suburb") 47 | c_compare.exact("state", "state") 48 | c_compare.string("address_1", "address_1", method="levenshtein", threshold=0.85) 49 | 50 | def time_global_large(self): 51 | c_compare = rl.Compare(self.pairs_large, self.A, self.B) 52 | c_compare.string("given_name", "given_name", method="jaro") 53 | c_compare.string("surname", "surname", method="jarowinkler", threshold=0.85) 54 | c_compare.date("date_of_birth", "date_of_birth") 55 | c_compare.exact("suburb", "suburb") 56 | c_compare.exact("state", "state") 57 | c_compare.string("address_1", "address_1", method="levenshtein", threshold=0.85) 58 | 59 | 60 | class CompareDeduplication: 61 | timeout = 30 * 60 62 | 63 | def setup(self): 64 | # download data 65 | self.A = load_febrl1() 66 | 67 | # make pairs 68 | c_pairs = rl.FullIndex() 69 | pairs = c_pairs.index(self.A) 70 | 71 | # different sizes of pairs 72 | self.pairs_xsmall = pairs[0:5e3] 73 | self.pairs_small = pairs[0:5e4] 74 | self.pairs_medium = pairs[0:5e5] 75 | self.pairs_large = pairs[0:5e6] 76 | 77 | def time_global_xsmall(self): 78 | c_compare = rl.Compare(self.pairs_xsmall, self.A) 79 | c_compare.string("given_name", "given_name", method="jaro") 80 | c_compare.string("surname", "surname", method="jarowinkler", threshold=0.85) 81 | c_compare.date("date_of_birth", "date_of_birth") 82 | c_compare.exact("suburb", "suburb") 83 | c_compare.exact("state", "state") 84 | c_compare.string("address_1", "address_1", method="levenshtein", threshold=0.85) 85 | 86 | def time_global_small(self): 87 | c_compare = rl.Compare(self.pairs_small, self.A) 88 | c_compare.string("given_name", "given_name", method="jaro") 89 | c_compare.string("surname", "surname", method="jarowinkler", threshold=0.85) 90 | c_compare.date("date_of_birth", "date_of_birth") 91 | c_compare.exact("suburb", "suburb") 92 | c_compare.exact("state", "state") 93 | c_compare.string("address_1", "address_1", method="levenshtein", threshold=0.85) 94 | 95 | def time_global_medium(self): 96 | c_compare = rl.Compare(self.pairs_medium, self.A) 97 | c_compare.string("given_name", "given_name", method="jaro") 98 | c_compare.string("surname", "surname", method="jarowinkler", threshold=0.85) 99 | c_compare.date("date_of_birth", "date_of_birth") 100 | c_compare.exact("suburb", "suburb") 101 | c_compare.exact("state", "state") 102 | c_compare.string("address_1", "address_1", method="levenshtein", threshold=0.85) 103 | 104 | def time_global_large(self): 105 | c_compare = rl.Compare(self.pairs_large, self.A) 106 | c_compare.string("given_name", "given_name", method="jaro") 107 | c_compare.string("surname", "surname", method="jarowinkler", threshold=0.85) 108 | c_compare.date("date_of_birth", "date_of_birth") 109 | c_compare.exact("suburb", "suburb") 110 | c_compare.exact("state", "state") 111 | c_compare.string("address_1", "address_1", method="levenshtein", threshold=0.85) 112 | 113 | 114 | class CompareAlgorithms: 115 | timeout = 30 * 60 116 | 117 | def setup(self): 118 | # download data 119 | self.A, self.B = load_febrl4() 120 | 121 | # Add numbers (age) 122 | self.A["postcode"] = self.A["postcode"].astype(float) 123 | self.B["postcode"] = self.B["postcode"].astype(float) 124 | 125 | # make pairs 126 | c_pairs = rl.FullIndex() 127 | self.pairs = c_pairs.index(self.A, self.B)[0:5e4] 128 | 129 | # ************* STRING ************* 130 | 131 | def time_string_jaro(self): 132 | c_compare = rl.Compare(self.pairs, self.A, self.B) 133 | c_compare.string("given_name", "given_name", method="jaro") 134 | 135 | def time_string_jarowinkler(self): 136 | c_compare = rl.Compare(self.pairs, self.A, self.B) 137 | c_compare.string("given_name", "given_name", method="jarowinkler") 138 | 139 | def time_string_qgram(self): 140 | c_compare = rl.Compare(self.pairs, self.A, self.B) 141 | c_compare.string("given_name", "given_name", method="qgram") 142 | 143 | def time_string_cosine(self): 144 | c_compare = rl.Compare(self.pairs, self.A, self.B) 145 | c_compare.string("given_name", "given_name", method="cosine") 146 | 147 | def time_string_levenshtein(self): 148 | c_compare = rl.Compare(self.pairs, self.A, self.B) 149 | c_compare.string("given_name", "given_name", method="levenshtein") 150 | 151 | # ************* Exact ************* 152 | 153 | def time_exact(self): 154 | c_compare = rl.Compare(self.pairs, self.A, self.B) 155 | c_compare.exact("state", "state") 156 | 157 | # ************* NUMERIC ************* 158 | 159 | def time_numeric_gauss(self): 160 | c_compare = rl.Compare(self.pairs, self.A, self.B) 161 | c_compare.numeric("age", "age", method="gauss", scale=2) 162 | -------------------------------------------------------------------------------- /benchmarks/bench_indexing.py: -------------------------------------------------------------------------------- 1 | import recordlinkage as rl 2 | from recordlinkage.datasets import load_febrl1 3 | from recordlinkage.datasets import load_febrl4 4 | 5 | 6 | class PairsRecordLinkage: 7 | timeout = 30 * 60 8 | 9 | def setup(self): 10 | # download data 11 | self.A, self.B = load_febrl4() 12 | 13 | def time_full_index(self): 14 | # setup class 15 | c_pairs = rl.FullIndex() 16 | 17 | # Make pairs 18 | c_pairs.index(self.A, self.B) 19 | 20 | def time_block_index(self): 21 | # setup class 22 | c_pairs = rl.BlockIndex("given_name") 23 | 24 | # Make pairs 25 | c_pairs.index(self.A, self.B) 26 | 27 | def time_sni_index(self): 28 | # setup class 29 | c_pairs = rl.SortedNeighbourhoodIndex(on="given_name", w=5) 30 | 31 | # Make pairs 32 | c_pairs.index(self.A, self.B) 33 | 34 | def time_random_index(self): 35 | # setup class 36 | c_pairs = rl.RandomIndex(2500) 37 | 38 | # Make pairs 39 | c_pairs.index(self.A, self.B) 40 | 41 | 42 | class PairsDeduplication: 43 | timeout = 30 * 60 44 | 45 | def setup(self): 46 | # download data 47 | self.A = load_febrl1() 48 | 49 | def time_full_index(self): 50 | # setup class 51 | c_pairs = rl.FullIndex() 52 | 53 | # Make pairs 54 | c_pairs.index(self.A) 55 | 56 | def time_block_index(self): 57 | # setup class 58 | c_pairs = rl.BlockIndex("given_name") 59 | 60 | # Make pairs 61 | c_pairs.index(self.A) 62 | 63 | def time_sni_index(self): 64 | # setup class 65 | c_pairs = rl.SortedNeighbourhoodIndex(on="given_name", w=5) 66 | 67 | # Make pairs 68 | c_pairs.index(self.A) 69 | 70 | def time_random_index(self): 71 | # setup class 72 | c_pairs = rl.RandomIndex(2500) 73 | 74 | # Make pairs 75 | c_pairs.index(self.A) 76 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/about.rst: -------------------------------------------------------------------------------- 1 | ***** 2 | About 3 | ***** 4 | 5 | Introduction 6 | ============ 7 | 8 | The **Python Record Linkage Toolkit** is a library to link records in or 9 | between data sources. The toolkit provides most of the tools needed for 10 | record linkage and deduplication. The package contains indexing methods, 11 | functions to compare records and classifiers. The package is developed 12 | for research and the linking of small or medium sized files. 13 | 14 | The project is inspired by the `Freely Extensible Biomedical Record Linkage 15 | (FEBRL) `__ project, which is a great 16 | project. In contrast with FEBRL, the recordlinkage project makes extensive use 17 | of data manipulation tools like `pandas `__ and 18 | `numpy `__. The use of *pandas*, a flexible and 19 | powerful data analysis and manipulation library for Python, makes the record 20 | linkage process much easier and faster. The extensive *pandas* library can be 21 | used to integrate your record linkage directly into existing data manipulation 22 | projects. 23 | 24 | One of the aims of this project is to make an extensible record linkage 25 | framework. It is easy to include your own indexing algorithms, 26 | comparison/similarity measures and classifiers. The main features of the 27 | Python Record Linkage Toolkit are: 28 | 29 | - Clean and standardise data with easy to use tools 30 | - Make pairs of records with smart indexing methods such as 31 | **blocking** and **sorted neighbourhood indexing** 32 | - Compare records with a large number of comparison and similarity measures 33 | for different types of variables such as strings, numbers and dates. 34 | - Several classifications algorithms, both supervised and unsupervised 35 | algorithms. 36 | - Common record linkage evaluation tools 37 | - Several built-in datasets. 38 | 39 | 40 | What is record linkage? 41 | ======================= 42 | 43 | The term record linkage is used to indicate the procedure of bringing together 44 | information from two or more records that are believed to belong to the same 45 | entity. Record linkage is used to link data from multiple data sources or to 46 | find duplicates in a single data source. In computer science, record linkage 47 | is also known as data matching or deduplication (in case of search duplicate 48 | records within a single file). 49 | 50 | In record linkage, the attributes of the entity (stored in a record) are used 51 | to link two or more records. Attributes can be unique entity identifiers (SSN, 52 | license plate number), but also attributes like (sur)name, date of birth and 53 | car model/colour. The record linkage procedure can be represented as a 54 | workflow [Christen, 2012]. The steps are: cleaning, indexing, comparing, 55 | classifying and evaluation. If needed, the classified record pairs flow back 56 | to improve the previous step. The Python Record Linkage Toolkit follows this 57 | workflow. 58 | 59 | .. seealso:: 60 | 61 | *Christen, Peter. 2012. Data matching: concepts and techniques for record 62 | linkage, entity resolution, and duplicate detection. Springer Science & 63 | Business Media.* 64 | 65 | *Fellegi, Ivan P and Alan B Sunter. 1969. “A theory for record linkage.” 66 | Journal of the American Statistical Association 64(328):1183–1210.* 67 | 68 | *Dunn, Halbert L. 1946. “Record linkage.” American Journal of Public 69 | Health and the Nations Health 36(12):1412–1416.* 70 | 71 | *Herzog, Thomas N, Fritz J Scheuren and William E Winkler. 2007. Data 72 | quality and record linkage techniques. Vol. 1 Springer.* 73 | 74 | How to link records? 75 | ==================== 76 | 77 | Import the ``recordlinkage`` module with all important tools for record 78 | linkage and import the data manipulation framework **pandas**. 79 | 80 | .. code:: python 81 | 82 | import recordlinkage 83 | import pandas 84 | 85 | Consider that you try to link two datasets with personal information 86 | like name, sex and date of birth. Load these datasets into a pandas 87 | ``DataFrame``. 88 | 89 | .. code:: python 90 | 91 | df_a = pandas.DataFrame(YOUR_FIRST_DATASET) 92 | df_b = pandas.DataFrame(YOUR_SECOND_DATASET) 93 | 94 | Comparing all record can be computationally intensive. Therefore, we 95 | make smart set of candidate links with one of the built-in indexing 96 | techniques like **blocking**. Only records pairs agreeing on the 97 | surname are included. 98 | 99 | .. code:: python 100 | 101 | indexer = recordlinkage.Index() 102 | indexer.block('surname') 103 | candidate_links = indexer.index(df_a, df_b) 104 | 105 | Each ``candidate_link`` needs to be compared on the comparable attributes. 106 | This can be done easily with the Compare class and the available comparison 107 | and similarity measures. 108 | 109 | .. code:: python 110 | 111 | compare = recordlinkage.Compare() 112 | 113 | compare.string('name', 'name', method='jarowinkler', threshold=0.85) 114 | compare.exact('sex', 'gender') 115 | compare.exact('dob', 'date_of_birth') 116 | compare.string('streetname', 'streetname', method='damerau_levenshtein', threshold=0.7) 117 | compare.exact('place', 'placename') 118 | compare.exact('haircolor', 'haircolor', missing_value=9) 119 | 120 | # The comparison vectors 121 | compare_vectors = compare.compute(candidate_links, df_a, df_b) 122 | 123 | This record linkage package contains several classification algorithms. 124 | Plenty of the algorithms need trainings data (supervised learning) while 125 | some others are unsupervised. An example of supervised learning: 126 | 127 | .. code:: python 128 | 129 | true_linkage = pandas.Series(YOUR_GOLDEN_DATA, index=pandas.MultiIndex(YOUR_MULTI_INDEX)) 130 | 131 | logrg = recordlinkage.LogisticRegressionClassifier() 132 | logrg.fit(compare_vectors[true_linkage.index], true_linkage) 133 | 134 | logrg.predict(compare_vectors) 135 | 136 | and an example of unsupervised learning (the well known ECM-algorithm): 137 | 138 | .. code:: python 139 | 140 | ecm = recordlinkage.BernoulliEMClassifier() 141 | ecm.fit_predict(compare_vectors) 142 | 143 | 144 | -------------------------------------------------------------------------------- /docs/annotation.rst: -------------------------------------------------------------------------------- 1 | ********** 2 | Annotation 3 | ********** 4 | 5 | Manually labeled record pairs are useful in training and validation tasks. 6 | Training data is usually not available in record linkage applications because 7 | it is highly dataset and sample-specific. The Python Record Linkage Toolkit 8 | comes with a `browser-based user interface`_ for manually classifying record 9 | pairs. A hosted version of `RecordLinkage ANNOTATOR`_ can be found on Github. 10 | 11 | .. _`browser-based user interface`: https://github.com/J535D165/recordlinkage-annotator 12 | .. _`RecordLinkage ANNOTATOR`: https://j535d165.github.io/recordlinkage-annotator/ 13 | 14 | .. image:: https://github.com/J535D165/recordlinkage-annotator/blob/master/images/annotator_review.png?raw=true 15 | :alt: Review screen of RecordLinkage ANNOTATOR 16 | :target: https://j535d165.github.io/recordlinkage-annotator/ 17 | 18 | Generate annotation file 19 | ======================== 20 | 21 | The `RecordLinkage ANNOTATOR`_ software requires a structured annotation 22 | file. The required schema_ of the annotation file is open. The function 23 | :func:`recordlinkage.write_annotation_file` can be used to render and save an 24 | annotation file. The function can be used for both linking and deduplication 25 | purposes. 26 | 27 | .. _schema: https://github.com/J535D165/recordlinkage-annotator/tree/master/schema 28 | 29 | .. autofunction:: recordlinkage.write_annotation_file 30 | 31 | Linking 32 | ------- 33 | 34 | This is a simple example of the code to render an annotation 35 | file for linking records: 36 | 37 | .. code:: python 38 | 39 | import recordlinkage as rl 40 | from recordlinkage.index import Block 41 | from recordlinkage.datasets import load_febrl4 42 | 43 | df_a, df_b = load_febrl4() 44 | 45 | blocker = Block("surname", "surname") 46 | pairs = blocker.index(df_a, df_b) 47 | 48 | rl.write_annotation_file( 49 | "annotation_demo_linking.json", 50 | pairs[0:50], 51 | df_a, 52 | df_b, 53 | dataset_a_name="Febrl4 A", 54 | dataset_b_name="Febrl4 B" 55 | ) 56 | 57 | Deduplication 58 | ------------- 59 | 60 | This is a simple example of the code to render an annotation 61 | file for duplicate detection: 62 | 63 | .. code:: python 64 | 65 | import recordlinkage as rl 66 | from recordlinkage.index import Block 67 | from recordlinkage.datasets import load_febrl1 68 | 69 | df_a = load_febrl1() 70 | 71 | blocker = Block("surname", "surname") 72 | pairs = blocker.index(df_a) 73 | 74 | rl.write_annotation_file( 75 | "annotation_demo_dedup.json", 76 | pairs[0:50], 77 | df_a, 78 | dataset_a_name="Febrl1 A" 79 | ) 80 | 81 | 82 | Manual labeling 83 | =============== 84 | 85 | Go to `RecordLinkage ANNOTATOR`_ or start the server yourself. 86 | 87 | Choose the annotation file on the landing screen or use the drag and drop 88 | functionality. A new screen shows the first record pair to label. Start 89 | labeling data the manually. Use the button `Match` for record pairs belonging 90 | to the same entity. Use `Distinct` for record pairs belonging to different 91 | entities. After all records are labeled by hand, the result can be saved to a 92 | file. 93 | 94 | 95 | Export/read annotation file 96 | =========================== 97 | 98 | After labeling all record pairs, you can export the annotation file to a JSON 99 | file. Use the function :func:`recordlinkage.read_annotation_file` to read the 100 | results. 101 | 102 | .. code:: python 103 | 104 | import recordlinkage as rl 105 | 106 | result = rl.read_annotation_file('my_annotation.json') 107 | print(result.links) 108 | 109 | The function :func:`recordlinkage.read_annotation_file` reads the file and returns 110 | an :class:`recordlinkage.annotation.AnnotationResult` object. This object contains 111 | links and distinct attributes that return a :class:`pandas.MultiIndex` object. 112 | 113 | .. autofunction:: recordlinkage.read_annotation_file 114 | 115 | 116 | .. autoclass:: recordlinkage.annotation.AnnotationResult 117 | :members: 118 | :inherited-members: 119 | 120 | -------------------------------------------------------------------------------- /docs/changelog.rst: -------------------------------------------------------------------------------- 1 | ************* 2 | Release notes 3 | ************* 4 | 5 | Version 0.15 6 | ============ 7 | 8 | - Remove deprecated recordlinkage classes (`#173`_) 9 | - Bump min Python version to 3.6, ideally 3.8+ (`#171`_) 10 | - Bump min pandas version to >=1 11 | - Resolve deprecation warnings for numpy and pandas 12 | - Happy lint, sort imports, format code with yapf 13 | - Remove unnecessary np.sort in SNI algorithm (`#141`_) 14 | - Fix bug for cosine and qgram string comparisons with threshold (`#135`_) 15 | - Fix several typos in docs (`#151`_)(`#152`_)(`#153`_)(`#154`_)(`#163`_)(`#164`_) 16 | - Fix random indexer (`#158`_) 17 | - Fix various deprecation warnings and broken docs build (`#170`_) 18 | - Fix broken docs build due to pandas depr warnings (`#169`_) 19 | - Fix broken build and removed warning messages (`#168`_) 20 | - Update narrative 21 | - Replace Travis by Github Actions (`#132`_) 22 | - Fix broken test NotFittedError 23 | - Fix bug in low memory random sampling and add more tests (`#130`_) 24 | - Add extras_require to setup.py for deps management 25 | - Add banner to README and update title 26 | - Add Binder and Colab buttons at tutorials (`#174`_) 27 | 28 | Special thanks to Tomasz Waleń @twalen and other contributors for their 29 | work on this release. 30 | 31 | .. _#173: https://github.com/J535D165/recordlinkage/pull/173 32 | .. _#171: https://github.com/J535D165/recordlinkage/pull/171 33 | .. _#141: https://github.com/J535D165/recordlinkage/pull/141 34 | .. _#135: https://github.com/J535D165/recordlinkage/pull/135 35 | .. _#151: https://github.com/J535D165/recordlinkage/pull/151 36 | .. _#152: https://github.com/J535D165/recordlinkage/pull/152 37 | .. _#153: https://github.com/J535D165/recordlinkage/pull/153 38 | .. _#154: https://github.com/J535D165/recordlinkage/pull/154 39 | .. _#163: https://github.com/J535D165/recordlinkage/pull/163 40 | .. _#164: https://github.com/J535D165/recordlinkage/pull/164 41 | .. _#158: https://github.com/J535D165/recordlinkage/pull/158 42 | .. _#170: https://github.com/J535D165/recordlinkage/pull/170 43 | .. _#169: https://github.com/J535D165/recordlinkage/pull/169 44 | .. _#168: https://github.com/J535D165/recordlinkage/pull/168 45 | .. _#132: https://github.com/J535D165/recordlinkage/pull/132 46 | .. _#130: https://github.com/J535D165/recordlinkage/pull/130 47 | .. _#174: https://github.com/J535D165/recordlinkage/pull/174 48 | 49 | Version 0.14 50 | ============ 51 | 52 | - Drop Python 2.7 and Python 3.4 support. (`#91`_) 53 | - Upgrade minimal pandas version to 0.23. 54 | - Simplify the use of all cpus in parallel mode. (`#102`_) 55 | - Store large example datasets in user home folder or use environment 56 | variable. Before, example datasets were stored in the package. (see 57 | issue `#42`_) (`#92`_) 58 | - Add support to write and read annotation files for recordlinkage ANNOTATOR. 59 | See the docs and https://github.com/J535D165/recordlinkage-annotator for 60 | more information. 61 | - Replace `.labels` by `.codes` for `pandas.MultiIndex` objects for newer 62 | versions of pandas (>0.24). (`#103`_) 63 | - Fix totals for pandas.MultiIndex input on confusion matrix and accuracy 64 | metrics. (see issue `#84`_) (`#109`_) 65 | - Initialize Compare with (a list of) features (Bug). (`#124`_) 66 | - Various updates in relation to deprecation warnings in third-party 67 | libraries such as sklearn, pandas and networkx. 68 | 69 | .. _#42: https://github.com/J535D165/recordlinkage/issues/42 70 | .. _#84: https://github.com/J535D165/recordlinkage/issues/84 71 | 72 | .. _#91: https://github.com/J535D165/recordlinkage/pull/91 73 | .. _#92: https://github.com/J535D165/recordlinkage/pull/92 74 | .. _#102: https://github.com/J535D165/recordlinkage/pull/102 75 | .. _#103: https://github.com/J535D165/recordlinkage/pull/103 76 | .. _#109: https://github.com/J535D165/recordlinkage/pull/109 77 | .. _#124: https://github.com/J535D165/recordlinkage/pull/124 78 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # For the full list of built-in configuration values, see the documentation: 4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 5 | 6 | import datetime 7 | import os 8 | import sys 9 | 10 | sys.path.insert(0, os.path.abspath("..")) 11 | 12 | # -- Project information ----------------------------------------------------- 13 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information 14 | 15 | project = "Python Record Linkage Toolkit" 16 | copyright = f"2016-{datetime.datetime.now().year}, Jonathan de Bruin" 17 | author = "Jonathan de Bruin" 18 | 19 | version = "0.15" 20 | release = "0.15" 21 | 22 | # -- General configuration --------------------------------------------------- 23 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration 24 | 25 | extensions = [ 26 | "sphinx.ext.autodoc", 27 | "sphinx.ext.doctest", 28 | "sphinx.ext.napoleon", 29 | "sphinx.ext.intersphinx", 30 | "IPython.sphinxext.ipython_console_highlighting", 31 | "IPython.sphinxext.ipython_directive", 32 | "nbsphinx", 33 | ] 34 | 35 | templates_path = ['_templates'] 36 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 37 | 38 | autodoc_member_order = "bysource" 39 | 40 | intersphinx_mapping = { 41 | "python": ("https://docs.python.org/3/", None), 42 | "pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None), 43 | "numpy": ("https://numpy.org/doc/stable/", None), 44 | "sklearn": ("https://scikit-learn.org/stable/", None), 45 | } 46 | 47 | 48 | # -- Options for HTML output ------------------------------------------------- 49 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output 50 | 51 | html_theme = "sphinx_rtd_theme" 52 | html_static_path = ['_static'] 53 | 54 | master_doc = "index" 55 | pygments_style = "sphinx" 56 | 57 | todo_include_todos = False 58 | 59 | # -- Options for HTML output ---------------------------------------------- 60 | 61 | html_static_path = [] 62 | html_domain_indices = False 63 | 64 | # Output file base name for HTML help builder. 65 | htmlhelp_basename = "RecordLinkageToolkitdoc" 66 | 67 | # -- Napoleon options --------------------------------------------------- 68 | 69 | napoleon_google_docstring = False 70 | napoleon_numpy_docstring = True 71 | napoleon_include_private_with_doc = False 72 | napoleon_include_special_with_doc = False 73 | napoleon_use_admonition_for_examples = False 74 | napoleon_use_admonition_for_notes = True 75 | napoleon_use_admonition_for_references = True 76 | napoleon_use_ivar = False 77 | napoleon_use_param = True 78 | napoleon_use_rtype = False 79 | 80 | # -- NBSphinx options ---------------------------------------------------- 81 | 82 | # nbsphinx_execute = 'never' 83 | 84 | # This is processed by Jinja2 and inserted before each notebook 85 | nbsphinx_prolog = r""" 86 | {% set docname = 'docs/' + env.doc2path(env.docname, base=None) %} 87 | 88 | .. note:: 89 | 90 | This page was generated from `{{ docname|e }} `_. 91 | Run an online interactive version of this page with |binder| or |colab|. 92 | 93 | .. |binder| image:: https://mybinder.org/badge_logo.svg 94 | :target: https://mybinder.org/v2/gh/J535D165/recordlinkage/v{{ env.config.release|e }}?filepath={{ docname|e }} 95 | 96 | .. |colab| image:: https://colab.research.google.com/assets/colab-badge.svg 97 | :target: https://githubtocolab.com/J535D165/recordlinkage/blob/v{{ env.config.release|e }}/{{ docname|e }} 98 | 99 | """ 100 | -------------------------------------------------------------------------------- /docs/contributing.rst: -------------------------------------------------------------------------------- 1 | ************ 2 | Contributing 3 | ************ 4 | 5 | Thanks for your interest in contributing to the Python Record Linkage Toolkit. 6 | There is a lot of work to do. See `Github `_ 7 | for the contributors to this package. 8 | 9 | The workflow for contributing is as follows: 10 | 11 | - clone https://github.com/J535D165/recordlinkage.git 12 | - Make a branch with your modifications/contributions 13 | - Write tests 14 | - Run all tests 15 | - Do a pull request 16 | 17 | Testing 18 | ======= 19 | 20 | Install `pytest`: 21 | 22 | .. code:: sh 23 | 24 | pip install pytest 25 | 26 | Run the following command to test the package 27 | 28 | .. code:: sh 29 | 30 | python -m pytest tests/ 31 | 32 | Performance 33 | =========== 34 | 35 | Performance is very important in record linkage. The performance is monitored 36 | for all serious modifications of the core API. The performance monitoring is 37 | performed with `Airspeed Velocity `_ 38 | (asv). 39 | 40 | Install Airspeed Velocity: 41 | 42 | .. code:: sh 43 | 44 | pip install asv 45 | 46 | Run the following command from the root of the repository to test the 47 | performance of the current version of the package: 48 | 49 | .. code:: sh 50 | 51 | asv run 52 | 53 | Run the following command to test all versions since tag v0.6.0 54 | 55 | .. code:: sh 56 | 57 | asv run --skip-existing-commits v0.6.0..master 58 | 59 | 60 | 61 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /docs/images/elas_1705.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/J535D165/recordlinkage/b93d97641952f8c85106be5794ca93b1f1298fbc/docs/images/elas_1705.png -------------------------------------------------------------------------------- /docs/images/indexing_basic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/J535D165/recordlinkage/b93d97641952f8c85106be5794ca93b1f1298fbc/docs/images/indexing_basic.png -------------------------------------------------------------------------------- /docs/images/indexing_plot.py: -------------------------------------------------------------------------------- 1 | import matplotlib.colors as mlc 2 | import matplotlib.pyplot as mlp 3 | import numpy as np 4 | 5 | figure, axes = mlp.subplots(nrows=1, ncols=2, figsize=(8, 5)) 6 | 7 | # linking 8 | db_a = ["A1", "A2", "A3", "A4", "A5", "A6"] 9 | db_b = ["B1", "B2", "B3", "B4", "B5", "B6"] 10 | 11 | img = np.ones((len(db_a), len(db_b)), dtype=float) 12 | 13 | color_map = mlc.LinearSegmentedColormap.from_list( 14 | "ColorMap", [(0.984, 0.501, 0.447), (1.000, 1.000, 1.000)] 15 | ) 16 | axes[0].imshow(img, cmap=color_map, interpolation="none") 17 | 18 | axes[0].set_xlabel("Dataset A", fontsize=13) 19 | axes[0].set_xticks(np.arange(0, len(db_b), 1)) 20 | axes[0].set_xticks(np.arange(-0.5, len(db_b), 1), minor=True) 21 | axes[0].set_xticklabels(db_a) 22 | 23 | axes[0].set_ylabel("Dataset B", fontsize=13) 24 | axes[0].set_yticks(np.arange(0, len(db_a), 1)) 25 | axes[0].set_yticks(np.arange(-0.5, len(db_a), 1), minor=True) 26 | axes[0].set_yticklabels(db_b) 27 | 28 | axes[0].grid(which="minor", color="k") 29 | 30 | axes[0].set_title("Linking A and B", fontsize=15, fontweight="bold") 31 | 32 | # dedup 33 | db_a = ["A1", "A2", "A3", "A4", "A5", "A6"] 34 | db_b = ["A1", "A2", "A3", "A4", "A5", "A6"] 35 | 36 | img = np.ones((len(db_a), len(db_b)), dtype=float) 37 | img = np.triu(img, 1) 38 | 39 | color_map = mlc.LinearSegmentedColormap.from_list( 40 | "ColorMap", [(1.000, 1.000, 1.000), (0.984, 0.501, 0.447)] 41 | ) 42 | axes[1].imshow(img, cmap=color_map, interpolation="none") 43 | 44 | axes[1].set_xlabel("Dataset A", fontsize=13) 45 | axes[1].set_xticks(np.arange(0, len(db_b), 1)) 46 | axes[1].set_xticks(np.arange(-0.5, len(db_b), 1), minor=True) 47 | axes[1].set_xticklabels(db_a) 48 | 49 | axes[1].set_ylabel("Dataset A", fontsize=13) 50 | axes[1].set_yticks(np.arange(0, len(db_a), 1)) 51 | axes[1].set_yticks(np.arange(-0.5, len(db_a), 1), minor=True) 52 | axes[1].set_yticklabels(db_b) 53 | 54 | axes[1].grid(which="minor", color="k") 55 | 56 | axes[1].set_title("Duplicate detection A", fontsize=15, fontweight="bold") 57 | 58 | figure.tight_layout() 59 | 60 | mlp.savefig("indexing_basic.png", dpi=150) 61 | -------------------------------------------------------------------------------- /docs/images/recordlinkage-banner-transparent.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | ******************************************* 2 | Python Record Linkage Toolkit Documentation 3 | ******************************************* 4 | 5 | .. figure:: /images/recordlinkage-banner-transparent.svg 6 | :width: 100% 7 | 8 | All you need to start linking records. 9 | 10 | .. toctree:: 11 | :caption: First steps 12 | :maxdepth: 2 13 | 14 | about 15 | installation 16 | guides/link_two_dataframes.ipynb 17 | guides/data_deduplication.ipynb 18 | 19 | .. toctree:: 20 | :caption: Record linkage 21 | :maxdepth: 2 22 | 23 | ref-preprocessing 24 | ref-index 25 | ref-compare 26 | ref-classifiers 27 | ref-evaluation 28 | ref-datasets 29 | ref-misc 30 | 31 | .. toctree:: 32 | :caption: Miscellaneous 33 | :maxdepth: 2 34 | 35 | annotation 36 | guides/classifiers.rst 37 | performance.rst 38 | 39 | .. toctree:: 40 | :caption: Developers 41 | :maxdepth: 1 42 | 43 | contributing 44 | changelog 45 | -------------------------------------------------------------------------------- /docs/installation.rst: -------------------------------------------------------------------------------- 1 | ************ 2 | Installation 3 | ************ 4 | 5 | Python version support 6 | ====================== 7 | 8 | The Python Record Linkage Toolkit supports the versions of Python that Pandas 9 | supports as well. You can find the supported Python versions in the Pandas 10 | documentation. 11 | 12 | Installation 13 | ============ 14 | 15 | The Python Record linkage Toolkit requires Python 3.6 or higher. Install the 16 | package easily with pip 17 | 18 | .. code:: sh 19 | 20 | pip install recordlinkage 21 | 22 | You can also clone the project on Github. 23 | 24 | To install all recommended and optional dependencies, run 25 | 26 | .. code:: sh 27 | 28 | pip install recordlinkage['all'] 29 | 30 | Dependencies 31 | ============ 32 | 33 | The Python Record Linkage Toolkit depends on the following packages: 34 | 35 | - `numpy `__ 36 | - `pandas `__ 37 | - `scipy `__ 38 | - `sklearn `__ 39 | - `jellyfish `__ 40 | - `joblib` 41 | 42 | Recommended dependencies 43 | ------------------------ 44 | 45 | - `numexpr `__ - accelerating certain numerical operations 46 | - `bottleneck `__ - accelerating certain types of nan evaluations 47 | 48 | Optional dependecies 49 | -------------------- 50 | 51 | - networkx - for network operations like connected components 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/performance.rst: -------------------------------------------------------------------------------- 1 | 2 | Performance 3 | =========== 4 | 5 | Performance plays an important role in record linkage. Record linkage problems 6 | scale quadratically with the size of the dataset(s). The number of record 7 | pairs can be enormous and so are the number of comparisons. The Python Record 8 | Linkage Toolkit can be used for large scale record linkage applications. 9 | Nevertheless, the toolkit is developed with experimenting in first place and 10 | performance on the second place. This page provides tips and tricks to improve 11 | the performance. 12 | 13 | Do you know more tricks? Let us know! 14 | 15 | Indexing 16 | -------- 17 | 18 | Block on multiple columns 19 | ~~~~~~~~~~~~~~~~~~~~~~~~~ 20 | 21 | Blocking is an effective way to increase the performance of your record 22 | linkage. If the performance of your implementation is still poor, decrease the 23 | number of pairs by blocking on multiple variables. This implies that the 24 | record pair is agrees on two or more variables. In the following example, the 25 | record pairs agree on the given name **and** surname. 26 | 27 | .. code:: python 28 | 29 | from recordlinkage.index import Block 30 | indexer = Block(left_on=['first_name', 'surname'], 31 | right_on=['name', 'surname']) 32 | pairs = indexer.index(dfA, dfB) 33 | 34 | You might exclude more links then desired. This can be solved by 35 | repeating the process with different blocking variables. 36 | 37 | .. code:: python 38 | 39 | indexer = recordlinkage.Index() 40 | indexer.block(left_on=['first_name', 'surname'], 41 | right_on=['name', 'surname']) 42 | indexer.block(left_on=['first_name', 'age'], 43 | right_on=['name', 'age']) 44 | pairs = indexer.index(dfA, dfB) 45 | 46 | .. note:: Sorted Neighbourhood indexing supports, besides the sorted 47 | neighbourhood, additional blocking on variables. 48 | 49 | Make record pairs 50 | ~~~~~~~~~~~~~~~~~ 51 | 52 | The structure of the Python Record Linkage Toolkit has a drawback for the 53 | performance. In the indexation step (the step in which record pairs are 54 | selected), only the index of both records is stored. The entire records 55 | are not stored. This results in less memory usage. The drawback is that the 56 | records need to be queried from the data. 57 | 58 | 59 | Comparing 60 | --------- 61 | 62 | Compare only discriminating variables 63 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 64 | 65 | Not all variables may be worth comparing in a record linkage. Some variables 66 | do not discriminate the links of the non-links or do have only minor effects. 67 | These variables can be excluded. Only discriminating and informative should be 68 | included. 69 | 70 | Prevent string comparisons 71 | ~~~~~~~~~~~~~~~~~~~~~~~~~~ 72 | 73 | String similarity measures and phonetic encodings are computationally 74 | expensive. Phonetic encoding takes place on the original data, while string 75 | simililatiry measures are applied on the record pairs. After phonetic encoding 76 | of the string variables, exact comparing can be used instead of computing the 77 | string similarity of all record pairs. If the number of candidate pairs is 78 | much larger than the number of records in both datasets together, then 79 | consider using phonetic encoding of string variables instead of string 80 | comparison. 81 | 82 | String comparing 83 | ~~~~~~~~~~~~~~~~ 84 | 85 | Comparing strings is computationally expensive. The Python Record Linkage 86 | Toolkit uses the package ``jellyfish`` for string comparisons. The package has 87 | two implementations, a C and a Python implementation. Ensure yourself of 88 | having the Rust-version installed (``import jellyfish.rustyfish`` should not 89 | raise an exception). 90 | 91 | There can be a large difference in the performance of different string 92 | comparison algorithms. The Jaro and Jaro-Winkler methods are faster than the 93 | Levenshtein distance and much faster than the Damerau-Levenshtein distance. 94 | 95 | Indexing with large files 96 | ~~~~~~~~~~~~~~~~~~~~~~~~~ 97 | 98 | Sometimes, the input files are very large. In that case, it can be hard 99 | to make an index without running out of memory in the indexing step or 100 | in the comparing step. ``recordlinkage`` has a method to deal with large 101 | files. It is fast, although is not primary developed to be fast. SQL 102 | databases may outperform this method. It is especially developed for the 103 | useability. The idea was to split the input files into small blocks. 104 | For each block the record pairs are computed. Then iterate over the 105 | blocks. Consider full indexing: 106 | 107 | .. code:: python 108 | 109 | import recordlinkage 110 | import numpy 111 | 112 | cl = recordlinkage.index.Full() 113 | 114 | for dfB_subset in numpy.split(dfB): 115 | 116 | # a subset of record pairs 117 | pairs_subset = cl.index(dfA, dfB_subset) 118 | 119 | # Your analysis on pairs_subset here 120 | 121 | 122 | -------------------------------------------------------------------------------- /docs/ref-classifiers.rst: -------------------------------------------------------------------------------- 1 | ***************** 2 | 3. Classification 3 | ***************** 4 | 5 | Classifiers 6 | =========== 7 | 8 | Classification is the step in the record linkage process were record pairs are 9 | classified into matches, non-matches and possible matches [Christen2012]_. 10 | Classification algorithms can be supervised or unsupervised (with or without 11 | training data). 12 | 13 | 14 | .. seealso:: 15 | 16 | .. [Christen2012] Christen, Peter. 2012. Data matching: concepts and 17 | techniques for record linkage, entity resolution, and duplicate 18 | detection. Springer Science & Business Media. 19 | 20 | Supervised 21 | ---------- 22 | 23 | .. autoclass:: recordlinkage.LogisticRegressionClassifier 24 | :members: 25 | :inherited-members: 26 | 27 | .. autoclass:: recordlinkage.NaiveBayesClassifier 28 | :members: 29 | :inherited-members: 30 | 31 | .. autoclass:: recordlinkage.SVMClassifier 32 | :members: 33 | :inherited-members: 34 | 35 | Unsupervised 36 | ------------ 37 | 38 | .. autoclass:: recordlinkage.ECMClassifier 39 | :members: 40 | :inherited-members: 41 | 42 | .. autoclass:: recordlinkage.KMeansClassifier 43 | :members: 44 | :inherited-members: 45 | 46 | 47 | Adapters 48 | ======== 49 | 50 | Adapters can be used to wrap a machine learning models from external packages 51 | like ScitKit-learn and Keras. For example, this makes it possible to classify 52 | record pairs with an neural network developed in Keras. 53 | 54 | .. autoclass:: recordlinkage.adapters.SKLearnAdapter 55 | 56 | 57 | .. code:: python 58 | 59 | # import ScitKit-Learn classifier 60 | from sklearn.ensemble import RandomForestClassifier 61 | 62 | # import BaseClassifier from recordlinkage.base 63 | from recordlinkage.base import BaseClassifier 64 | from recordlinkage.adapters import SKLearnClassifier 65 | from recordlinkage.datasets import binary_vectors 66 | 67 | class RandomForest(SKLearnClassifier, BaseClassifier): 68 | 69 | def __init__(*args, **kwargs): 70 | super(self, RandomForest).__init__() 71 | 72 | # set the kernel 73 | kernel = RandomForestClassifier(*args, **kwargs) 74 | 75 | 76 | # make a sample dataset 77 | features, links = binary_vectors(10000, 2000, return_links=True) 78 | 79 | # initialise the random forest 80 | cl = RandomForest(n_estimators=20) 81 | cl.fit(features, links) 82 | 83 | # predict the matches 84 | cl.predict(...) 85 | 86 | 87 | .. autoclass:: recordlinkage.adapters.KerasAdapter 88 | 89 | Example of a Keras model used for classification. 90 | 91 | .. code:: python 92 | 93 | from tensorflow.keras import layers 94 | from recordlinkage.base import BaseClassifier 95 | from recordlinkage.adapters import KerasAdapter 96 | 97 | class NNClassifier(KerasAdapter, BaseClassifier): 98 | """Neural network classifier.""" 99 | def __init__(self): 100 | super(NNClassifier, self).__init__() 101 | 102 | model = tf.keras.Sequential() 103 | model.add(layers.Dense(16, input_dim=8, activation='relu')) 104 | model.add(layers.Dense(8, activation='relu')) 105 | model.add(layers.Dense(1, activation='sigmoid')) 106 | model.compile( 107 | optimizer=tf.train.AdamOptimizer(0.001), 108 | loss='binary_crossentropy', 109 | metrics=['accuracy'] 110 | ) 111 | 112 | self.kernel = model 113 | 114 | # initialise the model 115 | cl = NNClassifier() 116 | # fit the model to the data 117 | cl.fit(X_train, links_true) 118 | # predict the class of the data 119 | cl.predict(X_pred) 120 | 121 | 122 | User-defined algorithms 123 | ======================= 124 | 125 | Classifiers can make use of the :class:`recordlinkage.base.BaseClassifier` for 126 | algorithms. ScitKit-learn based models may want 127 | :class:`recordlinkage.adapters.SKLearnAdapter` as subclass as well. 128 | 129 | .. autoclass:: recordlinkage.base.BaseClassifier 130 | :members: 131 | :inherited-members: 132 | 133 | Probabilistic models can use the Fellegi and Sunter base class. This class is 134 | used for the :class:`recordlinkage.ECMClassifier` and the 135 | :class:`recordlinkage.NaiveBayesClassifier`. 136 | 137 | .. autoclass:: recordlinkage.classifiers.FellegiSunter 138 | :members: 139 | :inherited-members: 140 | 141 | Examples 142 | ======== 143 | 144 | Unsupervised learning with the ECM algorithm. [See example on Github.](https://github.com/J535D165/recordlinkage/examples/unsupervised_learning.py) 145 | 146 | 147 | Network 148 | ======= 149 | 150 | The Python Record Linkage Toolkit provides network/graph analysis tools for 151 | classification of record pairs into matches and distinct pairs. The toolkit 152 | provides the functionality for one-to-one linking and one-to-many linking. It 153 | is also possible to detect all connected components which is useful in data 154 | deduplication. 155 | 156 | .. autoclass:: recordlinkage.OneToOneLinking 157 | :members: 158 | :inherited-members: 159 | 160 | .. autoclass:: recordlinkage.OneToManyLinking 161 | :members: 162 | :inherited-members: 163 | 164 | .. autoclass:: recordlinkage.ConnectedComponents 165 | :members: 166 | :inherited-members: 167 | -------------------------------------------------------------------------------- /docs/ref-datasets.rst: -------------------------------------------------------------------------------- 1 | ******** 2 | Datasets 3 | ******** 4 | 5 | The Python Record Linkage Toolkit contains several open public datasets. Four 6 | datasets were generated by the developers of Febrl. In the future, we are 7 | developing tools to generate your own datasets. 8 | 9 | .. autofunction:: recordlinkage.datasets.load_krebsregister 10 | 11 | .. autofunction:: recordlinkage.datasets.load_febrl1 12 | 13 | .. autofunction:: recordlinkage.datasets.load_febrl2 14 | 15 | .. autofunction:: recordlinkage.datasets.load_febrl3 16 | 17 | .. autofunction:: recordlinkage.datasets.load_febrl4 18 | 19 | .. autofunction:: recordlinkage.datasets.binary_vectors 20 | -------------------------------------------------------------------------------- /docs/ref-evaluation.rst: -------------------------------------------------------------------------------- 1 | ************* 2 | 4. Evaluation 3 | ************* 4 | 5 | Evaluation of classifications plays an important role in record linkage. 6 | Express your classification quality in terms accuracy, recall and F-score 7 | based on ``true positives``, ``false positives``, ``true negatives`` and 8 | ``false negatives``. 9 | 10 | .. autofunction:: recordlinkage.reduction_ratio 11 | .. autofunction:: recordlinkage.true_positives 12 | .. autofunction:: recordlinkage.true_negatives 13 | .. autofunction:: recordlinkage.false_positives 14 | .. autofunction:: recordlinkage.false_negatives 15 | .. autofunction:: recordlinkage.confusion_matrix 16 | .. autofunction:: recordlinkage.precision 17 | .. autofunction:: recordlinkage.recall 18 | .. autofunction:: recordlinkage.accuracy 19 | .. autofunction:: recordlinkage.specificity 20 | .. autofunction:: recordlinkage.fscore 21 | .. autofunction:: recordlinkage.max_pairs 22 | .. autofunction:: recordlinkage.full_index_size -------------------------------------------------------------------------------- /docs/ref-index.rst: -------------------------------------------------------------------------------- 1 | *********** 2 | 1. Indexing 3 | *********** 4 | 5 | The indexing module is used to make pairs of records. These pairs are called 6 | candidate links or candidate matches. There are several indexing algorithms 7 | available such as blocking and sorted neighborhood indexing. See 8 | [christen2012]_ and [christen2008]_ for background information about 9 | indexation. 10 | 11 | .. [christen2012] Christen, P. (2012). Data matching: concepts and 12 | techniques for record linkage, entity resolution, and duplicate 13 | detection. Springer Science & Business Media. 14 | .. [christen2008] Christen, P. (2008). Febrl - A Freely Available Record 15 | Linkage System with a Graphical User Interface. 16 | 17 | The indexing module can be used for both linking and duplicate detection. In 18 | case of duplicate detection, only pairs in the upper triangular part of the 19 | matrix are returned. This means that the first record in each record pair is 20 | the largest identifier. For example, `("A2", "A1")`, `(5, 2)` and `("acb", 21 | "abc")`. The following image shows the record pairs for a complete set of 22 | record pairs. 23 | 24 | .. figure:: /images/indexing_basic.png 25 | :width: 100% 26 | 27 | :class:`recordlinkage.Index` object 28 | =================================== 29 | 30 | .. autoclass:: recordlinkage.Index 31 | 32 | .. automethod:: recordlinkage.Index.add 33 | .. automethod:: recordlinkage.Index.index 34 | .. automethod:: recordlinkage.Index.full 35 | .. automethod:: recordlinkage.Index.block 36 | .. automethod:: recordlinkage.Index.sortedneighbourhood 37 | .. automethod:: recordlinkage.Index.random 38 | 39 | 40 | 41 | Algorithms 42 | ========== 43 | 44 | The Python Record Linkage Toolkit contains basic and advanced indexing (or 45 | blocking) algorithms to make record pairs. The algorithms are Python classes. 46 | Popular algorithms in the toolkit are: 47 | 48 | - :class:`recordlinkage.index.Full`, 49 | - :class:`recordlinkage.index.Block`, 50 | - :class:`recordlinkage.index.SortedNeighbourhood` 51 | 52 | The algorithms are available in the submodule `recordlinkage.index`. Import 53 | the algorithms in the following way (use blocking algorithm as example): 54 | 55 | .. code:: python 56 | 57 | from recordlinkage.index import Block 58 | 59 | The full reference for the indexing algorithms in the toolkit is given below. 60 | 61 | .. automodule:: recordlinkage.index 62 | :members: 63 | :inherited-members: 64 | 65 | User-defined algorithms 66 | ======================= 67 | 68 | A user-defined algorithm can be defined based on 69 | :class:`recordlinkage.base.BaseIndexAlgorithm`. The :class:`recordlinkage.base.BaseIndexAlgorithm` class is an abstract base 70 | class that is used for indexing algorithms. The classes 71 | 72 | - :class:`recordlinkage.index.Full` 73 | - :class:`recordlinkage.index.Block` 74 | - :class:`recordlinkage.index.SortedNeighbourhood` 75 | - :class:`recordlinkage.index.Random` 76 | 77 | are inherited from this abstract base class. You can use BaseIndexAlgorithm to 78 | create a user-defined/custom algorithm. 79 | 80 | To create a custom algorithm, subclass the 81 | :class:`recordlinkage.base.BaseIndexAlgorithm`. In the subclass, overwrite the 82 | :meth:`recordlinkage.base.BaseIndexAlgorithm._link_index` method in case of 83 | linking two datasets. This method accepts two (tuples of) 84 | :class:`pandas.Series` objects as arguments. Based on these Series objects, 85 | you create record pairs. The record pairs need to be returned in a 2-level 86 | :class:`pandas.MultiIndex` object. The :attr:`pandas.MultiIndex.names` are the 87 | name of index of DataFrame A and name of the index of DataFrame B 88 | respectively. Overwrite the 89 | :meth:`recordlinkage.base.BaseIndexAlgorithm._dedup_index` method in case of 90 | finding link within a single dataset (deduplication). This method accepts a 91 | single (tuples of) :class:`pandas.Series` objects as arguments. 92 | 93 | The algorithm for linking data frames can be used for finding duplicates as 94 | well. In this situation, DataFrame B is a copy of DataFrame A. The Pairs class 95 | removes pairs like (record_i, record_i) and one of the following (record_i, 96 | record_j) (record_j, record_i) under the hood. As result of this, only unique 97 | combinations are returned. If you do have a specific algorithm for finding 98 | duplicates, then you can overwrite the _dedup_index method. This method 99 | accepts only one argument (DataFrame A) and the internal base class does not 100 | look for combinations like explained above. 101 | 102 | .. autoclass:: recordlinkage.base.BaseIndexAlgorithm 103 | :members: 104 | :private-members: 105 | 106 | Examples 107 | ======== 108 | 109 | .. code:: python 110 | 111 | import recordlinkage as rl 112 | from recordlinkage.datasets import load_febrl4 113 | from recordlinkage.index import Block 114 | 115 | df_a, df_b = load_febrl4() 116 | 117 | indexer = rl.Index() 118 | indexer.add(Block('given_name', 'given_name')) 119 | indexer.add(Block('surname', 'surname')) 120 | indexer.index(df_a, df_b) 121 | 122 | Equivalent code: 123 | 124 | .. code:: python 125 | 126 | import recordlinkage as rl 127 | from recordlinkage.datasets import load_febrl4 128 | 129 | df_a, df_b = load_febrl4() 130 | 131 | indexer = rl.Index() 132 | indexer.block('given_name', 'given_name') 133 | indexer.block('surname', 'surname') 134 | index.index(df_a, df_b) 135 | 136 | This example shows how to implement a custom indexing algorithm. The algorithm 137 | returns all record pairs of which the given names starts with the letter ‘W’. 138 | 139 | .. code:: python 140 | 141 | import recordlinkage 142 | from recordlinkage.datasets import load_febrl4 143 | 144 | df_a, df_b = load_febrl4() 145 | 146 | from recordlinkage.base import BaseIndexAlgorithm 147 | 148 | class FirstLetterWIndex(BaseIndexAlgorithm): 149 | """Custom class for indexing""" 150 | 151 | def _link_index(self, df_a, df_b): 152 | """Make pairs with given names starting with the letter 'w'.""" 153 | 154 | # Select records with names starting with a w. 155 | name_a_w = df_a[df_a['given_name'].str.startswith('w') == True] 156 | name_b_w = df_b[df_b['given_name'].str.startswith('w') == True] 157 | 158 | # Make a product of the two numpy arrays 159 | return pandas.MultiIndex.from_product( 160 | [name_a_w.index.values, name_b_w.index.values], 161 | names=[df_a.index.name, df_b.index.name] 162 | ) 163 | 164 | indexer = FirstLetterWIndex() 165 | candidate_pairs = indexer.index(df_a, df_b) 166 | 167 | print ('Returns a', type(candidate_pairs).__name__) 168 | print ('Number of candidate record pairs starting with the letter w:', len(candidate_pairs)) 169 | 170 | The custom index class below does not restrict the first letter to ‘w’, but the first letter is an argument (named `letter`). This letter can is initialized during the setup of the class. 171 | 172 | .. code:: python 173 | 174 | class FirstLetterIndex(BaseIndexAlgorithm): 175 | """Custom class for indexing""" 176 | 177 | def __init__(self, letter): 178 | super(FirstLetterIndex, self).__init__() 179 | 180 | # the letter to save 181 | self.letter = letter 182 | 183 | def _link_index(self, df_a, df_b): 184 | """Make record pairs that agree on the first letter of the given name.""" 185 | 186 | # Select records with names starting with a 'letter'. 187 | a_startswith_w = df_a[df_a['given_name'].str.startswith(self.letter) == True] 188 | b_startswith_w = df_b[df_b['given_name'].str.startswith(self.letter) == True] 189 | 190 | # Make a product of the two numpy arrays 191 | return pandas.MultiIndex.from_product( 192 | [a_startswith_w.index.values, b_startswith_w.index.values], 193 | names=[df_a.index.name, df_b.index.name] 194 | ) 195 | -------------------------------------------------------------------------------- /docs/ref-misc.rst: -------------------------------------------------------------------------------- 1 | ************* 2 | Miscellaneous 3 | ************* 4 | 5 | .. autofunction:: recordlinkage.index_split 6 | 7 | .. autofunction:: recordlinkage.get_option 8 | .. autofunction:: recordlinkage.set_option 9 | .. autofunction:: recordlinkage.reset_option 10 | .. autofunction:: recordlinkage.describe_option -------------------------------------------------------------------------------- /docs/ref-preprocessing.rst: -------------------------------------------------------------------------------- 1 | **************** 2 | 0. Preprocessing 3 | **************** 4 | 5 | Preprocessing data, like cleaning and standardising, may increase your record 6 | linkage accuracy. The Python Record Linkage Toolkit contains several tools for 7 | data preprocessing. The preprocessing and standardising functions are 8 | available in the submodule `recordlinkage.preprocessing`. Import the 9 | algorithms in the following way: 10 | 11 | .. code:: python 12 | 13 | from recordlinkage.preprocessing import clean, phonetic 14 | 15 | Cleaning 16 | ======== 17 | 18 | The Python Record Linkage Toolkit has some cleaning function from which 19 | :func:`recordlinkage.preprocessing.clean` is the most generic function. Pandas 20 | itself is also very usefull for (string) data cleaning. See the pandas 21 | documentation on this topic: `Working with Text Data `_. 22 | 23 | .. autofunction:: recordlinkage.preprocessing.clean 24 | .. autofunction:: recordlinkage.preprocessing.phonenumbers 25 | .. autofunction:: recordlinkage.preprocessing.value_occurence 26 | 27 | Phonetic encoding 28 | ================= 29 | 30 | Phonetic algorithms are algorithms for indexing of words by their 31 | pronunciation. The most well-known algorithm is the `Soundex 32 | `_ algorithm. The Python Record Linkage 33 | Toolkit supports multiple algorithms through the 34 | :func:`recordlinkage.preprocessing.phonetic` function. 35 | 36 | .. note:: 37 | 38 | Use phonetic algorithms in advance of the indexing and comparing step. 39 | This results in most siutations in better performance. 40 | 41 | .. autofunction:: recordlinkage.preprocessing.phonetic 42 | .. autoattribute:: recordlinkage.preprocessing.phonetic_algorithms 43 | -------------------------------------------------------------------------------- /examples/README.rst: -------------------------------------------------------------------------------- 1 | Python Record Linkage Toolkit examples 2 | ====================================== 3 | 4 | This folder contains examples on record linkage with the Python Record Linkage 5 | Toolkit. The examples do have a BSD 3-Clause "New" or "Revised" License. 6 | Contributions are appreciated. 7 | 8 | Basic 9 | ----- 10 | 11 | `Deterministic deduplication`_ 12 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 13 | 14 | Example of deterministic record linkage to find duplicated records in a 15 | dataset. In this example, the model isn't trained with train data. 16 | 17 | `Deterministic linkage`_ 18 | ~~~~~~~~~~~~~~~~~~~~~~~~ 19 | 20 | Example of deterministic record linkage to find links between two datasets. In 21 | this example, the model isn't trained with train data. 22 | 23 | `Supervised Fellegi and Sunter with Naive Bayes classifier`_ 24 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 25 | 26 | An implementation of the Fellegi and Sunter (1969) classification model in a 27 | supervised way. 28 | 29 | `Unsupervised Fellegi and Sunter with ECM classifier`_ 30 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 31 | 32 | An implementation of the Fellegi and Sunter (1969) classification model in an 33 | unsupervised way. The training of model parameters is done with the 34 | Expectation-Conditional Maximisation algorithm. 35 | 36 | 37 | Advanced 38 | -------- 39 | 40 | `Record linkage with Neural Networks`_ 41 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 42 | 43 | This example shows how Neural Networks can be used to classify record pairs. 44 | The Neural Network is implemented in Keras. 45 | 46 | .. _`Deterministic deduplication`: /examples/dedup_deterministic.py 47 | .. _`Deterministic linkage`: /examples/linking_deterministic.py 48 | .. _`Record linkage with Neural Networks`: /examples/supervised_keras.py 49 | .. _`Supervised Fellegi and Sunter with Naive Bayes classifier`: /examples/supervised_learning_prob.py 50 | .. _`Unsupervised Fellegi and Sunter with ECM classifier`: /examples/unsupervised_learning_prob.py 51 | -------------------------------------------------------------------------------- /examples/dedup_deterministic.py: -------------------------------------------------------------------------------- 1 | """Example: Deterministic record linkage to find links in a single file. 2 | 3 | In determininistic record linkage, each compared attribute get a certain 4 | weight (coefficient). The higher the weight, the more dicriminating the 5 | variable is. A low weight indicate a less discriminating variable. For 6 | example, the given name has a higher weight than the hometown. 7 | 8 | This example uses FEBRL3 datasets. This dataset contain records about 9 | individuals. 10 | 11 | Deterministic RL parameters are: 12 | intercept = -11.0 13 | coefficients = [1.5, 1.5, 8.0, 6.0, 2.5, 6.5, 5.0] 14 | 15 | """ 16 | 17 | 18 | import recordlinkage as rl 19 | from recordlinkage.compare import Exact 20 | from recordlinkage.compare import String 21 | from recordlinkage.datasets import load_febrl3 22 | from recordlinkage.index import Block 23 | 24 | # set logging 25 | rl.logging.set_verbosity(rl.logging.INFO) 26 | 27 | # load dataset 28 | print("Loading data...") 29 | dfA, true_links = load_febrl3(return_links=True) 30 | print(len(dfA), "records in dataset A") 31 | print(len(true_links), "links in dataset A") 32 | 33 | # start indexing 34 | print("Build index...") 35 | indexer = rl.Index() 36 | indexer.add(Block("given_name")) 37 | indexer.add(Block("surname")) 38 | indexer.add(Block("soc_sec_id")) 39 | candidate_links = indexer.index(dfA) 40 | 41 | # start comparing 42 | print("Start comparing...") 43 | comparer = rl.Compare() 44 | comparer.add(Exact("given_name", "given_name", label="given_name")) 45 | comparer.add( 46 | String("surname", "surname", method="jarowinkler", threshold=0.85, label="surname") 47 | ) 48 | comparer.add(Exact("date_of_birth", "date_of_birth", label="date_of_birth")) 49 | comparer.add(Exact("suburb", "suburb", label="suburb")) 50 | comparer.add(Exact("state", "state", label="state")) 51 | comparer.add(String("address_1", "address_1", threshold=0.85, label="address_1")) 52 | comparer.add(String("address_2", "address_2", threshold=0.85, label="address_2")) 53 | features = comparer.compute(candidate_links, dfA) 54 | 55 | print("feature shape", features.shape) 56 | 57 | # use the Logistic Regression Classifier 58 | # this classifier is equivalent to the deterministic record linkage approach 59 | intercept = -9.5 60 | coefficients = [2.0, 3.0, 7.0, 6.0, 2.5, 5.0, 5.5] 61 | 62 | print("Deterministic classifier") 63 | print("intercept", intercept) 64 | print("coefficients", coefficients) 65 | 66 | logreg = rl.LogisticRegressionClassifier(coefficients=coefficients, intercept=intercept) 67 | links = logreg.predict(features) 68 | 69 | print(len(links), "links/matches") 70 | 71 | # return the confusion matrix 72 | conf_logreg = rl.confusion_matrix(true_links, links, len(candidate_links)) 73 | print("confusion matrix") 74 | print(conf_logreg) 75 | 76 | # compute the F-score for this classification 77 | fscore = rl.fscore(conf_logreg) 78 | print("fscore", fscore) 79 | recall = rl.recall(true_links, links) 80 | print("recall", recall) 81 | precision = rl.precision(true_links, links) 82 | print("precision", precision) 83 | -------------------------------------------------------------------------------- /examples/linking_deterministic.py: -------------------------------------------------------------------------------- 1 | """This example demonstrates deterministic record linkage to link two files. 2 | 3 | In determininistic record linkage, each compared attribute get a certain 4 | weight (coefficient). The higher the weight, the more dicriminating the 5 | variable is. A low weight indicate a less discriminating variable. For 6 | example, the given name has a higher weight than the hometown. 7 | 8 | This example uses FEBRL4 datasets. These datasets contain records about 9 | individuals. 10 | 11 | Deterministic RL parameters are: 12 | intercept = -11.0 13 | coefficients = [1.5, 1.5, 8.0, 6.0, 2.5, 6.5, 5.0] 14 | 15 | """ 16 | 17 | 18 | import recordlinkage as rl 19 | from recordlinkage.compare import Exact 20 | from recordlinkage.compare import String 21 | from recordlinkage.datasets import load_febrl4 22 | from recordlinkage.index import Block 23 | 24 | # set logging 25 | rl.logging.set_verbosity(rl.logging.INFO) 26 | 27 | # load datasets 28 | print("Loading data...") 29 | dfA, dfB, true_links = load_febrl4(return_links=True) 30 | print(len(dfA), "records in dataset A") 31 | print(len(dfB), "records in dataset B") 32 | print(len(true_links), "links between dataset A and B") 33 | 34 | # start indexing 35 | print("Build index...") 36 | indexer = rl.Index() 37 | indexer.add(Block("given_name")) 38 | indexer.add(Block("surname")) 39 | indexer.add(Block("soc_sec_id")) 40 | candidate_links = indexer.index(dfA, dfB) 41 | 42 | # start comparing 43 | print("Start comparing...") 44 | comparer = rl.Compare() 45 | comparer.add(Exact("given_name", "given_name", label="given_name")) 46 | comparer.add( 47 | String("surname", "surname", method="jarowinkler", threshold=0.85, label="surname") 48 | ) 49 | comparer.add(Exact("date_of_birth", "date_of_birth", label="date_of_birth")) 50 | comparer.add(Exact("suburb", "suburb", label="suburb")) 51 | comparer.add(Exact("state", "state", label="state")) 52 | comparer.add(String("address_1", "address_1", threshold=0.85, label="address_1")) 53 | comparer.add(String("address_2", "address_2", threshold=0.85, label="address_2")) 54 | features = comparer.compute(candidate_links, dfA, dfB) 55 | 56 | print("feature shape", features.shape) 57 | 58 | # use the Logistic Regression Classifier 59 | # this classifier is equivalent to the deterministic record linkage approach 60 | intercept = -11.0 61 | coefficients = [1.5, 1.5, 8.0, 6.0, 2.5, 6.5, 5.0] 62 | 63 | print("Deterministic classifier") 64 | print("intercept", intercept) 65 | print("coefficients", coefficients) 66 | 67 | logreg = rl.LogisticRegressionClassifier(coefficients=coefficients, intercept=intercept) 68 | links = logreg.predict(features) 69 | 70 | print(len(links), "links/matches") 71 | 72 | # return the confusion matrix 73 | conf_logreg = rl.confusion_matrix(true_links, links, len(candidate_links)) 74 | print("confusion matrix") 75 | print(conf_logreg) 76 | 77 | # compute the F-score for this classification 78 | fscore = rl.fscore(conf_logreg) 79 | print("fscore", fscore) 80 | recall = rl.recall(true_links, links) 81 | print("recall", recall) 82 | precision = rl.precision(true_links, links) 83 | print("precision", precision) 84 | -------------------------------------------------------------------------------- /examples/supervised_keras.py: -------------------------------------------------------------------------------- 1 | """Example: Supervised learning with Neural Networks.""" 2 | 3 | 4 | import numpy as np 5 | import tensorflow as tf 6 | 7 | try: 8 | from tensorflow.keras import layers 9 | except ModuleNotFoundError as err: 10 | raise ModuleNotFoundError("Please upgrade tensorflow.") from err 11 | 12 | import recordlinkage as rl 13 | from recordlinkage.adapters import KerasAdapter 14 | from recordlinkage.base import BaseClassifier 15 | from recordlinkage.datasets import binary_vectors 16 | 17 | # create a dataset with the following settings 18 | n_pairs = 50000 19 | n_matches = 7000 20 | m_simulate = np.array([0.94, 0.81, 0.85, 0.90, 0.99, 0.70, 0.56, 0.92]) 21 | u_simulate = np.array([0.19, 0.23, 0.50, 0.11, 0.20, 0.14, 0.50, 0.09]) 22 | 23 | # Create the dataset and return the true links. 24 | X_data, links_true = binary_vectors( 25 | n_pairs, # the number of candidate links 26 | n_matches, # the number of true links 27 | m=m_simulate, # the m probabilities 28 | u=u_simulate, # the u probabilities 29 | random_state=535, # set seed 30 | return_links=True, 31 | ) # return true links 32 | 33 | 34 | # Initialise the Keras. 35 | class NNClassifier(KerasAdapter, BaseClassifier): 36 | """Neural network classifier.""" 37 | 38 | def __init__(self, *args, **kwargs): 39 | super().__init__() 40 | 41 | model = tf.keras.Sequential() 42 | model.add(layers.Dense(16, input_dim=8, activation="relu")) 43 | model.add(layers.Dense(8, activation="relu")) 44 | model.add(layers.Dense(1, activation="sigmoid")) 45 | model.compile( 46 | optimizer=tf.train.AdamOptimizer(0.001), 47 | loss="binary_crossentropy", 48 | metrics=["accuracy"], 49 | ) 50 | 51 | self.kernel = model 52 | 53 | 54 | cl = NNClassifier() 55 | cl.fit(X_data, links_true) 56 | 57 | # evaluate the model 58 | links_pred = cl.predict(X_data) 59 | print("Predicted number of links:", len(links_pred)) 60 | 61 | cm = rl.confusion_matrix(links_true, links_pred, total=len(X_data)) 62 | print("Confusion matrix:\n", cm) 63 | 64 | # compute the F-score for this classification 65 | fscore = rl.fscore(cm) 66 | print("fscore", fscore) 67 | recall = rl.recall(links_true, links_pred) 68 | print("recall", recall) 69 | precision = rl.precision(links_true, links_pred) 70 | print("precision", precision) 71 | 72 | # Predict the match probability for each pair in the dataset. 73 | probs = cl.prob(X_data) 74 | print(probs[0:10]) 75 | -------------------------------------------------------------------------------- /examples/supervised_learning_prob.py: -------------------------------------------------------------------------------- 1 | """Example: Supervised learning with the Naive Bayes algorithm. 2 | 3 | """ 4 | 5 | 6 | import numpy as np 7 | 8 | import recordlinkage as rl 9 | from recordlinkage.datasets import binary_vectors 10 | 11 | # create a dataset with the following settings 12 | n_pairs = 50000 13 | n_matches = 7000 14 | m_simulate = np.array([0.94, 0.81, 0.85, 0.90, 0.99, 0.70, 0.56, 0.92]) 15 | u_simulate = np.array([0.19, 0.23, 0.50, 0.11, 0.20, 0.14, 0.50, 0.09]) 16 | 17 | # Create the dataset and return the true links. 18 | X_data, links_true = binary_vectors( 19 | n_pairs, # the number of candidate links 20 | n_matches, # the number of true links 21 | m=m_simulate, # the m probabilities 22 | u=u_simulate, # the u probabilities 23 | random_state=535, # set seed 24 | return_links=True, 25 | ) # return true links 26 | 27 | # Initialise the NaiveBayesClassifier. 28 | cl = rl.NaiveBayesClassifier() 29 | cl.fit(X_data, links_true) 30 | 31 | # Print the parameters that are trained (m, u and p). Note that the estimates 32 | # are very good. 33 | print("p probability P(Match):", cl.p) 34 | print("m probabilities P(x_i=1|Match):", cl.m_probs) 35 | print("u probabilities P(x_i=1|Non-Match):", cl.u_probs) 36 | print("log m probabilities P(x_i=1|Match):", cl.log_m_probs) 37 | print("log u probabilities P(x_i=1|Non-Match):", cl.log_u_probs) 38 | print("log weights of features:", cl.log_weights) 39 | print("weights of features:", cl.weights) 40 | 41 | # evaluate the model 42 | links_pred = cl.predict(X_data) 43 | print("Predicted number of links:", len(links_pred)) 44 | 45 | cm = rl.confusion_matrix(links_true, links_pred, total=len(X_data)) 46 | print("Confusion matrix:\n", cm) 47 | 48 | # compute the F-score for this classification 49 | fscore = rl.fscore(cm) 50 | print("fscore", fscore) 51 | recall = rl.recall(links_true, links_pred) 52 | print("recall", recall) 53 | precision = rl.precision(links_true, links_pred) 54 | print("precision", precision) 55 | 56 | # Predict the match probability for each pair in the dataset. 57 | probs = cl.prob(X_data) 58 | -------------------------------------------------------------------------------- /examples/unsupervised_learning_prob.py: -------------------------------------------------------------------------------- 1 | """Example: Unsupervised learning with the ECM algorithm. 2 | 3 | Train data is often hard to collect in record linkage or data matching 4 | problems. The Expectation-Conditional Maximisation (ECM) algorithm is the most 5 | well known algorithm for unsupervised data matching. The algorithm preforms 6 | relatively well compared to supervised methods. 7 | 8 | """ 9 | 10 | 11 | import numpy as np 12 | 13 | import recordlinkage as rl 14 | from recordlinkage.datasets import binary_vectors 15 | 16 | # create a dataset with the following settings 17 | n_pairs = 50000 18 | n_matches = 7000 19 | m_simulate = np.array([0.94, 0.81, 0.85, 0.90, 0.99, 0.70, 0.56, 0.92]) 20 | u_simulate = np.array([0.19, 0.23, 0.50, 0.11, 0.20, 0.14, 0.50, 0.09]) 21 | 22 | # Create the dataset and return the true links. 23 | X_data, links_true = binary_vectors( 24 | n_pairs, # the number of candidate links 25 | n_matches, # the number of true links 26 | m=m_simulate, # the m probabilities 27 | u=u_simulate, # the u probabilities 28 | random_state=535, # set seed 29 | return_links=True, 30 | ) # return true links 31 | 32 | # Initialise the Expectation-Conditional Maximisation classifier. 33 | cl = rl.ECMClassifier() 34 | cl.fit(X_data) 35 | 36 | # Print the parameters that are trained (m, u and p). Note that the estimates 37 | # are very good. 38 | print("p probability P(Match):", cl.p) 39 | print("m probabilities P(x_i=1|Match):", cl.m_probs) 40 | print("u probabilities P(x_i=1|Non-Match):", cl.u_probs) 41 | print("log m probabilities P(x_i=1|Match):", cl.log_m_probs) 42 | print("log u probabilities P(x_i=1|Non-Match):", cl.log_u_probs) 43 | print("log weights of features:", cl.log_weights) 44 | print("weights of features:", cl.weights) 45 | 46 | # evaluate the model 47 | links_pred = cl.predict(X_data) 48 | print("Predicted number of links:", len(links_pred)) 49 | 50 | cm = rl.confusion_matrix(links_true, links_pred, total=len(X_data)) 51 | print("Confusion matrix:\n", cm) 52 | 53 | # compute the F-score for this classification 54 | fscore = rl.fscore(cm) 55 | print("fscore", fscore) 56 | recall = rl.recall(links_true, links_pred) 57 | print("recall", recall) 58 | precision = rl.precision(links_true, links_pred) 59 | print("precision", precision) 60 | 61 | # Predict the match probability for each pair in the dataset. 62 | probs = cl.prob(X_data) 63 | print(probs) 64 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "recordlinkage" 3 | description = "A record linkage toolkit for linking and deduplication" 4 | authors = [ 5 | { name = "Jonathan de Bruin", email = "jonathandebruinos@gmail.com" } 6 | ] 7 | readme = "README.md" 8 | classifiers = [ 9 | "Development Status :: 4 - Beta", 10 | "License :: OSI Approved :: BSD License", 11 | "Programming Language :: Python :: 3.8", 12 | "Programming Language :: Python :: 3.9", 13 | "Programming Language :: Python :: 3.10", 14 | "Programming Language :: Python :: 3.11" 15 | ] 16 | license = {text = "BSD-3-Clause"} 17 | dependencies = [ 18 | "jellyfish>=1", 19 | "numpy>=1.13", 20 | "pandas>=1,<3", 21 | "scipy>=1", 22 | "scikit-learn>=1", 23 | "joblib" 24 | ] 25 | dynamic = ["version"] 26 | requires-python = ">=3.8" 27 | 28 | [project.urls] 29 | homepage = "https://recordlinkage.readthedocs.io/" 30 | repository = "https://github.com/J535D165/recordlinkage" 31 | 32 | [project.optional-dependencies] 33 | all = ["networkx>=2", "bottleneck", "numexpr"] 34 | lint = ["ruff"] 35 | docs = ["sphinx", "nbsphinx", "sphinx-rtd-theme", "ipykernel"] 36 | test = ["pytest"] 37 | 38 | [build-system] 39 | build-backend = 'setuptools.build_meta' 40 | requires = ["setuptools>=45", "setuptools_scm[toml]>=6.2"] 41 | 42 | [tool.setuptools] 43 | packages = ["recordlinkage"] 44 | 45 | [tool.setuptools.package-data] 46 | "*" = ["*.csv"] 47 | 48 | [tool.setuptools_scm] 49 | write_to = "recordlinkage/_version.py" 50 | 51 | [tool.ruff] 52 | select = ["E", "F", "UP", "I", "B"] 53 | ignore = ["B006"] 54 | exclude = ["docs/conf.py"] 55 | 56 | [tool.ruff.isort] 57 | force-single-line = true 58 | -------------------------------------------------------------------------------- /recordlinkage/__init__.py: -------------------------------------------------------------------------------- 1 | # register the configuration 2 | import recordlinkage.config_init # noqa 3 | from recordlinkage.classifiers import FellegiSunter 4 | from recordlinkage.classifiers import KMeansClassifier 5 | from recordlinkage.classifiers import LogisticRegressionClassifier 6 | from recordlinkage.classifiers import NaiveBayesClassifier 7 | from recordlinkage.classifiers import SVMClassifier 8 | from recordlinkage.classifiers import ECMClassifier 9 | from recordlinkage.measures import reduction_ratio 10 | from recordlinkage.measures import max_pairs 11 | from recordlinkage.measures import full_index_size 12 | from recordlinkage.measures import true_positives 13 | from recordlinkage.measures import true_negatives 14 | from recordlinkage.measures import false_positives 15 | from recordlinkage.measures import false_negatives 16 | from recordlinkage.measures import confusion_matrix 17 | from recordlinkage.measures import precision 18 | from recordlinkage.measures import recall 19 | from recordlinkage.measures import accuracy 20 | from recordlinkage.measures import specificity 21 | from recordlinkage.measures import fscore 22 | from recordlinkage.network import OneToOneLinking 23 | from recordlinkage.network import OneToManyLinking 24 | from recordlinkage.network import ConnectedComponents 25 | from recordlinkage import rl_logging as logging 26 | from recordlinkage.annotation import read_annotation_file 27 | from recordlinkage.annotation import write_annotation_file 28 | from recordlinkage.api import Compare 29 | from recordlinkage.api import Index 30 | from recordlinkage.config import describe_option 31 | from recordlinkage.config import get_option 32 | from recordlinkage.config import option_context 33 | from recordlinkage.config import options 34 | from recordlinkage.config import reset_option 35 | from recordlinkage.config import set_option 36 | from recordlinkage.utils import index_split 37 | from recordlinkage.utils import split_index 38 | 39 | try: 40 | from recordlinkage._version import __version__ 41 | from recordlinkage._version import __version_tuple__ 42 | except ImportError: 43 | __version__ = "0.0.0" 44 | __version_tuple__ = (0, 0, 0) 45 | 46 | 47 | __all__ = [ 48 | "logging", 49 | "read_annotation_file", 50 | "write_annotation_file", 51 | "Compare", 52 | "Index", 53 | "describe_option", 54 | "get_option", 55 | "option_context", 56 | "options", 57 | "reset_option", 58 | "set_option", 59 | "index_split", 60 | "split_index", 61 | "FellegiSunter", 62 | "KMeansClassifier", 63 | "LogisticRegressionClassifier", 64 | "NaiveBayesClassifier", 65 | "SVMClassifier", 66 | "ECMClassifier", 67 | "reduction_ratio", 68 | "max_pairs", 69 | "full_index_size", 70 | "true_positives", 71 | "true_negatives", 72 | "false_positives", 73 | "false_negatives", 74 | "confusion_matrix", 75 | "precision", 76 | "recall", 77 | "accuracy", 78 | "specificity", 79 | "fscore", 80 | "OneToOneLinking", 81 | "OneToManyLinking", 82 | "ConnectedComponents", 83 | ] 84 | -------------------------------------------------------------------------------- /recordlinkage/_lib/numeric.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #define R 6371 6 | #define TO_RAD (3.1415926536 / 180) 7 | 8 | double euclidean_dist(double x, double y) 9 | { 10 | return fabs(y - x); 11 | } 12 | 13 | double haversine_dist(double th1, double ph1, double th2, double ph2) 14 | { 15 | double dx, dy, dz; 16 | ph1 -= ph2; 17 | ph1 *= TO_RAD, th1 *= TO_RAD, th2 *= TO_RAD; 18 | 19 | dz = sin(th1) - sin(th2); 20 | dx = cos(ph1) * cos(th1) - cos(th2); 21 | dy = sin(ph1) * cos(th1); 22 | return asin(sqrt(dx * dx + dy * dy + dz * dz) / 2) * 2 * R; 23 | } 24 | 25 | double step_sim(double d, double offset, double origin) 26 | { 27 | if (fabs(d - origin) <= offset) 28 | { 29 | return 1.0; 30 | } else 31 | { 32 | return 0.0; 33 | } 34 | } 35 | 36 | double linear_sim(double d, double scale, double offset, double origin) 37 | { 38 | 39 | double d_norm; 40 | 41 | // normalise the distance measure 42 | d_norm = fabs(d - origin); 43 | 44 | if (d_norm <= offset) 45 | { 46 | return 1.0; 47 | } 48 | else if (d_norm >= offset + 2 * scale) 49 | { 50 | return 0.0; 51 | } 52 | else 53 | { 54 | return 1.0 - (d_norm - offset) / (2 * scale); 55 | } 56 | } 57 | 58 | 59 | double squared_sim(double d, double scale, double offset, double origin) 60 | { 61 | 62 | double d_norm; 63 | 64 | // normalise the distance measure 65 | d_norm = fabs(d - origin); 66 | 67 | if (d_norm <= offset) 68 | { 69 | return 1.0; 70 | } 71 | else if (d_norm >= offset + sqrt(2.0) * scale) 72 | { 73 | return 0.0; 74 | } 75 | else 76 | { 77 | return 1.0 - 0.5 * exp(2.0 * log((d_norm - offset)/scale)); 78 | } 79 | } 80 | 81 | 82 | double exp_sim(double d, double scale, double offset, double origin) 83 | { 84 | 85 | double d_norm; 86 | 87 | // normalise the distance measure 88 | d_norm = fabs(d - origin); 89 | 90 | if (d_norm <= offset) 91 | { 92 | return 1.0; 93 | } 94 | else 95 | { 96 | return pow(2.0, - (d_norm-offset) / scale); 97 | } 98 | } 99 | 100 | 101 | double gauss_sim(double d, double scale, double offset, double origin) 102 | { 103 | 104 | double d_norm; 105 | 106 | // normalise the distance measure 107 | d_norm = fabs(d - origin); 108 | 109 | if (d_norm <= offset) 110 | { 111 | return 1.0; 112 | } 113 | else 114 | { 115 | return pow(2.0, - pow((d_norm-offset) / scale, 2.0)); 116 | } 117 | } 118 | -------------------------------------------------------------------------------- /recordlinkage/_lib/numeric.h: -------------------------------------------------------------------------------- 1 | 2 | // numeric distance functions 3 | double euclidean_dist(double x, double y); 4 | double haversine_dist(double th1, double ph1, double th2, double ph2); 5 | 6 | // numeric similarity functions 7 | double step_sim(double d, double offset, double origin); 8 | double linear_sim(double d, double scale, double offset, double origin); 9 | double squared_sim(double d, double scale, double offset, double origin); 10 | double exp_sim(double d, double scale, double offset, double origin); 11 | double gauss_sim(double d, double scale, double offset, double origin); 12 | -------------------------------------------------------------------------------- /recordlinkage/adapters.py: -------------------------------------------------------------------------------- 1 | """Module to wrap external machine learning models.""" 2 | 3 | __all__ = ["SKLearnAdapter", "KerasAdapter"] 4 | 5 | 6 | class SKLearnAdapter: 7 | """SciKit-learn adapter for record pair classification. 8 | 9 | SciKit-learn adapter for record pair classification with SciKit-learn 10 | models. 11 | """ 12 | 13 | @property 14 | def classifier(self): 15 | # raise warning 16 | return self.kernel 17 | 18 | @classifier.setter 19 | def classifier(self, classifier): 20 | self.kernel = classifier 21 | 22 | def _predict(self, features): 23 | """Predict matches and non-matches. 24 | 25 | Parameters 26 | ---------- 27 | features : numpy.ndarray 28 | The data to predict the class of. 29 | 30 | Returns 31 | ------- 32 | numpy.ndarray 33 | The predicted classes. 34 | """ 35 | 36 | from sklearn.exceptions import NotFittedError 37 | 38 | try: 39 | prediction = self.kernel.predict(features) 40 | except NotFittedError as err: 41 | raise NotFittedError( 42 | "{} is not fitted yet. Call 'fit' with appropriate " 43 | "arguments before using this method.".format(type(self).__name__) 44 | ) from err 45 | 46 | return prediction 47 | 48 | def _fit(self, features, y=None): 49 | if y is None: # unsupervised 50 | self.kernel.fit(features) 51 | else: 52 | self.kernel.fit(features, y) 53 | 54 | def _prob_match(self, features): 55 | """Compute match probabilities. 56 | 57 | Parameters 58 | ---------- 59 | features : numpy.ndarray 60 | The data to train the model on. 61 | 62 | Returns 63 | ------- 64 | numpy.ndarray 65 | The match probabilties. 66 | """ 67 | 68 | # compute the probabilities 69 | probs = self.kernel.predict_proba(features) 70 | 71 | # get the position of match probabilities 72 | classes = list(self.kernel.classes_) 73 | match_class_position = classes.index(1) 74 | 75 | return probs[:, match_class_position] 76 | 77 | 78 | class KerasAdapter: 79 | """Keras adapter for record pair classification. 80 | 81 | Keras adapter for record pair classification with Keras models. 82 | """ 83 | 84 | @property 85 | def classifier(self): 86 | # raise warning 87 | return self.kernel 88 | 89 | @classifier.setter 90 | def classifier(self, classifier): 91 | self.kernel = classifier 92 | 93 | def _predict(self, features): 94 | """Predict matches and non-matches. 95 | 96 | Parameters 97 | ---------- 98 | features : numpy.ndarray 99 | The data to predict the class of. 100 | 101 | Returns 102 | ------- 103 | numpy.ndarray 104 | The predicted classes. 105 | """ 106 | 107 | from sklearn.exceptions import NotFittedError 108 | 109 | try: 110 | prediction = self.kernel.predict_classes(features)[:, 0] 111 | except NotFittedError as err: 112 | raise NotFittedError( 113 | "{} is not fitted yet. Call 'fit' with appropriate " 114 | "arguments before using this method.".format(type(self).__name__) 115 | ) from err 116 | 117 | return prediction 118 | 119 | def _fit(self, features, y=None): 120 | self.kernel.fit(features, y) 121 | 122 | def _prob_match(self, features): 123 | """Compute match probabilities. 124 | 125 | Parameters 126 | ---------- 127 | features : numpy.ndarray 128 | The data to train the model on. 129 | 130 | Returns 131 | ------- 132 | numpy.ndarray 133 | The match probabilties. 134 | """ 135 | 136 | # compute the probabilities 137 | probs = self.kernel.predict_proba(features)[:, 0] 138 | 139 | return probs 140 | -------------------------------------------------------------------------------- /recordlinkage/algorithms/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/J535D165/recordlinkage/b93d97641952f8c85106be5794ca93b1f1298fbc/recordlinkage/algorithms/__init__.py -------------------------------------------------------------------------------- /recordlinkage/algorithms/c_numeric.pyx: -------------------------------------------------------------------------------- 1 | cimport cython 2 | 3 | import numpy as np 4 | 5 | cimport numpy as np 6 | 7 | 8 | cdef extern from "../_lib/numeric.h": 9 | 10 | # numeric distance functions 11 | double euclidean_dist(double x, double y) 12 | double haversine_dist(double th1, double ph1, double th2, double ph2) 13 | 14 | # numeric similarity functions 15 | double step_sim(double d, double offset, double origin) 16 | double linear_sim(double d, double scale, double offset, double origin) 17 | double squared_sim(double d, double scale, double offset, double origin) 18 | double exp_sim(double d, double scale, double offset, double origin) 19 | double gauss_sim(double d, double scale, double offset, double origin) 20 | 21 | 22 | @cython.boundscheck(False) # turn off bounds-checking for entire function 23 | @cython.wraparound(False) # turn off negative index wrapping for entire function 24 | def euclidean_distance(np.ndarray[np.float64_t, ndim=1] x, np.ndarray[np.float64_t, ndim=1] y): 25 | 26 | cdef int n_rows = x.shape[0] 27 | 28 | cdef np.ndarray result = np.zeros(n_rows, dtype=np.float64) 29 | 30 | for k in range(n_rows): 31 | result[k] = euclidean_dist(x[k], y[k]) 32 | 33 | return result 34 | 35 | 36 | @cython.boundscheck(False) # turn off bounds-checking for entire function 37 | @cython.wraparound(False) # turn off negative index wrapping for entire function 38 | def haversine_distance(np.ndarray[np.float64_t, ndim=1] th1, np.ndarray[np.float64_t, ndim=1] ph1, np.ndarray[np.float64_t, ndim=1] th2, np.ndarray[np.float64_t, ndim=1] ph2): 39 | 40 | cdef int n_rows = th1.shape[0] 41 | 42 | cdef np.ndarray result = np.zeros(n_rows, dtype=np.float64) 43 | 44 | for k in range(n_rows): 45 | result[k] = haversine_dist(th1[k], ph1[k], th2[k], ph2[k]) 46 | 47 | return result 48 | -------------------------------------------------------------------------------- /recordlinkage/algorithms/compare.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas 3 | 4 | 5 | def _compare_exact(s1, s2, agree_value=1, disagree_value=0, missing_value=0): 6 | # dtypes can be hard if the passed parameters (agreement, disagreement, 7 | # missing_value) are of different types. 8 | # http://chris.friedline.net/2015-12-15-rutgers/lessons/python2/03-data-types-and-format.html 9 | 10 | # Convert to pandas.Series if (numpy) arrays are passed. 11 | if not isinstance(s1, pandas.Series): 12 | s1 = pandas.Series(s1, index=s1.index) 13 | 14 | if not isinstance(s2, pandas.Series): 15 | s2 = pandas.Series(s2, index=s2.index) 16 | 17 | # Values or agree/disagree 18 | if agree_value == "value": 19 | compare = s1.copy() 20 | compare[s1 != s2] = disagree_value 21 | 22 | else: 23 | compare = pandas.Series(disagree_value, index=s1.index) 24 | compare[s1 == s2] = agree_value 25 | 26 | # Only when disagree value is not identical with the missing value 27 | if disagree_value != missing_value: 28 | compare[(s1.isnull() | s2.isnull())] = missing_value 29 | 30 | return compare 31 | 32 | 33 | def _compare_dates( 34 | s1, s2, swap_month_day=0.5, swap_months="default", errors="coerce", *args, **kwargs 35 | ): 36 | # validate datatypes 37 | if str(s1.dtype) != "datetime64[ns]": 38 | raise ValueError("Left column is not of type datetime64[ns]") 39 | 40 | if str(s2.dtype) != "datetime64[ns]": 41 | raise ValueError("Right column is not of type datetime64[ns]") 42 | 43 | c = (s1 == s2).astype(np.int64) # start with int64 (will become float64) 44 | 45 | # The case is which there is a swap_month_day value given. 46 | if swap_month_day and swap_month_day != 0: 47 | c[ 48 | (s1.dt.year == s2.dt.year) 49 | & (s1.dt.month == s2.dt.day) 50 | & (s1.dt.day == s2.dt.month) 51 | & (c != 1) 52 | ] = swap_month_day 53 | 54 | if swap_months and swap_months != 0: 55 | if swap_months == "default": 56 | swap_months = [(6, 7, 0.5), (7, 6, 0.5), (9, 10, 0.5), (10, 9, 0.5)] 57 | else: 58 | try: 59 | if not all([len(x) == 3 for x in swap_months]): 60 | raise Exception 61 | except Exception as err: 62 | raise ValueError( 63 | "swap_months must be a list of (first month, \ 64 | second month, value) tuples or lists. " 65 | ) from err 66 | 67 | for month1, month2, value in swap_months: 68 | # if isinstance(value, float): 69 | # c = c.astype(np.float64) 70 | # elif isinstance(value, int): 71 | # c = c.astype(np.int64) 72 | # else: 73 | # c = c.astype(object) 74 | 75 | c[ 76 | (s1.dt.year == s2.dt.year) 77 | & (s1.dt.month == month1) 78 | & (s2.dt.month == month2) 79 | & (s1.dt.day == s2.dt.day) 80 | & (c != 1) 81 | ] = value 82 | 83 | c = pandas.Series(c) 84 | c[s1.isnull() | s2.isnull()] = np.nan 85 | 86 | return c 87 | -------------------------------------------------------------------------------- /recordlinkage/algorithms/distance.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas 3 | 4 | 5 | # Numerical distance algorithms 6 | def _1d_distance(s1, s2): 7 | return pandas.eval("s2-s1") 8 | 9 | 10 | def _haversine_distance(lat1, lng1, lat2, lng2): 11 | # degrees to radians conversion 12 | to_rad = np.deg2rad(1) # noqa 13 | 14 | # numeric expression to use with numexpr package 15 | expr = ( 16 | "2*6371*arcsin(sqrt((sin((lat2*to_rad-lat1*to_rad)/2))**2+" 17 | "cos(lat1*to_rad)*cos(lat2*to_rad)*" 18 | "(sin((lng2*to_rad-lng1*to_rad)/2))**2))" 19 | ) 20 | 21 | return pandas.eval(expr) 22 | -------------------------------------------------------------------------------- /recordlinkage/algorithms/indexing.py: -------------------------------------------------------------------------------- 1 | """Algorithms for indexing.""" 2 | 3 | import numpy as np 4 | 5 | from recordlinkage.measures import full_index_size 6 | 7 | 8 | def _map_tril_1d_on_2d(indices, dims): 9 | """Map 1d indices on lower triangular matrix in 2d.""" 10 | 11 | N = (dims * dims - dims) / 2 12 | 13 | m = np.ceil(np.sqrt(2 * N)) 14 | c = m - np.round(np.sqrt(2 * (N - indices))) - 1 15 | r = np.mod(indices + (c + 1) * (c + 2) / 2 - 1, m) + 1 16 | 17 | return np.array([r, c], dtype=np.int64) 18 | 19 | 20 | def random_pairs_with_replacement(n, shape, random_state=None): 21 | """make random record pairs""" 22 | 23 | if not isinstance(random_state, np.random.RandomState): 24 | random_state = np.random.RandomState(random_state) 25 | 26 | n_max = full_index_size(shape) 27 | 28 | if n_max <= 0: 29 | raise ValueError("n_max must be larger than 0") 30 | 31 | # make random pairs 32 | indices = random_state.randint(0, n_max, n, dtype=np.int64) 33 | 34 | if len(shape) == 1: 35 | return _map_tril_1d_on_2d(indices, shape[0]) 36 | else: 37 | return np.array(np.unravel_index(indices, shape)) 38 | 39 | 40 | def random_pairs_without_replacement(n, shape, random_state=None): 41 | """Return record pairs for dense sample. 42 | 43 | Sample random record pairs without replacement bounded by the 44 | maximum number of record pairs (based on shape). This algorithm is 45 | efficient and fast for relative small samples. 46 | """ 47 | 48 | n_max = full_index_size(shape) 49 | 50 | if not isinstance(random_state, np.random.RandomState): 51 | random_state = np.random.RandomState(random_state) 52 | 53 | if not isinstance(n, int) or n <= 0 or n > n_max: 54 | raise ValueError("n must be a integer satisfying 0 n_max: 80 | raise ValueError("n must be a integer satisfying 0= 0.0 61 | assert cv["random"].max() <= 1.0 62 | 63 | 64 | class TestRandomDiscrete: 65 | def test_random_desc_standalone(self): 66 | arr1 = [1, 2, 3, 4, 5] 67 | arr2 = [1, 2, 3, 4, 5] 68 | pairs = pd.MultiIndex.from_product([arr1, arr2]) 69 | 70 | c = RandomDiscrete() 71 | r = c.compute(pairs) 72 | 73 | assert r.shape[0] == len(arr1) * len(arr2) 74 | 75 | def test_random_desc(self): 76 | df_a = pd.DataFrame({"v": list("abcde")}) 77 | df_b = pd.DataFrame({"v": list("abcde")}) 78 | 79 | pairs = Full().index(df_a, df_b) 80 | 81 | c = recordlinkage.Compare() 82 | c.exact("v", "v") 83 | c.add(RandomDiscrete(label="random")) 84 | cv = c.compute(pairs, df_a, df_b) 85 | 86 | assert isinstance(cv, pd.DataFrame) 87 | 88 | assert cv["random"].notnull().all() 89 | assert cv["random"].isin([0, 1]).all() 90 | -------------------------------------------------------------------------------- /recordlinkage/contrib/index/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Jonathan de Bruin 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions are met: 5 | # 6 | # 1. Redistributions of source code must retain the above copyright notice, 7 | # this list of conditions and the following disclaimer. 8 | # 2. Redistributions in binary form must reproduce the above copyright notice, 9 | # this list of conditions and the following disclaimer in the documentation 10 | # and/or other materials provided with the distribution. 11 | # 3. Neither the name of the copyright holder nor the names of its 12 | # contributors may be used to endorse or promote products derived from this 13 | # software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 19 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 20 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 21 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 25 | # POSSIBILITY OF SUCH DAMAGE. 26 | 27 | 28 | from recordlinkage.contrib.index.neighbourhoodblock.neighbourhoodblock import ( 29 | NeighbourhoodBlock, 30 | ) 31 | 32 | __all__ = ["NeighbourhoodBlock"] 33 | -------------------------------------------------------------------------------- /recordlinkage/contrib/index/neighbourhoodblock/README.rst: -------------------------------------------------------------------------------- 1 | Neighbourhood blocking 2 | ====================== 3 | 4 | Example 5 | ------- 6 | 7 | In the following example, the record pairs are made for two historical 8 | datasets with census data. The datasets are named ``census_data_1980`` 9 | and ``census_data_1990``. The index includes record pairs with matches 10 | in (at least) any 3 out of the 5 nominated fields. Proximity matching is 11 | allowed in the first two fields, and up to one wildcard match of a 12 | missing value is also allowed. 13 | 14 | .. code:: python 15 | 16 | from recordlinkage.contrib.index import NeighbourhoodBlock 17 | 18 | keys = ['first_name', 'surname', 'date_of_birth', 'address', 'ssid'] 19 | windows = [9, 3, 1, 1, 1] 20 | 21 | indexer = NeighbourhoodBlock( 22 | keys, windows=windows, max_nulls=1, max_non_matches=2) 23 | indexer.index(census_data_1980, census_data_1990) 24 | 25 | Authors 26 | ------- 27 | 28 | - Daniel Elias -------------------------------------------------------------------------------- /recordlinkage/contrib/index/neighbourhoodblock/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions are met: 5 | # 6 | # 1. Redistributions of source code must retain the above copyright notice, 7 | # this list of conditions and the following disclaimer. 8 | # 2. Redistributions in binary form must reproduce the above copyright notice, 9 | # this list of conditions and the following disclaimer in the documentation 10 | # and/or other materials provided with the distribution. 11 | # 3. Neither the name of the copyright holder nor the names of its 12 | # contributors may be used to endorse or promote products derived from this 13 | # software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 19 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 20 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 21 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 25 | # POSSIBILITY OF SUCH DAMAGE. 26 | -------------------------------------------------------------------------------- /recordlinkage/contrib/index/neighbourhoodblock/test_neighbourhoodblock.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from operator import eq 4 | from operator import gt 5 | 6 | import numpy as np 7 | import pytest 8 | 9 | from recordlinkage.contrib.index import NeighbourhoodBlock 10 | from recordlinkage.index import Block 11 | from recordlinkage.index import Full 12 | from recordlinkage.index import SortedNeighbourhood 13 | from tests.test_indexing import TestData 14 | 15 | 16 | class TestNeighbourhoodBlock(TestData): 17 | """General unittest for the NeighbourhoodBlocking indexing class.""" 18 | 19 | @classmethod 20 | def setup_class(cls): 21 | TestData.setup_class() 22 | 23 | def incomplete_df_copy(df, nan_proportion=0.1): 24 | "copy of DataFrame with some cells set to NaN" 25 | nan_count = int(round(len(df) * nan_proportion)) 26 | 27 | def with_nulls(vals): 28 | vals = vals.copy() 29 | vals.iloc[ 30 | np.random.choice(len(df), size=nan_count, replace=False) 31 | ] = np.nan 32 | return vals 33 | 34 | return df.copy() if nan_count <= 0 else df.apply(with_nulls) 35 | 36 | np.random.seed(0) 37 | cls.incomplete_a = incomplete_df_copy(cls.a) 38 | cls.incomplete_b = incomplete_df_copy(cls.b) 39 | 40 | def assert_index_comparisons(self, pairwise_comparison, indexers, *args, **kwargs): 41 | indexes = [ndxr.index(*args, **kwargs) for ndxr in indexers] 42 | for index1, index2 in zip(indexes, indexes[1:]): 43 | pairs1, pairs2 = map(set, [index1, index2]) 44 | assert ( 45 | (len(pairs1) == len(index1)) 46 | and (len(pairs2) == len(index2)) 47 | and pairwise_comparison(pairs1, pairs2) 48 | ) 49 | 50 | def test_dedup_vs_full(self): 51 | indexers = [ 52 | NeighbourhoodBlock(max_non_matches=len(self.a.columns)), 53 | Full(), 54 | ] 55 | self.assert_index_comparisons(eq, indexers, self.a) 56 | 57 | def test_link_vs_full(self): 58 | indexers = [ 59 | NeighbourhoodBlock(max_non_matches=len(self.a.columns)), 60 | Full(), 61 | ] 62 | self.assert_index_comparisons(eq, indexers, self.a, self.b) 63 | 64 | def test_dedup_single_blocking_key_vs_block(self): 65 | indexers = [ 66 | NeighbourhoodBlock("var_block10", max_nulls=1), 67 | NeighbourhoodBlock( 68 | left_on="var_block10", right_on="var_block10", max_nulls=1 69 | ), 70 | Block("var_block10"), 71 | ] 72 | self.assert_index_comparisons(eq, indexers, self.a) 73 | self.assert_index_comparisons(gt, indexers[-2:], self.incomplete_a) 74 | 75 | def test_link_single_blocking_key_vs_block(self): 76 | indexers = [ 77 | NeighbourhoodBlock("var_arange", max_nulls=1), 78 | NeighbourhoodBlock( 79 | left_on="var_arange", right_on="var_arange", max_nulls=1 80 | ), 81 | Block("var_arange"), 82 | ] 83 | self.assert_index_comparisons(eq, indexers, self.a, self.b) 84 | self.assert_index_comparisons( 85 | gt, indexers[-2:], self.incomplete_a, self.incomplete_b 86 | ) 87 | 88 | def test_dedup_multiple_blocking_keys_vs_block(self): 89 | indexers = [ 90 | NeighbourhoodBlock(["var_single", "var_block10"], max_nulls=1), 91 | NeighbourhoodBlock( 92 | left_on=["var_single", "var_block10"], 93 | right_on=["var_single", "var_block10"], 94 | max_nulls=1, 95 | ), 96 | Block(["var_single", "var_block10"]), 97 | ] 98 | self.assert_index_comparisons(eq, indexers, self.a) 99 | self.assert_index_comparisons(gt, indexers[-2:], self.incomplete_a) 100 | 101 | def test_link_multiple_blocking_keys_vs_block(self): 102 | indexers = [ 103 | NeighbourhoodBlock(["var_arange", "var_block10"], max_nulls=1), 104 | NeighbourhoodBlock( 105 | left_on=["var_arange", "var_block10"], 106 | right_on=["var_arange", "var_block10"], 107 | max_nulls=1, 108 | ), 109 | Block(["var_arange", "var_block10"]), 110 | ] 111 | self.assert_index_comparisons(eq, indexers, self.a, self.b) 112 | self.assert_index_comparisons( 113 | gt, indexers[-2:], self.incomplete_a, self.incomplete_b 114 | ) 115 | 116 | @pytest.mark.parametrize("window", [3, 5, 7, 9, 11]) 117 | def test_dedup_single_sorting_key_vs_sortedneighbourhood(self, window): 118 | indexers = [ 119 | NeighbourhoodBlock("var_arange", max_nulls=1, windows=window), 120 | NeighbourhoodBlock( 121 | left_on="var_arange", right_on="var_arange", max_nulls=1, windows=window 122 | ), 123 | SortedNeighbourhood("var_arange", window=window), 124 | ] 125 | self.assert_index_comparisons(eq, indexers, self.a) 126 | self.assert_index_comparisons(gt, indexers[-2:], self.incomplete_a) 127 | 128 | @pytest.mark.parametrize("window", [3, 5, 7, 9, 11]) 129 | def test_link_single_sorting_key_vs_sortedneighbourhood(self, window): 130 | indexers = [ 131 | NeighbourhoodBlock("var_arange", max_nulls=1, windows=window), 132 | NeighbourhoodBlock( 133 | left_on="var_arange", right_on="var_arange", max_nulls=1, windows=window 134 | ), 135 | SortedNeighbourhood("var_arange", window=window), 136 | ] 137 | self.assert_index_comparisons(eq, indexers, self.a, self.b) 138 | self.assert_index_comparisons( 139 | gt, indexers[-2:], self.incomplete_a, self.incomplete_b 140 | ) 141 | 142 | @pytest.mark.parametrize("window", [3, 5, 7, 9, 11]) 143 | def test_dedup_with_blocking_vs_sortedneighbourhood(self, window): 144 | indexers = [ 145 | NeighbourhoodBlock( 146 | ["var_arange", "var_block10"], max_nulls=1, windows=[window, 1] 147 | ), 148 | NeighbourhoodBlock( 149 | left_on=["var_arange", "var_block10"], 150 | right_on=["var_arange", "var_block10"], 151 | max_nulls=1, 152 | windows=[window, 1], 153 | ), 154 | SortedNeighbourhood("var_arange", block_on="var_block10", window=window), 155 | ] 156 | self.assert_index_comparisons(eq, indexers, self.a) 157 | self.assert_index_comparisons(gt, indexers[-2:], self.incomplete_a) 158 | 159 | @pytest.mark.parametrize("window", [3, 5, 7, 9, 11]) 160 | def test_link_with_blocking_vs_sortedneighbourhood(self, window): 161 | indexers = [ 162 | NeighbourhoodBlock( 163 | ["var_arange", "var_block10"], max_nulls=1, windows=[window, 1] 164 | ), 165 | NeighbourhoodBlock( 166 | left_on=["var_arange", "var_block10"], 167 | right_on=["var_arange", "var_block10"], 168 | max_nulls=1, 169 | windows=[window, 1], 170 | ), 171 | SortedNeighbourhood("var_arange", block_on="var_block10", window=window), 172 | ] 173 | self.assert_index_comparisons(eq, indexers, self.a, self.b) 174 | self.assert_index_comparisons( 175 | gt, indexers[-2:], self.incomplete_a, self.incomplete_b 176 | ) 177 | -------------------------------------------------------------------------------- /recordlinkage/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from recordlinkage.datasets.external import clear_data_home 2 | from recordlinkage.datasets.external import get_data_home 3 | from recordlinkage.datasets.external import load_krebsregister 4 | from recordlinkage.datasets.febrl import load_febrl1 5 | from recordlinkage.datasets.febrl import load_febrl2 6 | from recordlinkage.datasets.febrl import load_febrl3 7 | from recordlinkage.datasets.febrl import load_febrl4 8 | from recordlinkage.datasets.generate import binary_vectors 9 | 10 | __all__ = [ 11 | "clear_data_home", 12 | "get_data_home", 13 | "load_krebsregister", 14 | "load_febrl1", 15 | "load_febrl2", 16 | "load_febrl3", 17 | "load_febrl4", 18 | "binary_vectors", 19 | ] 20 | -------------------------------------------------------------------------------- /recordlinkage/datasets/external.py: -------------------------------------------------------------------------------- 1 | # The function get_data_home() and clear_data_home() are based on 2 | # SciKit-Learn https://git.io/fjT70. See the 3-clause BSD license. 3 | 4 | import shutil 5 | import zipfile 6 | from io import BytesIO 7 | from os import environ 8 | from pathlib import Path 9 | from urllib.request import urlopen 10 | 11 | import pandas 12 | 13 | 14 | def get_data_home(data_home=None): 15 | """Return the path of the Record Linkage data folder. 16 | 17 | This folder is used by some large dataset loaders to avoid 18 | downloading the data several times. By default the data dir 19 | is set to a folder named 'rl_data' in the user 20 | home folder. 21 | Alternatively, it can be set by the 'RL_DATA' environment 22 | variable or programmatically by giving an explicit folder 23 | path. The '~' symbol is expanded to the user home folder. 24 | 25 | If the folder does not already exist, it is automatically 26 | created. 27 | 28 | Parameters 29 | ---------- 30 | data_home : str | None 31 | The path to recordlinkage data folder. 32 | """ 33 | if data_home is None: 34 | data_home = environ.get("RL_DATA", Path("~", "rl_data")) 35 | data_home = Path(data_home).expanduser() 36 | 37 | if not data_home.exists(): 38 | data_home.mkdir(parents=True, exist_ok=True) 39 | 40 | return data_home 41 | 42 | 43 | def clear_data_home(data_home=None): 44 | """Delete all the content of the data home cache. 45 | 46 | Parameters 47 | ---------- 48 | data_home : str | None 49 | The path to recordlinkage data folder. 50 | """ 51 | data_home = get_data_home(data_home) 52 | shutil.rmtree(str(data_home)) 53 | 54 | 55 | def load_krebsregister(block=None, missing_values=None, shuffle=True): 56 | """Load the Krebsregister dataset. 57 | 58 | This dataset of comparison patterns was obtained in a 59 | epidemiological cancer study in Germany. The comparison patterns 60 | were created by the Institute for Medical Biostatistics, 61 | Epidemiology and Informatics (IMBEI) and the University Medical 62 | Center of Johannes Gutenberg University (Mainz, Germany). The 63 | dataset is available for research online. 64 | 65 | "The records represent individual data including first and 66 | family name, sex, date of birth and postal code, which were 67 | collected through iterative insertions in the course of 68 | several years. The comparison patterns in this data set are 69 | based on a sample of 100.000 records dating from 2005 to 2008. 70 | Data pairs were classified as "match" or "non-match" during 71 | an extensive manual review where several documentarists were 72 | involved. The resulting classification formed the basis for 73 | assessing the quality of the registry's own record linkage 74 | procedure. 75 | 76 | In order to limit the amount of patterns a blocking procedure 77 | was applied, which selects only record pairs that meet 78 | specific agreement conditions. The results of the following 79 | six blocking iterations were merged together: 80 | 81 | - Phonetic equality of first name and family name, equality of 82 | date of birth. 83 | - Phonetic equality of first name, equality of day of birth. 84 | - Phonetic equality of first name, equality of month of birth. 85 | - Phonetic equality of first name, equality of year of birth. 86 | - Equality of complete date of birth. 87 | - Phonetic equality of family name, equality of sex. 88 | 89 | This procedure resulted in 5.749.132 record pairs, of which 90 | 20.931 are matches. The data set is split into 10 blocks of 91 | (approximately) equal size and ratio of matches to 92 | non-matches." 93 | 94 | Parameters 95 | ---------- 96 | block : int, list 97 | An integer or a list with integers between 1 and 10. The 98 | blocks are the blocks explained in the description. Default 99 | all 1 to 10. 100 | missing_values : object, int, float 101 | The value of the missing values. Default NaN. 102 | shuffle : bool 103 | Shuffle the record pairs. Default True. 104 | 105 | Returns 106 | ------- 107 | (pandas.DataFrame, pandas.MultiIndex) 108 | A pandas.DataFrame with comparison vectors and a 109 | pandas.MultiIndex with the indices of the matches. 110 | 111 | """ 112 | 113 | if block is None: 114 | block = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] 115 | 116 | # If the data is not found, download it. 117 | for i in range(1, 11): 118 | filepath = Path(get_data_home(), "krebsregister", f"block_{i}.zip") 119 | 120 | if not filepath.is_file(): 121 | _download_krebsregister() 122 | break 123 | 124 | if isinstance(block, (list, tuple)): 125 | data = pandas.concat([_krebsregister_block(bl) for bl in block]) 126 | else: 127 | data = _krebsregister_block(block) 128 | 129 | if shuffle: 130 | data = data.sample(frac=1, random_state=535) 131 | 132 | match_index = data.index[data["is_match"]] 133 | del data["is_match"] 134 | 135 | if pandas.notnull(missing_values): 136 | data.fillna(missing_values, inplace=True) 137 | 138 | return data, match_index 139 | 140 | 141 | def _download_krebsregister(): 142 | zip_file_url = ( 143 | "http://archive.ics.uci.edu/ml/" "machine-learning-databases/00210/donation.zip" 144 | ) 145 | 146 | folder = Path(get_data_home(), "krebsregister") 147 | 148 | try: 149 | print(f"Downloading data to {folder}.") 150 | r = urlopen(zip_file_url).read() 151 | 152 | # unzip the content and put it in the krebsregister folder 153 | z = zipfile.ZipFile(BytesIO(r)) 154 | z.extractall(str(folder)) 155 | 156 | print("Data download succesfull.") 157 | 158 | except Exception as e: 159 | print("Issue with downloading the data:", e) 160 | 161 | 162 | def _krebsregister_block(block): 163 | if block not in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]: 164 | raise ValueError( 165 | "Argument 'block' has to be integer in " 166 | "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10] or list of integers." 167 | ) 168 | 169 | fp_i = Path(get_data_home(), "krebsregister", f"block_{block}.zip") 170 | 171 | data_block = pandas.read_csv( 172 | fp_i, index_col=["id_1", "id_2"], na_values="?", compression="zip" 173 | ) 174 | 175 | data_block.columns = [ 176 | "cmp_firstname1", 177 | "cmp_firstname2", 178 | "cmp_lastname1", 179 | "cmp_lastname2", 180 | "cmp_sex", 181 | "cmp_birthday", 182 | "cmp_birthmonth", 183 | "cmp_birthyear", 184 | "cmp_zipcode", 185 | "is_match", 186 | ] 187 | data_block.index.names = ["id1", "id2"] 188 | 189 | return data_block 190 | -------------------------------------------------------------------------------- /recordlinkage/datasets/febrl.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import numpy 4 | import pandas 5 | 6 | 7 | def _febrl_load_data(filename): 8 | # Internal function for loading febrl data 9 | 10 | filepath = Path(Path(__file__).parent, "febrl", filename) 11 | 12 | febrl_data = pandas.read_csv( 13 | filepath, 14 | index_col="rec_id", 15 | sep=",", 16 | engine="c", 17 | skipinitialspace=True, 18 | encoding="utf-8", 19 | dtype={ 20 | "street_number": object, 21 | "date_of_birth": object, 22 | "soc_sec_id": object, 23 | "postcode": object, 24 | }, 25 | ) 26 | 27 | return febrl_data 28 | 29 | 30 | def _febrl_links(df): 31 | """Get the links of a FEBRL dataset.""" 32 | 33 | index = df.index.to_series() 34 | keys = index.str.extract(r"rec-(\d+)", expand=True)[0] 35 | 36 | index_int = numpy.arange(len(df)) 37 | 38 | df_helper = pandas.DataFrame({"key": keys, "index": index_int}) 39 | 40 | # merge the two frame and make MultiIndex. 41 | pairs_df = df_helper.merge(df_helper, on="key")[["index_x", "index_y"]] 42 | pairs_df = pairs_df[pairs_df["index_x"] > pairs_df["index_y"]] 43 | 44 | return pandas.MultiIndex( 45 | levels=[df.index.values, df.index.values], 46 | codes=[pairs_df["index_x"].values, pairs_df["index_y"].values], 47 | names=[None, None], 48 | verify_integrity=False, 49 | ) 50 | 51 | 52 | def load_febrl1(return_links=False): 53 | """Load the FEBRL 1 dataset. 54 | 55 | The Freely Extensible Biomedical Record Linkage (Febrl) package is 56 | distributed with a dataset generator and four datasets generated 57 | with the generator. This function returns the first Febrl dataset 58 | as a :class:`pandas.DataFrame`. 59 | 60 | *"This data set contains 1000 records (500 original and 61 | 500 duplicates, with exactly one duplicate per original 62 | record."* 63 | 64 | Parameters 65 | ---------- 66 | return_links: bool 67 | When True, the function returns also the true links. 68 | 69 | Returns 70 | ------- 71 | pandas.DataFrame 72 | A :class:`pandas.DataFrame` with Febrl dataset1.csv. When 73 | return_links is True, the function returns also the true 74 | links. The true links are all links in the lower triangular 75 | part of the matrix. 76 | 77 | """ 78 | 79 | df = _febrl_load_data("dataset1.csv") 80 | 81 | if return_links: 82 | links = _febrl_links(df) 83 | return df, links 84 | else: 85 | return df 86 | 87 | 88 | def load_febrl2(return_links=False): 89 | """Load the FEBRL 2 dataset. 90 | 91 | The Freely Extensible Biomedical Record Linkage (Febrl) package is 92 | distributed with a dataset generator and four datasets generated 93 | with the generator. This function returns the second Febrl dataset 94 | as a :class:`pandas.DataFrame`. 95 | 96 | *"This data set contains 5000 records (4000 originals and 97 | 1000 duplicates), with a maximum of 5 duplicates based on 98 | one original record (and a poisson distribution of 99 | duplicate records). Distribution of duplicates: 100 | 19 originals records have 5 duplicate records 101 | 47 originals records have 4 duplicate records 102 | 107 originals records have 3 duplicate records 103 | 141 originals records have 2 duplicate records 104 | 114 originals records have 1 duplicate record 105 | 572 originals records have no duplicate record"* 106 | 107 | Parameters 108 | ---------- 109 | return_links: bool 110 | When True, the function returns also the true links. 111 | 112 | Returns 113 | ------- 114 | pandas.DataFrame 115 | A :class:`pandas.DataFrame` with Febrl dataset2.csv. When 116 | return_links is True, the function returns also the true 117 | links. The true links are all links in the lower triangular 118 | part of the matrix. 119 | 120 | """ 121 | 122 | df = _febrl_load_data("dataset2.csv") 123 | 124 | if return_links: 125 | links = _febrl_links(df) 126 | return df, links 127 | else: 128 | return df 129 | 130 | 131 | def load_febrl3(return_links=False): 132 | """Load the FEBRL 3 dataset. 133 | 134 | The Freely Extensible Biomedical Record Linkage (Febrl) package is 135 | distributed with a dataset generator and four datasets generated 136 | with the generator. This function returns the third Febrl dataset 137 | as a :class:`pandas.DataFrame`. 138 | 139 | *"This data set contains 5000 records (2000 originals and 140 | 3000 duplicates), with a maximum of 5 duplicates based on 141 | one original record (and a Zipf distribution of duplicate 142 | records). Distribution of duplicates: 143 | 168 originals records have 5 duplicate records 144 | 161 originals records have 4 duplicate records 145 | 212 originals records have 3 duplicate records 146 | 256 originals records have 2 duplicate records 147 | 368 originals records have 1 duplicate record 148 | 1835 originals records have no duplicate record"* 149 | 150 | Parameters 151 | ---------- 152 | return_links: bool 153 | When True, the function returns also the true links. 154 | 155 | Returns 156 | ------- 157 | pandas.DataFrame 158 | A :class:`pandas.DataFrame` with Febrl dataset3.csv. When 159 | return_links is True, the function returns also the true 160 | links. The true links are all links in the lower triangular 161 | part of the matrix. 162 | 163 | """ 164 | 165 | df = _febrl_load_data("dataset3.csv") 166 | 167 | if return_links: 168 | links = _febrl_links(df) 169 | return df, links 170 | else: 171 | return df 172 | 173 | 174 | def load_febrl4(return_links=False): 175 | """Load the FEBRL 4 datasets. 176 | 177 | The Freely Extensible Biomedical Record Linkage (Febrl) package is 178 | distributed with a dataset generator and four datasets generated 179 | with the generator. This function returns the fourth Febrl dataset 180 | as a :class:`pandas.DataFrame`. 181 | 182 | *"Generated as one data set with 10000 records (5000 183 | originals and 5000 duplicates, with one duplicate per 184 | original), the originals have been split from the 185 | duplicates, into dataset4a.csv (containing the 5000 186 | original records) and dataset4b.csv (containing the 187 | 5000 duplicate records) These two data sets can be 188 | used for testing linkage procedures."* 189 | 190 | Parameters 191 | ---------- 192 | return_links: bool 193 | When True, the function returns also the true links. 194 | 195 | Returns 196 | ------- 197 | (pandas.DataFrame, pandas.DataFrame) 198 | A :class:`pandas.DataFrame` with Febrl dataset4a.csv and a pandas 199 | dataframe with Febrl dataset4b.csv. When return_links is True, 200 | the function returns also the true links. 201 | 202 | """ 203 | 204 | df_a = _febrl_load_data("dataset4a.csv") 205 | df_b = _febrl_load_data("dataset4b.csv") 206 | 207 | if return_links: 208 | links = pandas.MultiIndex.from_arrays( 209 | [ 210 | [f"rec-{i}-org" for i in range(0, 5000)], 211 | [f"rec-{i}-dup-0" for i in range(0, 5000)], 212 | ] 213 | ) 214 | return df_a, df_b, links 215 | else: 216 | return df_a, df_b 217 | -------------------------------------------------------------------------------- /recordlinkage/datasets/generate.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | 5 | def binary_vectors( 6 | n, 7 | n_match, 8 | m=[0.9] * 8, 9 | u=[0.1] * 8, 10 | random_state=None, 11 | return_links=False, 12 | dtype=np.int8, 13 | ): 14 | """Generate random binary comparison vectors. 15 | 16 | This function is used to generate random comparison vectors. The 17 | result of each comparison is a binary value (0 or 1). 18 | 19 | Parameters 20 | ---------- 21 | n : int 22 | The total number of comparison vectors. 23 | n_match : int 24 | The number of matching record pairs. 25 | m : list, default [0.9] * 8, optional 26 | A list of m probabilities of each partially identifying 27 | variable. The m probability is the probability that an 28 | identifier in matching record pairs agrees. 29 | u : list, default [0.9] * 8, optional 30 | A list of u probabilities of each partially identifying 31 | variable. The u probability is the probability that an 32 | identifier in non-matching record pairs agrees. 33 | random_state : int or numpy.random.RandomState, optional 34 | Seed for the random number generator with an integer or numpy 35 | RandomState object. 36 | return_links: bool 37 | When True, the function returns also the true links. 38 | dtype: numpy.dtype 39 | The dtype of each column in the returned DataFrame. 40 | 41 | Returns 42 | ------- 43 | pandas.DataFrame 44 | A dataframe with comparison vectors. 45 | 46 | 47 | """ 48 | 49 | if len(m) != len(u): 50 | raise ValueError("the length of 'm' is not equal the length of 'u'") 51 | 52 | if n_match >= n or n_match < 0: 53 | raise ValueError("the number of matches is bounded by [0, n]") 54 | 55 | # set the random seed 56 | np.random.seed(random_state) 57 | 58 | matches = [] 59 | nonmatches = [] 60 | 61 | sample_set = np.array([0, 1], dtype=dtype) 62 | 63 | for i, _ in enumerate(m): 64 | p_mi = [1 - m[i], m[i]] 65 | p_ui = [1 - u[i], u[i]] 66 | 67 | comp_mi = np.random.choice(sample_set, (n_match, 1), p=p_mi) 68 | comp_ui = np.random.choice(sample_set, (n - n_match, 1), p=p_ui) 69 | 70 | nonmatches.append(comp_ui) 71 | matches.append(comp_mi) 72 | 73 | match_block = np.concatenate(matches, axis=1) 74 | nonmatch_block = np.concatenate(nonmatches, axis=1) 75 | 76 | data_np = np.concatenate((match_block, nonmatch_block), axis=0) 77 | index_np = np.random.randint(1001, 1001 + n * 2, (n, 2)) 78 | 79 | data_col_names = ["c_%s" % (i + 1) for i in range(len(m))] 80 | data_mi = pd.MultiIndex.from_arrays([index_np[:, 0], index_np[:, 1]]) 81 | data_df = pd.DataFrame(data_np, index=data_mi, columns=data_col_names) 82 | 83 | features = data_df.sample(frac=1, random_state=random_state) 84 | 85 | if return_links: 86 | links = data_mi[:n_match] 87 | return features, links 88 | else: 89 | return features 90 | -------------------------------------------------------------------------------- /recordlinkage/deprecated.py: -------------------------------------------------------------------------------- 1 | """Home of all deprecated functions and classes.""" 2 | -------------------------------------------------------------------------------- /recordlinkage/network.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from recordlinkage.types import is_pandas_2d_multiindex 4 | from recordlinkage.types import is_pandas_multiindex 5 | 6 | 7 | class OneToOneLinking: 8 | """[EXPERIMENTAL] One-to-one linking 9 | 10 | A record from dataset A can match at most one record from dataset 11 | B. For example, (a1, a2) are records from A and (b1, b2) are records 12 | from B. A linkage of (a1, b1), (a1, b2), (a2, b1), (a2, b2) is not 13 | one-to-one connected. One of the results of one-to-one linking can 14 | be (a1, b1), (a2, b2). 15 | 16 | Parameters 17 | ---------- 18 | method : str 19 | The method to solve the problem. Only 'greedy' is supported at 20 | the moment. 21 | 22 | Note 23 | ---- 24 | 25 | This class is experimental and might change in future versions. 26 | 27 | """ 28 | 29 | def __init__(self, method="greedy"): 30 | super().__init__() 31 | 32 | self.method = method 33 | 34 | @classmethod 35 | def _bool_duplicated(cls, links, level): 36 | return links.get_level_values(level).duplicated() 37 | 38 | def _compute_greedy(self, links): 39 | result = [] 40 | set_a = set() 41 | set_b = set() 42 | 43 | for index_a, index_b in links: 44 | if index_a not in set_a and index_b not in set_b: 45 | result.append((index_a, index_b)) 46 | set_a.add(index_a) 47 | set_b.add(index_b) 48 | 49 | return pd.MultiIndex.from_tuples(result) 50 | 51 | def _compute(self, links): 52 | if not is_pandas_2d_multiindex(links): 53 | if not is_pandas_multiindex(links): 54 | raise TypeError("expected pandas.MultiIndex") 55 | elif not is_pandas_2d_multiindex(links): 56 | raise ValueError( 57 | "pandas.MultiIndex has incorrect number of " 58 | "levels (expected 2 levels)" 59 | ) 60 | 61 | if self.method == "greedy": 62 | return self._compute_greedy(links) 63 | else: 64 | raise ValueError(f"unknown matching method {self.method}") 65 | 66 | def compute(self, links): 67 | """Compute the one-to-one linking. 68 | 69 | Parameters 70 | ---------- 71 | links : pandas.MultiIndex 72 | The pairs to apply linking to. 73 | 74 | Returns 75 | ------- 76 | pandas.MultiIndex 77 | A one-to-one matched MultiIndex of record pairs. 78 | 79 | """ 80 | 81 | return self._compute(links) 82 | 83 | 84 | class OneToManyLinking(OneToOneLinking): 85 | """[EXPERIMENTAL] One-to-many linking 86 | 87 | A record from dataset A can link multiple records from dataset B, 88 | but a record from B can link to only one record of dataset A. Use 89 | the `level` argument to switch A and B. 90 | 91 | Parameters 92 | ---------- 93 | level : int 94 | The level of the MultiIndex to have the one relations. The 95 | options are 0 or 1 (incication the level of the MultiIndex). 96 | Default 0. 97 | method : str 98 | The method to solve the problem. Only 'greedy' is supported at 99 | the moment. 100 | 101 | Example 102 | ------- 103 | 104 | Consider a MultiIndex with record pairs constructed from datasets A 105 | and B. To link a record from B to at most one record of B, use the 106 | following syntax: 107 | 108 | > one_to_many = OneToManyLinking(0) 109 | > one_to_many.compute(links) 110 | 111 | To link a record from B to at most one record 112 | of B, use: 113 | 114 | > one_to_many = OneToManyLinking(1) 115 | > one_to_many.compute(links) 116 | 117 | Note 118 | ---- 119 | 120 | This class is experimental and might change in future versions. 121 | 122 | """ 123 | 124 | def __init__(self, level=0, method="greedy"): 125 | super().__init__(method=method) 126 | 127 | self.level = level 128 | 129 | def _compute_greedy(self, links): 130 | source_dupl_bool = self._bool_duplicated(links, self.level) 131 | return links[~source_dupl_bool] 132 | 133 | def compute(self, links): 134 | """Compute the one-to-many matching. 135 | 136 | Parameters 137 | ---------- 138 | links : pandas.MultiIndex 139 | The pairs to apply linking to. 140 | 141 | Returns 142 | ------- 143 | pandas.MultiIndex 144 | A one-to-many matched MultiIndex of record pairs. 145 | 146 | """ 147 | 148 | return self._compute(links) 149 | 150 | 151 | class ConnectedComponents: 152 | """[EXPERIMENTAL] Connected record pairs 153 | 154 | This class identifies connected record pairs. Connected components 155 | are especially used in detecting duplicates in a single dataset. 156 | 157 | Note 158 | ---- 159 | 160 | This class is experimental and might change in future versions. 161 | """ 162 | 163 | def __init__(self): 164 | super().__init__() 165 | 166 | def compute(self, links): 167 | """Return the connected components. 168 | 169 | Parameters 170 | ---------- 171 | links : pandas.MultiIndex 172 | The links to apply one-to-one matching on. 173 | 174 | Returns 175 | ------- 176 | list of pandas.MultiIndex 177 | A list with pandas.MultiIndex objects. Each MultiIndex 178 | object represents a set of connected record pairs. 179 | 180 | """ 181 | 182 | try: 183 | import networkx as nx 184 | except ImportError as err: 185 | raise Exception("'networkx' module is needed for this operation") from err 186 | 187 | graph_pairs = nx.Graph() 188 | graph_pairs.add_edges_from(links.values) 189 | connected_pairs = ( 190 | graph_pairs.subgraph(c).copy() for c in nx.connected_components(graph_pairs) 191 | ) 192 | 193 | links_result = [ 194 | pd.MultiIndex.from_tuples(subgraph.edges()) for subgraph in connected_pairs 195 | ] 196 | 197 | return links_result 198 | -------------------------------------------------------------------------------- /recordlinkage/preprocessing/__init__.py: -------------------------------------------------------------------------------- 1 | from recordlinkage.preprocessing.cleaning import clean 2 | from recordlinkage.preprocessing.cleaning import phonenumbers 3 | from recordlinkage.preprocessing.cleaning import value_occurence 4 | from recordlinkage.preprocessing.encoding import _list_phonetic_algorithms 5 | from recordlinkage.preprocessing.encoding import phonetic 6 | 7 | phonetic_algorithms = _list_phonetic_algorithms() 8 | """List of available phonetic algorithms.""" 9 | 10 | __all__ = [ 11 | "phonetic_algorithms", 12 | "clean", 13 | "phonetic", 14 | "value_occurence", 15 | "phonenumbers", 16 | ] 17 | -------------------------------------------------------------------------------- /recordlinkage/preprocessing/cleaning.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from sklearn.feature_extraction.text import strip_accents_ascii 4 | from sklearn.feature_extraction.text import strip_accents_unicode 5 | 6 | 7 | def clean( 8 | s, 9 | lowercase=True, 10 | replace_by_none=r"[^ \-\_A-Za-z0-9]+", 11 | replace_by_whitespace=r"[\-\_]", 12 | strip_accents=None, 13 | remove_brackets=True, 14 | encoding="utf-8", 15 | decode_error="strict", 16 | ): 17 | """Clean string variables. 18 | 19 | Clean strings in the Series by removing unwanted tokens, 20 | whitespace and brackets. 21 | 22 | Parameters 23 | ---------- 24 | s : pandas.Series 25 | A Series to clean. 26 | lower : bool, optional 27 | Convert strings in the Series to lowercase. Default True. 28 | replace_by_none : str, optional 29 | The matches of this regular expression are replaced by ''. 30 | replace_by_whitespace : str, optional 31 | The matches of this regular expression are replaced by a 32 | whitespace. 33 | remove_brackets : bool, optional 34 | Remove all content between brackets and the bracket 35 | themselves. Default True. 36 | strip_accents : {'ascii', 'unicode', None}, optional 37 | Remove accents during the preprocessing step. 'ascii' is a 38 | fast method that only works on characters that have an direct 39 | ASCII mapping. 'unicode' is a slightly slower method that 40 | works on any characters. None (default) does nothing. 41 | encoding : str, optional 42 | If bytes are given, this encoding is used to decode. Default 43 | is 'utf-8'. 44 | decode_error : {'strict', 'ignore', 'replace'}, optional 45 | Instruction on what to do if a byte Series is given that 46 | contains characters not of the given `encoding`. By default, 47 | it is 'strict', meaning that a UnicodeDecodeError will be 48 | raised. Other values are 'ignore' and 'replace'. 49 | 50 | Example 51 | ------- 52 | >>> import pandas 53 | >>> from recordlinkage.preprocessing import clean 54 | >>> 55 | >>> names = ['Mary-ann', 56 | 'Bob :)', 57 | 'Angel', 58 | 'Bob (alias Billy)', 59 | None] 60 | >>> s = pandas.Series(names) 61 | >>> print(clean(s)) 62 | 0 mary ann 63 | 1 bob 64 | 2 angel 65 | 3 bob 66 | 4 NaN 67 | dtype: object 68 | 69 | Returns 70 | ------- 71 | pandas.Series: 72 | A cleaned Series of strings. 73 | 74 | """ 75 | 76 | if s.shape[0] == 0: 77 | return s 78 | 79 | # Lower s if lower is True 80 | if lowercase is True: 81 | s = s.str.lower() 82 | 83 | # Accent stripping based on https://github.com/scikit-learn/ 84 | # scikit-learn/blob/412996f/sklearn/feature_extraction/text.py 85 | # BSD license 86 | if not strip_accents: 87 | pass 88 | elif callable(strip_accents): 89 | strip_accents_fn = strip_accents 90 | elif strip_accents == "ascii": 91 | strip_accents_fn = strip_accents_ascii 92 | elif strip_accents == "unicode": 93 | strip_accents_fn = strip_accents_unicode 94 | else: 95 | raise ValueError(f"Invalid value for 'strip_accents': {strip_accents}") 96 | 97 | # Remove accents etc 98 | if strip_accents: 99 | 100 | def strip_accents_fn_wrapper(x): 101 | if sys.version_info[0] >= 3: 102 | if isinstance(x, str): 103 | return strip_accents_fn(x) 104 | else: 105 | return x 106 | else: 107 | if isinstance(x, unicode): # noqa 108 | return strip_accents_fn(x) 109 | else: 110 | return x 111 | 112 | # encoding 113 | s = s.apply( 114 | lambda x: x.decode(encoding, decode_error) if type(x) == bytes else x 115 | ) 116 | s = s.map(lambda x: strip_accents_fn_wrapper(x)) 117 | 118 | # Remove all content between brackets 119 | if remove_brackets is True: 120 | s = s.str.replace(r"(\[.*?\]|\(.*?\)|\{.*?\})", "", regex=True) 121 | 122 | # Remove the special characters 123 | if replace_by_none: 124 | s = s.str.replace(replace_by_none, "", regex=True) 125 | 126 | if replace_by_whitespace: 127 | s = s.str.replace(replace_by_whitespace, " ", regex=True) 128 | 129 | # Remove multiple whitespaces 130 | s = s.str.replace(r"\s\s+", " ", regex=True) 131 | 132 | # Strip s 133 | s = s.str.lstrip().str.rstrip() 134 | 135 | return s 136 | 137 | 138 | def phonenumbers(s): 139 | """Clean phonenumbers by removing all non-numbers (except +). 140 | 141 | Parameters 142 | ---------- 143 | s: pandas.Series 144 | A Series to clean. 145 | 146 | Returns 147 | ------- 148 | pandas.Series 149 | A Series with cleaned phonenumbers. 150 | 151 | """ 152 | 153 | # Remove all special tokens 154 | s = s.astype(object).str.replace("[^0-9+]+", "", regex=True) 155 | 156 | return s 157 | 158 | 159 | def value_occurence(s): 160 | """Count the number of times each value occurs. 161 | 162 | This function returns the counts for each row, in contrast with 163 | `pandas.value_counts `_. 165 | 166 | Returns 167 | ------- 168 | pandas.Series 169 | A Series with value counts. 170 | 171 | """ 172 | 173 | # https://github.com/pydata/pandas/issues/3729 174 | value_count = s.fillna("NAN") 175 | 176 | return value_count.groupby(by=value_count).transform("count") 177 | -------------------------------------------------------------------------------- /recordlinkage/preprocessing/encoding.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import jellyfish 4 | import numpy as np 5 | import pandas 6 | 7 | _phonetic_algorithms = [ 8 | {"name": "Soundex", "callback": jellyfish.soundex, "argument_names": ["soundex"]}, 9 | { 10 | "name": "NYSIIS", 11 | "callback": jellyfish.nysiis, 12 | "argument_names": ["nysiis", "nyssis"], 13 | }, 14 | { 15 | "name": "Metaphone", 16 | "callback": jellyfish.metaphone, 17 | "argument_names": ["metaphone"], 18 | }, 19 | { 20 | "name": "Match Rating", 21 | "callback": jellyfish.match_rating_codex, 22 | "argument_names": [ 23 | "match_rating", 24 | "match rating", 25 | "matchrating", 26 | "match_rating_codex", 27 | "matchratingcodex", 28 | ], 29 | }, 30 | ] 31 | 32 | 33 | def _list_phonetic_algorithms(): 34 | """Return list of available phonetic algorithms.""" 35 | 36 | return [alg["argument_names"][0] for alg in _phonetic_algorithms] 37 | 38 | 39 | def phonetic(s, method, concat=True, encoding="utf-8", decode_error="strict"): 40 | """Convert names or strings into phonetic codes. 41 | 42 | The implemented algorithms are `soundex 43 | `_, `nysiis 44 | `_, `metaphone 46 | `_ or `match_rating 47 | `_. 48 | 49 | Parameters 50 | ---------- 51 | s : pandas.Series 52 | A pandas.Series with string values (often names) to encode. 53 | method: str 54 | The algorithm that is used to phonetically encode the values. 55 | The possible options are "soundex", "nysiis", "metaphone" or 56 | "match_rating". 57 | concat: bool, optional 58 | Remove whitespace before phonetic encoding. 59 | encoding: str, optional 60 | If bytes are given, this encoding is used to decode. Default 61 | is 'utf-8'. 62 | decode_error: {'strict', 'ignore', 'replace'}, optional 63 | Instruction on what to do if a byte Series is given that 64 | contains characters not of the given `encoding`. By default, 65 | it is 'strict', meaning that a UnicodeDecodeError will be 66 | raised. Other values are 'ignore' and 'replace'. 67 | 68 | Returns 69 | ------- 70 | pandas.Series 71 | A Series with phonetic encoded values. 72 | 73 | """ 74 | 75 | # encoding 76 | if sys.version_info[0] == 2: 77 | s = s.apply( 78 | lambda x: x.decode(encoding, decode_error) if type(x) == bytes else x 79 | ) 80 | 81 | if concat: 82 | s = s.str.replace(r"[\-\_\s]", "", regex=True) 83 | 84 | for alg in _phonetic_algorithms: 85 | if method in alg["argument_names"]: 86 | phonetic_callback = alg["callback"] 87 | break 88 | else: 89 | raise ValueError(f"The algorithm '{method}' is not known.") 90 | 91 | return s.str.upper().apply( 92 | lambda x: phonetic_callback(x) if pandas.notnull(x) else np.nan 93 | ) 94 | -------------------------------------------------------------------------------- /recordlinkage/rl_logging.py: -------------------------------------------------------------------------------- 1 | """Logging utilities.""" 2 | 3 | # Copyright 2015 The TensorFlow Authors. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================ 17 | # 18 | # Modifications copyright Jonathan de Bruin 2017 19 | 20 | # pylint: disable=unused-import 21 | 22 | import logging as _logging 23 | import sys as _sys 24 | from logging import DEBUG # noqa 25 | from logging import ERROR # noqa 26 | from logging import FATAL # noqa 27 | from logging import INFO # noqa 28 | from logging import WARN # noqa 29 | 30 | # Determine whether we are in an interactive environment 31 | _interactive = False 32 | try: 33 | # This is only defined in interactive shells 34 | if _sys.ps1: 35 | _interactive = True 36 | except AttributeError: 37 | # Even now, we may be in an interactive shell with `python -i`. 38 | _interactive = _sys.flags.interactive 39 | 40 | # Scope the tensorflow logger to not conflict with users' loggers 41 | _logger = _logging.getLogger("recordlinkage") 42 | 43 | # If we are in an interactive environment (like jupyter), set loglevel to info 44 | # and pipe the output to stdout 45 | if _interactive: 46 | _logger.setLevel(WARN) 47 | _logging_target = _sys.stdout 48 | else: 49 | _logging_target = _sys.stderr 50 | 51 | # Add the output handler 52 | _handler = _logging.StreamHandler(_logging_target) 53 | _handler.setFormatter(_logging.Formatter(_logging.BASIC_FORMAT, None)) 54 | _logger.addHandler(_handler) 55 | 56 | log = _logger.log 57 | debug = _logger.debug 58 | error = _logger.error 59 | fatal = _logger.fatal 60 | info = _logger.info 61 | warning = _logger.warning 62 | 63 | 64 | def get_verbosity(): 65 | """Return how much logging output will be produced.""" 66 | return _logger.getEffectiveLevel() 67 | 68 | 69 | def set_verbosity(verbosity): 70 | """Sets the threshold for what messages will be logged.""" 71 | _logger.setLevel(verbosity) 72 | -------------------------------------------------------------------------------- /recordlinkage/standardise/__init__.py: -------------------------------------------------------------------------------- 1 | # This module is renamed into preprocessing. Please use the preprocessing 2 | # module instead of this module. 3 | 4 | import warnings 5 | 6 | from recordlinkage.preprocessing import clean as _clean 7 | from recordlinkage.preprocessing import phonenumbers as _phonenumbers 8 | from recordlinkage.preprocessing import phonetic as _phonetic 9 | from recordlinkage.preprocessing import value_occurence as _value_occurence 10 | 11 | 12 | def _depr_warn(): 13 | warnings.warn( 14 | "module recordlinkage.standardise is deprecated, use " 15 | "recordlinkage.preprocessing instead", 16 | DeprecationWarning, 17 | stacklevel=2, 18 | ) 19 | 20 | 21 | def clean(*args, **kwargs): 22 | _depr_warn() 23 | 24 | return _clean(*args, **kwargs) 25 | 26 | 27 | def phonenumbers(*args, **kwargs): 28 | _depr_warn() 29 | 30 | return _phonenumbers(*args, **kwargs) 31 | 32 | 33 | def value_occurence(*args, **kwargs): 34 | _depr_warn() 35 | 36 | return _value_occurence(*args, **kwargs) 37 | 38 | 39 | def phonetic(*args, **kwargs): 40 | _depr_warn() 41 | 42 | return _phonetic(*args, **kwargs) 43 | -------------------------------------------------------------------------------- /recordlinkage/types.py: -------------------------------------------------------------------------------- 1 | """ 2 | basic inference routines 3 | 4 | most functions taken from pandas (https://github.com/pandas-dev/pandas) 5 | License BSD 6 | 7 | """ 8 | 9 | import collections 10 | import re 11 | from numbers import Number 12 | 13 | import numpy 14 | import pandas 15 | 16 | string_and_binary_types = (str, bytes) 17 | 18 | 19 | def is_number(obj): 20 | return isinstance(obj, (Number, numpy.number)) 21 | 22 | 23 | def is_string_like(obj): 24 | return isinstance(obj, str) 25 | 26 | 27 | def _iterable_not_string(x): 28 | return isinstance(x, collections.Iterable) and not isinstance(x, str) 29 | 30 | 31 | def is_iterator(obj): 32 | return hasattr(obj, "__next__") 33 | 34 | 35 | def is_re(obj): 36 | return isinstance(obj, re._pattern_type) 37 | 38 | 39 | def is_re_compilable(obj): 40 | try: 41 | re.compile(obj) 42 | except TypeError: 43 | return False 44 | else: 45 | return True 46 | 47 | 48 | def is_list_like(arg): 49 | return hasattr(arg, "__iter__") and not isinstance(arg, string_and_binary_types) 50 | 51 | 52 | def is_dict_like(arg): 53 | return hasattr(arg, "__getitem__") and hasattr(arg, "keys") 54 | 55 | 56 | def is_named_tuple(arg): 57 | return isinstance(arg, tuple) and hasattr(arg, "_fields") 58 | 59 | 60 | def is_hashable(arg): 61 | """Return True if hash(arg) will succeed, False otherwise. 62 | 63 | Some types will pass a test against collections.Hashable but fail when they 64 | are actually hashed with hash(). 65 | 66 | Distinguish between these and other types by trying the call to hash() and 67 | seeing if they raise TypeError. 68 | 69 | Examples 70 | -------- 71 | >>> a = ([],) 72 | >>> isinstance(a, collections.Hashable) 73 | True 74 | >>> is_hashable(a) 75 | False 76 | """ 77 | 78 | # unfortunately, we can't use isinstance(arg, collections.Hashable), which 79 | # can be faster than calling hash, because numpy scalars on Python 3 fail 80 | # this test 81 | 82 | # reconsider this decision once this numpy bug is fixed: 83 | # https://github.com/numpy/numpy/issues/5562 84 | 85 | try: 86 | hash(arg) 87 | except TypeError: 88 | return False 89 | else: 90 | return True 91 | 92 | 93 | def is_sequence(x): 94 | try: 95 | iter(x) 96 | len(x) # it has a length 97 | return not isinstance(x, string_and_binary_types) 98 | except (TypeError, AttributeError): 99 | return False 100 | 101 | 102 | def is_pandas_like(x): 103 | return isinstance(x, (pandas.Series, pandas.DataFrame)) 104 | 105 | 106 | def is_pandas_multiindex(x): 107 | return isinstance(x, (pandas.MultiIndex)) 108 | 109 | 110 | def is_pandas_2d_multiindex(x): 111 | return is_pandas_multiindex(x) and x.nlevels == 2 112 | 113 | 114 | def is_numpy_like(x): 115 | return isinstance(x, (numpy.ndarray)) 116 | -------------------------------------------------------------------------------- /recordlinkage/utils.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from functools import wraps 3 | 4 | import numpy 5 | import pandas 6 | 7 | import recordlinkage.config as cf 8 | 9 | 10 | # Errors and Exception handlers 11 | class IndexError(Exception): 12 | """Error class for errors related to indexing.""" 13 | 14 | pass 15 | 16 | 17 | class LearningError(Exception): 18 | """Learning error""" 19 | 20 | 21 | class DeprecationHelper: 22 | """Deprecation helper for classes and functions. 23 | 24 | Based on https://stackoverflow.com/a/9008509/8727928 25 | """ 26 | 27 | def __init__(self, new_target, msg=None): 28 | self.new_target = new_target 29 | self.msg = msg 30 | 31 | def _warn(self): 32 | from warnings import warn 33 | 34 | if self.msg is None: 35 | msg = "This class will get deprecated." 36 | else: 37 | msg = self.msg 38 | 39 | warn(msg, DeprecationWarning, stacklevel=1) 40 | 41 | def __call__(self, *args, **kwargs): 42 | self._warn() 43 | return self.new_target(*args, **kwargs) 44 | 45 | def __getattr__(self, attr): 46 | self._warn() 47 | return getattr(self.new_target, attr) 48 | 49 | 50 | def return_type_deprecator(func): 51 | @wraps(func) 52 | def func_wrapper(*args, **kwargs): 53 | return_type = kwargs.pop("return_type", None) 54 | if return_type is not None: 55 | warnings.warn( 56 | "The argument 'return_type' is deprecated in the next " 57 | "version. Use recordlinkage.set_option('classification." 58 | "return_type', '{}') instead.".format(return_type), 59 | DeprecationWarning, 60 | stacklevel=2, 61 | ) 62 | with cf.option_context("classification.return_type", return_type): 63 | return func(*args, **kwargs) 64 | else: 65 | return func(*args, **kwargs) 66 | 67 | return func_wrapper 68 | 69 | 70 | # Checks and conversions 71 | def is_label_dataframe(label, df): 72 | """check column label existance""" 73 | 74 | setdiff = set(label) - set(df.columns.tolist()) 75 | 76 | if len(setdiff) == 0: 77 | return True 78 | else: 79 | return False 80 | 81 | 82 | def get_length(x): 83 | """Return int or len(x)""" 84 | 85 | try: 86 | return int(x) 87 | except Exception: 88 | return len(x) 89 | 90 | 91 | def listify(x, none_value=[]): 92 | """Make a list of the argument if it is not a list.""" 93 | 94 | if isinstance(x, list): 95 | return x 96 | elif isinstance(x, tuple): 97 | return list(x) 98 | elif x is None: 99 | return none_value 100 | else: 101 | return [x] 102 | 103 | 104 | def unique(x): 105 | """Convert a list in a unique list.""" 106 | 107 | return list(set(x)) 108 | 109 | 110 | def merge_dicts(*dict_args): 111 | """ 112 | Given any number of dicts, shallow copy and merge into a new dict, 113 | precedence goes to key value pairs in latter dicts. 114 | """ 115 | result = {} 116 | for dictionary in dict_args: 117 | result.update(dictionary) 118 | return result 119 | 120 | 121 | def multi_index_to_frame(index): 122 | """ 123 | Replicates MultiIndex.to_frame, which was introduced in pandas 0.21, 124 | for the sake of backwards compatibility. 125 | """ 126 | return pandas.DataFrame(index.tolist(), index=index, columns=index.names) 127 | 128 | 129 | def index_split(index, chunks): 130 | """Function to split pandas.Index and pandas.MultiIndex objects. 131 | 132 | Split :class:`pandas.Index` and :class:`pandas.MultiIndex` objects 133 | into chunks. This function is based on :func:`numpy.array_split`. 134 | 135 | Parameters 136 | ---------- 137 | index : pandas.Index, pandas.MultiIndex 138 | A pandas.Index or pandas.MultiIndex to split into chunks. 139 | chunks : int 140 | The number of parts to split the index into. 141 | 142 | Returns 143 | ------- 144 | list 145 | A list with chunked pandas.Index or pandas.MultiIndex objects. 146 | 147 | """ 148 | 149 | Ntotal = index.shape[0] 150 | Nsections = int(chunks) 151 | if Nsections <= 0: 152 | raise ValueError("number sections must be larger than 0.") 153 | Neach_section, extras = divmod(Ntotal, Nsections) 154 | section_sizes = ( 155 | [0] + extras * [Neach_section + 1] + (Nsections - extras) * [Neach_section] 156 | ) 157 | div_points = numpy.array(section_sizes).cumsum() 158 | 159 | sub_ind = [] 160 | for i in range(Nsections): 161 | st = div_points[i] 162 | end = div_points[i + 1] 163 | sub_ind.append(index[st:end]) 164 | 165 | return sub_ind 166 | 167 | 168 | def split_index(*args, **kwargs): 169 | warnings.warn( 170 | "Function will be removed in the future. Use index_split.", 171 | DeprecationWarning, 172 | stacklevel=2, 173 | ) 174 | 175 | return index_split(*args, **kwargs) 176 | 177 | 178 | def frame_indexing(frame, multi_index, level_i, indexing_type="label"): 179 | """Index dataframe based on one level of MultiIndex. 180 | 181 | Arguments 182 | --------- 183 | frame : pandas.DataFrame 184 | The datafrme to select records from. 185 | multi_index : pandas.MultiIndex 186 | A pandas multiindex were one fo the levels is used to sample the 187 | dataframe with. 188 | level_i : int, str 189 | The level of the multiIndex to index on. 190 | indexing_type : str 191 | The type of indexing. The value can be 'label' or 'position'. 192 | Default 'label'. 193 | 194 | """ 195 | 196 | if indexing_type == "label": 197 | data = frame.loc[multi_index.get_level_values(level_i)] 198 | data.index = multi_index 199 | elif indexing_type == "position": 200 | data = frame.iloc[multi_index.get_level_values(level_i)] 201 | data.index = multi_index 202 | else: 203 | raise ValueError("indexing_type needs to be 'label' or 'position'") 204 | 205 | return data 206 | 207 | 208 | def fillna(series_or_arr, missing_value=0.0): 209 | """Fill missing values in pandas objects and numpy arrays. 210 | 211 | Arguments 212 | --------- 213 | series_or_arr : pandas.Series, numpy.ndarray 214 | The numpy array or pandas series for which the missing values 215 | need to be replaced. 216 | missing_value : float, int, str 217 | The value to replace the missing value with. Default 0.0. 218 | 219 | Returns 220 | ------- 221 | pandas.Series, numpy.ndarray 222 | The numpy array or pandas series with the missing values 223 | filled. 224 | """ 225 | 226 | if pandas.notnull(missing_value): 227 | if isinstance(series_or_arr, (numpy.ndarray)): 228 | series_or_arr[numpy.isnan(series_or_arr)] = missing_value 229 | else: 230 | series_or_arr.fillna(missing_value, inplace=True) 231 | 232 | return series_or_arr 233 | -------------------------------------------------------------------------------- /tests/test_annotator.py: -------------------------------------------------------------------------------- 1 | import recordlinkage as rl 2 | from recordlinkage.datasets import load_febrl1 3 | from recordlinkage.datasets import load_febrl4 4 | from recordlinkage.index import Block 5 | 6 | 7 | def test_annotation_link(tmp_path): 8 | path = tmp_path / "febrl_annotation_link.json" 9 | 10 | # get febrl4 file 11 | df_a, df_b, matches = load_febrl4(return_links=True) 12 | 13 | # get record pairs 14 | indexer = Block("given_name", "given_name") 15 | pairs = indexer.index(df_a, df_b) 16 | 17 | # create annotation file 18 | # write an annotation file for the Febrl4 dataset. 19 | rl.write_annotation_file(path, pairs[0:10], df_a, df_b) 20 | 21 | # read the result 22 | result = rl.read_annotation_file(path) 23 | 24 | assert result.links is None 25 | assert result.distinct is None 26 | 27 | 28 | def test_annotation_dedup(tmp_path): 29 | path = tmp_path / "febrl_annotation_dedup.json" 30 | 31 | # get febrl4 file 32 | df_a, matches = load_febrl1(return_links=True) 33 | 34 | # get record pairs 35 | indexer = Block("given_name", "given_name") 36 | pairs = indexer.index(df_a) 37 | 38 | # create annotation file 39 | # write an annotation file for the Febrl4 dataset. 40 | rl.write_annotation_file(path, pairs[0:10], df_a) 41 | 42 | # read the result 43 | result = rl.read_annotation_file(path) 44 | 45 | assert result.links is None 46 | assert result.distinct is None 47 | -------------------------------------------------------------------------------- /tests/test_datasets.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from os import environ 4 | from pathlib import Path 5 | 6 | import numpy 7 | import pandas 8 | import pytest 9 | 10 | from recordlinkage.datasets import binary_vectors 11 | from recordlinkage.datasets import clear_data_home 12 | from recordlinkage.datasets import get_data_home 13 | from recordlinkage.datasets import load_febrl1 14 | from recordlinkage.datasets import load_febrl2 15 | from recordlinkage.datasets import load_febrl3 16 | from recordlinkage.datasets import load_febrl4 17 | from recordlinkage.datasets import load_krebsregister 18 | 19 | FEBRL_DEDUP = [ 20 | # nlinks = 500 21 | (load_febrl1, 1000, 500), 22 | # nlinks=19*6*5/2+47*5*4/2+107*4*3/2+141*3*2/2+114 23 | (load_febrl2, 5000, 1934), 24 | # nlinks=168*6*5/2+161*5*4/2+212*4*3/2+256*3*2/2+368 25 | (load_febrl3, 5000, 6538), 26 | ] 27 | 28 | 29 | @pytest.mark.parametrize("dataset,nrows,nlinks", FEBRL_DEDUP) 30 | def test_febrl_dedup(dataset, nrows, nlinks): 31 | df = dataset() 32 | assert isinstance(df, pandas.DataFrame) 33 | assert len(df) == nrows 34 | 35 | 36 | @pytest.mark.parametrize("dataset,nrows,nlinks", FEBRL_DEDUP) 37 | def test_febrl_dedup_links(dataset, nrows, nlinks): 38 | df, links = dataset(return_links=True) 39 | assert isinstance(df, pandas.DataFrame) 40 | assert len(df) == nrows 41 | assert len(links) == nlinks 42 | assert isinstance(links, pandas.MultiIndex) 43 | 44 | 45 | @pytest.mark.parametrize("dataset,nrows,nlinks", FEBRL_DEDUP) 46 | def test_febrl_dedup_tril(dataset, nrows, nlinks): 47 | df, links = dataset(return_links=True) 48 | 49 | s_level_1 = pandas.Series(numpy.arange(len(df)), index=df.index) 50 | s_level_2 = pandas.Series(numpy.arange(len(df)), index=df.index) 51 | 52 | x1 = s_level_1.loc[links.get_level_values(0)] 53 | x2 = s_level_2.loc[links.get_level_values(1)] 54 | 55 | assert numpy.all(x1.values > x2.values) 56 | 57 | 58 | def test_febrl4(): 59 | dfa, dfb = load_febrl4() 60 | assert isinstance(dfa, pandas.DataFrame) 61 | assert isinstance(dfb, pandas.DataFrame) 62 | assert len(dfa) == 5000 63 | assert len(dfb) == 5000 64 | 65 | 66 | def test_febrl_links(): 67 | dfa, dfb, links = load_febrl4(return_links=True) 68 | assert isinstance(dfa, pandas.DataFrame) 69 | assert isinstance(dfb, pandas.DataFrame) 70 | assert len(dfa) == 5000 71 | assert len(dfb) == 5000 72 | assert isinstance(links, pandas.MultiIndex) 73 | 74 | 75 | @pytest.mark.skip(reason="Causes undeterministic problems") 76 | def test_krebs_dataset_download(): 77 | # remove downloaded datasets 78 | clear_data_home() 79 | 80 | krebs_data, krebs_matches = load_krebsregister() 81 | 82 | for i in range(1, 11): 83 | assert Path(get_data_home(), "krebsregister", f"block_{i}.zip").is_file() 84 | 85 | # count the number of recordss 86 | assert type(krebs_data), pandas.DataFrame 87 | assert type(krebs_matches), pandas.MultiIndex 88 | assert len(krebs_data) == 5749132 89 | assert len(krebs_matches) == 20931 90 | 91 | 92 | @pytest.mark.skip(reason="Causes undeterministic problems") 93 | def test_krebs_dataset_environ(tmpdir): 94 | path = Path(str(tmpdir)).expanduser() 95 | environ["RL_DATA"] = str(path) 96 | 97 | krebs_data, krebs_matches = load_krebsregister() 98 | 99 | for i in range(1, 11): 100 | assert Path(path, "krebsregister", f"block_{i}.zip").is_file() 101 | 102 | 103 | @pytest.mark.skip(reason="Causes undeterministic problems") 104 | def test_krebs_dataset(): 105 | krebs_data_block1, krebs_matches_block1 = load_krebsregister(1) 106 | krebs_data_block10, krebs_matches_block10 = load_krebsregister(10) 107 | 108 | assert len(krebs_data_block1) > 0 109 | assert len(krebs_data_block10) > 0 110 | 111 | # load not existing block 112 | with pytest.raises(ValueError): 113 | load_krebsregister(11) 114 | 115 | # missing values 116 | krebs_block10, matches = load_krebsregister(10, missing_values=0) 117 | assert krebs_block10.isnull().sum().sum() == 0 118 | 119 | 120 | @pytest.mark.skip(reason="Causes undeterministic problems") 121 | def test_krebs_missings(): 122 | # missing values 123 | krebs_block10, matches = load_krebsregister(10, missing_values=0) 124 | assert krebs_block10.isnull().sum().sum() == 0 125 | 126 | 127 | @pytest.mark.skip(reason="Causes undeterministic problems") 128 | def test_krebs_shuffle(): 129 | # missing values 130 | krebs_block10, matches = load_krebsregister(10, shuffle=False) 131 | 132 | 133 | def test_random_comparison_vectors(): 134 | # Test the generation of a random dataset 135 | 136 | n_record_pairs = 10000 137 | n_matches = 500 138 | 139 | df = binary_vectors( 140 | n_record_pairs, n_matches, m=[0.8] * 8, u=[0.2] * 8, random_state=535 141 | ) 142 | 143 | # Check the result is a DataFrame with MultiIndex 144 | assert isinstance(df, pandas.DataFrame) 145 | assert isinstance(df.index, pandas.MultiIndex) 146 | 147 | # Test the length of the dataframe 148 | assert len(df) == n_record_pairs 149 | 150 | 151 | def test_random_comparison_vectors_1value_col(): 152 | m = numpy.array([1, 0.81, 0.85, 0]) 153 | u = numpy.array([1, 0.23, 0.50, 0]) 154 | 155 | # Create the train dataset. 156 | X_train, y_train = binary_vectors( 157 | 1000, 500, m=m, u=u, random_state=535, return_links=True 158 | ) 159 | 160 | assert len(X_train.iloc[:, 0].unique()) == 1 161 | assert X_train.iloc[:, 0].unique()[0] == 1 162 | 163 | assert len(X_train.iloc[:, 3].unique()) == 1 164 | assert X_train.iloc[:, 3].unique()[0] == 0 165 | 166 | assert len(X_train.iloc[:, 1].unique()) == 2 167 | assert len(X_train.iloc[:, 2].unique()) == 2 168 | -------------------------------------------------------------------------------- /tests/test_generate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/J535D165/recordlinkage/b93d97641952f8c85106be5794ca93b1f1298fbc/tests/test_generate.py -------------------------------------------------------------------------------- /tests/test_measures.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | 4 | import numpy 5 | import pandas 6 | 7 | import recordlinkage as rl 8 | 9 | FULL_INDEX = pandas.MultiIndex.from_product( 10 | [[1, 2, 3], [1, 2, 3]], names=["first", "second"] # 3x3 matrix 11 | ) 12 | LINKS_TRUE = pandas.MultiIndex.from_tuples( 13 | [(1, 1), (2, 2), (3, 3)], names=["first", "second"] # the diagonal 14 | ) 15 | LINKS_PRED = pandas.MultiIndex.from_tuples( 16 | [(1, 1), (2, 1), (3, 1), (1, 2)], names=["first", "second"] # L shape 17 | ) 18 | 19 | 20 | class TestMeasures: 21 | def test_confusion_matrix(self): 22 | result_len = rl.confusion_matrix(LINKS_TRUE, LINKS_PRED, len(FULL_INDEX)) 23 | result_full_index = rl.confusion_matrix(LINKS_TRUE, LINKS_PRED, FULL_INDEX) 24 | expected = numpy.array([[1, 2], [3, 3]]) 25 | 26 | numpy.testing.assert_array_equal(result_len, expected) 27 | numpy.testing.assert_array_equal(result_full_index, expected) 28 | 29 | def test_tp_fp_tn_fn(self): 30 | tp = rl.true_positives(LINKS_TRUE, LINKS_PRED) 31 | assert tp == 1 32 | fp = rl.false_positives(LINKS_TRUE, LINKS_PRED) 33 | assert fp == 3 34 | tn = rl.true_negatives(LINKS_TRUE, LINKS_PRED, len(FULL_INDEX)) 35 | assert tn == 3 36 | fn = rl.false_negatives(LINKS_TRUE, LINKS_PRED) 37 | assert fn == 2 38 | 39 | def test_recall(self): 40 | # confusion matrix 41 | cm = rl.confusion_matrix(LINKS_TRUE, LINKS_PRED) 42 | 43 | assert rl.recall(LINKS_TRUE, LINKS_PRED) == 1 / 3 44 | assert rl.recall(cm) == 1 / 3 45 | 46 | def test_precision(self): 47 | # confusion matrix 48 | cm = rl.confusion_matrix(LINKS_TRUE, LINKS_PRED, len(FULL_INDEX)) 49 | 50 | assert rl.precision(LINKS_TRUE, LINKS_PRED) == 1 / 4 51 | assert rl.precision(cm) == 1 / 4 52 | 53 | def test_accuracy(self): 54 | # confusion matrix 55 | cm = rl.confusion_matrix(LINKS_TRUE, LINKS_PRED, len(FULL_INDEX)) 56 | 57 | assert rl.accuracy(LINKS_TRUE, LINKS_PRED, len(FULL_INDEX)) == 4 / 9 58 | assert rl.accuracy(cm) == 4 / 9 59 | assert rl.accuracy(LINKS_TRUE, LINKS_PRED, FULL_INDEX) == 4 / 9 60 | 61 | def test_specificity(self): 62 | # confusion matrix 63 | cm = rl.confusion_matrix(LINKS_TRUE, LINKS_PRED, len(FULL_INDEX)) 64 | 65 | assert rl.specificity(LINKS_TRUE, LINKS_PRED, len(FULL_INDEX)) == 1 / 2 66 | assert rl.specificity(cm) == 1 / 2 67 | assert rl.specificity(LINKS_TRUE, LINKS_PRED, FULL_INDEX) == 1 / 2 68 | 69 | def test_fscore(self): 70 | # confusion matrix 71 | cm = rl.confusion_matrix(LINKS_TRUE, LINKS_PRED, len(FULL_INDEX)) 72 | prec = rl.precision(LINKS_TRUE, LINKS_PRED) 73 | rec = rl.recall(LINKS_TRUE, LINKS_PRED) 74 | expected = float(2 * prec * rec / (prec + rec)) 75 | 76 | assert rl.fscore(LINKS_TRUE, LINKS_PRED) == expected 77 | assert rl.fscore(cm) == expected 78 | 79 | def test_full_index_size(self): 80 | df_a = pandas.DataFrame(numpy.arange(10)) 81 | df_b = pandas.DataFrame(numpy.arange(10)) 82 | 83 | assert rl.full_index_size(df_a) == 45 84 | assert rl.full_index_size(len(df_a)) == 45 85 | assert rl.full_index_size(len(df_a)) == 45 86 | assert rl.full_index_size([len(df_a)]) == 45 87 | 88 | assert rl.full_index_size(df_a, df_b) == 100 89 | assert rl.full_index_size(len(df_a), len(df_b)) == 100 90 | assert rl.full_index_size((len(df_a), len(df_b))) == 100 91 | assert rl.full_index_size([len(df_a), len(df_b)]) == 100 92 | 93 | def test_reduction_ratio(self): 94 | df_a = pandas.DataFrame(numpy.arange(10)) 95 | df_b = pandas.DataFrame(numpy.arange(10)) 96 | candidate_pairs_link = pandas.MultiIndex.from_product( 97 | [[1, 2, 3, 4, 5], [1, 2, 3, 4, 5]] 98 | ) 99 | candidate_pairs_dedup = pandas.MultiIndex.from_arrays( 100 | [[1, 2, 3, 4, 5], [1, 2, 3, 4, 5]] 101 | ) 102 | 103 | assert rl.reduction_ratio(candidate_pairs_dedup, df_a) == 8 / 9 104 | assert rl.reduction_ratio(candidate_pairs_dedup, (df_a)) == 8 / 9 105 | assert rl.reduction_ratio(candidate_pairs_dedup, (df_a,)) == 8 / 9 106 | 107 | assert rl.reduction_ratio(candidate_pairs_link, df_a, df_b) == 3 / 4 108 | assert rl.reduction_ratio(candidate_pairs_link, (df_a, df_b)) == 3 / 4 109 | assert rl.reduction_ratio(candidate_pairs_link, [df_a, df_b]) == 3 / 4 110 | -------------------------------------------------------------------------------- /tests/test_misc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | # testing utils from pandas 7 | import pandas.testing as pdt 8 | import pytest 9 | 10 | import recordlinkage as rl 11 | from recordlinkage import index_split 12 | 13 | 14 | def test_multiindex_split(): 15 | index = pd.MultiIndex.from_product([np.arange(5), np.arange(6)]) 16 | result = index_split(index, 3) 17 | 18 | assert len(result) == 3 19 | 20 | for i, result_index_chunk in enumerate(result): 21 | expected_index_chunk = index[i * 10 : (i + 1) * 10] 22 | pdt.assert_index_equal(result_index_chunk, expected_index_chunk) 23 | 24 | assert len(result_index_chunk.levels) == 2 25 | assert len(result_index_chunk.codes) == 2 26 | 27 | 28 | def test_options(): 29 | # global set 30 | rl.options.indexing.pairs = "multiindex" 31 | assert rl.get_option("indexing.pairs") == "multiindex" 32 | 33 | 34 | def test_options_context(): 35 | with rl.option_context("indexing.pairs", "multiindex"): 36 | rl.options.indexing.pairs = "multiindex" 37 | assert rl.get_option("indexing.pairs") == "multiindex" 38 | 39 | 40 | def test_options_incorrect_values(): 41 | # incorrect value 42 | with pytest.raises(ValueError): 43 | rl.options.indexing.pairs = "non_existing" 44 | -------------------------------------------------------------------------------- /tests/test_network.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | 5 | import pandas as pd 6 | 7 | # testing utils from pandas 8 | import pandas.testing as pdt 9 | import pytest 10 | 11 | try: 12 | import networkx # noqa 13 | except ImportError: 14 | pass 15 | 16 | from recordlinkage import ConnectedComponents 17 | from recordlinkage import OneToManyLinking 18 | from recordlinkage import OneToOneLinking 19 | 20 | 21 | def test_one_to_one_linking(): 22 | sample = pd.MultiIndex.from_tuples( 23 | [ 24 | (1, 1), 25 | (2, 2), 26 | (3, 3), 27 | (3, 4), 28 | (3, 5), 29 | (4, 4), 30 | (5, 5), 31 | (6, 5), 32 | (7, 7), 33 | (7, 7), 34 | (7, 8), 35 | ] 36 | ) 37 | one_to_many = OneToManyLinking() 38 | sample_one_to_many = one_to_many.compute(sample) 39 | 40 | expected = pd.MultiIndex.from_tuples( 41 | [(1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 5), (7, 7)] 42 | ) 43 | pdt.assert_index_equal(sample_one_to_many, expected) 44 | 45 | 46 | def test_one_to_many_linking(): 47 | sample = pd.MultiIndex.from_tuples( 48 | [ 49 | (1, 1), 50 | (2, 2), 51 | (3, 3), 52 | (3, 4), 53 | (3, 5), 54 | (4, 4), 55 | (5, 5), 56 | (6, 5), 57 | (7, 7), 58 | (7, 6), 59 | (7, 8), 60 | ] 61 | ) 62 | 63 | # test OneToOneLinking 64 | one_to_one = OneToOneLinking() 65 | sample_one_to_one = one_to_one.compute(sample) 66 | 67 | expected = pd.MultiIndex.from_tuples( 68 | [(1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (7, 7)] 69 | ) 70 | pdt.assert_index_equal(sample_one_to_one, expected) 71 | 72 | 73 | @pytest.mark.skipif( 74 | "networkx" not in sys.modules, reason="Requires the Networkx library" 75 | ) 76 | def test_connected_components(): 77 | sample = pd.MultiIndex.from_tuples([(1, 2), (2, 3), (3, 4), (5, 6), (5, 7), (8, 9)]) 78 | 79 | # test ConnectedComponents 80 | connected = ConnectedComponents() 81 | sample_connected = connected.compute(sample) 82 | 83 | expected = [ 84 | pd.MultiIndex.from_tuples([(1, 2), (2, 3), (3, 4)]), 85 | pd.MultiIndex.from_tuples([(5, 6), (5, 7)]), 86 | pd.MultiIndex.from_tuples([(8, 9)]), 87 | ] 88 | 89 | for i, _mi in enumerate(expected): 90 | pdt.assert_index_equal(sample_connected[i], expected[i]) 91 | --------------------------------------------------------------------------------