├── .coveragerc
├── .github
└── workflows
│ ├── ci-workflow.yml
│ ├── python-package.yml
│ └── render-docs.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .readthedocs.yaml
├── LICENSE
├── MANIFEST.in
├── README.md
├── benchmarks
├── __init__.py
├── bench_comparing.py
└── bench_indexing.py
├── docs
├── Makefile
├── about.rst
├── annotation.rst
├── changelog.rst
├── conf.py
├── contributing.rst
├── guides
│ ├── classifiers.rst
│ ├── data_deduplication.ipynb
│ └── link_two_dataframes.ipynb
├── images
│ ├── elas_1705.png
│ ├── indexing_basic.png
│ ├── indexing_plot.py
│ └── recordlinkage-banner-transparent.svg
├── index.rst
├── installation.rst
├── make.bat
├── performance.rst
├── ref-classifiers.rst
├── ref-compare.rst
├── ref-datasets.rst
├── ref-evaluation.rst
├── ref-index.rst
├── ref-misc.rst
└── ref-preprocessing.rst
├── examples
├── README.rst
├── dedup_deterministic.py
├── linking_deterministic.py
├── supervised_keras.py
├── supervised_learning_prob.py
└── unsupervised_learning_prob.py
├── pyproject.toml
├── recordlinkage
├── __init__.py
├── _lib
│ ├── numeric.c
│ └── numeric.h
├── adapters.py
├── algorithms
│ ├── __init__.py
│ ├── c_numeric.pyx
│ ├── compare.py
│ ├── distance.py
│ ├── indexing.py
│ ├── nb_sklearn.py
│ ├── numeric.py
│ └── string.py
├── annotation.py
├── api.py
├── base.py
├── classifiers.py
├── compare.py
├── config.py
├── config_init.py
├── contrib
│ ├── README.rst
│ ├── __init__.py
│ ├── compare
│ │ ├── __init__.py
│ │ └── random
│ │ │ ├── README.rst
│ │ │ ├── __init__.py
│ │ │ ├── random.py
│ │ │ └── test_random.py
│ └── index
│ │ ├── __init__.py
│ │ └── neighbourhoodblock
│ │ ├── README.rst
│ │ ├── __init__.py
│ │ ├── neighbourhoodblock.py
│ │ └── test_neighbourhoodblock.py
├── datasets
│ ├── __init__.py
│ ├── external.py
│ ├── febrl.py
│ ├── febrl
│ │ ├── dataset1.csv
│ │ ├── dataset2.csv
│ │ ├── dataset3.csv
│ │ ├── dataset4a.csv
│ │ └── dataset4b.csv
│ └── generate.py
├── deprecated.py
├── index.py
├── measures.py
├── network.py
├── preprocessing
│ ├── __init__.py
│ ├── cleaning.py
│ └── encoding.py
├── rl_logging.py
├── standardise
│ └── __init__.py
├── types.py
└── utils.py
└── tests
├── test_annotator.py
├── test_classify.py
├── test_compare.py
├── test_datasets.py
├── test_generate.py
├── test_indexing.py
├── test_measures.py
├── test_misc.py
├── test_network.py
└── test_preprocessing.py
/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | branch = True
3 |
4 | [report]
5 | exclude_lines =
6 | if self.debug:
7 | pragma: no cover
8 | raise NotImplementedError
9 | if __name__ == .__main__.:
10 |
11 | ignore_errors = False
12 |
13 | omit =
14 | tests/*
15 | docs/*
16 | recordlinkage/_version.py
17 | recordlinkage/types.py
--------------------------------------------------------------------------------
/.github/workflows/ci-workflow.yml:
--------------------------------------------------------------------------------
1 | name: tests
2 |
3 | on: [push, pull_request]
4 |
5 | jobs:
6 | build:
7 |
8 | runs-on: ubuntu-latest
9 | strategy:
10 | fail-fast: false
11 | matrix:
12 | python-version: ["3.8", "3.9", "3.10", "3.11"]
13 | pandas-version: ["1.0", "2.0"]
14 | steps:
15 | - uses: actions/checkout@v2
16 | - name: Set up Python ${{ matrix.python-version }}
17 | uses: actions/setup-python@v1
18 | with:
19 | python-version: ${{ matrix.python-version }}
20 | - name: Install pandas
21 | run: |
22 | pip install pandas~=${{ matrix.pandas-version }}
23 | - name: Package recordlinkage
24 | run: |
25 | pip install --upgrade pip
26 | pip install build
27 | python -m build
28 | - name: Install recordlinkage
29 | run: |
30 | pip install networkx>=2
31 | pip install ./dist/recordlinkage-*.whl
32 | - name: Test with pytest
33 | run: |
34 | pip install pytest
35 | # remove recordlinkage to prevent relative imports (use installed package)
36 | # this is like wrapping stuff in a src folder
37 | rm -r recordlinkage/
38 | pytest
39 | lint:
40 | runs-on: ubuntu-latest
41 | steps:
42 | - uses: actions/checkout@v2
43 | - uses: actions/setup-python@v1
44 | - name: Install ruff
45 | run: |
46 | pip install ruff
47 | - name: Lint with ruff
48 | run: |
49 | ruff .
50 |
--------------------------------------------------------------------------------
/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
1 | name: Upload Python Package
2 |
3 | on:
4 | release:
5 | types: [published]
6 |
7 | permissions:
8 | contents: read
9 |
10 | jobs:
11 | deploy:
12 | runs-on: ubuntu-latest
13 | steps:
14 | - uses: actions/checkout@v3
15 | - name: Set up Python
16 | uses: actions/setup-python@v4
17 | with:
18 | python-version: '3.x'
19 | - name: Install dependencies
20 | run: |
21 | python -m pip install --upgrade pip
22 | pip install build
23 | - name: Build package
24 | run: python -m build
25 | - name: Publish package
26 | uses: pypa/gh-action-pypi-publish@release/v1
27 | with:
28 | user: __token__
29 | password: ${{ secrets.pypi_password }}
30 |
--------------------------------------------------------------------------------
/.github/workflows/render-docs.yml:
--------------------------------------------------------------------------------
1 | name: Build HTML with Sphinx
2 | on: [push, pull_request]
3 | jobs:
4 | html-sphinx:
5 | runs-on: ubuntu-latest
6 | steps:
7 | - name: Clone repo
8 | uses: actions/checkout@v2
9 | - name: Set up Python
10 | uses: actions/setup-python@v2
11 | with:
12 | python-version: '3.10'
13 | - name: Install recordlinkage and docs tools
14 | run: |
15 | sudo apt install pandoc
16 | python -m pip install .[docs]
17 | - name: Build HTML
18 | run: |
19 | python -m sphinx -W --keep-going --color docs/ _build/html/
20 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | recordlinkage/datasets/krebsregister/*
3 |
4 | recordlinkage/_version.py
5 |
6 |
7 | .DS_Store
8 | */.DS_Store
9 |
10 | # Byte-compiled / optimized / DLL files
11 | __pycache__/
12 | *.py[cod]
13 | *$py.class
14 |
15 | # C extensions and Cython .pyx compilations
16 | *.so
17 | algorithms/*.c
18 |
19 | # Distribution / packaging
20 | .Python
21 | build/
22 | develop-eggs/
23 | dist/
24 | downloads/
25 | eggs/
26 | .eggs/
27 | lib/
28 | lib64/
29 | parts/
30 | sdist/
31 | var/
32 | *.egg-info/
33 | .installed.cfg
34 | *.egg
35 | *.bat
36 |
37 | # PyInstaller
38 | # Usually these files are written by a python script from a template
39 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
40 | *.manifest
41 | *.spec
42 |
43 | # Installer logs
44 | pip-log.txt
45 | pip-delete-this-directory.txt
46 |
47 | # Unit test / coverage reports
48 | htmlcov/
49 | .tox/
50 | .coverage
51 | .coverage.*
52 | .cache
53 | nosetests.xml
54 | .pytest_cache/*
55 | coverage.xml
56 | *,cover
57 |
58 | # Translations
59 | *.mo
60 | *.pot
61 |
62 | # Django stuff:
63 | *.log
64 |
65 | # Sphinx documentation
66 | docs/_build/
67 |
68 | # PyBuilder
69 | target/
70 |
71 | # IPython Notebook
72 | .ipynb_checkpoints
73 |
74 | # dotenv
75 | .env
76 |
77 | # virtualenv
78 | venv/
79 | ENV/
80 |
81 | /tests/sandbox
82 | # ASV
83 | .asv/
84 |
85 | # PyCharm IDE
86 | /sandbox
87 | /cover
88 | /coverage-report
89 | .idea/
90 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/pre-commit/pre-commit-hooks
3 | rev: v4.4.0
4 | hooks:
5 | - id: trailing-whitespace
6 | - id: end-of-file-fixer
7 | - id: check-yaml
8 | - id: check-added-large-files
9 | - repo: https://github.com/charliermarsh/ruff-pre-commit
10 | rev: v0.0.278
11 | hooks:
12 | - id: ruff
13 | - repo: https://github.com/psf/black
14 | rev: 23.7.0
15 | hooks:
16 | - id: black
17 |
--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
1 | version: 2
2 |
3 | build:
4 | os: ubuntu-22.04
5 | tools:
6 | python: "3.11"
7 |
8 | sphinx:
9 | configuration: docs/conf.py
10 |
11 | python:
12 | install:
13 | - method: pip
14 | path: .
15 | extra_requirements:
16 | - docs
17 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2016-2018, Jonathan de Bruin
2 | All rights reserved.
3 |
4 | Redistribution and use in source and binary forms, with or without
5 | modification, are permitted provided that the following conditions are met:
6 |
7 | * Redistributions of source code must retain the above copyright notice, this
8 | list of conditions and the following disclaimer.
9 |
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 | this list of conditions and the following disclaimer in the documentation
12 | and/or other materials provided with the distribution.
13 |
14 | * Neither the name of the copyright holder nor the names of its
15 | contributors may be used to endorse or promote products derived from
16 | this software without specific prior written permission.
17 |
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include recordlinkage/datasets/febrl *.csv
2 | recursive-include recordlinkage/datasets/krebsregister *.csv
3 |
4 | global-exclude test_*.py
5 | global-exclude *_test.py
6 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |

3 |
4 |
5 | # RecordLinkage: powerful and modular Python record linkage toolkit
6 |
7 | [](https://pypi.python.org/pypi/recordlinkage/)
8 | [](https://github.com/J535D165/recordlinkage/actions)
9 | [](https://codecov.io/gh/J535D165/recordlinkage)
10 | [](https://recordlinkage.readthedocs.io/en/latest/?badge=latest)
11 | [](https://doi.org/10.5281/zenodo.3559042)
12 |
13 | **RecordLinkage** is a powerful and modular record linkage toolkit to
14 | link records in or between data sources. The toolkit provides most of
15 | the tools needed for record linkage and deduplication. The package
16 | contains indexing methods, functions to compare records and classifiers.
17 | The package is developed for research and the linking of small or medium
18 | sized files.
19 |
20 | This project is inspired by the [Freely Extensible Biomedical Record
21 | Linkage (FEBRL)](https://sourceforge.net/projects/febrl/) project, which
22 | is a great project. In contrast with FEBRL, the recordlinkage project
23 | uses [pandas](http://pandas.pydata.org/) and
24 | [numpy](http://www.numpy.org/) for data handling and computations. The
25 | use of *pandas*, a flexible and powerful data analysis and manipulation
26 | library for Python, makes the record linkage process much easier and
27 | faster. The extensive *pandas* library can be used to integrate your
28 | record linkage directly into existing data manipulation projects.
29 |
30 | One of the aims of this project is to make an easily extensible record
31 | linkage framework. It is easy to include your own indexing algorithms,
32 | comparison/similarity measures and classifiers.
33 |
34 | ## Basic linking example
35 |
36 | Import the `recordlinkage` module with all important tools for record
37 | linkage and import the data manipulation framework **pandas**.
38 |
39 | ``` python
40 | import recordlinkage
41 | import pandas
42 | ```
43 |
44 | Load your data into pandas DataFrames.
45 |
46 | ``` python
47 | df_a = pandas.DataFrame(YOUR_FIRST_DATASET)
48 | df_b = pandas.DataFrame(YOUR_SECOND_DATASET)
49 | ```
50 |
51 | Comparing all record can be computationally intensive. Therefore, we
52 | make set of candidate links with one of the built-in indexing techniques
53 | like **blocking**. In this example, only pairs of records that agree on
54 | the surname are returned.
55 |
56 | ``` python
57 | indexer = recordlinkage.Index()
58 | indexer.block('surname')
59 | candidate_links = indexer.index(df_a, df_b)
60 | ```
61 |
62 | For each candidate link, compare the records with one of the comparison
63 | or similarity algorithms in the Compare class.
64 |
65 | ``` python
66 | c = recordlinkage.Compare()
67 |
68 | c.string('name_a', 'name_b', method='jarowinkler', threshold=0.85)
69 | c.exact('sex', 'gender')
70 | c.date('dob', 'date_of_birth')
71 | c.string('str_name', 'streetname', method='damerau_levenshtein', threshold=0.7)
72 | c.exact('place', 'placename')
73 | c.numeric('income', 'income', method='gauss', offset=3, scale=3, missing_value=0.5)
74 |
75 | # The comparison vectors
76 | feature_vectors = c.compute(candidate_links, df_a, df_b)
77 | ```
78 |
79 | Classify the candidate links into matching or distinct pairs based on
80 | their comparison result with one of the [classification
81 | algorithms](https://recordlinkage.readthedocs.io/en/latest/ref-classifiers.html).
82 | The following code classifies candidate pairs with a Logistic Regression
83 | classifier. This (supervised machine learning) algorithm requires
84 | training data.
85 |
86 | ``` python
87 | logrg = recordlinkage.LogisticRegressionClassifier()
88 | logrg.fit(TRAINING_COMPARISON_VECTORS, TRAINING_PAIRS)
89 |
90 | logrg.predict(feature_vectors)
91 | ```
92 |
93 | The following code shows the classification of candidate pairs with the
94 | Expectation-Conditional Maximisation (ECM) algorithm. This variant of
95 | the Expectation-Maximisation algorithm doesn't require training data
96 | (unsupervised machine learning).
97 |
98 | ``` python
99 | ecm = recordlinkage.ECMClassifier()
100 | ecm.fit_predict(feature_vectors)
101 | ```
102 |
103 | ## Main Features
104 |
105 | The main features of this Python record linkage toolkit are:
106 |
107 | - Clean and standardise data with easy to use tools
108 | - Make pairs of records with smart indexing methods such as
109 | **blocking** and **sorted neighbourhood indexing**
110 | - Compare records with a large number of comparison and similarity
111 | measures for different types of variables such as strings, numbers
112 | and dates.
113 | - Several classifications algorithms, both supervised and unsupervised
114 | algorithms.
115 | - Common record linkage evaluation tools
116 | - Several built-in datasets.
117 |
118 | ## Documentation
119 |
120 | The most recent documentation and API reference can be found at
121 | [recordlinkage.readthedocs.org](http://recordlinkage.readthedocs.org/en/latest/).
122 | The documentation provides some basic usage examples like
123 | [deduplication](http://recordlinkage.readthedocs.io/en/latest/guides/data_deduplication.html)
124 | and
125 | [linking](http://recordlinkage.readthedocs.io/en/latest/guides/link_two_dataframes.html)
126 | census data. More examples are coming soon. If you do have interesting
127 | examples to share, let us know.
128 |
129 | ## Installation
130 |
131 | The Python Record linkage Toolkit requires Python 3.8 or higher. Install the
132 | package easily with pip
133 |
134 | ``` sh
135 | pip install recordlinkage
136 | ```
137 |
138 | The toolkit depends on popular packages like
139 | [Pandas](https://github.com/pydata/pandas),
140 | [Numpy](http://www.numpy.org), [Scipy](https://www.scipy.org/) and,
141 | [Scikit-learn](http://scikit-learn.org/). A complete list of
142 | dependencies can be found in the [installation
143 | manual](https://recordlinkage.readthedocs.io/en/latest/installation.html)
144 | as well as recommended and optional dependencies.
145 |
146 | ## License
147 |
148 | The license for this record linkage tool is BSD-3-Clause.
149 |
150 | ## Citation
151 |
152 | Please cite this package when being used in an academic context. Ensure
153 | that the DOI and version match the installed version. Citatation styles
154 | can be found on the publishers website
155 | [10.5281/zenodo.3559042](https://doi.org/10.5281/zenodo.3559042).
156 |
157 | ``` text
158 | @software{de_bruin_j_2019_3559043,
159 | author = {De Bruin, J},
160 | title = {{Python Record Linkage Toolkit: A toolkit for
161 | record linkage and duplicate detection in Python}},
162 | month = dec,
163 | year = 2019,
164 | publisher = {Zenodo},
165 | version = {v0.14},
166 | doi = {10.5281/zenodo.3559043},
167 | url = {https://doi.org/10.5281/zenodo.3559043}
168 | }
169 | ```
170 |
171 | ## Need help?
172 |
173 | Stuck on your record linkage code or problem? Any other questions? Don't
174 | hestitate to send me an email ().
175 |
--------------------------------------------------------------------------------
/benchmarks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/J535D165/recordlinkage/b93d97641952f8c85106be5794ca93b1f1298fbc/benchmarks/__init__.py
--------------------------------------------------------------------------------
/benchmarks/bench_comparing.py:
--------------------------------------------------------------------------------
1 | import recordlinkage as rl
2 | from recordlinkage.datasets import load_febrl1
3 | from recordlinkage.datasets import load_febrl4
4 |
5 |
6 | class CompareRecordLinkage:
7 | timeout = 30 * 60
8 |
9 | def setup(self):
10 | # download data
11 | self.A, self.B = load_febrl4()
12 |
13 | # make pairs
14 | c_pairs = rl.FullIndex()
15 | pairs = c_pairs.index(self.A, self.B)
16 |
17 | # different sizes of pairs
18 | self.pairs_xsmall = pairs[0:5e3]
19 | self.pairs_small = pairs[0:5e4]
20 | self.pairs_medium = pairs[0:5e5]
21 | self.pairs_large = pairs[0:5e6]
22 |
23 | def time_global_xsmall(self):
24 | c_compare = rl.Compare(self.pairs_xsmall, self.A, self.B)
25 | c_compare.string("given_name", "given_name", method="jaro")
26 | c_compare.string("surname", "surname", method="jarowinkler", threshold=0.85)
27 | c_compare.date("date_of_birth", "date_of_birth")
28 | c_compare.exact("suburb", "suburb")
29 | c_compare.exact("state", "state")
30 | c_compare.string("address_1", "address_1", method="levenshtein", threshold=0.85)
31 |
32 | def time_global_small(self):
33 | c_compare = rl.Compare(self.pairs_small, self.A, self.B)
34 | c_compare.string("given_name", "given_name", method="jaro")
35 | c_compare.string("surname", "surname", method="jarowinkler", threshold=0.85)
36 | c_compare.date("date_of_birth", "date_of_birth")
37 | c_compare.exact("suburb", "suburb")
38 | c_compare.exact("state", "state")
39 | c_compare.string("address_1", "address_1", method="levenshtein", threshold=0.85)
40 |
41 | def time_global_medium(self):
42 | c_compare = rl.Compare(self.pairs_medium, self.A, self.B)
43 | c_compare.string("given_name", "given_name", method="jaro")
44 | c_compare.string("surname", "surname", method="jarowinkler", threshold=0.85)
45 | c_compare.date("date_of_birth", "date_of_birth")
46 | c_compare.exact("suburb", "suburb")
47 | c_compare.exact("state", "state")
48 | c_compare.string("address_1", "address_1", method="levenshtein", threshold=0.85)
49 |
50 | def time_global_large(self):
51 | c_compare = rl.Compare(self.pairs_large, self.A, self.B)
52 | c_compare.string("given_name", "given_name", method="jaro")
53 | c_compare.string("surname", "surname", method="jarowinkler", threshold=0.85)
54 | c_compare.date("date_of_birth", "date_of_birth")
55 | c_compare.exact("suburb", "suburb")
56 | c_compare.exact("state", "state")
57 | c_compare.string("address_1", "address_1", method="levenshtein", threshold=0.85)
58 |
59 |
60 | class CompareDeduplication:
61 | timeout = 30 * 60
62 |
63 | def setup(self):
64 | # download data
65 | self.A = load_febrl1()
66 |
67 | # make pairs
68 | c_pairs = rl.FullIndex()
69 | pairs = c_pairs.index(self.A)
70 |
71 | # different sizes of pairs
72 | self.pairs_xsmall = pairs[0:5e3]
73 | self.pairs_small = pairs[0:5e4]
74 | self.pairs_medium = pairs[0:5e5]
75 | self.pairs_large = pairs[0:5e6]
76 |
77 | def time_global_xsmall(self):
78 | c_compare = rl.Compare(self.pairs_xsmall, self.A)
79 | c_compare.string("given_name", "given_name", method="jaro")
80 | c_compare.string("surname", "surname", method="jarowinkler", threshold=0.85)
81 | c_compare.date("date_of_birth", "date_of_birth")
82 | c_compare.exact("suburb", "suburb")
83 | c_compare.exact("state", "state")
84 | c_compare.string("address_1", "address_1", method="levenshtein", threshold=0.85)
85 |
86 | def time_global_small(self):
87 | c_compare = rl.Compare(self.pairs_small, self.A)
88 | c_compare.string("given_name", "given_name", method="jaro")
89 | c_compare.string("surname", "surname", method="jarowinkler", threshold=0.85)
90 | c_compare.date("date_of_birth", "date_of_birth")
91 | c_compare.exact("suburb", "suburb")
92 | c_compare.exact("state", "state")
93 | c_compare.string("address_1", "address_1", method="levenshtein", threshold=0.85)
94 |
95 | def time_global_medium(self):
96 | c_compare = rl.Compare(self.pairs_medium, self.A)
97 | c_compare.string("given_name", "given_name", method="jaro")
98 | c_compare.string("surname", "surname", method="jarowinkler", threshold=0.85)
99 | c_compare.date("date_of_birth", "date_of_birth")
100 | c_compare.exact("suburb", "suburb")
101 | c_compare.exact("state", "state")
102 | c_compare.string("address_1", "address_1", method="levenshtein", threshold=0.85)
103 |
104 | def time_global_large(self):
105 | c_compare = rl.Compare(self.pairs_large, self.A)
106 | c_compare.string("given_name", "given_name", method="jaro")
107 | c_compare.string("surname", "surname", method="jarowinkler", threshold=0.85)
108 | c_compare.date("date_of_birth", "date_of_birth")
109 | c_compare.exact("suburb", "suburb")
110 | c_compare.exact("state", "state")
111 | c_compare.string("address_1", "address_1", method="levenshtein", threshold=0.85)
112 |
113 |
114 | class CompareAlgorithms:
115 | timeout = 30 * 60
116 |
117 | def setup(self):
118 | # download data
119 | self.A, self.B = load_febrl4()
120 |
121 | # Add numbers (age)
122 | self.A["postcode"] = self.A["postcode"].astype(float)
123 | self.B["postcode"] = self.B["postcode"].astype(float)
124 |
125 | # make pairs
126 | c_pairs = rl.FullIndex()
127 | self.pairs = c_pairs.index(self.A, self.B)[0:5e4]
128 |
129 | # ************* STRING *************
130 |
131 | def time_string_jaro(self):
132 | c_compare = rl.Compare(self.pairs, self.A, self.B)
133 | c_compare.string("given_name", "given_name", method="jaro")
134 |
135 | def time_string_jarowinkler(self):
136 | c_compare = rl.Compare(self.pairs, self.A, self.B)
137 | c_compare.string("given_name", "given_name", method="jarowinkler")
138 |
139 | def time_string_qgram(self):
140 | c_compare = rl.Compare(self.pairs, self.A, self.B)
141 | c_compare.string("given_name", "given_name", method="qgram")
142 |
143 | def time_string_cosine(self):
144 | c_compare = rl.Compare(self.pairs, self.A, self.B)
145 | c_compare.string("given_name", "given_name", method="cosine")
146 |
147 | def time_string_levenshtein(self):
148 | c_compare = rl.Compare(self.pairs, self.A, self.B)
149 | c_compare.string("given_name", "given_name", method="levenshtein")
150 |
151 | # ************* Exact *************
152 |
153 | def time_exact(self):
154 | c_compare = rl.Compare(self.pairs, self.A, self.B)
155 | c_compare.exact("state", "state")
156 |
157 | # ************* NUMERIC *************
158 |
159 | def time_numeric_gauss(self):
160 | c_compare = rl.Compare(self.pairs, self.A, self.B)
161 | c_compare.numeric("age", "age", method="gauss", scale=2)
162 |
--------------------------------------------------------------------------------
/benchmarks/bench_indexing.py:
--------------------------------------------------------------------------------
1 | import recordlinkage as rl
2 | from recordlinkage.datasets import load_febrl1
3 | from recordlinkage.datasets import load_febrl4
4 |
5 |
6 | class PairsRecordLinkage:
7 | timeout = 30 * 60
8 |
9 | def setup(self):
10 | # download data
11 | self.A, self.B = load_febrl4()
12 |
13 | def time_full_index(self):
14 | # setup class
15 | c_pairs = rl.FullIndex()
16 |
17 | # Make pairs
18 | c_pairs.index(self.A, self.B)
19 |
20 | def time_block_index(self):
21 | # setup class
22 | c_pairs = rl.BlockIndex("given_name")
23 |
24 | # Make pairs
25 | c_pairs.index(self.A, self.B)
26 |
27 | def time_sni_index(self):
28 | # setup class
29 | c_pairs = rl.SortedNeighbourhoodIndex(on="given_name", w=5)
30 |
31 | # Make pairs
32 | c_pairs.index(self.A, self.B)
33 |
34 | def time_random_index(self):
35 | # setup class
36 | c_pairs = rl.RandomIndex(2500)
37 |
38 | # Make pairs
39 | c_pairs.index(self.A, self.B)
40 |
41 |
42 | class PairsDeduplication:
43 | timeout = 30 * 60
44 |
45 | def setup(self):
46 | # download data
47 | self.A = load_febrl1()
48 |
49 | def time_full_index(self):
50 | # setup class
51 | c_pairs = rl.FullIndex()
52 |
53 | # Make pairs
54 | c_pairs.index(self.A)
55 |
56 | def time_block_index(self):
57 | # setup class
58 | c_pairs = rl.BlockIndex("given_name")
59 |
60 | # Make pairs
61 | c_pairs.index(self.A)
62 |
63 | def time_sni_index(self):
64 | # setup class
65 | c_pairs = rl.SortedNeighbourhoodIndex(on="given_name", w=5)
66 |
67 | # Make pairs
68 | c_pairs.index(self.A)
69 |
70 | def time_random_index(self):
71 | # setup class
72 | c_pairs = rl.RandomIndex(2500)
73 |
74 | # Make pairs
75 | c_pairs.index(self.A)
76 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = .
9 | BUILDDIR = _build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/docs/about.rst:
--------------------------------------------------------------------------------
1 | *****
2 | About
3 | *****
4 |
5 | Introduction
6 | ============
7 |
8 | The **Python Record Linkage Toolkit** is a library to link records in or
9 | between data sources. The toolkit provides most of the tools needed for
10 | record linkage and deduplication. The package contains indexing methods,
11 | functions to compare records and classifiers. The package is developed
12 | for research and the linking of small or medium sized files.
13 |
14 | The project is inspired by the `Freely Extensible Biomedical Record Linkage
15 | (FEBRL) `__ project, which is a great
16 | project. In contrast with FEBRL, the recordlinkage project makes extensive use
17 | of data manipulation tools like `pandas `__ and
18 | `numpy `__. The use of *pandas*, a flexible and
19 | powerful data analysis and manipulation library for Python, makes the record
20 | linkage process much easier and faster. The extensive *pandas* library can be
21 | used to integrate your record linkage directly into existing data manipulation
22 | projects.
23 |
24 | One of the aims of this project is to make an extensible record linkage
25 | framework. It is easy to include your own indexing algorithms,
26 | comparison/similarity measures and classifiers. The main features of the
27 | Python Record Linkage Toolkit are:
28 |
29 | - Clean and standardise data with easy to use tools
30 | - Make pairs of records with smart indexing methods such as
31 | **blocking** and **sorted neighbourhood indexing**
32 | - Compare records with a large number of comparison and similarity measures
33 | for different types of variables such as strings, numbers and dates.
34 | - Several classifications algorithms, both supervised and unsupervised
35 | algorithms.
36 | - Common record linkage evaluation tools
37 | - Several built-in datasets.
38 |
39 |
40 | What is record linkage?
41 | =======================
42 |
43 | The term record linkage is used to indicate the procedure of bringing together
44 | information from two or more records that are believed to belong to the same
45 | entity. Record linkage is used to link data from multiple data sources or to
46 | find duplicates in a single data source. In computer science, record linkage
47 | is also known as data matching or deduplication (in case of search duplicate
48 | records within a single file).
49 |
50 | In record linkage, the attributes of the entity (stored in a record) are used
51 | to link two or more records. Attributes can be unique entity identifiers (SSN,
52 | license plate number), but also attributes like (sur)name, date of birth and
53 | car model/colour. The record linkage procedure can be represented as a
54 | workflow [Christen, 2012]. The steps are: cleaning, indexing, comparing,
55 | classifying and evaluation. If needed, the classified record pairs flow back
56 | to improve the previous step. The Python Record Linkage Toolkit follows this
57 | workflow.
58 |
59 | .. seealso::
60 |
61 | *Christen, Peter. 2012. Data matching: concepts and techniques for record
62 | linkage, entity resolution, and duplicate detection. Springer Science &
63 | Business Media.*
64 |
65 | *Fellegi, Ivan P and Alan B Sunter. 1969. “A theory for record linkage.”
66 | Journal of the American Statistical Association 64(328):1183–1210.*
67 |
68 | *Dunn, Halbert L. 1946. “Record linkage.” American Journal of Public
69 | Health and the Nations Health 36(12):1412–1416.*
70 |
71 | *Herzog, Thomas N, Fritz J Scheuren and William E Winkler. 2007. Data
72 | quality and record linkage techniques. Vol. 1 Springer.*
73 |
74 | How to link records?
75 | ====================
76 |
77 | Import the ``recordlinkage`` module with all important tools for record
78 | linkage and import the data manipulation framework **pandas**.
79 |
80 | .. code:: python
81 |
82 | import recordlinkage
83 | import pandas
84 |
85 | Consider that you try to link two datasets with personal information
86 | like name, sex and date of birth. Load these datasets into a pandas
87 | ``DataFrame``.
88 |
89 | .. code:: python
90 |
91 | df_a = pandas.DataFrame(YOUR_FIRST_DATASET)
92 | df_b = pandas.DataFrame(YOUR_SECOND_DATASET)
93 |
94 | Comparing all record can be computationally intensive. Therefore, we
95 | make smart set of candidate links with one of the built-in indexing
96 | techniques like **blocking**. Only records pairs agreeing on the
97 | surname are included.
98 |
99 | .. code:: python
100 |
101 | indexer = recordlinkage.Index()
102 | indexer.block('surname')
103 | candidate_links = indexer.index(df_a, df_b)
104 |
105 | Each ``candidate_link`` needs to be compared on the comparable attributes.
106 | This can be done easily with the Compare class and the available comparison
107 | and similarity measures.
108 |
109 | .. code:: python
110 |
111 | compare = recordlinkage.Compare()
112 |
113 | compare.string('name', 'name', method='jarowinkler', threshold=0.85)
114 | compare.exact('sex', 'gender')
115 | compare.exact('dob', 'date_of_birth')
116 | compare.string('streetname', 'streetname', method='damerau_levenshtein', threshold=0.7)
117 | compare.exact('place', 'placename')
118 | compare.exact('haircolor', 'haircolor', missing_value=9)
119 |
120 | # The comparison vectors
121 | compare_vectors = compare.compute(candidate_links, df_a, df_b)
122 |
123 | This record linkage package contains several classification algorithms.
124 | Plenty of the algorithms need trainings data (supervised learning) while
125 | some others are unsupervised. An example of supervised learning:
126 |
127 | .. code:: python
128 |
129 | true_linkage = pandas.Series(YOUR_GOLDEN_DATA, index=pandas.MultiIndex(YOUR_MULTI_INDEX))
130 |
131 | logrg = recordlinkage.LogisticRegressionClassifier()
132 | logrg.fit(compare_vectors[true_linkage.index], true_linkage)
133 |
134 | logrg.predict(compare_vectors)
135 |
136 | and an example of unsupervised learning (the well known ECM-algorithm):
137 |
138 | .. code:: python
139 |
140 | ecm = recordlinkage.BernoulliEMClassifier()
141 | ecm.fit_predict(compare_vectors)
142 |
143 |
144 |
--------------------------------------------------------------------------------
/docs/annotation.rst:
--------------------------------------------------------------------------------
1 | **********
2 | Annotation
3 | **********
4 |
5 | Manually labeled record pairs are useful in training and validation tasks.
6 | Training data is usually not available in record linkage applications because
7 | it is highly dataset and sample-specific. The Python Record Linkage Toolkit
8 | comes with a `browser-based user interface`_ for manually classifying record
9 | pairs. A hosted version of `RecordLinkage ANNOTATOR`_ can be found on Github.
10 |
11 | .. _`browser-based user interface`: https://github.com/J535D165/recordlinkage-annotator
12 | .. _`RecordLinkage ANNOTATOR`: https://j535d165.github.io/recordlinkage-annotator/
13 |
14 | .. image:: https://github.com/J535D165/recordlinkage-annotator/blob/master/images/annotator_review.png?raw=true
15 | :alt: Review screen of RecordLinkage ANNOTATOR
16 | :target: https://j535d165.github.io/recordlinkage-annotator/
17 |
18 | Generate annotation file
19 | ========================
20 |
21 | The `RecordLinkage ANNOTATOR`_ software requires a structured annotation
22 | file. The required schema_ of the annotation file is open. The function
23 | :func:`recordlinkage.write_annotation_file` can be used to render and save an
24 | annotation file. The function can be used for both linking and deduplication
25 | purposes.
26 |
27 | .. _schema: https://github.com/J535D165/recordlinkage-annotator/tree/master/schema
28 |
29 | .. autofunction:: recordlinkage.write_annotation_file
30 |
31 | Linking
32 | -------
33 |
34 | This is a simple example of the code to render an annotation
35 | file for linking records:
36 |
37 | .. code:: python
38 |
39 | import recordlinkage as rl
40 | from recordlinkage.index import Block
41 | from recordlinkage.datasets import load_febrl4
42 |
43 | df_a, df_b = load_febrl4()
44 |
45 | blocker = Block("surname", "surname")
46 | pairs = blocker.index(df_a, df_b)
47 |
48 | rl.write_annotation_file(
49 | "annotation_demo_linking.json",
50 | pairs[0:50],
51 | df_a,
52 | df_b,
53 | dataset_a_name="Febrl4 A",
54 | dataset_b_name="Febrl4 B"
55 | )
56 |
57 | Deduplication
58 | -------------
59 |
60 | This is a simple example of the code to render an annotation
61 | file for duplicate detection:
62 |
63 | .. code:: python
64 |
65 | import recordlinkage as rl
66 | from recordlinkage.index import Block
67 | from recordlinkage.datasets import load_febrl1
68 |
69 | df_a = load_febrl1()
70 |
71 | blocker = Block("surname", "surname")
72 | pairs = blocker.index(df_a)
73 |
74 | rl.write_annotation_file(
75 | "annotation_demo_dedup.json",
76 | pairs[0:50],
77 | df_a,
78 | dataset_a_name="Febrl1 A"
79 | )
80 |
81 |
82 | Manual labeling
83 | ===============
84 |
85 | Go to `RecordLinkage ANNOTATOR`_ or start the server yourself.
86 |
87 | Choose the annotation file on the landing screen or use the drag and drop
88 | functionality. A new screen shows the first record pair to label. Start
89 | labeling data the manually. Use the button `Match` for record pairs belonging
90 | to the same entity. Use `Distinct` for record pairs belonging to different
91 | entities. After all records are labeled by hand, the result can be saved to a
92 | file.
93 |
94 |
95 | Export/read annotation file
96 | ===========================
97 |
98 | After labeling all record pairs, you can export the annotation file to a JSON
99 | file. Use the function :func:`recordlinkage.read_annotation_file` to read the
100 | results.
101 |
102 | .. code:: python
103 |
104 | import recordlinkage as rl
105 |
106 | result = rl.read_annotation_file('my_annotation.json')
107 | print(result.links)
108 |
109 | The function :func:`recordlinkage.read_annotation_file` reads the file and returns
110 | an :class:`recordlinkage.annotation.AnnotationResult` object. This object contains
111 | links and distinct attributes that return a :class:`pandas.MultiIndex` object.
112 |
113 | .. autofunction:: recordlinkage.read_annotation_file
114 |
115 |
116 | .. autoclass:: recordlinkage.annotation.AnnotationResult
117 | :members:
118 | :inherited-members:
119 |
120 |
--------------------------------------------------------------------------------
/docs/changelog.rst:
--------------------------------------------------------------------------------
1 | *************
2 | Release notes
3 | *************
4 |
5 | Version 0.15
6 | ============
7 |
8 | - Remove deprecated recordlinkage classes (`#173`_)
9 | - Bump min Python version to 3.6, ideally 3.8+ (`#171`_)
10 | - Bump min pandas version to >=1
11 | - Resolve deprecation warnings for numpy and pandas
12 | - Happy lint, sort imports, format code with yapf
13 | - Remove unnecessary np.sort in SNI algorithm (`#141`_)
14 | - Fix bug for cosine and qgram string comparisons with threshold (`#135`_)
15 | - Fix several typos in docs (`#151`_)(`#152`_)(`#153`_)(`#154`_)(`#163`_)(`#164`_)
16 | - Fix random indexer (`#158`_)
17 | - Fix various deprecation warnings and broken docs build (`#170`_)
18 | - Fix broken docs build due to pandas depr warnings (`#169`_)
19 | - Fix broken build and removed warning messages (`#168`_)
20 | - Update narrative
21 | - Replace Travis by Github Actions (`#132`_)
22 | - Fix broken test NotFittedError
23 | - Fix bug in low memory random sampling and add more tests (`#130`_)
24 | - Add extras_require to setup.py for deps management
25 | - Add banner to README and update title
26 | - Add Binder and Colab buttons at tutorials (`#174`_)
27 |
28 | Special thanks to Tomasz Waleń @twalen and other contributors for their
29 | work on this release.
30 |
31 | .. _#173: https://github.com/J535D165/recordlinkage/pull/173
32 | .. _#171: https://github.com/J535D165/recordlinkage/pull/171
33 | .. _#141: https://github.com/J535D165/recordlinkage/pull/141
34 | .. _#135: https://github.com/J535D165/recordlinkage/pull/135
35 | .. _#151: https://github.com/J535D165/recordlinkage/pull/151
36 | .. _#152: https://github.com/J535D165/recordlinkage/pull/152
37 | .. _#153: https://github.com/J535D165/recordlinkage/pull/153
38 | .. _#154: https://github.com/J535D165/recordlinkage/pull/154
39 | .. _#163: https://github.com/J535D165/recordlinkage/pull/163
40 | .. _#164: https://github.com/J535D165/recordlinkage/pull/164
41 | .. _#158: https://github.com/J535D165/recordlinkage/pull/158
42 | .. _#170: https://github.com/J535D165/recordlinkage/pull/170
43 | .. _#169: https://github.com/J535D165/recordlinkage/pull/169
44 | .. _#168: https://github.com/J535D165/recordlinkage/pull/168
45 | .. _#132: https://github.com/J535D165/recordlinkage/pull/132
46 | .. _#130: https://github.com/J535D165/recordlinkage/pull/130
47 | .. _#174: https://github.com/J535D165/recordlinkage/pull/174
48 |
49 | Version 0.14
50 | ============
51 |
52 | - Drop Python 2.7 and Python 3.4 support. (`#91`_)
53 | - Upgrade minimal pandas version to 0.23.
54 | - Simplify the use of all cpus in parallel mode. (`#102`_)
55 | - Store large example datasets in user home folder or use environment
56 | variable. Before, example datasets were stored in the package. (see
57 | issue `#42`_) (`#92`_)
58 | - Add support to write and read annotation files for recordlinkage ANNOTATOR.
59 | See the docs and https://github.com/J535D165/recordlinkage-annotator for
60 | more information.
61 | - Replace `.labels` by `.codes` for `pandas.MultiIndex` objects for newer
62 | versions of pandas (>0.24). (`#103`_)
63 | - Fix totals for pandas.MultiIndex input on confusion matrix and accuracy
64 | metrics. (see issue `#84`_) (`#109`_)
65 | - Initialize Compare with (a list of) features (Bug). (`#124`_)
66 | - Various updates in relation to deprecation warnings in third-party
67 | libraries such as sklearn, pandas and networkx.
68 |
69 | .. _#42: https://github.com/J535D165/recordlinkage/issues/42
70 | .. _#84: https://github.com/J535D165/recordlinkage/issues/84
71 |
72 | .. _#91: https://github.com/J535D165/recordlinkage/pull/91
73 | .. _#92: https://github.com/J535D165/recordlinkage/pull/92
74 | .. _#102: https://github.com/J535D165/recordlinkage/pull/102
75 | .. _#103: https://github.com/J535D165/recordlinkage/pull/103
76 | .. _#109: https://github.com/J535D165/recordlinkage/pull/109
77 | .. _#124: https://github.com/J535D165/recordlinkage/pull/124
78 |
--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
1 | # Configuration file for the Sphinx documentation builder.
2 | #
3 | # For the full list of built-in configuration values, see the documentation:
4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
5 |
6 | import datetime
7 | import os
8 | import sys
9 |
10 | sys.path.insert(0, os.path.abspath(".."))
11 |
12 | # -- Project information -----------------------------------------------------
13 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
14 |
15 | project = "Python Record Linkage Toolkit"
16 | copyright = f"2016-{datetime.datetime.now().year}, Jonathan de Bruin"
17 | author = "Jonathan de Bruin"
18 |
19 | version = "0.15"
20 | release = "0.15"
21 |
22 | # -- General configuration ---------------------------------------------------
23 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
24 |
25 | extensions = [
26 | "sphinx.ext.autodoc",
27 | "sphinx.ext.doctest",
28 | "sphinx.ext.napoleon",
29 | "sphinx.ext.intersphinx",
30 | "IPython.sphinxext.ipython_console_highlighting",
31 | "IPython.sphinxext.ipython_directive",
32 | "nbsphinx",
33 | ]
34 |
35 | templates_path = ['_templates']
36 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
37 |
38 | autodoc_member_order = "bysource"
39 |
40 | intersphinx_mapping = {
41 | "python": ("https://docs.python.org/3/", None),
42 | "pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None),
43 | "numpy": ("https://numpy.org/doc/stable/", None),
44 | "sklearn": ("https://scikit-learn.org/stable/", None),
45 | }
46 |
47 |
48 | # -- Options for HTML output -------------------------------------------------
49 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
50 |
51 | html_theme = "sphinx_rtd_theme"
52 | html_static_path = ['_static']
53 |
54 | master_doc = "index"
55 | pygments_style = "sphinx"
56 |
57 | todo_include_todos = False
58 |
59 | # -- Options for HTML output ----------------------------------------------
60 |
61 | html_static_path = []
62 | html_domain_indices = False
63 |
64 | # Output file base name for HTML help builder.
65 | htmlhelp_basename = "RecordLinkageToolkitdoc"
66 |
67 | # -- Napoleon options ---------------------------------------------------
68 |
69 | napoleon_google_docstring = False
70 | napoleon_numpy_docstring = True
71 | napoleon_include_private_with_doc = False
72 | napoleon_include_special_with_doc = False
73 | napoleon_use_admonition_for_examples = False
74 | napoleon_use_admonition_for_notes = True
75 | napoleon_use_admonition_for_references = True
76 | napoleon_use_ivar = False
77 | napoleon_use_param = True
78 | napoleon_use_rtype = False
79 |
80 | # -- NBSphinx options ----------------------------------------------------
81 |
82 | # nbsphinx_execute = 'never'
83 |
84 | # This is processed by Jinja2 and inserted before each notebook
85 | nbsphinx_prolog = r"""
86 | {% set docname = 'docs/' + env.doc2path(env.docname, base=None) %}
87 |
88 | .. note::
89 |
90 | This page was generated from `{{ docname|e }} `_.
91 | Run an online interactive version of this page with |binder| or |colab|.
92 |
93 | .. |binder| image:: https://mybinder.org/badge_logo.svg
94 | :target: https://mybinder.org/v2/gh/J535D165/recordlinkage/v{{ env.config.release|e }}?filepath={{ docname|e }}
95 |
96 | .. |colab| image:: https://colab.research.google.com/assets/colab-badge.svg
97 | :target: https://githubtocolab.com/J535D165/recordlinkage/blob/v{{ env.config.release|e }}/{{ docname|e }}
98 |
99 | """
100 |
--------------------------------------------------------------------------------
/docs/contributing.rst:
--------------------------------------------------------------------------------
1 | ************
2 | Contributing
3 | ************
4 |
5 | Thanks for your interest in contributing to the Python Record Linkage Toolkit.
6 | There is a lot of work to do. See `Github `_
7 | for the contributors to this package.
8 |
9 | The workflow for contributing is as follows:
10 |
11 | - clone https://github.com/J535D165/recordlinkage.git
12 | - Make a branch with your modifications/contributions
13 | - Write tests
14 | - Run all tests
15 | - Do a pull request
16 |
17 | Testing
18 | =======
19 |
20 | Install `pytest`:
21 |
22 | .. code:: sh
23 |
24 | pip install pytest
25 |
26 | Run the following command to test the package
27 |
28 | .. code:: sh
29 |
30 | python -m pytest tests/
31 |
32 | Performance
33 | ===========
34 |
35 | Performance is very important in record linkage. The performance is monitored
36 | for all serious modifications of the core API. The performance monitoring is
37 | performed with `Airspeed Velocity `_
38 | (asv).
39 |
40 | Install Airspeed Velocity:
41 |
42 | .. code:: sh
43 |
44 | pip install asv
45 |
46 | Run the following command from the root of the repository to test the
47 | performance of the current version of the package:
48 |
49 | .. code:: sh
50 |
51 | asv run
52 |
53 | Run the following command to test all versions since tag v0.6.0
54 |
55 | .. code:: sh
56 |
57 | asv run --skip-existing-commits v0.6.0..master
58 |
59 |
60 |
61 |
62 |
63 |
64 |
--------------------------------------------------------------------------------
/docs/images/elas_1705.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/J535D165/recordlinkage/b93d97641952f8c85106be5794ca93b1f1298fbc/docs/images/elas_1705.png
--------------------------------------------------------------------------------
/docs/images/indexing_basic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/J535D165/recordlinkage/b93d97641952f8c85106be5794ca93b1f1298fbc/docs/images/indexing_basic.png
--------------------------------------------------------------------------------
/docs/images/indexing_plot.py:
--------------------------------------------------------------------------------
1 | import matplotlib.colors as mlc
2 | import matplotlib.pyplot as mlp
3 | import numpy as np
4 |
5 | figure, axes = mlp.subplots(nrows=1, ncols=2, figsize=(8, 5))
6 |
7 | # linking
8 | db_a = ["A1", "A2", "A3", "A4", "A5", "A6"]
9 | db_b = ["B1", "B2", "B3", "B4", "B5", "B6"]
10 |
11 | img = np.ones((len(db_a), len(db_b)), dtype=float)
12 |
13 | color_map = mlc.LinearSegmentedColormap.from_list(
14 | "ColorMap", [(0.984, 0.501, 0.447), (1.000, 1.000, 1.000)]
15 | )
16 | axes[0].imshow(img, cmap=color_map, interpolation="none")
17 |
18 | axes[0].set_xlabel("Dataset A", fontsize=13)
19 | axes[0].set_xticks(np.arange(0, len(db_b), 1))
20 | axes[0].set_xticks(np.arange(-0.5, len(db_b), 1), minor=True)
21 | axes[0].set_xticklabels(db_a)
22 |
23 | axes[0].set_ylabel("Dataset B", fontsize=13)
24 | axes[0].set_yticks(np.arange(0, len(db_a), 1))
25 | axes[0].set_yticks(np.arange(-0.5, len(db_a), 1), minor=True)
26 | axes[0].set_yticklabels(db_b)
27 |
28 | axes[0].grid(which="minor", color="k")
29 |
30 | axes[0].set_title("Linking A and B", fontsize=15, fontweight="bold")
31 |
32 | # dedup
33 | db_a = ["A1", "A2", "A3", "A4", "A5", "A6"]
34 | db_b = ["A1", "A2", "A3", "A4", "A5", "A6"]
35 |
36 | img = np.ones((len(db_a), len(db_b)), dtype=float)
37 | img = np.triu(img, 1)
38 |
39 | color_map = mlc.LinearSegmentedColormap.from_list(
40 | "ColorMap", [(1.000, 1.000, 1.000), (0.984, 0.501, 0.447)]
41 | )
42 | axes[1].imshow(img, cmap=color_map, interpolation="none")
43 |
44 | axes[1].set_xlabel("Dataset A", fontsize=13)
45 | axes[1].set_xticks(np.arange(0, len(db_b), 1))
46 | axes[1].set_xticks(np.arange(-0.5, len(db_b), 1), minor=True)
47 | axes[1].set_xticklabels(db_a)
48 |
49 | axes[1].set_ylabel("Dataset A", fontsize=13)
50 | axes[1].set_yticks(np.arange(0, len(db_a), 1))
51 | axes[1].set_yticks(np.arange(-0.5, len(db_a), 1), minor=True)
52 | axes[1].set_yticklabels(db_b)
53 |
54 | axes[1].grid(which="minor", color="k")
55 |
56 | axes[1].set_title("Duplicate detection A", fontsize=15, fontweight="bold")
57 |
58 | figure.tight_layout()
59 |
60 | mlp.savefig("indexing_basic.png", dpi=150)
61 |
--------------------------------------------------------------------------------
/docs/images/recordlinkage-banner-transparent.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
1 | *******************************************
2 | Python Record Linkage Toolkit Documentation
3 | *******************************************
4 |
5 | .. figure:: /images/recordlinkage-banner-transparent.svg
6 | :width: 100%
7 |
8 | All you need to start linking records.
9 |
10 | .. toctree::
11 | :caption: First steps
12 | :maxdepth: 2
13 |
14 | about
15 | installation
16 | guides/link_two_dataframes.ipynb
17 | guides/data_deduplication.ipynb
18 |
19 | .. toctree::
20 | :caption: Record linkage
21 | :maxdepth: 2
22 |
23 | ref-preprocessing
24 | ref-index
25 | ref-compare
26 | ref-classifiers
27 | ref-evaluation
28 | ref-datasets
29 | ref-misc
30 |
31 | .. toctree::
32 | :caption: Miscellaneous
33 | :maxdepth: 2
34 |
35 | annotation
36 | guides/classifiers.rst
37 | performance.rst
38 |
39 | .. toctree::
40 | :caption: Developers
41 | :maxdepth: 1
42 |
43 | contributing
44 | changelog
45 |
--------------------------------------------------------------------------------
/docs/installation.rst:
--------------------------------------------------------------------------------
1 | ************
2 | Installation
3 | ************
4 |
5 | Python version support
6 | ======================
7 |
8 | The Python Record Linkage Toolkit supports the versions of Python that Pandas
9 | supports as well. You can find the supported Python versions in the Pandas
10 | documentation.
11 |
12 | Installation
13 | ============
14 |
15 | The Python Record linkage Toolkit requires Python 3.6 or higher. Install the
16 | package easily with pip
17 |
18 | .. code:: sh
19 |
20 | pip install recordlinkage
21 |
22 | You can also clone the project on Github.
23 |
24 | To install all recommended and optional dependencies, run
25 |
26 | .. code:: sh
27 |
28 | pip install recordlinkage['all']
29 |
30 | Dependencies
31 | ============
32 |
33 | The Python Record Linkage Toolkit depends on the following packages:
34 |
35 | - `numpy `__
36 | - `pandas `__
37 | - `scipy `__
38 | - `sklearn `__
39 | - `jellyfish `__
40 | - `joblib`
41 |
42 | Recommended dependencies
43 | ------------------------
44 |
45 | - `numexpr `__ - accelerating certain numerical operations
46 | - `bottleneck `__ - accelerating certain types of nan evaluations
47 |
48 | Optional dependecies
49 | --------------------
50 |
51 | - networkx - for network operations like connected components
52 |
53 |
54 |
55 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 |
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | echo.
16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | echo.installed, then set the SPHINXBUILD environment variable to point
18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | echo.may add the Sphinx directory to PATH.
20 | echo.
21 | echo.If you don't have Sphinx installed, grab it from
22 | echo.https://www.sphinx-doc.org/
23 | exit /b 1
24 | )
25 |
26 | if "%1" == "" goto help
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/docs/performance.rst:
--------------------------------------------------------------------------------
1 |
2 | Performance
3 | ===========
4 |
5 | Performance plays an important role in record linkage. Record linkage problems
6 | scale quadratically with the size of the dataset(s). The number of record
7 | pairs can be enormous and so are the number of comparisons. The Python Record
8 | Linkage Toolkit can be used for large scale record linkage applications.
9 | Nevertheless, the toolkit is developed with experimenting in first place and
10 | performance on the second place. This page provides tips and tricks to improve
11 | the performance.
12 |
13 | Do you know more tricks? Let us know!
14 |
15 | Indexing
16 | --------
17 |
18 | Block on multiple columns
19 | ~~~~~~~~~~~~~~~~~~~~~~~~~
20 |
21 | Blocking is an effective way to increase the performance of your record
22 | linkage. If the performance of your implementation is still poor, decrease the
23 | number of pairs by blocking on multiple variables. This implies that the
24 | record pair is agrees on two or more variables. In the following example, the
25 | record pairs agree on the given name **and** surname.
26 |
27 | .. code:: python
28 |
29 | from recordlinkage.index import Block
30 | indexer = Block(left_on=['first_name', 'surname'],
31 | right_on=['name', 'surname'])
32 | pairs = indexer.index(dfA, dfB)
33 |
34 | You might exclude more links then desired. This can be solved by
35 | repeating the process with different blocking variables.
36 |
37 | .. code:: python
38 |
39 | indexer = recordlinkage.Index()
40 | indexer.block(left_on=['first_name', 'surname'],
41 | right_on=['name', 'surname'])
42 | indexer.block(left_on=['first_name', 'age'],
43 | right_on=['name', 'age'])
44 | pairs = indexer.index(dfA, dfB)
45 |
46 | .. note:: Sorted Neighbourhood indexing supports, besides the sorted
47 | neighbourhood, additional blocking on variables.
48 |
49 | Make record pairs
50 | ~~~~~~~~~~~~~~~~~
51 |
52 | The structure of the Python Record Linkage Toolkit has a drawback for the
53 | performance. In the indexation step (the step in which record pairs are
54 | selected), only the index of both records is stored. The entire records
55 | are not stored. This results in less memory usage. The drawback is that the
56 | records need to be queried from the data.
57 |
58 |
59 | Comparing
60 | ---------
61 |
62 | Compare only discriminating variables
63 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
64 |
65 | Not all variables may be worth comparing in a record linkage. Some variables
66 | do not discriminate the links of the non-links or do have only minor effects.
67 | These variables can be excluded. Only discriminating and informative should be
68 | included.
69 |
70 | Prevent string comparisons
71 | ~~~~~~~~~~~~~~~~~~~~~~~~~~
72 |
73 | String similarity measures and phonetic encodings are computationally
74 | expensive. Phonetic encoding takes place on the original data, while string
75 | simililatiry measures are applied on the record pairs. After phonetic encoding
76 | of the string variables, exact comparing can be used instead of computing the
77 | string similarity of all record pairs. If the number of candidate pairs is
78 | much larger than the number of records in both datasets together, then
79 | consider using phonetic encoding of string variables instead of string
80 | comparison.
81 |
82 | String comparing
83 | ~~~~~~~~~~~~~~~~
84 |
85 | Comparing strings is computationally expensive. The Python Record Linkage
86 | Toolkit uses the package ``jellyfish`` for string comparisons. The package has
87 | two implementations, a C and a Python implementation. Ensure yourself of
88 | having the Rust-version installed (``import jellyfish.rustyfish`` should not
89 | raise an exception).
90 |
91 | There can be a large difference in the performance of different string
92 | comparison algorithms. The Jaro and Jaro-Winkler methods are faster than the
93 | Levenshtein distance and much faster than the Damerau-Levenshtein distance.
94 |
95 | Indexing with large files
96 | ~~~~~~~~~~~~~~~~~~~~~~~~~
97 |
98 | Sometimes, the input files are very large. In that case, it can be hard
99 | to make an index without running out of memory in the indexing step or
100 | in the comparing step. ``recordlinkage`` has a method to deal with large
101 | files. It is fast, although is not primary developed to be fast. SQL
102 | databases may outperform this method. It is especially developed for the
103 | useability. The idea was to split the input files into small blocks.
104 | For each block the record pairs are computed. Then iterate over the
105 | blocks. Consider full indexing:
106 |
107 | .. code:: python
108 |
109 | import recordlinkage
110 | import numpy
111 |
112 | cl = recordlinkage.index.Full()
113 |
114 | for dfB_subset in numpy.split(dfB):
115 |
116 | # a subset of record pairs
117 | pairs_subset = cl.index(dfA, dfB_subset)
118 |
119 | # Your analysis on pairs_subset here
120 |
121 |
122 |
--------------------------------------------------------------------------------
/docs/ref-classifiers.rst:
--------------------------------------------------------------------------------
1 | *****************
2 | 3. Classification
3 | *****************
4 |
5 | Classifiers
6 | ===========
7 |
8 | Classification is the step in the record linkage process were record pairs are
9 | classified into matches, non-matches and possible matches [Christen2012]_.
10 | Classification algorithms can be supervised or unsupervised (with or without
11 | training data).
12 |
13 |
14 | .. seealso::
15 |
16 | .. [Christen2012] Christen, Peter. 2012. Data matching: concepts and
17 | techniques for record linkage, entity resolution, and duplicate
18 | detection. Springer Science & Business Media.
19 |
20 | Supervised
21 | ----------
22 |
23 | .. autoclass:: recordlinkage.LogisticRegressionClassifier
24 | :members:
25 | :inherited-members:
26 |
27 | .. autoclass:: recordlinkage.NaiveBayesClassifier
28 | :members:
29 | :inherited-members:
30 |
31 | .. autoclass:: recordlinkage.SVMClassifier
32 | :members:
33 | :inherited-members:
34 |
35 | Unsupervised
36 | ------------
37 |
38 | .. autoclass:: recordlinkage.ECMClassifier
39 | :members:
40 | :inherited-members:
41 |
42 | .. autoclass:: recordlinkage.KMeansClassifier
43 | :members:
44 | :inherited-members:
45 |
46 |
47 | Adapters
48 | ========
49 |
50 | Adapters can be used to wrap a machine learning models from external packages
51 | like ScitKit-learn and Keras. For example, this makes it possible to classify
52 | record pairs with an neural network developed in Keras.
53 |
54 | .. autoclass:: recordlinkage.adapters.SKLearnAdapter
55 |
56 |
57 | .. code:: python
58 |
59 | # import ScitKit-Learn classifier
60 | from sklearn.ensemble import RandomForestClassifier
61 |
62 | # import BaseClassifier from recordlinkage.base
63 | from recordlinkage.base import BaseClassifier
64 | from recordlinkage.adapters import SKLearnClassifier
65 | from recordlinkage.datasets import binary_vectors
66 |
67 | class RandomForest(SKLearnClassifier, BaseClassifier):
68 |
69 | def __init__(*args, **kwargs):
70 | super(self, RandomForest).__init__()
71 |
72 | # set the kernel
73 | kernel = RandomForestClassifier(*args, **kwargs)
74 |
75 |
76 | # make a sample dataset
77 | features, links = binary_vectors(10000, 2000, return_links=True)
78 |
79 | # initialise the random forest
80 | cl = RandomForest(n_estimators=20)
81 | cl.fit(features, links)
82 |
83 | # predict the matches
84 | cl.predict(...)
85 |
86 |
87 | .. autoclass:: recordlinkage.adapters.KerasAdapter
88 |
89 | Example of a Keras model used for classification.
90 |
91 | .. code:: python
92 |
93 | from tensorflow.keras import layers
94 | from recordlinkage.base import BaseClassifier
95 | from recordlinkage.adapters import KerasAdapter
96 |
97 | class NNClassifier(KerasAdapter, BaseClassifier):
98 | """Neural network classifier."""
99 | def __init__(self):
100 | super(NNClassifier, self).__init__()
101 |
102 | model = tf.keras.Sequential()
103 | model.add(layers.Dense(16, input_dim=8, activation='relu'))
104 | model.add(layers.Dense(8, activation='relu'))
105 | model.add(layers.Dense(1, activation='sigmoid'))
106 | model.compile(
107 | optimizer=tf.train.AdamOptimizer(0.001),
108 | loss='binary_crossentropy',
109 | metrics=['accuracy']
110 | )
111 |
112 | self.kernel = model
113 |
114 | # initialise the model
115 | cl = NNClassifier()
116 | # fit the model to the data
117 | cl.fit(X_train, links_true)
118 | # predict the class of the data
119 | cl.predict(X_pred)
120 |
121 |
122 | User-defined algorithms
123 | =======================
124 |
125 | Classifiers can make use of the :class:`recordlinkage.base.BaseClassifier` for
126 | algorithms. ScitKit-learn based models may want
127 | :class:`recordlinkage.adapters.SKLearnAdapter` as subclass as well.
128 |
129 | .. autoclass:: recordlinkage.base.BaseClassifier
130 | :members:
131 | :inherited-members:
132 |
133 | Probabilistic models can use the Fellegi and Sunter base class. This class is
134 | used for the :class:`recordlinkage.ECMClassifier` and the
135 | :class:`recordlinkage.NaiveBayesClassifier`.
136 |
137 | .. autoclass:: recordlinkage.classifiers.FellegiSunter
138 | :members:
139 | :inherited-members:
140 |
141 | Examples
142 | ========
143 |
144 | Unsupervised learning with the ECM algorithm. [See example on Github.](https://github.com/J535D165/recordlinkage/examples/unsupervised_learning.py)
145 |
146 |
147 | Network
148 | =======
149 |
150 | The Python Record Linkage Toolkit provides network/graph analysis tools for
151 | classification of record pairs into matches and distinct pairs. The toolkit
152 | provides the functionality for one-to-one linking and one-to-many linking. It
153 | is also possible to detect all connected components which is useful in data
154 | deduplication.
155 |
156 | .. autoclass:: recordlinkage.OneToOneLinking
157 | :members:
158 | :inherited-members:
159 |
160 | .. autoclass:: recordlinkage.OneToManyLinking
161 | :members:
162 | :inherited-members:
163 |
164 | .. autoclass:: recordlinkage.ConnectedComponents
165 | :members:
166 | :inherited-members:
167 |
--------------------------------------------------------------------------------
/docs/ref-datasets.rst:
--------------------------------------------------------------------------------
1 | ********
2 | Datasets
3 | ********
4 |
5 | The Python Record Linkage Toolkit contains several open public datasets. Four
6 | datasets were generated by the developers of Febrl. In the future, we are
7 | developing tools to generate your own datasets.
8 |
9 | .. autofunction:: recordlinkage.datasets.load_krebsregister
10 |
11 | .. autofunction:: recordlinkage.datasets.load_febrl1
12 |
13 | .. autofunction:: recordlinkage.datasets.load_febrl2
14 |
15 | .. autofunction:: recordlinkage.datasets.load_febrl3
16 |
17 | .. autofunction:: recordlinkage.datasets.load_febrl4
18 |
19 | .. autofunction:: recordlinkage.datasets.binary_vectors
20 |
--------------------------------------------------------------------------------
/docs/ref-evaluation.rst:
--------------------------------------------------------------------------------
1 | *************
2 | 4. Evaluation
3 | *************
4 |
5 | Evaluation of classifications plays an important role in record linkage.
6 | Express your classification quality in terms accuracy, recall and F-score
7 | based on ``true positives``, ``false positives``, ``true negatives`` and
8 | ``false negatives``.
9 |
10 | .. autofunction:: recordlinkage.reduction_ratio
11 | .. autofunction:: recordlinkage.true_positives
12 | .. autofunction:: recordlinkage.true_negatives
13 | .. autofunction:: recordlinkage.false_positives
14 | .. autofunction:: recordlinkage.false_negatives
15 | .. autofunction:: recordlinkage.confusion_matrix
16 | .. autofunction:: recordlinkage.precision
17 | .. autofunction:: recordlinkage.recall
18 | .. autofunction:: recordlinkage.accuracy
19 | .. autofunction:: recordlinkage.specificity
20 | .. autofunction:: recordlinkage.fscore
21 | .. autofunction:: recordlinkage.max_pairs
22 | .. autofunction:: recordlinkage.full_index_size
--------------------------------------------------------------------------------
/docs/ref-index.rst:
--------------------------------------------------------------------------------
1 | ***********
2 | 1. Indexing
3 | ***********
4 |
5 | The indexing module is used to make pairs of records. These pairs are called
6 | candidate links or candidate matches. There are several indexing algorithms
7 | available such as blocking and sorted neighborhood indexing. See
8 | [christen2012]_ and [christen2008]_ for background information about
9 | indexation.
10 |
11 | .. [christen2012] Christen, P. (2012). Data matching: concepts and
12 | techniques for record linkage, entity resolution, and duplicate
13 | detection. Springer Science & Business Media.
14 | .. [christen2008] Christen, P. (2008). Febrl - A Freely Available Record
15 | Linkage System with a Graphical User Interface.
16 |
17 | The indexing module can be used for both linking and duplicate detection. In
18 | case of duplicate detection, only pairs in the upper triangular part of the
19 | matrix are returned. This means that the first record in each record pair is
20 | the largest identifier. For example, `("A2", "A1")`, `(5, 2)` and `("acb",
21 | "abc")`. The following image shows the record pairs for a complete set of
22 | record pairs.
23 |
24 | .. figure:: /images/indexing_basic.png
25 | :width: 100%
26 |
27 | :class:`recordlinkage.Index` object
28 | ===================================
29 |
30 | .. autoclass:: recordlinkage.Index
31 |
32 | .. automethod:: recordlinkage.Index.add
33 | .. automethod:: recordlinkage.Index.index
34 | .. automethod:: recordlinkage.Index.full
35 | .. automethod:: recordlinkage.Index.block
36 | .. automethod:: recordlinkage.Index.sortedneighbourhood
37 | .. automethod:: recordlinkage.Index.random
38 |
39 |
40 |
41 | Algorithms
42 | ==========
43 |
44 | The Python Record Linkage Toolkit contains basic and advanced indexing (or
45 | blocking) algorithms to make record pairs. The algorithms are Python classes.
46 | Popular algorithms in the toolkit are:
47 |
48 | - :class:`recordlinkage.index.Full`,
49 | - :class:`recordlinkage.index.Block`,
50 | - :class:`recordlinkage.index.SortedNeighbourhood`
51 |
52 | The algorithms are available in the submodule `recordlinkage.index`. Import
53 | the algorithms in the following way (use blocking algorithm as example):
54 |
55 | .. code:: python
56 |
57 | from recordlinkage.index import Block
58 |
59 | The full reference for the indexing algorithms in the toolkit is given below.
60 |
61 | .. automodule:: recordlinkage.index
62 | :members:
63 | :inherited-members:
64 |
65 | User-defined algorithms
66 | =======================
67 |
68 | A user-defined algorithm can be defined based on
69 | :class:`recordlinkage.base.BaseIndexAlgorithm`. The :class:`recordlinkage.base.BaseIndexAlgorithm` class is an abstract base
70 | class that is used for indexing algorithms. The classes
71 |
72 | - :class:`recordlinkage.index.Full`
73 | - :class:`recordlinkage.index.Block`
74 | - :class:`recordlinkage.index.SortedNeighbourhood`
75 | - :class:`recordlinkage.index.Random`
76 |
77 | are inherited from this abstract base class. You can use BaseIndexAlgorithm to
78 | create a user-defined/custom algorithm.
79 |
80 | To create a custom algorithm, subclass the
81 | :class:`recordlinkage.base.BaseIndexAlgorithm`. In the subclass, overwrite the
82 | :meth:`recordlinkage.base.BaseIndexAlgorithm._link_index` method in case of
83 | linking two datasets. This method accepts two (tuples of)
84 | :class:`pandas.Series` objects as arguments. Based on these Series objects,
85 | you create record pairs. The record pairs need to be returned in a 2-level
86 | :class:`pandas.MultiIndex` object. The :attr:`pandas.MultiIndex.names` are the
87 | name of index of DataFrame A and name of the index of DataFrame B
88 | respectively. Overwrite the
89 | :meth:`recordlinkage.base.BaseIndexAlgorithm._dedup_index` method in case of
90 | finding link within a single dataset (deduplication). This method accepts a
91 | single (tuples of) :class:`pandas.Series` objects as arguments.
92 |
93 | The algorithm for linking data frames can be used for finding duplicates as
94 | well. In this situation, DataFrame B is a copy of DataFrame A. The Pairs class
95 | removes pairs like (record_i, record_i) and one of the following (record_i,
96 | record_j) (record_j, record_i) under the hood. As result of this, only unique
97 | combinations are returned. If you do have a specific algorithm for finding
98 | duplicates, then you can overwrite the _dedup_index method. This method
99 | accepts only one argument (DataFrame A) and the internal base class does not
100 | look for combinations like explained above.
101 |
102 | .. autoclass:: recordlinkage.base.BaseIndexAlgorithm
103 | :members:
104 | :private-members:
105 |
106 | Examples
107 | ========
108 |
109 | .. code:: python
110 |
111 | import recordlinkage as rl
112 | from recordlinkage.datasets import load_febrl4
113 | from recordlinkage.index import Block
114 |
115 | df_a, df_b = load_febrl4()
116 |
117 | indexer = rl.Index()
118 | indexer.add(Block('given_name', 'given_name'))
119 | indexer.add(Block('surname', 'surname'))
120 | indexer.index(df_a, df_b)
121 |
122 | Equivalent code:
123 |
124 | .. code:: python
125 |
126 | import recordlinkage as rl
127 | from recordlinkage.datasets import load_febrl4
128 |
129 | df_a, df_b = load_febrl4()
130 |
131 | indexer = rl.Index()
132 | indexer.block('given_name', 'given_name')
133 | indexer.block('surname', 'surname')
134 | index.index(df_a, df_b)
135 |
136 | This example shows how to implement a custom indexing algorithm. The algorithm
137 | returns all record pairs of which the given names starts with the letter ‘W’.
138 |
139 | .. code:: python
140 |
141 | import recordlinkage
142 | from recordlinkage.datasets import load_febrl4
143 |
144 | df_a, df_b = load_febrl4()
145 |
146 | from recordlinkage.base import BaseIndexAlgorithm
147 |
148 | class FirstLetterWIndex(BaseIndexAlgorithm):
149 | """Custom class for indexing"""
150 |
151 | def _link_index(self, df_a, df_b):
152 | """Make pairs with given names starting with the letter 'w'."""
153 |
154 | # Select records with names starting with a w.
155 | name_a_w = df_a[df_a['given_name'].str.startswith('w') == True]
156 | name_b_w = df_b[df_b['given_name'].str.startswith('w') == True]
157 |
158 | # Make a product of the two numpy arrays
159 | return pandas.MultiIndex.from_product(
160 | [name_a_w.index.values, name_b_w.index.values],
161 | names=[df_a.index.name, df_b.index.name]
162 | )
163 |
164 | indexer = FirstLetterWIndex()
165 | candidate_pairs = indexer.index(df_a, df_b)
166 |
167 | print ('Returns a', type(candidate_pairs).__name__)
168 | print ('Number of candidate record pairs starting with the letter w:', len(candidate_pairs))
169 |
170 | The custom index class below does not restrict the first letter to ‘w’, but the first letter is an argument (named `letter`). This letter can is initialized during the setup of the class.
171 |
172 | .. code:: python
173 |
174 | class FirstLetterIndex(BaseIndexAlgorithm):
175 | """Custom class for indexing"""
176 |
177 | def __init__(self, letter):
178 | super(FirstLetterIndex, self).__init__()
179 |
180 | # the letter to save
181 | self.letter = letter
182 |
183 | def _link_index(self, df_a, df_b):
184 | """Make record pairs that agree on the first letter of the given name."""
185 |
186 | # Select records with names starting with a 'letter'.
187 | a_startswith_w = df_a[df_a['given_name'].str.startswith(self.letter) == True]
188 | b_startswith_w = df_b[df_b['given_name'].str.startswith(self.letter) == True]
189 |
190 | # Make a product of the two numpy arrays
191 | return pandas.MultiIndex.from_product(
192 | [a_startswith_w.index.values, b_startswith_w.index.values],
193 | names=[df_a.index.name, df_b.index.name]
194 | )
195 |
--------------------------------------------------------------------------------
/docs/ref-misc.rst:
--------------------------------------------------------------------------------
1 | *************
2 | Miscellaneous
3 | *************
4 |
5 | .. autofunction:: recordlinkage.index_split
6 |
7 | .. autofunction:: recordlinkage.get_option
8 | .. autofunction:: recordlinkage.set_option
9 | .. autofunction:: recordlinkage.reset_option
10 | .. autofunction:: recordlinkage.describe_option
--------------------------------------------------------------------------------
/docs/ref-preprocessing.rst:
--------------------------------------------------------------------------------
1 | ****************
2 | 0. Preprocessing
3 | ****************
4 |
5 | Preprocessing data, like cleaning and standardising, may increase your record
6 | linkage accuracy. The Python Record Linkage Toolkit contains several tools for
7 | data preprocessing. The preprocessing and standardising functions are
8 | available in the submodule `recordlinkage.preprocessing`. Import the
9 | algorithms in the following way:
10 |
11 | .. code:: python
12 |
13 | from recordlinkage.preprocessing import clean, phonetic
14 |
15 | Cleaning
16 | ========
17 |
18 | The Python Record Linkage Toolkit has some cleaning function from which
19 | :func:`recordlinkage.preprocessing.clean` is the most generic function. Pandas
20 | itself is also very usefull for (string) data cleaning. See the pandas
21 | documentation on this topic: `Working with Text Data `_.
22 |
23 | .. autofunction:: recordlinkage.preprocessing.clean
24 | .. autofunction:: recordlinkage.preprocessing.phonenumbers
25 | .. autofunction:: recordlinkage.preprocessing.value_occurence
26 |
27 | Phonetic encoding
28 | =================
29 |
30 | Phonetic algorithms are algorithms for indexing of words by their
31 | pronunciation. The most well-known algorithm is the `Soundex
32 | `_ algorithm. The Python Record Linkage
33 | Toolkit supports multiple algorithms through the
34 | :func:`recordlinkage.preprocessing.phonetic` function.
35 |
36 | .. note::
37 |
38 | Use phonetic algorithms in advance of the indexing and comparing step.
39 | This results in most siutations in better performance.
40 |
41 | .. autofunction:: recordlinkage.preprocessing.phonetic
42 | .. autoattribute:: recordlinkage.preprocessing.phonetic_algorithms
43 |
--------------------------------------------------------------------------------
/examples/README.rst:
--------------------------------------------------------------------------------
1 | Python Record Linkage Toolkit examples
2 | ======================================
3 |
4 | This folder contains examples on record linkage with the Python Record Linkage
5 | Toolkit. The examples do have a BSD 3-Clause "New" or "Revised" License.
6 | Contributions are appreciated.
7 |
8 | Basic
9 | -----
10 |
11 | `Deterministic deduplication`_
12 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
13 |
14 | Example of deterministic record linkage to find duplicated records in a
15 | dataset. In this example, the model isn't trained with train data.
16 |
17 | `Deterministic linkage`_
18 | ~~~~~~~~~~~~~~~~~~~~~~~~
19 |
20 | Example of deterministic record linkage to find links between two datasets. In
21 | this example, the model isn't trained with train data.
22 |
23 | `Supervised Fellegi and Sunter with Naive Bayes classifier`_
24 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
25 |
26 | An implementation of the Fellegi and Sunter (1969) classification model in a
27 | supervised way.
28 |
29 | `Unsupervised Fellegi and Sunter with ECM classifier`_
30 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
31 |
32 | An implementation of the Fellegi and Sunter (1969) classification model in an
33 | unsupervised way. The training of model parameters is done with the
34 | Expectation-Conditional Maximisation algorithm.
35 |
36 |
37 | Advanced
38 | --------
39 |
40 | `Record linkage with Neural Networks`_
41 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
42 |
43 | This example shows how Neural Networks can be used to classify record pairs.
44 | The Neural Network is implemented in Keras.
45 |
46 | .. _`Deterministic deduplication`: /examples/dedup_deterministic.py
47 | .. _`Deterministic linkage`: /examples/linking_deterministic.py
48 | .. _`Record linkage with Neural Networks`: /examples/supervised_keras.py
49 | .. _`Supervised Fellegi and Sunter with Naive Bayes classifier`: /examples/supervised_learning_prob.py
50 | .. _`Unsupervised Fellegi and Sunter with ECM classifier`: /examples/unsupervised_learning_prob.py
51 |
--------------------------------------------------------------------------------
/examples/dedup_deterministic.py:
--------------------------------------------------------------------------------
1 | """Example: Deterministic record linkage to find links in a single file.
2 |
3 | In determininistic record linkage, each compared attribute get a certain
4 | weight (coefficient). The higher the weight, the more dicriminating the
5 | variable is. A low weight indicate a less discriminating variable. For
6 | example, the given name has a higher weight than the hometown.
7 |
8 | This example uses FEBRL3 datasets. This dataset contain records about
9 | individuals.
10 |
11 | Deterministic RL parameters are:
12 | intercept = -11.0
13 | coefficients = [1.5, 1.5, 8.0, 6.0, 2.5, 6.5, 5.0]
14 |
15 | """
16 |
17 |
18 | import recordlinkage as rl
19 | from recordlinkage.compare import Exact
20 | from recordlinkage.compare import String
21 | from recordlinkage.datasets import load_febrl3
22 | from recordlinkage.index import Block
23 |
24 | # set logging
25 | rl.logging.set_verbosity(rl.logging.INFO)
26 |
27 | # load dataset
28 | print("Loading data...")
29 | dfA, true_links = load_febrl3(return_links=True)
30 | print(len(dfA), "records in dataset A")
31 | print(len(true_links), "links in dataset A")
32 |
33 | # start indexing
34 | print("Build index...")
35 | indexer = rl.Index()
36 | indexer.add(Block("given_name"))
37 | indexer.add(Block("surname"))
38 | indexer.add(Block("soc_sec_id"))
39 | candidate_links = indexer.index(dfA)
40 |
41 | # start comparing
42 | print("Start comparing...")
43 | comparer = rl.Compare()
44 | comparer.add(Exact("given_name", "given_name", label="given_name"))
45 | comparer.add(
46 | String("surname", "surname", method="jarowinkler", threshold=0.85, label="surname")
47 | )
48 | comparer.add(Exact("date_of_birth", "date_of_birth", label="date_of_birth"))
49 | comparer.add(Exact("suburb", "suburb", label="suburb"))
50 | comparer.add(Exact("state", "state", label="state"))
51 | comparer.add(String("address_1", "address_1", threshold=0.85, label="address_1"))
52 | comparer.add(String("address_2", "address_2", threshold=0.85, label="address_2"))
53 | features = comparer.compute(candidate_links, dfA)
54 |
55 | print("feature shape", features.shape)
56 |
57 | # use the Logistic Regression Classifier
58 | # this classifier is equivalent to the deterministic record linkage approach
59 | intercept = -9.5
60 | coefficients = [2.0, 3.0, 7.0, 6.0, 2.5, 5.0, 5.5]
61 |
62 | print("Deterministic classifier")
63 | print("intercept", intercept)
64 | print("coefficients", coefficients)
65 |
66 | logreg = rl.LogisticRegressionClassifier(coefficients=coefficients, intercept=intercept)
67 | links = logreg.predict(features)
68 |
69 | print(len(links), "links/matches")
70 |
71 | # return the confusion matrix
72 | conf_logreg = rl.confusion_matrix(true_links, links, len(candidate_links))
73 | print("confusion matrix")
74 | print(conf_logreg)
75 |
76 | # compute the F-score for this classification
77 | fscore = rl.fscore(conf_logreg)
78 | print("fscore", fscore)
79 | recall = rl.recall(true_links, links)
80 | print("recall", recall)
81 | precision = rl.precision(true_links, links)
82 | print("precision", precision)
83 |
--------------------------------------------------------------------------------
/examples/linking_deterministic.py:
--------------------------------------------------------------------------------
1 | """This example demonstrates deterministic record linkage to link two files.
2 |
3 | In determininistic record linkage, each compared attribute get a certain
4 | weight (coefficient). The higher the weight, the more dicriminating the
5 | variable is. A low weight indicate a less discriminating variable. For
6 | example, the given name has a higher weight than the hometown.
7 |
8 | This example uses FEBRL4 datasets. These datasets contain records about
9 | individuals.
10 |
11 | Deterministic RL parameters are:
12 | intercept = -11.0
13 | coefficients = [1.5, 1.5, 8.0, 6.0, 2.5, 6.5, 5.0]
14 |
15 | """
16 |
17 |
18 | import recordlinkage as rl
19 | from recordlinkage.compare import Exact
20 | from recordlinkage.compare import String
21 | from recordlinkage.datasets import load_febrl4
22 | from recordlinkage.index import Block
23 |
24 | # set logging
25 | rl.logging.set_verbosity(rl.logging.INFO)
26 |
27 | # load datasets
28 | print("Loading data...")
29 | dfA, dfB, true_links = load_febrl4(return_links=True)
30 | print(len(dfA), "records in dataset A")
31 | print(len(dfB), "records in dataset B")
32 | print(len(true_links), "links between dataset A and B")
33 |
34 | # start indexing
35 | print("Build index...")
36 | indexer = rl.Index()
37 | indexer.add(Block("given_name"))
38 | indexer.add(Block("surname"))
39 | indexer.add(Block("soc_sec_id"))
40 | candidate_links = indexer.index(dfA, dfB)
41 |
42 | # start comparing
43 | print("Start comparing...")
44 | comparer = rl.Compare()
45 | comparer.add(Exact("given_name", "given_name", label="given_name"))
46 | comparer.add(
47 | String("surname", "surname", method="jarowinkler", threshold=0.85, label="surname")
48 | )
49 | comparer.add(Exact("date_of_birth", "date_of_birth", label="date_of_birth"))
50 | comparer.add(Exact("suburb", "suburb", label="suburb"))
51 | comparer.add(Exact("state", "state", label="state"))
52 | comparer.add(String("address_1", "address_1", threshold=0.85, label="address_1"))
53 | comparer.add(String("address_2", "address_2", threshold=0.85, label="address_2"))
54 | features = comparer.compute(candidate_links, dfA, dfB)
55 |
56 | print("feature shape", features.shape)
57 |
58 | # use the Logistic Regression Classifier
59 | # this classifier is equivalent to the deterministic record linkage approach
60 | intercept = -11.0
61 | coefficients = [1.5, 1.5, 8.0, 6.0, 2.5, 6.5, 5.0]
62 |
63 | print("Deterministic classifier")
64 | print("intercept", intercept)
65 | print("coefficients", coefficients)
66 |
67 | logreg = rl.LogisticRegressionClassifier(coefficients=coefficients, intercept=intercept)
68 | links = logreg.predict(features)
69 |
70 | print(len(links), "links/matches")
71 |
72 | # return the confusion matrix
73 | conf_logreg = rl.confusion_matrix(true_links, links, len(candidate_links))
74 | print("confusion matrix")
75 | print(conf_logreg)
76 |
77 | # compute the F-score for this classification
78 | fscore = rl.fscore(conf_logreg)
79 | print("fscore", fscore)
80 | recall = rl.recall(true_links, links)
81 | print("recall", recall)
82 | precision = rl.precision(true_links, links)
83 | print("precision", precision)
84 |
--------------------------------------------------------------------------------
/examples/supervised_keras.py:
--------------------------------------------------------------------------------
1 | """Example: Supervised learning with Neural Networks."""
2 |
3 |
4 | import numpy as np
5 | import tensorflow as tf
6 |
7 | try:
8 | from tensorflow.keras import layers
9 | except ModuleNotFoundError as err:
10 | raise ModuleNotFoundError("Please upgrade tensorflow.") from err
11 |
12 | import recordlinkage as rl
13 | from recordlinkage.adapters import KerasAdapter
14 | from recordlinkage.base import BaseClassifier
15 | from recordlinkage.datasets import binary_vectors
16 |
17 | # create a dataset with the following settings
18 | n_pairs = 50000
19 | n_matches = 7000
20 | m_simulate = np.array([0.94, 0.81, 0.85, 0.90, 0.99, 0.70, 0.56, 0.92])
21 | u_simulate = np.array([0.19, 0.23, 0.50, 0.11, 0.20, 0.14, 0.50, 0.09])
22 |
23 | # Create the dataset and return the true links.
24 | X_data, links_true = binary_vectors(
25 | n_pairs, # the number of candidate links
26 | n_matches, # the number of true links
27 | m=m_simulate, # the m probabilities
28 | u=u_simulate, # the u probabilities
29 | random_state=535, # set seed
30 | return_links=True,
31 | ) # return true links
32 |
33 |
34 | # Initialise the Keras.
35 | class NNClassifier(KerasAdapter, BaseClassifier):
36 | """Neural network classifier."""
37 |
38 | def __init__(self, *args, **kwargs):
39 | super().__init__()
40 |
41 | model = tf.keras.Sequential()
42 | model.add(layers.Dense(16, input_dim=8, activation="relu"))
43 | model.add(layers.Dense(8, activation="relu"))
44 | model.add(layers.Dense(1, activation="sigmoid"))
45 | model.compile(
46 | optimizer=tf.train.AdamOptimizer(0.001),
47 | loss="binary_crossentropy",
48 | metrics=["accuracy"],
49 | )
50 |
51 | self.kernel = model
52 |
53 |
54 | cl = NNClassifier()
55 | cl.fit(X_data, links_true)
56 |
57 | # evaluate the model
58 | links_pred = cl.predict(X_data)
59 | print("Predicted number of links:", len(links_pred))
60 |
61 | cm = rl.confusion_matrix(links_true, links_pred, total=len(X_data))
62 | print("Confusion matrix:\n", cm)
63 |
64 | # compute the F-score for this classification
65 | fscore = rl.fscore(cm)
66 | print("fscore", fscore)
67 | recall = rl.recall(links_true, links_pred)
68 | print("recall", recall)
69 | precision = rl.precision(links_true, links_pred)
70 | print("precision", precision)
71 |
72 | # Predict the match probability for each pair in the dataset.
73 | probs = cl.prob(X_data)
74 | print(probs[0:10])
75 |
--------------------------------------------------------------------------------
/examples/supervised_learning_prob.py:
--------------------------------------------------------------------------------
1 | """Example: Supervised learning with the Naive Bayes algorithm.
2 |
3 | """
4 |
5 |
6 | import numpy as np
7 |
8 | import recordlinkage as rl
9 | from recordlinkage.datasets import binary_vectors
10 |
11 | # create a dataset with the following settings
12 | n_pairs = 50000
13 | n_matches = 7000
14 | m_simulate = np.array([0.94, 0.81, 0.85, 0.90, 0.99, 0.70, 0.56, 0.92])
15 | u_simulate = np.array([0.19, 0.23, 0.50, 0.11, 0.20, 0.14, 0.50, 0.09])
16 |
17 | # Create the dataset and return the true links.
18 | X_data, links_true = binary_vectors(
19 | n_pairs, # the number of candidate links
20 | n_matches, # the number of true links
21 | m=m_simulate, # the m probabilities
22 | u=u_simulate, # the u probabilities
23 | random_state=535, # set seed
24 | return_links=True,
25 | ) # return true links
26 |
27 | # Initialise the NaiveBayesClassifier.
28 | cl = rl.NaiveBayesClassifier()
29 | cl.fit(X_data, links_true)
30 |
31 | # Print the parameters that are trained (m, u and p). Note that the estimates
32 | # are very good.
33 | print("p probability P(Match):", cl.p)
34 | print("m probabilities P(x_i=1|Match):", cl.m_probs)
35 | print("u probabilities P(x_i=1|Non-Match):", cl.u_probs)
36 | print("log m probabilities P(x_i=1|Match):", cl.log_m_probs)
37 | print("log u probabilities P(x_i=1|Non-Match):", cl.log_u_probs)
38 | print("log weights of features:", cl.log_weights)
39 | print("weights of features:", cl.weights)
40 |
41 | # evaluate the model
42 | links_pred = cl.predict(X_data)
43 | print("Predicted number of links:", len(links_pred))
44 |
45 | cm = rl.confusion_matrix(links_true, links_pred, total=len(X_data))
46 | print("Confusion matrix:\n", cm)
47 |
48 | # compute the F-score for this classification
49 | fscore = rl.fscore(cm)
50 | print("fscore", fscore)
51 | recall = rl.recall(links_true, links_pred)
52 | print("recall", recall)
53 | precision = rl.precision(links_true, links_pred)
54 | print("precision", precision)
55 |
56 | # Predict the match probability for each pair in the dataset.
57 | probs = cl.prob(X_data)
58 |
--------------------------------------------------------------------------------
/examples/unsupervised_learning_prob.py:
--------------------------------------------------------------------------------
1 | """Example: Unsupervised learning with the ECM algorithm.
2 |
3 | Train data is often hard to collect in record linkage or data matching
4 | problems. The Expectation-Conditional Maximisation (ECM) algorithm is the most
5 | well known algorithm for unsupervised data matching. The algorithm preforms
6 | relatively well compared to supervised methods.
7 |
8 | """
9 |
10 |
11 | import numpy as np
12 |
13 | import recordlinkage as rl
14 | from recordlinkage.datasets import binary_vectors
15 |
16 | # create a dataset with the following settings
17 | n_pairs = 50000
18 | n_matches = 7000
19 | m_simulate = np.array([0.94, 0.81, 0.85, 0.90, 0.99, 0.70, 0.56, 0.92])
20 | u_simulate = np.array([0.19, 0.23, 0.50, 0.11, 0.20, 0.14, 0.50, 0.09])
21 |
22 | # Create the dataset and return the true links.
23 | X_data, links_true = binary_vectors(
24 | n_pairs, # the number of candidate links
25 | n_matches, # the number of true links
26 | m=m_simulate, # the m probabilities
27 | u=u_simulate, # the u probabilities
28 | random_state=535, # set seed
29 | return_links=True,
30 | ) # return true links
31 |
32 | # Initialise the Expectation-Conditional Maximisation classifier.
33 | cl = rl.ECMClassifier()
34 | cl.fit(X_data)
35 |
36 | # Print the parameters that are trained (m, u and p). Note that the estimates
37 | # are very good.
38 | print("p probability P(Match):", cl.p)
39 | print("m probabilities P(x_i=1|Match):", cl.m_probs)
40 | print("u probabilities P(x_i=1|Non-Match):", cl.u_probs)
41 | print("log m probabilities P(x_i=1|Match):", cl.log_m_probs)
42 | print("log u probabilities P(x_i=1|Non-Match):", cl.log_u_probs)
43 | print("log weights of features:", cl.log_weights)
44 | print("weights of features:", cl.weights)
45 |
46 | # evaluate the model
47 | links_pred = cl.predict(X_data)
48 | print("Predicted number of links:", len(links_pred))
49 |
50 | cm = rl.confusion_matrix(links_true, links_pred, total=len(X_data))
51 | print("Confusion matrix:\n", cm)
52 |
53 | # compute the F-score for this classification
54 | fscore = rl.fscore(cm)
55 | print("fscore", fscore)
56 | recall = rl.recall(links_true, links_pred)
57 | print("recall", recall)
58 | precision = rl.precision(links_true, links_pred)
59 | print("precision", precision)
60 |
61 | # Predict the match probability for each pair in the dataset.
62 | probs = cl.prob(X_data)
63 | print(probs)
64 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "recordlinkage"
3 | description = "A record linkage toolkit for linking and deduplication"
4 | authors = [
5 | { name = "Jonathan de Bruin", email = "jonathandebruinos@gmail.com" }
6 | ]
7 | readme = "README.md"
8 | classifiers = [
9 | "Development Status :: 4 - Beta",
10 | "License :: OSI Approved :: BSD License",
11 | "Programming Language :: Python :: 3.8",
12 | "Programming Language :: Python :: 3.9",
13 | "Programming Language :: Python :: 3.10",
14 | "Programming Language :: Python :: 3.11"
15 | ]
16 | license = {text = "BSD-3-Clause"}
17 | dependencies = [
18 | "jellyfish>=1",
19 | "numpy>=1.13",
20 | "pandas>=1,<3",
21 | "scipy>=1",
22 | "scikit-learn>=1",
23 | "joblib"
24 | ]
25 | dynamic = ["version"]
26 | requires-python = ">=3.8"
27 |
28 | [project.urls]
29 | homepage = "https://recordlinkage.readthedocs.io/"
30 | repository = "https://github.com/J535D165/recordlinkage"
31 |
32 | [project.optional-dependencies]
33 | all = ["networkx>=2", "bottleneck", "numexpr"]
34 | lint = ["ruff"]
35 | docs = ["sphinx", "nbsphinx", "sphinx-rtd-theme", "ipykernel"]
36 | test = ["pytest"]
37 |
38 | [build-system]
39 | build-backend = 'setuptools.build_meta'
40 | requires = ["setuptools>=45", "setuptools_scm[toml]>=6.2"]
41 |
42 | [tool.setuptools]
43 | packages = ["recordlinkage"]
44 |
45 | [tool.setuptools.package-data]
46 | "*" = ["*.csv"]
47 |
48 | [tool.setuptools_scm]
49 | write_to = "recordlinkage/_version.py"
50 |
51 | [tool.ruff]
52 | select = ["E", "F", "UP", "I", "B"]
53 | ignore = ["B006"]
54 | exclude = ["docs/conf.py"]
55 |
56 | [tool.ruff.isort]
57 | force-single-line = true
58 |
--------------------------------------------------------------------------------
/recordlinkage/__init__.py:
--------------------------------------------------------------------------------
1 | # register the configuration
2 | import recordlinkage.config_init # noqa
3 | from recordlinkage.classifiers import FellegiSunter
4 | from recordlinkage.classifiers import KMeansClassifier
5 | from recordlinkage.classifiers import LogisticRegressionClassifier
6 | from recordlinkage.classifiers import NaiveBayesClassifier
7 | from recordlinkage.classifiers import SVMClassifier
8 | from recordlinkage.classifiers import ECMClassifier
9 | from recordlinkage.measures import reduction_ratio
10 | from recordlinkage.measures import max_pairs
11 | from recordlinkage.measures import full_index_size
12 | from recordlinkage.measures import true_positives
13 | from recordlinkage.measures import true_negatives
14 | from recordlinkage.measures import false_positives
15 | from recordlinkage.measures import false_negatives
16 | from recordlinkage.measures import confusion_matrix
17 | from recordlinkage.measures import precision
18 | from recordlinkage.measures import recall
19 | from recordlinkage.measures import accuracy
20 | from recordlinkage.measures import specificity
21 | from recordlinkage.measures import fscore
22 | from recordlinkage.network import OneToOneLinking
23 | from recordlinkage.network import OneToManyLinking
24 | from recordlinkage.network import ConnectedComponents
25 | from recordlinkage import rl_logging as logging
26 | from recordlinkage.annotation import read_annotation_file
27 | from recordlinkage.annotation import write_annotation_file
28 | from recordlinkage.api import Compare
29 | from recordlinkage.api import Index
30 | from recordlinkage.config import describe_option
31 | from recordlinkage.config import get_option
32 | from recordlinkage.config import option_context
33 | from recordlinkage.config import options
34 | from recordlinkage.config import reset_option
35 | from recordlinkage.config import set_option
36 | from recordlinkage.utils import index_split
37 | from recordlinkage.utils import split_index
38 |
39 | try:
40 | from recordlinkage._version import __version__
41 | from recordlinkage._version import __version_tuple__
42 | except ImportError:
43 | __version__ = "0.0.0"
44 | __version_tuple__ = (0, 0, 0)
45 |
46 |
47 | __all__ = [
48 | "logging",
49 | "read_annotation_file",
50 | "write_annotation_file",
51 | "Compare",
52 | "Index",
53 | "describe_option",
54 | "get_option",
55 | "option_context",
56 | "options",
57 | "reset_option",
58 | "set_option",
59 | "index_split",
60 | "split_index",
61 | "FellegiSunter",
62 | "KMeansClassifier",
63 | "LogisticRegressionClassifier",
64 | "NaiveBayesClassifier",
65 | "SVMClassifier",
66 | "ECMClassifier",
67 | "reduction_ratio",
68 | "max_pairs",
69 | "full_index_size",
70 | "true_positives",
71 | "true_negatives",
72 | "false_positives",
73 | "false_negatives",
74 | "confusion_matrix",
75 | "precision",
76 | "recall",
77 | "accuracy",
78 | "specificity",
79 | "fscore",
80 | "OneToOneLinking",
81 | "OneToManyLinking",
82 | "ConnectedComponents",
83 | ]
84 |
--------------------------------------------------------------------------------
/recordlinkage/_lib/numeric.c:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 |
5 | #define R 6371
6 | #define TO_RAD (3.1415926536 / 180)
7 |
8 | double euclidean_dist(double x, double y)
9 | {
10 | return fabs(y - x);
11 | }
12 |
13 | double haversine_dist(double th1, double ph1, double th2, double ph2)
14 | {
15 | double dx, dy, dz;
16 | ph1 -= ph2;
17 | ph1 *= TO_RAD, th1 *= TO_RAD, th2 *= TO_RAD;
18 |
19 | dz = sin(th1) - sin(th2);
20 | dx = cos(ph1) * cos(th1) - cos(th2);
21 | dy = sin(ph1) * cos(th1);
22 | return asin(sqrt(dx * dx + dy * dy + dz * dz) / 2) * 2 * R;
23 | }
24 |
25 | double step_sim(double d, double offset, double origin)
26 | {
27 | if (fabs(d - origin) <= offset)
28 | {
29 | return 1.0;
30 | } else
31 | {
32 | return 0.0;
33 | }
34 | }
35 |
36 | double linear_sim(double d, double scale, double offset, double origin)
37 | {
38 |
39 | double d_norm;
40 |
41 | // normalise the distance measure
42 | d_norm = fabs(d - origin);
43 |
44 | if (d_norm <= offset)
45 | {
46 | return 1.0;
47 | }
48 | else if (d_norm >= offset + 2 * scale)
49 | {
50 | return 0.0;
51 | }
52 | else
53 | {
54 | return 1.0 - (d_norm - offset) / (2 * scale);
55 | }
56 | }
57 |
58 |
59 | double squared_sim(double d, double scale, double offset, double origin)
60 | {
61 |
62 | double d_norm;
63 |
64 | // normalise the distance measure
65 | d_norm = fabs(d - origin);
66 |
67 | if (d_norm <= offset)
68 | {
69 | return 1.0;
70 | }
71 | else if (d_norm >= offset + sqrt(2.0) * scale)
72 | {
73 | return 0.0;
74 | }
75 | else
76 | {
77 | return 1.0 - 0.5 * exp(2.0 * log((d_norm - offset)/scale));
78 | }
79 | }
80 |
81 |
82 | double exp_sim(double d, double scale, double offset, double origin)
83 | {
84 |
85 | double d_norm;
86 |
87 | // normalise the distance measure
88 | d_norm = fabs(d - origin);
89 |
90 | if (d_norm <= offset)
91 | {
92 | return 1.0;
93 | }
94 | else
95 | {
96 | return pow(2.0, - (d_norm-offset) / scale);
97 | }
98 | }
99 |
100 |
101 | double gauss_sim(double d, double scale, double offset, double origin)
102 | {
103 |
104 | double d_norm;
105 |
106 | // normalise the distance measure
107 | d_norm = fabs(d - origin);
108 |
109 | if (d_norm <= offset)
110 | {
111 | return 1.0;
112 | }
113 | else
114 | {
115 | return pow(2.0, - pow((d_norm-offset) / scale, 2.0));
116 | }
117 | }
118 |
--------------------------------------------------------------------------------
/recordlinkage/_lib/numeric.h:
--------------------------------------------------------------------------------
1 |
2 | // numeric distance functions
3 | double euclidean_dist(double x, double y);
4 | double haversine_dist(double th1, double ph1, double th2, double ph2);
5 |
6 | // numeric similarity functions
7 | double step_sim(double d, double offset, double origin);
8 | double linear_sim(double d, double scale, double offset, double origin);
9 | double squared_sim(double d, double scale, double offset, double origin);
10 | double exp_sim(double d, double scale, double offset, double origin);
11 | double gauss_sim(double d, double scale, double offset, double origin);
12 |
--------------------------------------------------------------------------------
/recordlinkage/adapters.py:
--------------------------------------------------------------------------------
1 | """Module to wrap external machine learning models."""
2 |
3 | __all__ = ["SKLearnAdapter", "KerasAdapter"]
4 |
5 |
6 | class SKLearnAdapter:
7 | """SciKit-learn adapter for record pair classification.
8 |
9 | SciKit-learn adapter for record pair classification with SciKit-learn
10 | models.
11 | """
12 |
13 | @property
14 | def classifier(self):
15 | # raise warning
16 | return self.kernel
17 |
18 | @classifier.setter
19 | def classifier(self, classifier):
20 | self.kernel = classifier
21 |
22 | def _predict(self, features):
23 | """Predict matches and non-matches.
24 |
25 | Parameters
26 | ----------
27 | features : numpy.ndarray
28 | The data to predict the class of.
29 |
30 | Returns
31 | -------
32 | numpy.ndarray
33 | The predicted classes.
34 | """
35 |
36 | from sklearn.exceptions import NotFittedError
37 |
38 | try:
39 | prediction = self.kernel.predict(features)
40 | except NotFittedError as err:
41 | raise NotFittedError(
42 | "{} is not fitted yet. Call 'fit' with appropriate "
43 | "arguments before using this method.".format(type(self).__name__)
44 | ) from err
45 |
46 | return prediction
47 |
48 | def _fit(self, features, y=None):
49 | if y is None: # unsupervised
50 | self.kernel.fit(features)
51 | else:
52 | self.kernel.fit(features, y)
53 |
54 | def _prob_match(self, features):
55 | """Compute match probabilities.
56 |
57 | Parameters
58 | ----------
59 | features : numpy.ndarray
60 | The data to train the model on.
61 |
62 | Returns
63 | -------
64 | numpy.ndarray
65 | The match probabilties.
66 | """
67 |
68 | # compute the probabilities
69 | probs = self.kernel.predict_proba(features)
70 |
71 | # get the position of match probabilities
72 | classes = list(self.kernel.classes_)
73 | match_class_position = classes.index(1)
74 |
75 | return probs[:, match_class_position]
76 |
77 |
78 | class KerasAdapter:
79 | """Keras adapter for record pair classification.
80 |
81 | Keras adapter for record pair classification with Keras models.
82 | """
83 |
84 | @property
85 | def classifier(self):
86 | # raise warning
87 | return self.kernel
88 |
89 | @classifier.setter
90 | def classifier(self, classifier):
91 | self.kernel = classifier
92 |
93 | def _predict(self, features):
94 | """Predict matches and non-matches.
95 |
96 | Parameters
97 | ----------
98 | features : numpy.ndarray
99 | The data to predict the class of.
100 |
101 | Returns
102 | -------
103 | numpy.ndarray
104 | The predicted classes.
105 | """
106 |
107 | from sklearn.exceptions import NotFittedError
108 |
109 | try:
110 | prediction = self.kernel.predict_classes(features)[:, 0]
111 | except NotFittedError as err:
112 | raise NotFittedError(
113 | "{} is not fitted yet. Call 'fit' with appropriate "
114 | "arguments before using this method.".format(type(self).__name__)
115 | ) from err
116 |
117 | return prediction
118 |
119 | def _fit(self, features, y=None):
120 | self.kernel.fit(features, y)
121 |
122 | def _prob_match(self, features):
123 | """Compute match probabilities.
124 |
125 | Parameters
126 | ----------
127 | features : numpy.ndarray
128 | The data to train the model on.
129 |
130 | Returns
131 | -------
132 | numpy.ndarray
133 | The match probabilties.
134 | """
135 |
136 | # compute the probabilities
137 | probs = self.kernel.predict_proba(features)[:, 0]
138 |
139 | return probs
140 |
--------------------------------------------------------------------------------
/recordlinkage/algorithms/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/J535D165/recordlinkage/b93d97641952f8c85106be5794ca93b1f1298fbc/recordlinkage/algorithms/__init__.py
--------------------------------------------------------------------------------
/recordlinkage/algorithms/c_numeric.pyx:
--------------------------------------------------------------------------------
1 | cimport cython
2 |
3 | import numpy as np
4 |
5 | cimport numpy as np
6 |
7 |
8 | cdef extern from "../_lib/numeric.h":
9 |
10 | # numeric distance functions
11 | double euclidean_dist(double x, double y)
12 | double haversine_dist(double th1, double ph1, double th2, double ph2)
13 |
14 | # numeric similarity functions
15 | double step_sim(double d, double offset, double origin)
16 | double linear_sim(double d, double scale, double offset, double origin)
17 | double squared_sim(double d, double scale, double offset, double origin)
18 | double exp_sim(double d, double scale, double offset, double origin)
19 | double gauss_sim(double d, double scale, double offset, double origin)
20 |
21 |
22 | @cython.boundscheck(False) # turn off bounds-checking for entire function
23 | @cython.wraparound(False) # turn off negative index wrapping for entire function
24 | def euclidean_distance(np.ndarray[np.float64_t, ndim=1] x, np.ndarray[np.float64_t, ndim=1] y):
25 |
26 | cdef int n_rows = x.shape[0]
27 |
28 | cdef np.ndarray result = np.zeros(n_rows, dtype=np.float64)
29 |
30 | for k in range(n_rows):
31 | result[k] = euclidean_dist(x[k], y[k])
32 |
33 | return result
34 |
35 |
36 | @cython.boundscheck(False) # turn off bounds-checking for entire function
37 | @cython.wraparound(False) # turn off negative index wrapping for entire function
38 | def haversine_distance(np.ndarray[np.float64_t, ndim=1] th1, np.ndarray[np.float64_t, ndim=1] ph1, np.ndarray[np.float64_t, ndim=1] th2, np.ndarray[np.float64_t, ndim=1] ph2):
39 |
40 | cdef int n_rows = th1.shape[0]
41 |
42 | cdef np.ndarray result = np.zeros(n_rows, dtype=np.float64)
43 |
44 | for k in range(n_rows):
45 | result[k] = haversine_dist(th1[k], ph1[k], th2[k], ph2[k])
46 |
47 | return result
48 |
--------------------------------------------------------------------------------
/recordlinkage/algorithms/compare.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas
3 |
4 |
5 | def _compare_exact(s1, s2, agree_value=1, disagree_value=0, missing_value=0):
6 | # dtypes can be hard if the passed parameters (agreement, disagreement,
7 | # missing_value) are of different types.
8 | # http://chris.friedline.net/2015-12-15-rutgers/lessons/python2/03-data-types-and-format.html
9 |
10 | # Convert to pandas.Series if (numpy) arrays are passed.
11 | if not isinstance(s1, pandas.Series):
12 | s1 = pandas.Series(s1, index=s1.index)
13 |
14 | if not isinstance(s2, pandas.Series):
15 | s2 = pandas.Series(s2, index=s2.index)
16 |
17 | # Values or agree/disagree
18 | if agree_value == "value":
19 | compare = s1.copy()
20 | compare[s1 != s2] = disagree_value
21 |
22 | else:
23 | compare = pandas.Series(disagree_value, index=s1.index)
24 | compare[s1 == s2] = agree_value
25 |
26 | # Only when disagree value is not identical with the missing value
27 | if disagree_value != missing_value:
28 | compare[(s1.isnull() | s2.isnull())] = missing_value
29 |
30 | return compare
31 |
32 |
33 | def _compare_dates(
34 | s1, s2, swap_month_day=0.5, swap_months="default", errors="coerce", *args, **kwargs
35 | ):
36 | # validate datatypes
37 | if str(s1.dtype) != "datetime64[ns]":
38 | raise ValueError("Left column is not of type datetime64[ns]")
39 |
40 | if str(s2.dtype) != "datetime64[ns]":
41 | raise ValueError("Right column is not of type datetime64[ns]")
42 |
43 | c = (s1 == s2).astype(np.int64) # start with int64 (will become float64)
44 |
45 | # The case is which there is a swap_month_day value given.
46 | if swap_month_day and swap_month_day != 0:
47 | c[
48 | (s1.dt.year == s2.dt.year)
49 | & (s1.dt.month == s2.dt.day)
50 | & (s1.dt.day == s2.dt.month)
51 | & (c != 1)
52 | ] = swap_month_day
53 |
54 | if swap_months and swap_months != 0:
55 | if swap_months == "default":
56 | swap_months = [(6, 7, 0.5), (7, 6, 0.5), (9, 10, 0.5), (10, 9, 0.5)]
57 | else:
58 | try:
59 | if not all([len(x) == 3 for x in swap_months]):
60 | raise Exception
61 | except Exception as err:
62 | raise ValueError(
63 | "swap_months must be a list of (first month, \
64 | second month, value) tuples or lists. "
65 | ) from err
66 |
67 | for month1, month2, value in swap_months:
68 | # if isinstance(value, float):
69 | # c = c.astype(np.float64)
70 | # elif isinstance(value, int):
71 | # c = c.astype(np.int64)
72 | # else:
73 | # c = c.astype(object)
74 |
75 | c[
76 | (s1.dt.year == s2.dt.year)
77 | & (s1.dt.month == month1)
78 | & (s2.dt.month == month2)
79 | & (s1.dt.day == s2.dt.day)
80 | & (c != 1)
81 | ] = value
82 |
83 | c = pandas.Series(c)
84 | c[s1.isnull() | s2.isnull()] = np.nan
85 |
86 | return c
87 |
--------------------------------------------------------------------------------
/recordlinkage/algorithms/distance.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas
3 |
4 |
5 | # Numerical distance algorithms
6 | def _1d_distance(s1, s2):
7 | return pandas.eval("s2-s1")
8 |
9 |
10 | def _haversine_distance(lat1, lng1, lat2, lng2):
11 | # degrees to radians conversion
12 | to_rad = np.deg2rad(1) # noqa
13 |
14 | # numeric expression to use with numexpr package
15 | expr = (
16 | "2*6371*arcsin(sqrt((sin((lat2*to_rad-lat1*to_rad)/2))**2+"
17 | "cos(lat1*to_rad)*cos(lat2*to_rad)*"
18 | "(sin((lng2*to_rad-lng1*to_rad)/2))**2))"
19 | )
20 |
21 | return pandas.eval(expr)
22 |
--------------------------------------------------------------------------------
/recordlinkage/algorithms/indexing.py:
--------------------------------------------------------------------------------
1 | """Algorithms for indexing."""
2 |
3 | import numpy as np
4 |
5 | from recordlinkage.measures import full_index_size
6 |
7 |
8 | def _map_tril_1d_on_2d(indices, dims):
9 | """Map 1d indices on lower triangular matrix in 2d."""
10 |
11 | N = (dims * dims - dims) / 2
12 |
13 | m = np.ceil(np.sqrt(2 * N))
14 | c = m - np.round(np.sqrt(2 * (N - indices))) - 1
15 | r = np.mod(indices + (c + 1) * (c + 2) / 2 - 1, m) + 1
16 |
17 | return np.array([r, c], dtype=np.int64)
18 |
19 |
20 | def random_pairs_with_replacement(n, shape, random_state=None):
21 | """make random record pairs"""
22 |
23 | if not isinstance(random_state, np.random.RandomState):
24 | random_state = np.random.RandomState(random_state)
25 |
26 | n_max = full_index_size(shape)
27 |
28 | if n_max <= 0:
29 | raise ValueError("n_max must be larger than 0")
30 |
31 | # make random pairs
32 | indices = random_state.randint(0, n_max, n, dtype=np.int64)
33 |
34 | if len(shape) == 1:
35 | return _map_tril_1d_on_2d(indices, shape[0])
36 | else:
37 | return np.array(np.unravel_index(indices, shape))
38 |
39 |
40 | def random_pairs_without_replacement(n, shape, random_state=None):
41 | """Return record pairs for dense sample.
42 |
43 | Sample random record pairs without replacement bounded by the
44 | maximum number of record pairs (based on shape). This algorithm is
45 | efficient and fast for relative small samples.
46 | """
47 |
48 | n_max = full_index_size(shape)
49 |
50 | if not isinstance(random_state, np.random.RandomState):
51 | random_state = np.random.RandomState(random_state)
52 |
53 | if not isinstance(n, int) or n <= 0 or n > n_max:
54 | raise ValueError("n must be a integer satisfying 0 n_max:
80 | raise ValueError("n must be a integer satisfying 0= 0.0
61 | assert cv["random"].max() <= 1.0
62 |
63 |
64 | class TestRandomDiscrete:
65 | def test_random_desc_standalone(self):
66 | arr1 = [1, 2, 3, 4, 5]
67 | arr2 = [1, 2, 3, 4, 5]
68 | pairs = pd.MultiIndex.from_product([arr1, arr2])
69 |
70 | c = RandomDiscrete()
71 | r = c.compute(pairs)
72 |
73 | assert r.shape[0] == len(arr1) * len(arr2)
74 |
75 | def test_random_desc(self):
76 | df_a = pd.DataFrame({"v": list("abcde")})
77 | df_b = pd.DataFrame({"v": list("abcde")})
78 |
79 | pairs = Full().index(df_a, df_b)
80 |
81 | c = recordlinkage.Compare()
82 | c.exact("v", "v")
83 | c.add(RandomDiscrete(label="random"))
84 | cv = c.compute(pairs, df_a, df_b)
85 |
86 | assert isinstance(cv, pd.DataFrame)
87 |
88 | assert cv["random"].notnull().all()
89 | assert cv["random"].isin([0, 1]).all()
90 |
--------------------------------------------------------------------------------
/recordlinkage/contrib/index/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2018 Jonathan de Bruin
2 | #
3 | # Redistribution and use in source and binary forms, with or without
4 | # modification, are permitted provided that the following conditions are met:
5 | #
6 | # 1. Redistributions of source code must retain the above copyright notice,
7 | # this list of conditions and the following disclaimer.
8 | # 2. Redistributions in binary form must reproduce the above copyright notice,
9 | # this list of conditions and the following disclaimer in the documentation
10 | # and/or other materials provided with the distribution.
11 | # 3. Neither the name of the copyright holder nor the names of its
12 | # contributors may be used to endorse or promote products derived from this
13 | # software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
19 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25 | # POSSIBILITY OF SUCH DAMAGE.
26 |
27 |
28 | from recordlinkage.contrib.index.neighbourhoodblock.neighbourhoodblock import (
29 | NeighbourhoodBlock,
30 | )
31 |
32 | __all__ = ["NeighbourhoodBlock"]
33 |
--------------------------------------------------------------------------------
/recordlinkage/contrib/index/neighbourhoodblock/README.rst:
--------------------------------------------------------------------------------
1 | Neighbourhood blocking
2 | ======================
3 |
4 | Example
5 | -------
6 |
7 | In the following example, the record pairs are made for two historical
8 | datasets with census data. The datasets are named ``census_data_1980``
9 | and ``census_data_1990``. The index includes record pairs with matches
10 | in (at least) any 3 out of the 5 nominated fields. Proximity matching is
11 | allowed in the first two fields, and up to one wildcard match of a
12 | missing value is also allowed.
13 |
14 | .. code:: python
15 |
16 | from recordlinkage.contrib.index import NeighbourhoodBlock
17 |
18 | keys = ['first_name', 'surname', 'date_of_birth', 'address', 'ssid']
19 | windows = [9, 3, 1, 1, 1]
20 |
21 | indexer = NeighbourhoodBlock(
22 | keys, windows=windows, max_nulls=1, max_non_matches=2)
23 | indexer.index(census_data_1980, census_data_1990)
24 |
25 | Authors
26 | -------
27 |
28 | - Daniel Elias
--------------------------------------------------------------------------------
/recordlinkage/contrib/index/neighbourhoodblock/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2019
2 | #
3 | # Redistribution and use in source and binary forms, with or without
4 | # modification, are permitted provided that the following conditions are met:
5 | #
6 | # 1. Redistributions of source code must retain the above copyright notice,
7 | # this list of conditions and the following disclaimer.
8 | # 2. Redistributions in binary form must reproduce the above copyright notice,
9 | # this list of conditions and the following disclaimer in the documentation
10 | # and/or other materials provided with the distribution.
11 | # 3. Neither the name of the copyright holder nor the names of its
12 | # contributors may be used to endorse or promote products derived from this
13 | # software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
19 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25 | # POSSIBILITY OF SUCH DAMAGE.
26 |
--------------------------------------------------------------------------------
/recordlinkage/contrib/index/neighbourhoodblock/test_neighbourhoodblock.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | from operator import eq
4 | from operator import gt
5 |
6 | import numpy as np
7 | import pytest
8 |
9 | from recordlinkage.contrib.index import NeighbourhoodBlock
10 | from recordlinkage.index import Block
11 | from recordlinkage.index import Full
12 | from recordlinkage.index import SortedNeighbourhood
13 | from tests.test_indexing import TestData
14 |
15 |
16 | class TestNeighbourhoodBlock(TestData):
17 | """General unittest for the NeighbourhoodBlocking indexing class."""
18 |
19 | @classmethod
20 | def setup_class(cls):
21 | TestData.setup_class()
22 |
23 | def incomplete_df_copy(df, nan_proportion=0.1):
24 | "copy of DataFrame with some cells set to NaN"
25 | nan_count = int(round(len(df) * nan_proportion))
26 |
27 | def with_nulls(vals):
28 | vals = vals.copy()
29 | vals.iloc[
30 | np.random.choice(len(df), size=nan_count, replace=False)
31 | ] = np.nan
32 | return vals
33 |
34 | return df.copy() if nan_count <= 0 else df.apply(with_nulls)
35 |
36 | np.random.seed(0)
37 | cls.incomplete_a = incomplete_df_copy(cls.a)
38 | cls.incomplete_b = incomplete_df_copy(cls.b)
39 |
40 | def assert_index_comparisons(self, pairwise_comparison, indexers, *args, **kwargs):
41 | indexes = [ndxr.index(*args, **kwargs) for ndxr in indexers]
42 | for index1, index2 in zip(indexes, indexes[1:]):
43 | pairs1, pairs2 = map(set, [index1, index2])
44 | assert (
45 | (len(pairs1) == len(index1))
46 | and (len(pairs2) == len(index2))
47 | and pairwise_comparison(pairs1, pairs2)
48 | )
49 |
50 | def test_dedup_vs_full(self):
51 | indexers = [
52 | NeighbourhoodBlock(max_non_matches=len(self.a.columns)),
53 | Full(),
54 | ]
55 | self.assert_index_comparisons(eq, indexers, self.a)
56 |
57 | def test_link_vs_full(self):
58 | indexers = [
59 | NeighbourhoodBlock(max_non_matches=len(self.a.columns)),
60 | Full(),
61 | ]
62 | self.assert_index_comparisons(eq, indexers, self.a, self.b)
63 |
64 | def test_dedup_single_blocking_key_vs_block(self):
65 | indexers = [
66 | NeighbourhoodBlock("var_block10", max_nulls=1),
67 | NeighbourhoodBlock(
68 | left_on="var_block10", right_on="var_block10", max_nulls=1
69 | ),
70 | Block("var_block10"),
71 | ]
72 | self.assert_index_comparisons(eq, indexers, self.a)
73 | self.assert_index_comparisons(gt, indexers[-2:], self.incomplete_a)
74 |
75 | def test_link_single_blocking_key_vs_block(self):
76 | indexers = [
77 | NeighbourhoodBlock("var_arange", max_nulls=1),
78 | NeighbourhoodBlock(
79 | left_on="var_arange", right_on="var_arange", max_nulls=1
80 | ),
81 | Block("var_arange"),
82 | ]
83 | self.assert_index_comparisons(eq, indexers, self.a, self.b)
84 | self.assert_index_comparisons(
85 | gt, indexers[-2:], self.incomplete_a, self.incomplete_b
86 | )
87 |
88 | def test_dedup_multiple_blocking_keys_vs_block(self):
89 | indexers = [
90 | NeighbourhoodBlock(["var_single", "var_block10"], max_nulls=1),
91 | NeighbourhoodBlock(
92 | left_on=["var_single", "var_block10"],
93 | right_on=["var_single", "var_block10"],
94 | max_nulls=1,
95 | ),
96 | Block(["var_single", "var_block10"]),
97 | ]
98 | self.assert_index_comparisons(eq, indexers, self.a)
99 | self.assert_index_comparisons(gt, indexers[-2:], self.incomplete_a)
100 |
101 | def test_link_multiple_blocking_keys_vs_block(self):
102 | indexers = [
103 | NeighbourhoodBlock(["var_arange", "var_block10"], max_nulls=1),
104 | NeighbourhoodBlock(
105 | left_on=["var_arange", "var_block10"],
106 | right_on=["var_arange", "var_block10"],
107 | max_nulls=1,
108 | ),
109 | Block(["var_arange", "var_block10"]),
110 | ]
111 | self.assert_index_comparisons(eq, indexers, self.a, self.b)
112 | self.assert_index_comparisons(
113 | gt, indexers[-2:], self.incomplete_a, self.incomplete_b
114 | )
115 |
116 | @pytest.mark.parametrize("window", [3, 5, 7, 9, 11])
117 | def test_dedup_single_sorting_key_vs_sortedneighbourhood(self, window):
118 | indexers = [
119 | NeighbourhoodBlock("var_arange", max_nulls=1, windows=window),
120 | NeighbourhoodBlock(
121 | left_on="var_arange", right_on="var_arange", max_nulls=1, windows=window
122 | ),
123 | SortedNeighbourhood("var_arange", window=window),
124 | ]
125 | self.assert_index_comparisons(eq, indexers, self.a)
126 | self.assert_index_comparisons(gt, indexers[-2:], self.incomplete_a)
127 |
128 | @pytest.mark.parametrize("window", [3, 5, 7, 9, 11])
129 | def test_link_single_sorting_key_vs_sortedneighbourhood(self, window):
130 | indexers = [
131 | NeighbourhoodBlock("var_arange", max_nulls=1, windows=window),
132 | NeighbourhoodBlock(
133 | left_on="var_arange", right_on="var_arange", max_nulls=1, windows=window
134 | ),
135 | SortedNeighbourhood("var_arange", window=window),
136 | ]
137 | self.assert_index_comparisons(eq, indexers, self.a, self.b)
138 | self.assert_index_comparisons(
139 | gt, indexers[-2:], self.incomplete_a, self.incomplete_b
140 | )
141 |
142 | @pytest.mark.parametrize("window", [3, 5, 7, 9, 11])
143 | def test_dedup_with_blocking_vs_sortedneighbourhood(self, window):
144 | indexers = [
145 | NeighbourhoodBlock(
146 | ["var_arange", "var_block10"], max_nulls=1, windows=[window, 1]
147 | ),
148 | NeighbourhoodBlock(
149 | left_on=["var_arange", "var_block10"],
150 | right_on=["var_arange", "var_block10"],
151 | max_nulls=1,
152 | windows=[window, 1],
153 | ),
154 | SortedNeighbourhood("var_arange", block_on="var_block10", window=window),
155 | ]
156 | self.assert_index_comparisons(eq, indexers, self.a)
157 | self.assert_index_comparisons(gt, indexers[-2:], self.incomplete_a)
158 |
159 | @pytest.mark.parametrize("window", [3, 5, 7, 9, 11])
160 | def test_link_with_blocking_vs_sortedneighbourhood(self, window):
161 | indexers = [
162 | NeighbourhoodBlock(
163 | ["var_arange", "var_block10"], max_nulls=1, windows=[window, 1]
164 | ),
165 | NeighbourhoodBlock(
166 | left_on=["var_arange", "var_block10"],
167 | right_on=["var_arange", "var_block10"],
168 | max_nulls=1,
169 | windows=[window, 1],
170 | ),
171 | SortedNeighbourhood("var_arange", block_on="var_block10", window=window),
172 | ]
173 | self.assert_index_comparisons(eq, indexers, self.a, self.b)
174 | self.assert_index_comparisons(
175 | gt, indexers[-2:], self.incomplete_a, self.incomplete_b
176 | )
177 |
--------------------------------------------------------------------------------
/recordlinkage/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from recordlinkage.datasets.external import clear_data_home
2 | from recordlinkage.datasets.external import get_data_home
3 | from recordlinkage.datasets.external import load_krebsregister
4 | from recordlinkage.datasets.febrl import load_febrl1
5 | from recordlinkage.datasets.febrl import load_febrl2
6 | from recordlinkage.datasets.febrl import load_febrl3
7 | from recordlinkage.datasets.febrl import load_febrl4
8 | from recordlinkage.datasets.generate import binary_vectors
9 |
10 | __all__ = [
11 | "clear_data_home",
12 | "get_data_home",
13 | "load_krebsregister",
14 | "load_febrl1",
15 | "load_febrl2",
16 | "load_febrl3",
17 | "load_febrl4",
18 | "binary_vectors",
19 | ]
20 |
--------------------------------------------------------------------------------
/recordlinkage/datasets/external.py:
--------------------------------------------------------------------------------
1 | # The function get_data_home() and clear_data_home() are based on
2 | # SciKit-Learn https://git.io/fjT70. See the 3-clause BSD license.
3 |
4 | import shutil
5 | import zipfile
6 | from io import BytesIO
7 | from os import environ
8 | from pathlib import Path
9 | from urllib.request import urlopen
10 |
11 | import pandas
12 |
13 |
14 | def get_data_home(data_home=None):
15 | """Return the path of the Record Linkage data folder.
16 |
17 | This folder is used by some large dataset loaders to avoid
18 | downloading the data several times. By default the data dir
19 | is set to a folder named 'rl_data' in the user
20 | home folder.
21 | Alternatively, it can be set by the 'RL_DATA' environment
22 | variable or programmatically by giving an explicit folder
23 | path. The '~' symbol is expanded to the user home folder.
24 |
25 | If the folder does not already exist, it is automatically
26 | created.
27 |
28 | Parameters
29 | ----------
30 | data_home : str | None
31 | The path to recordlinkage data folder.
32 | """
33 | if data_home is None:
34 | data_home = environ.get("RL_DATA", Path("~", "rl_data"))
35 | data_home = Path(data_home).expanduser()
36 |
37 | if not data_home.exists():
38 | data_home.mkdir(parents=True, exist_ok=True)
39 |
40 | return data_home
41 |
42 |
43 | def clear_data_home(data_home=None):
44 | """Delete all the content of the data home cache.
45 |
46 | Parameters
47 | ----------
48 | data_home : str | None
49 | The path to recordlinkage data folder.
50 | """
51 | data_home = get_data_home(data_home)
52 | shutil.rmtree(str(data_home))
53 |
54 |
55 | def load_krebsregister(block=None, missing_values=None, shuffle=True):
56 | """Load the Krebsregister dataset.
57 |
58 | This dataset of comparison patterns was obtained in a
59 | epidemiological cancer study in Germany. The comparison patterns
60 | were created by the Institute for Medical Biostatistics,
61 | Epidemiology and Informatics (IMBEI) and the University Medical
62 | Center of Johannes Gutenberg University (Mainz, Germany). The
63 | dataset is available for research online.
64 |
65 | "The records represent individual data including first and
66 | family name, sex, date of birth and postal code, which were
67 | collected through iterative insertions in the course of
68 | several years. The comparison patterns in this data set are
69 | based on a sample of 100.000 records dating from 2005 to 2008.
70 | Data pairs were classified as "match" or "non-match" during
71 | an extensive manual review where several documentarists were
72 | involved. The resulting classification formed the basis for
73 | assessing the quality of the registry's own record linkage
74 | procedure.
75 |
76 | In order to limit the amount of patterns a blocking procedure
77 | was applied, which selects only record pairs that meet
78 | specific agreement conditions. The results of the following
79 | six blocking iterations were merged together:
80 |
81 | - Phonetic equality of first name and family name, equality of
82 | date of birth.
83 | - Phonetic equality of first name, equality of day of birth.
84 | - Phonetic equality of first name, equality of month of birth.
85 | - Phonetic equality of first name, equality of year of birth.
86 | - Equality of complete date of birth.
87 | - Phonetic equality of family name, equality of sex.
88 |
89 | This procedure resulted in 5.749.132 record pairs, of which
90 | 20.931 are matches. The data set is split into 10 blocks of
91 | (approximately) equal size and ratio of matches to
92 | non-matches."
93 |
94 | Parameters
95 | ----------
96 | block : int, list
97 | An integer or a list with integers between 1 and 10. The
98 | blocks are the blocks explained in the description. Default
99 | all 1 to 10.
100 | missing_values : object, int, float
101 | The value of the missing values. Default NaN.
102 | shuffle : bool
103 | Shuffle the record pairs. Default True.
104 |
105 | Returns
106 | -------
107 | (pandas.DataFrame, pandas.MultiIndex)
108 | A pandas.DataFrame with comparison vectors and a
109 | pandas.MultiIndex with the indices of the matches.
110 |
111 | """
112 |
113 | if block is None:
114 | block = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
115 |
116 | # If the data is not found, download it.
117 | for i in range(1, 11):
118 | filepath = Path(get_data_home(), "krebsregister", f"block_{i}.zip")
119 |
120 | if not filepath.is_file():
121 | _download_krebsregister()
122 | break
123 |
124 | if isinstance(block, (list, tuple)):
125 | data = pandas.concat([_krebsregister_block(bl) for bl in block])
126 | else:
127 | data = _krebsregister_block(block)
128 |
129 | if shuffle:
130 | data = data.sample(frac=1, random_state=535)
131 |
132 | match_index = data.index[data["is_match"]]
133 | del data["is_match"]
134 |
135 | if pandas.notnull(missing_values):
136 | data.fillna(missing_values, inplace=True)
137 |
138 | return data, match_index
139 |
140 |
141 | def _download_krebsregister():
142 | zip_file_url = (
143 | "http://archive.ics.uci.edu/ml/" "machine-learning-databases/00210/donation.zip"
144 | )
145 |
146 | folder = Path(get_data_home(), "krebsregister")
147 |
148 | try:
149 | print(f"Downloading data to {folder}.")
150 | r = urlopen(zip_file_url).read()
151 |
152 | # unzip the content and put it in the krebsregister folder
153 | z = zipfile.ZipFile(BytesIO(r))
154 | z.extractall(str(folder))
155 |
156 | print("Data download succesfull.")
157 |
158 | except Exception as e:
159 | print("Issue with downloading the data:", e)
160 |
161 |
162 | def _krebsregister_block(block):
163 | if block not in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]:
164 | raise ValueError(
165 | "Argument 'block' has to be integer in "
166 | "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10] or list of integers."
167 | )
168 |
169 | fp_i = Path(get_data_home(), "krebsregister", f"block_{block}.zip")
170 |
171 | data_block = pandas.read_csv(
172 | fp_i, index_col=["id_1", "id_2"], na_values="?", compression="zip"
173 | )
174 |
175 | data_block.columns = [
176 | "cmp_firstname1",
177 | "cmp_firstname2",
178 | "cmp_lastname1",
179 | "cmp_lastname2",
180 | "cmp_sex",
181 | "cmp_birthday",
182 | "cmp_birthmonth",
183 | "cmp_birthyear",
184 | "cmp_zipcode",
185 | "is_match",
186 | ]
187 | data_block.index.names = ["id1", "id2"]
188 |
189 | return data_block
190 |
--------------------------------------------------------------------------------
/recordlinkage/datasets/febrl.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 |
3 | import numpy
4 | import pandas
5 |
6 |
7 | def _febrl_load_data(filename):
8 | # Internal function for loading febrl data
9 |
10 | filepath = Path(Path(__file__).parent, "febrl", filename)
11 |
12 | febrl_data = pandas.read_csv(
13 | filepath,
14 | index_col="rec_id",
15 | sep=",",
16 | engine="c",
17 | skipinitialspace=True,
18 | encoding="utf-8",
19 | dtype={
20 | "street_number": object,
21 | "date_of_birth": object,
22 | "soc_sec_id": object,
23 | "postcode": object,
24 | },
25 | )
26 |
27 | return febrl_data
28 |
29 |
30 | def _febrl_links(df):
31 | """Get the links of a FEBRL dataset."""
32 |
33 | index = df.index.to_series()
34 | keys = index.str.extract(r"rec-(\d+)", expand=True)[0]
35 |
36 | index_int = numpy.arange(len(df))
37 |
38 | df_helper = pandas.DataFrame({"key": keys, "index": index_int})
39 |
40 | # merge the two frame and make MultiIndex.
41 | pairs_df = df_helper.merge(df_helper, on="key")[["index_x", "index_y"]]
42 | pairs_df = pairs_df[pairs_df["index_x"] > pairs_df["index_y"]]
43 |
44 | return pandas.MultiIndex(
45 | levels=[df.index.values, df.index.values],
46 | codes=[pairs_df["index_x"].values, pairs_df["index_y"].values],
47 | names=[None, None],
48 | verify_integrity=False,
49 | )
50 |
51 |
52 | def load_febrl1(return_links=False):
53 | """Load the FEBRL 1 dataset.
54 |
55 | The Freely Extensible Biomedical Record Linkage (Febrl) package is
56 | distributed with a dataset generator and four datasets generated
57 | with the generator. This function returns the first Febrl dataset
58 | as a :class:`pandas.DataFrame`.
59 |
60 | *"This data set contains 1000 records (500 original and
61 | 500 duplicates, with exactly one duplicate per original
62 | record."*
63 |
64 | Parameters
65 | ----------
66 | return_links: bool
67 | When True, the function returns also the true links.
68 |
69 | Returns
70 | -------
71 | pandas.DataFrame
72 | A :class:`pandas.DataFrame` with Febrl dataset1.csv. When
73 | return_links is True, the function returns also the true
74 | links. The true links are all links in the lower triangular
75 | part of the matrix.
76 |
77 | """
78 |
79 | df = _febrl_load_data("dataset1.csv")
80 |
81 | if return_links:
82 | links = _febrl_links(df)
83 | return df, links
84 | else:
85 | return df
86 |
87 |
88 | def load_febrl2(return_links=False):
89 | """Load the FEBRL 2 dataset.
90 |
91 | The Freely Extensible Biomedical Record Linkage (Febrl) package is
92 | distributed with a dataset generator and four datasets generated
93 | with the generator. This function returns the second Febrl dataset
94 | as a :class:`pandas.DataFrame`.
95 |
96 | *"This data set contains 5000 records (4000 originals and
97 | 1000 duplicates), with a maximum of 5 duplicates based on
98 | one original record (and a poisson distribution of
99 | duplicate records). Distribution of duplicates:
100 | 19 originals records have 5 duplicate records
101 | 47 originals records have 4 duplicate records
102 | 107 originals records have 3 duplicate records
103 | 141 originals records have 2 duplicate records
104 | 114 originals records have 1 duplicate record
105 | 572 originals records have no duplicate record"*
106 |
107 | Parameters
108 | ----------
109 | return_links: bool
110 | When True, the function returns also the true links.
111 |
112 | Returns
113 | -------
114 | pandas.DataFrame
115 | A :class:`pandas.DataFrame` with Febrl dataset2.csv. When
116 | return_links is True, the function returns also the true
117 | links. The true links are all links in the lower triangular
118 | part of the matrix.
119 |
120 | """
121 |
122 | df = _febrl_load_data("dataset2.csv")
123 |
124 | if return_links:
125 | links = _febrl_links(df)
126 | return df, links
127 | else:
128 | return df
129 |
130 |
131 | def load_febrl3(return_links=False):
132 | """Load the FEBRL 3 dataset.
133 |
134 | The Freely Extensible Biomedical Record Linkage (Febrl) package is
135 | distributed with a dataset generator and four datasets generated
136 | with the generator. This function returns the third Febrl dataset
137 | as a :class:`pandas.DataFrame`.
138 |
139 | *"This data set contains 5000 records (2000 originals and
140 | 3000 duplicates), with a maximum of 5 duplicates based on
141 | one original record (and a Zipf distribution of duplicate
142 | records). Distribution of duplicates:
143 | 168 originals records have 5 duplicate records
144 | 161 originals records have 4 duplicate records
145 | 212 originals records have 3 duplicate records
146 | 256 originals records have 2 duplicate records
147 | 368 originals records have 1 duplicate record
148 | 1835 originals records have no duplicate record"*
149 |
150 | Parameters
151 | ----------
152 | return_links: bool
153 | When True, the function returns also the true links.
154 |
155 | Returns
156 | -------
157 | pandas.DataFrame
158 | A :class:`pandas.DataFrame` with Febrl dataset3.csv. When
159 | return_links is True, the function returns also the true
160 | links. The true links are all links in the lower triangular
161 | part of the matrix.
162 |
163 | """
164 |
165 | df = _febrl_load_data("dataset3.csv")
166 |
167 | if return_links:
168 | links = _febrl_links(df)
169 | return df, links
170 | else:
171 | return df
172 |
173 |
174 | def load_febrl4(return_links=False):
175 | """Load the FEBRL 4 datasets.
176 |
177 | The Freely Extensible Biomedical Record Linkage (Febrl) package is
178 | distributed with a dataset generator and four datasets generated
179 | with the generator. This function returns the fourth Febrl dataset
180 | as a :class:`pandas.DataFrame`.
181 |
182 | *"Generated as one data set with 10000 records (5000
183 | originals and 5000 duplicates, with one duplicate per
184 | original), the originals have been split from the
185 | duplicates, into dataset4a.csv (containing the 5000
186 | original records) and dataset4b.csv (containing the
187 | 5000 duplicate records) These two data sets can be
188 | used for testing linkage procedures."*
189 |
190 | Parameters
191 | ----------
192 | return_links: bool
193 | When True, the function returns also the true links.
194 |
195 | Returns
196 | -------
197 | (pandas.DataFrame, pandas.DataFrame)
198 | A :class:`pandas.DataFrame` with Febrl dataset4a.csv and a pandas
199 | dataframe with Febrl dataset4b.csv. When return_links is True,
200 | the function returns also the true links.
201 |
202 | """
203 |
204 | df_a = _febrl_load_data("dataset4a.csv")
205 | df_b = _febrl_load_data("dataset4b.csv")
206 |
207 | if return_links:
208 | links = pandas.MultiIndex.from_arrays(
209 | [
210 | [f"rec-{i}-org" for i in range(0, 5000)],
211 | [f"rec-{i}-dup-0" for i in range(0, 5000)],
212 | ]
213 | )
214 | return df_a, df_b, links
215 | else:
216 | return df_a, df_b
217 |
--------------------------------------------------------------------------------
/recordlinkage/datasets/generate.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 |
4 |
5 | def binary_vectors(
6 | n,
7 | n_match,
8 | m=[0.9] * 8,
9 | u=[0.1] * 8,
10 | random_state=None,
11 | return_links=False,
12 | dtype=np.int8,
13 | ):
14 | """Generate random binary comparison vectors.
15 |
16 | This function is used to generate random comparison vectors. The
17 | result of each comparison is a binary value (0 or 1).
18 |
19 | Parameters
20 | ----------
21 | n : int
22 | The total number of comparison vectors.
23 | n_match : int
24 | The number of matching record pairs.
25 | m : list, default [0.9] * 8, optional
26 | A list of m probabilities of each partially identifying
27 | variable. The m probability is the probability that an
28 | identifier in matching record pairs agrees.
29 | u : list, default [0.9] * 8, optional
30 | A list of u probabilities of each partially identifying
31 | variable. The u probability is the probability that an
32 | identifier in non-matching record pairs agrees.
33 | random_state : int or numpy.random.RandomState, optional
34 | Seed for the random number generator with an integer or numpy
35 | RandomState object.
36 | return_links: bool
37 | When True, the function returns also the true links.
38 | dtype: numpy.dtype
39 | The dtype of each column in the returned DataFrame.
40 |
41 | Returns
42 | -------
43 | pandas.DataFrame
44 | A dataframe with comparison vectors.
45 |
46 |
47 | """
48 |
49 | if len(m) != len(u):
50 | raise ValueError("the length of 'm' is not equal the length of 'u'")
51 |
52 | if n_match >= n or n_match < 0:
53 | raise ValueError("the number of matches is bounded by [0, n]")
54 |
55 | # set the random seed
56 | np.random.seed(random_state)
57 |
58 | matches = []
59 | nonmatches = []
60 |
61 | sample_set = np.array([0, 1], dtype=dtype)
62 |
63 | for i, _ in enumerate(m):
64 | p_mi = [1 - m[i], m[i]]
65 | p_ui = [1 - u[i], u[i]]
66 |
67 | comp_mi = np.random.choice(sample_set, (n_match, 1), p=p_mi)
68 | comp_ui = np.random.choice(sample_set, (n - n_match, 1), p=p_ui)
69 |
70 | nonmatches.append(comp_ui)
71 | matches.append(comp_mi)
72 |
73 | match_block = np.concatenate(matches, axis=1)
74 | nonmatch_block = np.concatenate(nonmatches, axis=1)
75 |
76 | data_np = np.concatenate((match_block, nonmatch_block), axis=0)
77 | index_np = np.random.randint(1001, 1001 + n * 2, (n, 2))
78 |
79 | data_col_names = ["c_%s" % (i + 1) for i in range(len(m))]
80 | data_mi = pd.MultiIndex.from_arrays([index_np[:, 0], index_np[:, 1]])
81 | data_df = pd.DataFrame(data_np, index=data_mi, columns=data_col_names)
82 |
83 | features = data_df.sample(frac=1, random_state=random_state)
84 |
85 | if return_links:
86 | links = data_mi[:n_match]
87 | return features, links
88 | else:
89 | return features
90 |
--------------------------------------------------------------------------------
/recordlinkage/deprecated.py:
--------------------------------------------------------------------------------
1 | """Home of all deprecated functions and classes."""
2 |
--------------------------------------------------------------------------------
/recordlinkage/network.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 | from recordlinkage.types import is_pandas_2d_multiindex
4 | from recordlinkage.types import is_pandas_multiindex
5 |
6 |
7 | class OneToOneLinking:
8 | """[EXPERIMENTAL] One-to-one linking
9 |
10 | A record from dataset A can match at most one record from dataset
11 | B. For example, (a1, a2) are records from A and (b1, b2) are records
12 | from B. A linkage of (a1, b1), (a1, b2), (a2, b1), (a2, b2) is not
13 | one-to-one connected. One of the results of one-to-one linking can
14 | be (a1, b1), (a2, b2).
15 |
16 | Parameters
17 | ----------
18 | method : str
19 | The method to solve the problem. Only 'greedy' is supported at
20 | the moment.
21 |
22 | Note
23 | ----
24 |
25 | This class is experimental and might change in future versions.
26 |
27 | """
28 |
29 | def __init__(self, method="greedy"):
30 | super().__init__()
31 |
32 | self.method = method
33 |
34 | @classmethod
35 | def _bool_duplicated(cls, links, level):
36 | return links.get_level_values(level).duplicated()
37 |
38 | def _compute_greedy(self, links):
39 | result = []
40 | set_a = set()
41 | set_b = set()
42 |
43 | for index_a, index_b in links:
44 | if index_a not in set_a and index_b not in set_b:
45 | result.append((index_a, index_b))
46 | set_a.add(index_a)
47 | set_b.add(index_b)
48 |
49 | return pd.MultiIndex.from_tuples(result)
50 |
51 | def _compute(self, links):
52 | if not is_pandas_2d_multiindex(links):
53 | if not is_pandas_multiindex(links):
54 | raise TypeError("expected pandas.MultiIndex")
55 | elif not is_pandas_2d_multiindex(links):
56 | raise ValueError(
57 | "pandas.MultiIndex has incorrect number of "
58 | "levels (expected 2 levels)"
59 | )
60 |
61 | if self.method == "greedy":
62 | return self._compute_greedy(links)
63 | else:
64 | raise ValueError(f"unknown matching method {self.method}")
65 |
66 | def compute(self, links):
67 | """Compute the one-to-one linking.
68 |
69 | Parameters
70 | ----------
71 | links : pandas.MultiIndex
72 | The pairs to apply linking to.
73 |
74 | Returns
75 | -------
76 | pandas.MultiIndex
77 | A one-to-one matched MultiIndex of record pairs.
78 |
79 | """
80 |
81 | return self._compute(links)
82 |
83 |
84 | class OneToManyLinking(OneToOneLinking):
85 | """[EXPERIMENTAL] One-to-many linking
86 |
87 | A record from dataset A can link multiple records from dataset B,
88 | but a record from B can link to only one record of dataset A. Use
89 | the `level` argument to switch A and B.
90 |
91 | Parameters
92 | ----------
93 | level : int
94 | The level of the MultiIndex to have the one relations. The
95 | options are 0 or 1 (incication the level of the MultiIndex).
96 | Default 0.
97 | method : str
98 | The method to solve the problem. Only 'greedy' is supported at
99 | the moment.
100 |
101 | Example
102 | -------
103 |
104 | Consider a MultiIndex with record pairs constructed from datasets A
105 | and B. To link a record from B to at most one record of B, use the
106 | following syntax:
107 |
108 | > one_to_many = OneToManyLinking(0)
109 | > one_to_many.compute(links)
110 |
111 | To link a record from B to at most one record
112 | of B, use:
113 |
114 | > one_to_many = OneToManyLinking(1)
115 | > one_to_many.compute(links)
116 |
117 | Note
118 | ----
119 |
120 | This class is experimental and might change in future versions.
121 |
122 | """
123 |
124 | def __init__(self, level=0, method="greedy"):
125 | super().__init__(method=method)
126 |
127 | self.level = level
128 |
129 | def _compute_greedy(self, links):
130 | source_dupl_bool = self._bool_duplicated(links, self.level)
131 | return links[~source_dupl_bool]
132 |
133 | def compute(self, links):
134 | """Compute the one-to-many matching.
135 |
136 | Parameters
137 | ----------
138 | links : pandas.MultiIndex
139 | The pairs to apply linking to.
140 |
141 | Returns
142 | -------
143 | pandas.MultiIndex
144 | A one-to-many matched MultiIndex of record pairs.
145 |
146 | """
147 |
148 | return self._compute(links)
149 |
150 |
151 | class ConnectedComponents:
152 | """[EXPERIMENTAL] Connected record pairs
153 |
154 | This class identifies connected record pairs. Connected components
155 | are especially used in detecting duplicates in a single dataset.
156 |
157 | Note
158 | ----
159 |
160 | This class is experimental and might change in future versions.
161 | """
162 |
163 | def __init__(self):
164 | super().__init__()
165 |
166 | def compute(self, links):
167 | """Return the connected components.
168 |
169 | Parameters
170 | ----------
171 | links : pandas.MultiIndex
172 | The links to apply one-to-one matching on.
173 |
174 | Returns
175 | -------
176 | list of pandas.MultiIndex
177 | A list with pandas.MultiIndex objects. Each MultiIndex
178 | object represents a set of connected record pairs.
179 |
180 | """
181 |
182 | try:
183 | import networkx as nx
184 | except ImportError as err:
185 | raise Exception("'networkx' module is needed for this operation") from err
186 |
187 | graph_pairs = nx.Graph()
188 | graph_pairs.add_edges_from(links.values)
189 | connected_pairs = (
190 | graph_pairs.subgraph(c).copy() for c in nx.connected_components(graph_pairs)
191 | )
192 |
193 | links_result = [
194 | pd.MultiIndex.from_tuples(subgraph.edges()) for subgraph in connected_pairs
195 | ]
196 |
197 | return links_result
198 |
--------------------------------------------------------------------------------
/recordlinkage/preprocessing/__init__.py:
--------------------------------------------------------------------------------
1 | from recordlinkage.preprocessing.cleaning import clean
2 | from recordlinkage.preprocessing.cleaning import phonenumbers
3 | from recordlinkage.preprocessing.cleaning import value_occurence
4 | from recordlinkage.preprocessing.encoding import _list_phonetic_algorithms
5 | from recordlinkage.preprocessing.encoding import phonetic
6 |
7 | phonetic_algorithms = _list_phonetic_algorithms()
8 | """List of available phonetic algorithms."""
9 |
10 | __all__ = [
11 | "phonetic_algorithms",
12 | "clean",
13 | "phonetic",
14 | "value_occurence",
15 | "phonenumbers",
16 | ]
17 |
--------------------------------------------------------------------------------
/recordlinkage/preprocessing/cleaning.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | from sklearn.feature_extraction.text import strip_accents_ascii
4 | from sklearn.feature_extraction.text import strip_accents_unicode
5 |
6 |
7 | def clean(
8 | s,
9 | lowercase=True,
10 | replace_by_none=r"[^ \-\_A-Za-z0-9]+",
11 | replace_by_whitespace=r"[\-\_]",
12 | strip_accents=None,
13 | remove_brackets=True,
14 | encoding="utf-8",
15 | decode_error="strict",
16 | ):
17 | """Clean string variables.
18 |
19 | Clean strings in the Series by removing unwanted tokens,
20 | whitespace and brackets.
21 |
22 | Parameters
23 | ----------
24 | s : pandas.Series
25 | A Series to clean.
26 | lower : bool, optional
27 | Convert strings in the Series to lowercase. Default True.
28 | replace_by_none : str, optional
29 | The matches of this regular expression are replaced by ''.
30 | replace_by_whitespace : str, optional
31 | The matches of this regular expression are replaced by a
32 | whitespace.
33 | remove_brackets : bool, optional
34 | Remove all content between brackets and the bracket
35 | themselves. Default True.
36 | strip_accents : {'ascii', 'unicode', None}, optional
37 | Remove accents during the preprocessing step. 'ascii' is a
38 | fast method that only works on characters that have an direct
39 | ASCII mapping. 'unicode' is a slightly slower method that
40 | works on any characters. None (default) does nothing.
41 | encoding : str, optional
42 | If bytes are given, this encoding is used to decode. Default
43 | is 'utf-8'.
44 | decode_error : {'strict', 'ignore', 'replace'}, optional
45 | Instruction on what to do if a byte Series is given that
46 | contains characters not of the given `encoding`. By default,
47 | it is 'strict', meaning that a UnicodeDecodeError will be
48 | raised. Other values are 'ignore' and 'replace'.
49 |
50 | Example
51 | -------
52 | >>> import pandas
53 | >>> from recordlinkage.preprocessing import clean
54 | >>>
55 | >>> names = ['Mary-ann',
56 | 'Bob :)',
57 | 'Angel',
58 | 'Bob (alias Billy)',
59 | None]
60 | >>> s = pandas.Series(names)
61 | >>> print(clean(s))
62 | 0 mary ann
63 | 1 bob
64 | 2 angel
65 | 3 bob
66 | 4 NaN
67 | dtype: object
68 |
69 | Returns
70 | -------
71 | pandas.Series:
72 | A cleaned Series of strings.
73 |
74 | """
75 |
76 | if s.shape[0] == 0:
77 | return s
78 |
79 | # Lower s if lower is True
80 | if lowercase is True:
81 | s = s.str.lower()
82 |
83 | # Accent stripping based on https://github.com/scikit-learn/
84 | # scikit-learn/blob/412996f/sklearn/feature_extraction/text.py
85 | # BSD license
86 | if not strip_accents:
87 | pass
88 | elif callable(strip_accents):
89 | strip_accents_fn = strip_accents
90 | elif strip_accents == "ascii":
91 | strip_accents_fn = strip_accents_ascii
92 | elif strip_accents == "unicode":
93 | strip_accents_fn = strip_accents_unicode
94 | else:
95 | raise ValueError(f"Invalid value for 'strip_accents': {strip_accents}")
96 |
97 | # Remove accents etc
98 | if strip_accents:
99 |
100 | def strip_accents_fn_wrapper(x):
101 | if sys.version_info[0] >= 3:
102 | if isinstance(x, str):
103 | return strip_accents_fn(x)
104 | else:
105 | return x
106 | else:
107 | if isinstance(x, unicode): # noqa
108 | return strip_accents_fn(x)
109 | else:
110 | return x
111 |
112 | # encoding
113 | s = s.apply(
114 | lambda x: x.decode(encoding, decode_error) if type(x) == bytes else x
115 | )
116 | s = s.map(lambda x: strip_accents_fn_wrapper(x))
117 |
118 | # Remove all content between brackets
119 | if remove_brackets is True:
120 | s = s.str.replace(r"(\[.*?\]|\(.*?\)|\{.*?\})", "", regex=True)
121 |
122 | # Remove the special characters
123 | if replace_by_none:
124 | s = s.str.replace(replace_by_none, "", regex=True)
125 |
126 | if replace_by_whitespace:
127 | s = s.str.replace(replace_by_whitespace, " ", regex=True)
128 |
129 | # Remove multiple whitespaces
130 | s = s.str.replace(r"\s\s+", " ", regex=True)
131 |
132 | # Strip s
133 | s = s.str.lstrip().str.rstrip()
134 |
135 | return s
136 |
137 |
138 | def phonenumbers(s):
139 | """Clean phonenumbers by removing all non-numbers (except +).
140 |
141 | Parameters
142 | ----------
143 | s: pandas.Series
144 | A Series to clean.
145 |
146 | Returns
147 | -------
148 | pandas.Series
149 | A Series with cleaned phonenumbers.
150 |
151 | """
152 |
153 | # Remove all special tokens
154 | s = s.astype(object).str.replace("[^0-9+]+", "", regex=True)
155 |
156 | return s
157 |
158 |
159 | def value_occurence(s):
160 | """Count the number of times each value occurs.
161 |
162 | This function returns the counts for each row, in contrast with
163 | `pandas.value_counts `_.
165 |
166 | Returns
167 | -------
168 | pandas.Series
169 | A Series with value counts.
170 |
171 | """
172 |
173 | # https://github.com/pydata/pandas/issues/3729
174 | value_count = s.fillna("NAN")
175 |
176 | return value_count.groupby(by=value_count).transform("count")
177 |
--------------------------------------------------------------------------------
/recordlinkage/preprocessing/encoding.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | import jellyfish
4 | import numpy as np
5 | import pandas
6 |
7 | _phonetic_algorithms = [
8 | {"name": "Soundex", "callback": jellyfish.soundex, "argument_names": ["soundex"]},
9 | {
10 | "name": "NYSIIS",
11 | "callback": jellyfish.nysiis,
12 | "argument_names": ["nysiis", "nyssis"],
13 | },
14 | {
15 | "name": "Metaphone",
16 | "callback": jellyfish.metaphone,
17 | "argument_names": ["metaphone"],
18 | },
19 | {
20 | "name": "Match Rating",
21 | "callback": jellyfish.match_rating_codex,
22 | "argument_names": [
23 | "match_rating",
24 | "match rating",
25 | "matchrating",
26 | "match_rating_codex",
27 | "matchratingcodex",
28 | ],
29 | },
30 | ]
31 |
32 |
33 | def _list_phonetic_algorithms():
34 | """Return list of available phonetic algorithms."""
35 |
36 | return [alg["argument_names"][0] for alg in _phonetic_algorithms]
37 |
38 |
39 | def phonetic(s, method, concat=True, encoding="utf-8", decode_error="strict"):
40 | """Convert names or strings into phonetic codes.
41 |
42 | The implemented algorithms are `soundex
43 | `_, `nysiis
44 | `_, `metaphone
46 | `_ or `match_rating
47 | `_.
48 |
49 | Parameters
50 | ----------
51 | s : pandas.Series
52 | A pandas.Series with string values (often names) to encode.
53 | method: str
54 | The algorithm that is used to phonetically encode the values.
55 | The possible options are "soundex", "nysiis", "metaphone" or
56 | "match_rating".
57 | concat: bool, optional
58 | Remove whitespace before phonetic encoding.
59 | encoding: str, optional
60 | If bytes are given, this encoding is used to decode. Default
61 | is 'utf-8'.
62 | decode_error: {'strict', 'ignore', 'replace'}, optional
63 | Instruction on what to do if a byte Series is given that
64 | contains characters not of the given `encoding`. By default,
65 | it is 'strict', meaning that a UnicodeDecodeError will be
66 | raised. Other values are 'ignore' and 'replace'.
67 |
68 | Returns
69 | -------
70 | pandas.Series
71 | A Series with phonetic encoded values.
72 |
73 | """
74 |
75 | # encoding
76 | if sys.version_info[0] == 2:
77 | s = s.apply(
78 | lambda x: x.decode(encoding, decode_error) if type(x) == bytes else x
79 | )
80 |
81 | if concat:
82 | s = s.str.replace(r"[\-\_\s]", "", regex=True)
83 |
84 | for alg in _phonetic_algorithms:
85 | if method in alg["argument_names"]:
86 | phonetic_callback = alg["callback"]
87 | break
88 | else:
89 | raise ValueError(f"The algorithm '{method}' is not known.")
90 |
91 | return s.str.upper().apply(
92 | lambda x: phonetic_callback(x) if pandas.notnull(x) else np.nan
93 | )
94 |
--------------------------------------------------------------------------------
/recordlinkage/rl_logging.py:
--------------------------------------------------------------------------------
1 | """Logging utilities."""
2 |
3 | # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ============================================================================
17 | #
18 | # Modifications copyright Jonathan de Bruin 2017
19 |
20 | # pylint: disable=unused-import
21 |
22 | import logging as _logging
23 | import sys as _sys
24 | from logging import DEBUG # noqa
25 | from logging import ERROR # noqa
26 | from logging import FATAL # noqa
27 | from logging import INFO # noqa
28 | from logging import WARN # noqa
29 |
30 | # Determine whether we are in an interactive environment
31 | _interactive = False
32 | try:
33 | # This is only defined in interactive shells
34 | if _sys.ps1:
35 | _interactive = True
36 | except AttributeError:
37 | # Even now, we may be in an interactive shell with `python -i`.
38 | _interactive = _sys.flags.interactive
39 |
40 | # Scope the tensorflow logger to not conflict with users' loggers
41 | _logger = _logging.getLogger("recordlinkage")
42 |
43 | # If we are in an interactive environment (like jupyter), set loglevel to info
44 | # and pipe the output to stdout
45 | if _interactive:
46 | _logger.setLevel(WARN)
47 | _logging_target = _sys.stdout
48 | else:
49 | _logging_target = _sys.stderr
50 |
51 | # Add the output handler
52 | _handler = _logging.StreamHandler(_logging_target)
53 | _handler.setFormatter(_logging.Formatter(_logging.BASIC_FORMAT, None))
54 | _logger.addHandler(_handler)
55 |
56 | log = _logger.log
57 | debug = _logger.debug
58 | error = _logger.error
59 | fatal = _logger.fatal
60 | info = _logger.info
61 | warning = _logger.warning
62 |
63 |
64 | def get_verbosity():
65 | """Return how much logging output will be produced."""
66 | return _logger.getEffectiveLevel()
67 |
68 |
69 | def set_verbosity(verbosity):
70 | """Sets the threshold for what messages will be logged."""
71 | _logger.setLevel(verbosity)
72 |
--------------------------------------------------------------------------------
/recordlinkage/standardise/__init__.py:
--------------------------------------------------------------------------------
1 | # This module is renamed into preprocessing. Please use the preprocessing
2 | # module instead of this module.
3 |
4 | import warnings
5 |
6 | from recordlinkage.preprocessing import clean as _clean
7 | from recordlinkage.preprocessing import phonenumbers as _phonenumbers
8 | from recordlinkage.preprocessing import phonetic as _phonetic
9 | from recordlinkage.preprocessing import value_occurence as _value_occurence
10 |
11 |
12 | def _depr_warn():
13 | warnings.warn(
14 | "module recordlinkage.standardise is deprecated, use "
15 | "recordlinkage.preprocessing instead",
16 | DeprecationWarning,
17 | stacklevel=2,
18 | )
19 |
20 |
21 | def clean(*args, **kwargs):
22 | _depr_warn()
23 |
24 | return _clean(*args, **kwargs)
25 |
26 |
27 | def phonenumbers(*args, **kwargs):
28 | _depr_warn()
29 |
30 | return _phonenumbers(*args, **kwargs)
31 |
32 |
33 | def value_occurence(*args, **kwargs):
34 | _depr_warn()
35 |
36 | return _value_occurence(*args, **kwargs)
37 |
38 |
39 | def phonetic(*args, **kwargs):
40 | _depr_warn()
41 |
42 | return _phonetic(*args, **kwargs)
43 |
--------------------------------------------------------------------------------
/recordlinkage/types.py:
--------------------------------------------------------------------------------
1 | """
2 | basic inference routines
3 |
4 | most functions taken from pandas (https://github.com/pandas-dev/pandas)
5 | License BSD
6 |
7 | """
8 |
9 | import collections
10 | import re
11 | from numbers import Number
12 |
13 | import numpy
14 | import pandas
15 |
16 | string_and_binary_types = (str, bytes)
17 |
18 |
19 | def is_number(obj):
20 | return isinstance(obj, (Number, numpy.number))
21 |
22 |
23 | def is_string_like(obj):
24 | return isinstance(obj, str)
25 |
26 |
27 | def _iterable_not_string(x):
28 | return isinstance(x, collections.Iterable) and not isinstance(x, str)
29 |
30 |
31 | def is_iterator(obj):
32 | return hasattr(obj, "__next__")
33 |
34 |
35 | def is_re(obj):
36 | return isinstance(obj, re._pattern_type)
37 |
38 |
39 | def is_re_compilable(obj):
40 | try:
41 | re.compile(obj)
42 | except TypeError:
43 | return False
44 | else:
45 | return True
46 |
47 |
48 | def is_list_like(arg):
49 | return hasattr(arg, "__iter__") and not isinstance(arg, string_and_binary_types)
50 |
51 |
52 | def is_dict_like(arg):
53 | return hasattr(arg, "__getitem__") and hasattr(arg, "keys")
54 |
55 |
56 | def is_named_tuple(arg):
57 | return isinstance(arg, tuple) and hasattr(arg, "_fields")
58 |
59 |
60 | def is_hashable(arg):
61 | """Return True if hash(arg) will succeed, False otherwise.
62 |
63 | Some types will pass a test against collections.Hashable but fail when they
64 | are actually hashed with hash().
65 |
66 | Distinguish between these and other types by trying the call to hash() and
67 | seeing if they raise TypeError.
68 |
69 | Examples
70 | --------
71 | >>> a = ([],)
72 | >>> isinstance(a, collections.Hashable)
73 | True
74 | >>> is_hashable(a)
75 | False
76 | """
77 |
78 | # unfortunately, we can't use isinstance(arg, collections.Hashable), which
79 | # can be faster than calling hash, because numpy scalars on Python 3 fail
80 | # this test
81 |
82 | # reconsider this decision once this numpy bug is fixed:
83 | # https://github.com/numpy/numpy/issues/5562
84 |
85 | try:
86 | hash(arg)
87 | except TypeError:
88 | return False
89 | else:
90 | return True
91 |
92 |
93 | def is_sequence(x):
94 | try:
95 | iter(x)
96 | len(x) # it has a length
97 | return not isinstance(x, string_and_binary_types)
98 | except (TypeError, AttributeError):
99 | return False
100 |
101 |
102 | def is_pandas_like(x):
103 | return isinstance(x, (pandas.Series, pandas.DataFrame))
104 |
105 |
106 | def is_pandas_multiindex(x):
107 | return isinstance(x, (pandas.MultiIndex))
108 |
109 |
110 | def is_pandas_2d_multiindex(x):
111 | return is_pandas_multiindex(x) and x.nlevels == 2
112 |
113 |
114 | def is_numpy_like(x):
115 | return isinstance(x, (numpy.ndarray))
116 |
--------------------------------------------------------------------------------
/recordlinkage/utils.py:
--------------------------------------------------------------------------------
1 | import warnings
2 | from functools import wraps
3 |
4 | import numpy
5 | import pandas
6 |
7 | import recordlinkage.config as cf
8 |
9 |
10 | # Errors and Exception handlers
11 | class IndexError(Exception):
12 | """Error class for errors related to indexing."""
13 |
14 | pass
15 |
16 |
17 | class LearningError(Exception):
18 | """Learning error"""
19 |
20 |
21 | class DeprecationHelper:
22 | """Deprecation helper for classes and functions.
23 |
24 | Based on https://stackoverflow.com/a/9008509/8727928
25 | """
26 |
27 | def __init__(self, new_target, msg=None):
28 | self.new_target = new_target
29 | self.msg = msg
30 |
31 | def _warn(self):
32 | from warnings import warn
33 |
34 | if self.msg is None:
35 | msg = "This class will get deprecated."
36 | else:
37 | msg = self.msg
38 |
39 | warn(msg, DeprecationWarning, stacklevel=1)
40 |
41 | def __call__(self, *args, **kwargs):
42 | self._warn()
43 | return self.new_target(*args, **kwargs)
44 |
45 | def __getattr__(self, attr):
46 | self._warn()
47 | return getattr(self.new_target, attr)
48 |
49 |
50 | def return_type_deprecator(func):
51 | @wraps(func)
52 | def func_wrapper(*args, **kwargs):
53 | return_type = kwargs.pop("return_type", None)
54 | if return_type is not None:
55 | warnings.warn(
56 | "The argument 'return_type' is deprecated in the next "
57 | "version. Use recordlinkage.set_option('classification."
58 | "return_type', '{}') instead.".format(return_type),
59 | DeprecationWarning,
60 | stacklevel=2,
61 | )
62 | with cf.option_context("classification.return_type", return_type):
63 | return func(*args, **kwargs)
64 | else:
65 | return func(*args, **kwargs)
66 |
67 | return func_wrapper
68 |
69 |
70 | # Checks and conversions
71 | def is_label_dataframe(label, df):
72 | """check column label existance"""
73 |
74 | setdiff = set(label) - set(df.columns.tolist())
75 |
76 | if len(setdiff) == 0:
77 | return True
78 | else:
79 | return False
80 |
81 |
82 | def get_length(x):
83 | """Return int or len(x)"""
84 |
85 | try:
86 | return int(x)
87 | except Exception:
88 | return len(x)
89 |
90 |
91 | def listify(x, none_value=[]):
92 | """Make a list of the argument if it is not a list."""
93 |
94 | if isinstance(x, list):
95 | return x
96 | elif isinstance(x, tuple):
97 | return list(x)
98 | elif x is None:
99 | return none_value
100 | else:
101 | return [x]
102 |
103 |
104 | def unique(x):
105 | """Convert a list in a unique list."""
106 |
107 | return list(set(x))
108 |
109 |
110 | def merge_dicts(*dict_args):
111 | """
112 | Given any number of dicts, shallow copy and merge into a new dict,
113 | precedence goes to key value pairs in latter dicts.
114 | """
115 | result = {}
116 | for dictionary in dict_args:
117 | result.update(dictionary)
118 | return result
119 |
120 |
121 | def multi_index_to_frame(index):
122 | """
123 | Replicates MultiIndex.to_frame, which was introduced in pandas 0.21,
124 | for the sake of backwards compatibility.
125 | """
126 | return pandas.DataFrame(index.tolist(), index=index, columns=index.names)
127 |
128 |
129 | def index_split(index, chunks):
130 | """Function to split pandas.Index and pandas.MultiIndex objects.
131 |
132 | Split :class:`pandas.Index` and :class:`pandas.MultiIndex` objects
133 | into chunks. This function is based on :func:`numpy.array_split`.
134 |
135 | Parameters
136 | ----------
137 | index : pandas.Index, pandas.MultiIndex
138 | A pandas.Index or pandas.MultiIndex to split into chunks.
139 | chunks : int
140 | The number of parts to split the index into.
141 |
142 | Returns
143 | -------
144 | list
145 | A list with chunked pandas.Index or pandas.MultiIndex objects.
146 |
147 | """
148 |
149 | Ntotal = index.shape[0]
150 | Nsections = int(chunks)
151 | if Nsections <= 0:
152 | raise ValueError("number sections must be larger than 0.")
153 | Neach_section, extras = divmod(Ntotal, Nsections)
154 | section_sizes = (
155 | [0] + extras * [Neach_section + 1] + (Nsections - extras) * [Neach_section]
156 | )
157 | div_points = numpy.array(section_sizes).cumsum()
158 |
159 | sub_ind = []
160 | for i in range(Nsections):
161 | st = div_points[i]
162 | end = div_points[i + 1]
163 | sub_ind.append(index[st:end])
164 |
165 | return sub_ind
166 |
167 |
168 | def split_index(*args, **kwargs):
169 | warnings.warn(
170 | "Function will be removed in the future. Use index_split.",
171 | DeprecationWarning,
172 | stacklevel=2,
173 | )
174 |
175 | return index_split(*args, **kwargs)
176 |
177 |
178 | def frame_indexing(frame, multi_index, level_i, indexing_type="label"):
179 | """Index dataframe based on one level of MultiIndex.
180 |
181 | Arguments
182 | ---------
183 | frame : pandas.DataFrame
184 | The datafrme to select records from.
185 | multi_index : pandas.MultiIndex
186 | A pandas multiindex were one fo the levels is used to sample the
187 | dataframe with.
188 | level_i : int, str
189 | The level of the multiIndex to index on.
190 | indexing_type : str
191 | The type of indexing. The value can be 'label' or 'position'.
192 | Default 'label'.
193 |
194 | """
195 |
196 | if indexing_type == "label":
197 | data = frame.loc[multi_index.get_level_values(level_i)]
198 | data.index = multi_index
199 | elif indexing_type == "position":
200 | data = frame.iloc[multi_index.get_level_values(level_i)]
201 | data.index = multi_index
202 | else:
203 | raise ValueError("indexing_type needs to be 'label' or 'position'")
204 |
205 | return data
206 |
207 |
208 | def fillna(series_or_arr, missing_value=0.0):
209 | """Fill missing values in pandas objects and numpy arrays.
210 |
211 | Arguments
212 | ---------
213 | series_or_arr : pandas.Series, numpy.ndarray
214 | The numpy array or pandas series for which the missing values
215 | need to be replaced.
216 | missing_value : float, int, str
217 | The value to replace the missing value with. Default 0.0.
218 |
219 | Returns
220 | -------
221 | pandas.Series, numpy.ndarray
222 | The numpy array or pandas series with the missing values
223 | filled.
224 | """
225 |
226 | if pandas.notnull(missing_value):
227 | if isinstance(series_or_arr, (numpy.ndarray)):
228 | series_or_arr[numpy.isnan(series_or_arr)] = missing_value
229 | else:
230 | series_or_arr.fillna(missing_value, inplace=True)
231 |
232 | return series_or_arr
233 |
--------------------------------------------------------------------------------
/tests/test_annotator.py:
--------------------------------------------------------------------------------
1 | import recordlinkage as rl
2 | from recordlinkage.datasets import load_febrl1
3 | from recordlinkage.datasets import load_febrl4
4 | from recordlinkage.index import Block
5 |
6 |
7 | def test_annotation_link(tmp_path):
8 | path = tmp_path / "febrl_annotation_link.json"
9 |
10 | # get febrl4 file
11 | df_a, df_b, matches = load_febrl4(return_links=True)
12 |
13 | # get record pairs
14 | indexer = Block("given_name", "given_name")
15 | pairs = indexer.index(df_a, df_b)
16 |
17 | # create annotation file
18 | # write an annotation file for the Febrl4 dataset.
19 | rl.write_annotation_file(path, pairs[0:10], df_a, df_b)
20 |
21 | # read the result
22 | result = rl.read_annotation_file(path)
23 |
24 | assert result.links is None
25 | assert result.distinct is None
26 |
27 |
28 | def test_annotation_dedup(tmp_path):
29 | path = tmp_path / "febrl_annotation_dedup.json"
30 |
31 | # get febrl4 file
32 | df_a, matches = load_febrl1(return_links=True)
33 |
34 | # get record pairs
35 | indexer = Block("given_name", "given_name")
36 | pairs = indexer.index(df_a)
37 |
38 | # create annotation file
39 | # write an annotation file for the Febrl4 dataset.
40 | rl.write_annotation_file(path, pairs[0:10], df_a)
41 |
42 | # read the result
43 | result = rl.read_annotation_file(path)
44 |
45 | assert result.links is None
46 | assert result.distinct is None
47 |
--------------------------------------------------------------------------------
/tests/test_datasets.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | from os import environ
4 | from pathlib import Path
5 |
6 | import numpy
7 | import pandas
8 | import pytest
9 |
10 | from recordlinkage.datasets import binary_vectors
11 | from recordlinkage.datasets import clear_data_home
12 | from recordlinkage.datasets import get_data_home
13 | from recordlinkage.datasets import load_febrl1
14 | from recordlinkage.datasets import load_febrl2
15 | from recordlinkage.datasets import load_febrl3
16 | from recordlinkage.datasets import load_febrl4
17 | from recordlinkage.datasets import load_krebsregister
18 |
19 | FEBRL_DEDUP = [
20 | # nlinks = 500
21 | (load_febrl1, 1000, 500),
22 | # nlinks=19*6*5/2+47*5*4/2+107*4*3/2+141*3*2/2+114
23 | (load_febrl2, 5000, 1934),
24 | # nlinks=168*6*5/2+161*5*4/2+212*4*3/2+256*3*2/2+368
25 | (load_febrl3, 5000, 6538),
26 | ]
27 |
28 |
29 | @pytest.mark.parametrize("dataset,nrows,nlinks", FEBRL_DEDUP)
30 | def test_febrl_dedup(dataset, nrows, nlinks):
31 | df = dataset()
32 | assert isinstance(df, pandas.DataFrame)
33 | assert len(df) == nrows
34 |
35 |
36 | @pytest.mark.parametrize("dataset,nrows,nlinks", FEBRL_DEDUP)
37 | def test_febrl_dedup_links(dataset, nrows, nlinks):
38 | df, links = dataset(return_links=True)
39 | assert isinstance(df, pandas.DataFrame)
40 | assert len(df) == nrows
41 | assert len(links) == nlinks
42 | assert isinstance(links, pandas.MultiIndex)
43 |
44 |
45 | @pytest.mark.parametrize("dataset,nrows,nlinks", FEBRL_DEDUP)
46 | def test_febrl_dedup_tril(dataset, nrows, nlinks):
47 | df, links = dataset(return_links=True)
48 |
49 | s_level_1 = pandas.Series(numpy.arange(len(df)), index=df.index)
50 | s_level_2 = pandas.Series(numpy.arange(len(df)), index=df.index)
51 |
52 | x1 = s_level_1.loc[links.get_level_values(0)]
53 | x2 = s_level_2.loc[links.get_level_values(1)]
54 |
55 | assert numpy.all(x1.values > x2.values)
56 |
57 |
58 | def test_febrl4():
59 | dfa, dfb = load_febrl4()
60 | assert isinstance(dfa, pandas.DataFrame)
61 | assert isinstance(dfb, pandas.DataFrame)
62 | assert len(dfa) == 5000
63 | assert len(dfb) == 5000
64 |
65 |
66 | def test_febrl_links():
67 | dfa, dfb, links = load_febrl4(return_links=True)
68 | assert isinstance(dfa, pandas.DataFrame)
69 | assert isinstance(dfb, pandas.DataFrame)
70 | assert len(dfa) == 5000
71 | assert len(dfb) == 5000
72 | assert isinstance(links, pandas.MultiIndex)
73 |
74 |
75 | @pytest.mark.skip(reason="Causes undeterministic problems")
76 | def test_krebs_dataset_download():
77 | # remove downloaded datasets
78 | clear_data_home()
79 |
80 | krebs_data, krebs_matches = load_krebsregister()
81 |
82 | for i in range(1, 11):
83 | assert Path(get_data_home(), "krebsregister", f"block_{i}.zip").is_file()
84 |
85 | # count the number of recordss
86 | assert type(krebs_data), pandas.DataFrame
87 | assert type(krebs_matches), pandas.MultiIndex
88 | assert len(krebs_data) == 5749132
89 | assert len(krebs_matches) == 20931
90 |
91 |
92 | @pytest.mark.skip(reason="Causes undeterministic problems")
93 | def test_krebs_dataset_environ(tmpdir):
94 | path = Path(str(tmpdir)).expanduser()
95 | environ["RL_DATA"] = str(path)
96 |
97 | krebs_data, krebs_matches = load_krebsregister()
98 |
99 | for i in range(1, 11):
100 | assert Path(path, "krebsregister", f"block_{i}.zip").is_file()
101 |
102 |
103 | @pytest.mark.skip(reason="Causes undeterministic problems")
104 | def test_krebs_dataset():
105 | krebs_data_block1, krebs_matches_block1 = load_krebsregister(1)
106 | krebs_data_block10, krebs_matches_block10 = load_krebsregister(10)
107 |
108 | assert len(krebs_data_block1) > 0
109 | assert len(krebs_data_block10) > 0
110 |
111 | # load not existing block
112 | with pytest.raises(ValueError):
113 | load_krebsregister(11)
114 |
115 | # missing values
116 | krebs_block10, matches = load_krebsregister(10, missing_values=0)
117 | assert krebs_block10.isnull().sum().sum() == 0
118 |
119 |
120 | @pytest.mark.skip(reason="Causes undeterministic problems")
121 | def test_krebs_missings():
122 | # missing values
123 | krebs_block10, matches = load_krebsregister(10, missing_values=0)
124 | assert krebs_block10.isnull().sum().sum() == 0
125 |
126 |
127 | @pytest.mark.skip(reason="Causes undeterministic problems")
128 | def test_krebs_shuffle():
129 | # missing values
130 | krebs_block10, matches = load_krebsregister(10, shuffle=False)
131 |
132 |
133 | def test_random_comparison_vectors():
134 | # Test the generation of a random dataset
135 |
136 | n_record_pairs = 10000
137 | n_matches = 500
138 |
139 | df = binary_vectors(
140 | n_record_pairs, n_matches, m=[0.8] * 8, u=[0.2] * 8, random_state=535
141 | )
142 |
143 | # Check the result is a DataFrame with MultiIndex
144 | assert isinstance(df, pandas.DataFrame)
145 | assert isinstance(df.index, pandas.MultiIndex)
146 |
147 | # Test the length of the dataframe
148 | assert len(df) == n_record_pairs
149 |
150 |
151 | def test_random_comparison_vectors_1value_col():
152 | m = numpy.array([1, 0.81, 0.85, 0])
153 | u = numpy.array([1, 0.23, 0.50, 0])
154 |
155 | # Create the train dataset.
156 | X_train, y_train = binary_vectors(
157 | 1000, 500, m=m, u=u, random_state=535, return_links=True
158 | )
159 |
160 | assert len(X_train.iloc[:, 0].unique()) == 1
161 | assert X_train.iloc[:, 0].unique()[0] == 1
162 |
163 | assert len(X_train.iloc[:, 3].unique()) == 1
164 | assert X_train.iloc[:, 3].unique()[0] == 0
165 |
166 | assert len(X_train.iloc[:, 1].unique()) == 2
167 | assert len(X_train.iloc[:, 2].unique()) == 2
168 |
--------------------------------------------------------------------------------
/tests/test_generate.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/J535D165/recordlinkage/b93d97641952f8c85106be5794ca93b1f1298fbc/tests/test_generate.py
--------------------------------------------------------------------------------
/tests/test_measures.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 |
4 | import numpy
5 | import pandas
6 |
7 | import recordlinkage as rl
8 |
9 | FULL_INDEX = pandas.MultiIndex.from_product(
10 | [[1, 2, 3], [1, 2, 3]], names=["first", "second"] # 3x3 matrix
11 | )
12 | LINKS_TRUE = pandas.MultiIndex.from_tuples(
13 | [(1, 1), (2, 2), (3, 3)], names=["first", "second"] # the diagonal
14 | )
15 | LINKS_PRED = pandas.MultiIndex.from_tuples(
16 | [(1, 1), (2, 1), (3, 1), (1, 2)], names=["first", "second"] # L shape
17 | )
18 |
19 |
20 | class TestMeasures:
21 | def test_confusion_matrix(self):
22 | result_len = rl.confusion_matrix(LINKS_TRUE, LINKS_PRED, len(FULL_INDEX))
23 | result_full_index = rl.confusion_matrix(LINKS_TRUE, LINKS_PRED, FULL_INDEX)
24 | expected = numpy.array([[1, 2], [3, 3]])
25 |
26 | numpy.testing.assert_array_equal(result_len, expected)
27 | numpy.testing.assert_array_equal(result_full_index, expected)
28 |
29 | def test_tp_fp_tn_fn(self):
30 | tp = rl.true_positives(LINKS_TRUE, LINKS_PRED)
31 | assert tp == 1
32 | fp = rl.false_positives(LINKS_TRUE, LINKS_PRED)
33 | assert fp == 3
34 | tn = rl.true_negatives(LINKS_TRUE, LINKS_PRED, len(FULL_INDEX))
35 | assert tn == 3
36 | fn = rl.false_negatives(LINKS_TRUE, LINKS_PRED)
37 | assert fn == 2
38 |
39 | def test_recall(self):
40 | # confusion matrix
41 | cm = rl.confusion_matrix(LINKS_TRUE, LINKS_PRED)
42 |
43 | assert rl.recall(LINKS_TRUE, LINKS_PRED) == 1 / 3
44 | assert rl.recall(cm) == 1 / 3
45 |
46 | def test_precision(self):
47 | # confusion matrix
48 | cm = rl.confusion_matrix(LINKS_TRUE, LINKS_PRED, len(FULL_INDEX))
49 |
50 | assert rl.precision(LINKS_TRUE, LINKS_PRED) == 1 / 4
51 | assert rl.precision(cm) == 1 / 4
52 |
53 | def test_accuracy(self):
54 | # confusion matrix
55 | cm = rl.confusion_matrix(LINKS_TRUE, LINKS_PRED, len(FULL_INDEX))
56 |
57 | assert rl.accuracy(LINKS_TRUE, LINKS_PRED, len(FULL_INDEX)) == 4 / 9
58 | assert rl.accuracy(cm) == 4 / 9
59 | assert rl.accuracy(LINKS_TRUE, LINKS_PRED, FULL_INDEX) == 4 / 9
60 |
61 | def test_specificity(self):
62 | # confusion matrix
63 | cm = rl.confusion_matrix(LINKS_TRUE, LINKS_PRED, len(FULL_INDEX))
64 |
65 | assert rl.specificity(LINKS_TRUE, LINKS_PRED, len(FULL_INDEX)) == 1 / 2
66 | assert rl.specificity(cm) == 1 / 2
67 | assert rl.specificity(LINKS_TRUE, LINKS_PRED, FULL_INDEX) == 1 / 2
68 |
69 | def test_fscore(self):
70 | # confusion matrix
71 | cm = rl.confusion_matrix(LINKS_TRUE, LINKS_PRED, len(FULL_INDEX))
72 | prec = rl.precision(LINKS_TRUE, LINKS_PRED)
73 | rec = rl.recall(LINKS_TRUE, LINKS_PRED)
74 | expected = float(2 * prec * rec / (prec + rec))
75 |
76 | assert rl.fscore(LINKS_TRUE, LINKS_PRED) == expected
77 | assert rl.fscore(cm) == expected
78 |
79 | def test_full_index_size(self):
80 | df_a = pandas.DataFrame(numpy.arange(10))
81 | df_b = pandas.DataFrame(numpy.arange(10))
82 |
83 | assert rl.full_index_size(df_a) == 45
84 | assert rl.full_index_size(len(df_a)) == 45
85 | assert rl.full_index_size(len(df_a)) == 45
86 | assert rl.full_index_size([len(df_a)]) == 45
87 |
88 | assert rl.full_index_size(df_a, df_b) == 100
89 | assert rl.full_index_size(len(df_a), len(df_b)) == 100
90 | assert rl.full_index_size((len(df_a), len(df_b))) == 100
91 | assert rl.full_index_size([len(df_a), len(df_b)]) == 100
92 |
93 | def test_reduction_ratio(self):
94 | df_a = pandas.DataFrame(numpy.arange(10))
95 | df_b = pandas.DataFrame(numpy.arange(10))
96 | candidate_pairs_link = pandas.MultiIndex.from_product(
97 | [[1, 2, 3, 4, 5], [1, 2, 3, 4, 5]]
98 | )
99 | candidate_pairs_dedup = pandas.MultiIndex.from_arrays(
100 | [[1, 2, 3, 4, 5], [1, 2, 3, 4, 5]]
101 | )
102 |
103 | assert rl.reduction_ratio(candidate_pairs_dedup, df_a) == 8 / 9
104 | assert rl.reduction_ratio(candidate_pairs_dedup, (df_a)) == 8 / 9
105 | assert rl.reduction_ratio(candidate_pairs_dedup, (df_a,)) == 8 / 9
106 |
107 | assert rl.reduction_ratio(candidate_pairs_link, df_a, df_b) == 3 / 4
108 | assert rl.reduction_ratio(candidate_pairs_link, (df_a, df_b)) == 3 / 4
109 | assert rl.reduction_ratio(candidate_pairs_link, [df_a, df_b]) == 3 / 4
110 |
--------------------------------------------------------------------------------
/tests/test_misc.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import numpy as np
4 | import pandas as pd
5 |
6 | # testing utils from pandas
7 | import pandas.testing as pdt
8 | import pytest
9 |
10 | import recordlinkage as rl
11 | from recordlinkage import index_split
12 |
13 |
14 | def test_multiindex_split():
15 | index = pd.MultiIndex.from_product([np.arange(5), np.arange(6)])
16 | result = index_split(index, 3)
17 |
18 | assert len(result) == 3
19 |
20 | for i, result_index_chunk in enumerate(result):
21 | expected_index_chunk = index[i * 10 : (i + 1) * 10]
22 | pdt.assert_index_equal(result_index_chunk, expected_index_chunk)
23 |
24 | assert len(result_index_chunk.levels) == 2
25 | assert len(result_index_chunk.codes) == 2
26 |
27 |
28 | def test_options():
29 | # global set
30 | rl.options.indexing.pairs = "multiindex"
31 | assert rl.get_option("indexing.pairs") == "multiindex"
32 |
33 |
34 | def test_options_context():
35 | with rl.option_context("indexing.pairs", "multiindex"):
36 | rl.options.indexing.pairs = "multiindex"
37 | assert rl.get_option("indexing.pairs") == "multiindex"
38 |
39 |
40 | def test_options_incorrect_values():
41 | # incorrect value
42 | with pytest.raises(ValueError):
43 | rl.options.indexing.pairs = "non_existing"
44 |
--------------------------------------------------------------------------------
/tests/test_network.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import sys
4 |
5 | import pandas as pd
6 |
7 | # testing utils from pandas
8 | import pandas.testing as pdt
9 | import pytest
10 |
11 | try:
12 | import networkx # noqa
13 | except ImportError:
14 | pass
15 |
16 | from recordlinkage import ConnectedComponents
17 | from recordlinkage import OneToManyLinking
18 | from recordlinkage import OneToOneLinking
19 |
20 |
21 | def test_one_to_one_linking():
22 | sample = pd.MultiIndex.from_tuples(
23 | [
24 | (1, 1),
25 | (2, 2),
26 | (3, 3),
27 | (3, 4),
28 | (3, 5),
29 | (4, 4),
30 | (5, 5),
31 | (6, 5),
32 | (7, 7),
33 | (7, 7),
34 | (7, 8),
35 | ]
36 | )
37 | one_to_many = OneToManyLinking()
38 | sample_one_to_many = one_to_many.compute(sample)
39 |
40 | expected = pd.MultiIndex.from_tuples(
41 | [(1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 5), (7, 7)]
42 | )
43 | pdt.assert_index_equal(sample_one_to_many, expected)
44 |
45 |
46 | def test_one_to_many_linking():
47 | sample = pd.MultiIndex.from_tuples(
48 | [
49 | (1, 1),
50 | (2, 2),
51 | (3, 3),
52 | (3, 4),
53 | (3, 5),
54 | (4, 4),
55 | (5, 5),
56 | (6, 5),
57 | (7, 7),
58 | (7, 6),
59 | (7, 8),
60 | ]
61 | )
62 |
63 | # test OneToOneLinking
64 | one_to_one = OneToOneLinking()
65 | sample_one_to_one = one_to_one.compute(sample)
66 |
67 | expected = pd.MultiIndex.from_tuples(
68 | [(1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (7, 7)]
69 | )
70 | pdt.assert_index_equal(sample_one_to_one, expected)
71 |
72 |
73 | @pytest.mark.skipif(
74 | "networkx" not in sys.modules, reason="Requires the Networkx library"
75 | )
76 | def test_connected_components():
77 | sample = pd.MultiIndex.from_tuples([(1, 2), (2, 3), (3, 4), (5, 6), (5, 7), (8, 9)])
78 |
79 | # test ConnectedComponents
80 | connected = ConnectedComponents()
81 | sample_connected = connected.compute(sample)
82 |
83 | expected = [
84 | pd.MultiIndex.from_tuples([(1, 2), (2, 3), (3, 4)]),
85 | pd.MultiIndex.from_tuples([(5, 6), (5, 7)]),
86 | pd.MultiIndex.from_tuples([(8, 9)]),
87 | ]
88 |
89 | for i, _mi in enumerate(expected):
90 | pdt.assert_index_equal(sample_connected[i], expected[i])
91 |
--------------------------------------------------------------------------------