├── .github └── workflows │ └── pythonpackage.yml ├── .gitignore ├── LICENSE ├── README.rst ├── docs ├── Makefile ├── conf.py ├── index.rst ├── readme.rst └── source │ ├── fuzzymatcher.rst │ └── modules.rst ├── examples.ipynb ├── fuzzymatcher ├── __init__.py ├── data_getter_abc.py ├── data_getter_cartesian.py ├── data_getter_sqlite.py ├── data_preprocessor_abc.py ├── data_preprocessor_default.py ├── matcher.py ├── record.py ├── scorer_abc.py ├── scorer_default.py ├── tokencomparison.py └── utils.py ├── requirements.txt ├── setup.py └── tests ├── __init__.py ├── create_fake_dataset.ipynb ├── data ├── _cities.csv ├── _first_names.csv ├── _surnames.csv ├── las_ons.csv ├── las_os.csv ├── left_1.csv ├── left_2.csv ├── left_3.csv ├── left_4.csv ├── left_5_nas.csv ├── left_token_escape.csv ├── right_1.csv ├── right_2.csv ├── right_3.csv ├── right_4.csv ├── right_5_nas.csv └── right_token_escape.csv ├── datagetter_performance.txt ├── generate_test_data.py ├── realexample_performance.txt ├── test_accuracy.py ├── test_colnames.py └── test_misc.py /.github/workflows/pythonpackage.yml: -------------------------------------------------------------------------------- 1 | name: Python package 2 | 3 | on: [push] 4 | 5 | jobs: 6 | build: 7 | 8 | runs-on: ubuntu-latest 9 | strategy: 10 | max-parallel: 4 11 | matrix: 12 | python-version: [3.5, 3.6, 3.7] 13 | 14 | steps: 15 | - uses: actions/checkout@v1 16 | - name: Set up Python ${{ matrix.python-version }} 17 | uses: actions/setup-python@v1 18 | with: 19 | python-version: ${{ matrix.python-version }} 20 | - name: Install dependencies 21 | run: | 22 | python -m pip install --upgrade pip 23 | pip install -r requirements.txt 24 | # - name: Lint with flake8 25 | # run: | 26 | # pip install flake8 27 | # # stop the build if there are Python syntax errors or undefined names 28 | # flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 29 | # # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 30 | # flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 31 | - name: Test with unittest 32 | run: | 33 | pip install coverage 34 | coverage run -m unittest discover 35 | coverage xml -o codecov_report.xml 36 | - uses: codecov/codecov-action@v1.0.2 37 | with: 38 | token: ${{secrets.CODECOV_TOKEN}} #required 39 | file: ./codecov_report.xml #optional 40 | flags: unittests #optional 41 | name: codecov-umbrella #optional 42 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | .static_storage/ 56 | .media/ 57 | local_settings.py 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Robin Linacre 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | .. image:: https://badge.fury.io/py/fuzzymatcher.svg 2 | :target: https://badge.fury.io/py/fuzzymatcher 3 | 4 | .. image:: https://codecov.io/gh/RobinL/fuzzymatcher/branch/dev/graph/badge.svg 5 | :target: https://codecov.io/gh/RobinL/fuzzymatcher 6 | 7 | 8 | fuzzymatcher 9 | ====================================== 10 | 11 | **Note: fuzzymatcher is no longer actively maintained. Please see** `splink `_ **for a more accurate, scalable and performant solution** 12 | 13 | A Python package that allows the user to fuzzy match two pandas dataframes based on one or more common fields. 14 | 15 | Fuzzymatches uses ``sqlite3``'s Full Text Search to find potential matches. 16 | 17 | It then uses `probabilistic record linkage `_ to score matches. 18 | 19 | Finally it outputs a list of the matches it has found and associated score. 20 | 21 | 22 | Installation 23 | ------------ 24 | 25 | ``pip install fuzzymatcher`` 26 | 27 | Note that you will need a build of sqlite which includes FTS4. This seems to be widely included by default, but otherwise `see here `_. 28 | 29 | Usage 30 | ----- 31 | 32 | See `examples.ipynb `_ for examples of usage and the output. 33 | 34 | You can run these examples interactively `here `_. 35 | 36 | Simple example 37 | -------------- 38 | 39 | Suppose you have a table called ``df_left`` which looks like this: 40 | 41 | ==== ============= 42 | id ons_name 43 | ==== ============= 44 | 0 Darlington 45 | 1 Monmouthshire 46 | 2 Havering 47 | 3 Knowsley 48 | 4 Charnwood 49 | ... etc. 50 | ==== ============= 51 | 52 | And you want to link it to a table ``df_right`` that looks like this: 53 | 54 | ==== ========================= 55 | id os_name 56 | ==== ========================= 57 | 0 Darlington (B) 58 | 1 Havering London Boro 59 | 2 Sir Fynwy - Monmouthshire 60 | 3 Knowsley District (B) 61 | 4 Charnwood District (B) 62 | ... etc. 63 | ==== ========================= 64 | 65 | You can write: 66 | 67 | .. code:: python 68 | 69 | import fuzzymatcher 70 | fuzzymatcher.fuzzy_left_join(df_left, df_right, left_on = "ons_name", right_on = "os_name") 71 | 72 | And you'll get: 73 | 74 | ================== ============= ========================= 75 | best_match_score ons_name os_name 76 | ================== ============= ========================= 77 | 0.178449 Darlington Darlington (B) 78 | 0.133371 Monmouthshire Sir Fynwy - Monmouthshire 79 | 0.102473 Havering Havering London Boro 80 | 0.155775 Knowsley Knowsley District (B) 81 | 0.155775 Charnwood Charnwood District (B) 82 | ... etc. etc. 83 | ================== ============= ========================= 84 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = fuzzymatcher 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # 4 | # fuzzymatcher documentation build configuration file, created by 5 | # sphinx-quickstart on Wed Nov 15 15:39:19 2017. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | # 20 | import os 21 | import sys 22 | sys.path.insert(0, os.path.abspath('.')) 23 | sys.path.insert(0, os.path.abspath('../')) 24 | 25 | 26 | # -- General configuration ------------------------------------------------ 27 | 28 | # If your documentation needs a minimal Sphinx version, state it here. 29 | # 30 | # needs_sphinx = '1.0' 31 | 32 | # Add any Sphinx extension module names here, as strings. They can be 33 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 34 | # ones. 35 | extensions = ['sphinx.ext.autodoc', 36 | 'sphinx.ext.githubpages', 37 | 'sphinx.ext.napoleon'] 38 | 39 | napoleon_google_docstring = True 40 | napoleon_numpy_docstring = False 41 | 42 | # Add any paths that contain templates here, relative to this directory. 43 | templates_path = ['_templates'] 44 | 45 | # The suffix(es) of source filenames. 46 | # You can specify multiple suffix as a list of string: 47 | # 48 | # source_suffix = ['.rst', '.md'] 49 | source_suffix = '.rst' 50 | 51 | # The master toctree document. 52 | master_doc = 'index' 53 | 54 | # General information about the project. 55 | project = 'fuzzymatcher' 56 | copyright = '2017, Robin Linacre' 57 | author = 'Robin Linacre' 58 | 59 | # The version info for the project you're documenting, acts as replacement for 60 | # |version| and |release|, also used in various other places throughout the 61 | # built documents. 62 | # 63 | # The short X.Y version. 64 | version = '0.1' 65 | # The full version, including alpha/beta/rc tags. 66 | release = '0.1' 67 | 68 | # The language for content autogenerated by Sphinx. Refer to documentation 69 | # for a list of supported languages. 70 | # 71 | # This is also used if you do content translation via gettext catalogs. 72 | # Usually you set "language" from the command line for these cases. 73 | language = None 74 | 75 | # List of patterns, relative to source directory, that match files and 76 | # directories to ignore when looking for source files. 77 | # This patterns also effect to html_static_path and html_extra_path 78 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 79 | 80 | # The name of the Pygments (syntax highlighting) style to use. 81 | pygments_style = 'sphinx' 82 | 83 | # If true, `todo` and `todoList` produce output, else they produce nothing. 84 | todo_include_todos = False 85 | 86 | 87 | # -- Options for HTML output ---------------------------------------------- 88 | 89 | # The theme to use for HTML and HTML Help pages. See the documentation for 90 | # a list of builtin themes. 91 | # 92 | html_theme = 'sphinx_rtd_theme' 93 | 94 | # Theme options are theme-specific and customize the look and feel of a theme 95 | # further. For a list of options available for each theme, see the 96 | # documentation. 97 | # 98 | # html_theme_options = {} 99 | 100 | # Add any paths that contain custom static files (such as style sheets) here, 101 | # relative to this directory. They are copied after the builtin static files, 102 | # so a file named "default.css" will overwrite the builtin "default.css". 103 | html_static_path = ['_static'] 104 | 105 | 106 | # -- Options for HTMLHelp output ------------------------------------------ 107 | 108 | # Output file base name for HTML help builder. 109 | htmlhelp_basename = 'fuzzymatcherdoc' 110 | 111 | 112 | # -- Options for LaTeX output --------------------------------------------- 113 | 114 | latex_elements = { 115 | # The paper size ('letterpaper' or 'a4paper'). 116 | # 117 | # 'papersize': 'letterpaper', 118 | 119 | # The font size ('10pt', '11pt' or '12pt'). 120 | # 121 | # 'pointsize': '10pt', 122 | 123 | # Additional stuff for the LaTeX preamble. 124 | # 125 | # 'preamble': '', 126 | 127 | # Latex figure (float) alignment 128 | # 129 | # 'figure_align': 'htbp', 130 | } 131 | 132 | # Grouping the document tree into LaTeX files. List of tuples 133 | # (source start file, target name, title, 134 | # author, documentclass [howto, manual, or own class]). 135 | latex_documents = [ 136 | (master_doc, 'fuzzymatcher.tex', 'fuzzymatcher Documentation', 137 | 'Robin Linacre', 'manual'), 138 | ] 139 | 140 | 141 | # -- Options for manual page output --------------------------------------- 142 | 143 | # One entry per manual page. List of tuples 144 | # (source start file, name, description, authors, manual section). 145 | man_pages = [ 146 | (master_doc, 'fuzzymatcher', 'fuzzymatcher Documentation', 147 | [author], 1) 148 | ] 149 | 150 | 151 | # -- Options for Texinfo output ------------------------------------------- 152 | 153 | # Grouping the document tree into Texinfo files. List of tuples 154 | # (source start file, target name, title, author, 155 | # dir menu entry, description, category) 156 | texinfo_documents = [ 157 | (master_doc, 'fuzzymatcher', 'fuzzymatcher Documentation', 158 | author, 'fuzzymatcher', 'One line description of project.', 159 | 'Miscellaneous'), 160 | ] 161 | 162 | 163 | 164 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. fuzzymatcher documentation master file, created by 2 | sphinx-quickstart on Wed Nov 15 15:39:19 2017. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to fuzzymatcher's documentation! 7 | ======================================== 8 | 9 | .. toctree:: 10 | :maxdepth: 2 11 | :caption: Contents: 12 | 13 | readme 14 | 15 | Another title goes here 16 | ======================= 17 | 18 | * :ref:`genindex` 19 | * :ref:`modindex` 20 | * :ref:`search` 21 | -------------------------------------------------------------------------------- /docs/readme.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../README.rst -------------------------------------------------------------------------------- /docs/source/fuzzymatcher.rst: -------------------------------------------------------------------------------- 1 | fuzzymatcher package 2 | ==================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | fuzzymatcher\.data\_getter\_abc module 8 | -------------------------------------- 9 | 10 | .. automodule:: fuzzymatcher.data_getter_abc 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | fuzzymatcher\.data\_getter\_sqlite module 16 | ----------------------------------------- 17 | 18 | .. automodule:: fuzzymatcher.data_getter_sqlite 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | fuzzymatcher\.data\_preprocessor\_abc module 24 | -------------------------------------------- 25 | 26 | .. automodule:: fuzzymatcher.data_preprocessor_abc 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | fuzzymatcher\.data\_preprocessor\_default module 32 | ------------------------------------------------ 33 | 34 | .. automodule:: fuzzymatcher.data_preprocessor_default 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | fuzzymatcher\.matcher module 40 | ---------------------------- 41 | 42 | .. automodule:: fuzzymatcher.matcher 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | 47 | fuzzymatcher\.record module 48 | --------------------------- 49 | 50 | .. automodule:: fuzzymatcher.record 51 | :members: 52 | :undoc-members: 53 | :show-inheritance: 54 | 55 | fuzzymatcher\.scorer\_abc module 56 | -------------------------------- 57 | 58 | .. automodule:: fuzzymatcher.scorer_abc 59 | :members: 60 | :undoc-members: 61 | :show-inheritance: 62 | 63 | fuzzymatcher\.scorer\_default module 64 | ------------------------------------ 65 | 66 | .. automodule:: fuzzymatcher.scorer_default 67 | :members: 68 | :undoc-members: 69 | :show-inheritance: 70 | 71 | 72 | Module contents 73 | --------------- 74 | 75 | .. automodule:: fuzzymatcher 76 | :members: 77 | :undoc-members: 78 | :show-inheritance: 79 | -------------------------------------------------------------------------------- /docs/source/modules.rst: -------------------------------------------------------------------------------- 1 | fuzzymatcher 2 | ============ 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | expected_usage 8 | fuzzymatcher 9 | setup 10 | tests 11 | try 12 | -------------------------------------------------------------------------------- /examples.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# `fuzzymatcher` examples" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Basic usage - `link_table`\n", 15 | "\n", 16 | "In the most basic usage, the user provides `fuzzymatcher` with two pandas dataframes, indicating which columns to join on.\n", 17 | "\n", 18 | "The central output of `fuzzymatcher` is the `link_table`.\n", 19 | "\n", 20 | "For each record in the left table, the link table includes one or more possible matching records from the right table.\n", 21 | "\n", 22 | "The user can then inspect the link table and decide which matches to retain, e.g. by choosing a score threshold ( `match_score > chosen_threshold` ) or just choosing the best match ( `match_rank == 1` )" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "import logging\n", 32 | "logging.basicConfig(level=logging.DEBUG)" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "import fuzzymatcher\n", 42 | "import pandas as pd\n", 43 | "\n", 44 | "df_left = pd.read_csv(\"tests/data/left_1.csv\")\n", 45 | "df_left" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "df_right = pd.read_csv(\"tests/data/right_1.csv\")\n", 55 | "df_right" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "# Columns to match on from df_left\n", 65 | "left_on = [\"fname\", \"mname\", \"lname\", \"dob\"]\n", 66 | "\n", 67 | "# Columns to match on from df_right\n", 68 | "right_on = [\"name\", \"middlename\", \"surname\", \"date\"]\n", 69 | "\n", 70 | "# Note that if left_id_col or right_id_col are admitted a unique id will be autogenerated\n", 71 | "fuzzymatcher.link_table(df_left, df_right, left_on, right_on, left_id_col = \"id\", right_id_col = \"id\")" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "## Basic usage - `fuzzy_left_join`\n", 79 | "\n", 80 | "A second option is to use `fuzzy_left_join`, which automatically links the two dataframes based on the highest-scoring match." 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "import fuzzymatcher\n", 90 | "import pandas as pd\n", 91 | "\n", 92 | "df_left = pd.read_csv(\"tests/data/left_1.csv\")\n", 93 | "df_right = pd.read_csv(\"tests/data/right_1.csv\")\n", 94 | "left_on = [\"fname\", \"lname\", \"dob\"]\n", 95 | "right_on = [\"name\", \"surname\", \"date\"]\n", 96 | "\n", 97 | "fuzzymatcher.fuzzy_left_join(df_left, df_right, left_on, right_on)" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "# Basic example with real data\n", 105 | "### Matching the names of Local Authorities provided by Office for National Statistics with the names provided by Ordnance Survey\n", 106 | "\n", 107 | "We would usually join this data on the Local Authority District (LAD) Codes (e.g. E06000001 = Hartlepool), but sometimes these are unavailable. In this example, we fuzzy match on the name, but provide the LAD code to demonstate it has worked." 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "ons = pd.read_csv(\"tests/data/las_ons.csv\")\n", 117 | "os = pd.read_csv(\"tests/data/las_os.csv\")\n", 118 | "\n", 119 | "df_joined = fuzzymatcher.fuzzy_left_join(ons, os, left_on = \"lad16nm\", right_on = \"name\")\n", 120 | "rename = {\"lad16cd\": \"ons_code\", \"code\": \"os_code\", \"lad16nm\": \"ons_name\", \"name\": \"os_name\"}\n", 121 | "df_joined = df_joined.rename(columns=rename)\n", 122 | "col_order = [\"best_match_score\", \"ons_name\", \"os_name\", \"ons_code\", \"os_code\"]\n", 123 | "df_joined[col_order].sample(5)" 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "metadata": {}, 129 | "source": [ 130 | "We can get a sense of match quality by measuring how often the fuzzy matcher got it right:" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "metadata": {}, 137 | "outputs": [], 138 | "source": [ 139 | "num_records = len(df_joined)\n", 140 | "correct_binary = (df_joined[\"ons_code\"] == df_joined[\"os_code\"])\n", 141 | "perc_correct = correct_binary.sum()/num_records\n", 142 | "\n", 143 | "\"The percentage of codes which were correctly matched was {:,.1f}%\".format(perc_correct*100)" 144 | ] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "metadata": {}, 149 | "source": [ 150 | "# Advanced usage - configuring the matcher\n", 151 | "\n", 152 | "`fuzzymatcher` uses a number of components, each one of which can be re-written or adapted by the user:\n", 153 | "\n", 154 | "* **`data_preprocessor`**: Responsible for normalising strings, removing punctuation etc.\n", 155 | "* **`datagetter`**: Responsible for finding a list of possible matches for each df_left record in df_right\n", 156 | "* **`scorer`**: Responsible for computing a match score, given a record from df_left and df_right respectively\n", 157 | "\n", 158 | "The main `link_table` and `fuzzy_left_join` convenience functions use these components under the hood. See [here](https://github.com/RobinL/fuzzymatcher/blob/master/fuzzymatcher/__init__.py) for how this work.\n", 159 | "\n", 160 | "This section provides a few examples of how an advanced user can compose these components to create a custom matcher" 161 | ] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "metadata": {}, 166 | "source": [ 167 | "## Example 1: Replacing the default sqlite datagetter with the cartesian datagetter" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "metadata": {}, 174 | "outputs": [], 175 | "source": [ 176 | "from fuzzymatcher.data_getter_cartesian import DataGetterCartesian\n", 177 | "from fuzzymatcher.matcher import Matcher\n", 178 | "\n", 179 | "dg = DataGetterCartesian()\n", 180 | "\n", 181 | "m = Matcher(data_getter = dg)\n", 182 | "\n", 183 | "df_left = pd.read_csv(\"tests/data/left_3.csv\")\n", 184 | "df_right = pd.read_csv(\"tests/data/right_3.csv\")\n", 185 | "\n", 186 | "on = [\"first_name\", \"surname\", \"dob\", \"city\"]\n", 187 | "\n", 188 | "m.add_data(df_left, df_right, on, on)\n", 189 | "\n", 190 | "m.match_all()\n", 191 | "lt = m.get_formatted_link_table()\n", 192 | "print(\"Length of Cartesian join table: {:,.0f}\".format(len(lt))) # Note, because df_left and df_right are 100 records each, this table is 10,000 records long\n", 193 | "lt.head()" 194 | ] 195 | }, 196 | { 197 | "cell_type": "markdown", 198 | "metadata": {}, 199 | "source": [ 200 | "The Cartesian matcher considers more potential matches, but its performance is considerably worse" 201 | ] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": {}, 206 | "source": [ 207 | "# Performance\n" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "metadata": {}, 214 | "outputs": [], 215 | "source": [ 216 | "df_left = pd.read_csv(\"tests/data/left_4.csv\")\n", 217 | "# df_left = df_left[:1000]\n", 218 | "df_right = pd.read_csv(\"tests/data/right_4.csv\")\n", 219 | "# df_right = df_right[:1000]\n", 220 | "on = [\"first_name\", \"surname\", \"dob\", \"city\"]\n", 221 | "\n", 222 | "lt = fuzzymatcher.link_table(df_left, df_right, on, on)\n", 223 | "lt.head(5)" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": null, 229 | "metadata": {}, 230 | "outputs": [], 231 | "source": [ 232 | "def link_table_percentage_correct(link_table):\n", 233 | " \"\"\"\n", 234 | " In this test dataset, we know what the link should be\n", 235 | " Therefore we can compute a measure of performance\n", 236 | " \"\"\"\n", 237 | " lt = link_table.copy()\n", 238 | " lt = lt[lt[\"match_rank\"] == 1]\n", 239 | " lt[\"__id_left\"] = lt[\"__id_left\"].str.replace(\"_left\", \"\")\n", 240 | " lt[\"__id_right\"] = lt[\"__id_right\"].str.replace(\"_right\", \"\")\n", 241 | " lt[\"link_correct\"] = (lt[\"__id_left\"] == lt[\"__id_right\"])\n", 242 | "\n", 243 | " return lt[\"link_correct\"].sum()/len(lt)\n", 244 | "\n", 245 | "\"Percent matches correct: {:,.1f}%\".format(link_table_percentage_correct(lt)*100)" 246 | ] 247 | }, 248 | { 249 | "cell_type": "markdown", 250 | "metadata": {}, 251 | "source": [ 252 | "### Note that in this particular case we can improve the match rate by including initials and allowing inversion of first name and surname \n", 253 | "\n", 254 | "(Within a field, the matcher pays no attention to token order)" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": null, 260 | "metadata": {}, 261 | "outputs": [], 262 | "source": [ 263 | "df_left[\"full_name\"] = df_left[\"first_name\"] + \" \" + df_left[\"surname\"]\n", 264 | "df_right[\"full_name\"] = df_right[\"first_name\"] + \" \" + df_right[\"surname\"]\n", 265 | "df_left[\"initials\"] = df_left[\"first_name\"].str[0] + df_left[\"surname\"].str[0]\n", 266 | "df_right[\"initials\"] = df_right[\"first_name\"].str[0] + df_right[\"surname\"].str[0]\n", 267 | "\n", 268 | "on = [\"full_name\", \"initials\", \"dob\", \"city\"]\n", 269 | "\n", 270 | "lt = fuzzymatcher.link_table(df_left, df_right, on, on)\n", 271 | "lt.head(5)" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": null, 277 | "metadata": {}, 278 | "outputs": [], 279 | "source": [ 280 | "\"Percent matches correct: {:,.1f}%\".format(link_table_percentage_correct(lt)*100)" 281 | ] 282 | } 283 | ], 284 | "metadata": { 285 | "kernelspec": { 286 | "display_name": "Python 3", 287 | "language": "python", 288 | "name": "python3" 289 | }, 290 | "language_info": { 291 | "codemirror_mode": { 292 | "name": "ipython", 293 | "version": 3 294 | }, 295 | "file_extension": ".py", 296 | "mimetype": "text/x-python", 297 | "name": "python", 298 | "nbconvert_exporter": "python", 299 | "pygments_lexer": "ipython3", 300 | "version": "3.6.1" 301 | } 302 | }, 303 | "nbformat": 4, 304 | "nbformat_minor": 2 305 | } 306 | -------------------------------------------------------------------------------- /fuzzymatcher/__init__.py: -------------------------------------------------------------------------------- 1 | from fuzzymatcher.data_preprocessor_default import DataPreprocessor 2 | from fuzzymatcher.data_getter_sqlite import DataGetter 3 | from fuzzymatcher.scorer_default import Scorer 4 | 5 | from fuzzymatcher.matcher import Matcher 6 | 7 | import pandas as pd 8 | import importlib 9 | 10 | 11 | def link_table(df_left, 12 | df_right, 13 | left_on, 14 | right_on, 15 | left_id_col = None, 16 | right_id_col = None): 17 | 18 | dp = DataPreprocessor() 19 | dg = DataGetter() 20 | s = Scorer() 21 | 22 | m = Matcher(dp, dg, s) 23 | m.add_data(df_left, df_right, left_on, right_on, left_id_col, right_id_col) 24 | m.match_all() 25 | 26 | return m.get_formatted_link_table() 27 | 28 | def fuzzy_left_join(df_left, 29 | df_right, 30 | left_on, 31 | right_on, 32 | left_id_col = None, 33 | right_id_col = None): 34 | 35 | dp = DataPreprocessor() 36 | dg = DataGetter() 37 | s = Scorer() 38 | 39 | m = Matcher(dp, dg, s) 40 | m.add_data(df_left, df_right, left_on, right_on, left_id_col, right_id_col) 41 | m.match_all() 42 | 43 | return m.get_left_join_table() -------------------------------------------------------------------------------- /fuzzymatcher/data_getter_abc.py: -------------------------------------------------------------------------------- 1 | import abc 2 | 3 | class DataGetterABC: 4 | 5 | __metaclass__ = abc.ABCMeta 6 | 7 | """ 8 | A DataGetter handles the retrieval of data from df_right (the dataframe in which to search for matches) 9 | It retrieves a list of potential match ids 10 | """ 11 | 12 | @abc.abstractmethod 13 | def add_data(self, df_search_within): 14 | 15 | """Adds the data in 'df_search_within'. 16 | 17 | Args: 18 | df_search_within: The search space i.e. the whole dataset we search within 19 | to find potential matches 20 | 21 | Returns: 22 | None 23 | """ 24 | 25 | 26 | 27 | @abc.abstractmethod 28 | def get_potential_match_ids_from_record(self, rec_find_match_for): 29 | 30 | """Retrieves lists of potential matches to a record 31 | 32 | Args: 33 | rec_find_match_for: The record for which we're trying to find a match 34 | 35 | Returns: 36 | A list of rec_potential_match records which represent the potential matches 37 | to the rec_find_match_for 38 | 39 | """ 40 | -------------------------------------------------------------------------------- /fuzzymatcher/data_getter_cartesian.py: -------------------------------------------------------------------------------- 1 | import random 2 | import sqlite3 3 | 4 | from fuzzymatcher.record import Record 5 | from fuzzymatcher.data_getter_abc import DataGetterABC 6 | 7 | class DataGetterCartesian(DataGetterABC): 8 | 9 | """ 10 | The DataGetter class handles the retrieval of record_ids from 'df_right' 11 | 12 | This Cartesian datagetter is the simplest, most thorough, but least efficient implementation 13 | where every record_id in 'df_right' is returned, compared and scored against 'df_left', leading to n^2 complexity. 14 | 15 | """ 16 | 17 | def add_data(self, matcher): 18 | 19 | """ 20 | Registers the matcher on the datagetter so the datagetter can manipulate the matcher object 21 | 22 | Args: 23 | matcher. The matcher object 24 | 25 | Returns: 26 | None 27 | """ 28 | 29 | self.matcher = matcher 30 | 31 | def get_potential_match_ids_from_record(self, rec_left): 32 | 33 | """Retrieves lists of potential matches to a record 34 | 35 | Args: 36 | rec_left: The record for which we're trying to find a match 37 | 38 | Returns: 39 | A list of rec_potential_match records which represent the potential matches 40 | to the rec_find_match_for 41 | 42 | """ 43 | 44 | for record_right_id ,record_right in self.matcher.right_records.items(): 45 | scored_potential_match = self.matcher.scorer.score_match(rec_left.record_id, record_right_id) 46 | rec_left.potential_matches[record_right_id] = scored_potential_match 47 | 48 | -------------------------------------------------------------------------------- /fuzzymatcher/data_getter_sqlite.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pandas as pd 3 | import random 4 | import sqlite3 5 | import copy 6 | from functools import lru_cache 7 | 8 | from fuzzymatcher.record import Record 9 | from fuzzymatcher.utils import tokens_to_dmetaphones, add_dmetaphone_to_concat_all 10 | log = logging.getLogger(__name__) 11 | 12 | class DataGetter: 13 | 14 | """ 15 | A DataGetter handles the retrieval of data from 'df_search_within' 16 | It retrieves lists of potential matches to a record in 'df_find_match_for' 17 | in 'df_search_within' 18 | """ 19 | 20 | def __init__(self, return_records_limit=50, search_intensity=100, found_score_threshold = 0, found_num_records_threshold = 200): 21 | self.return_records_limit = return_records_limit 22 | self.search_intensity = search_intensity 23 | self.found_score_threshold = found_score_threshold 24 | self.found_num_records_threshold = found_num_records_threshold 25 | 26 | def add_data(self, matcher): 27 | 28 | """Adds the data in 'matcher.df_search_within' to a sqlite database 29 | and create a connection to the database to be used by the data getter 30 | Also registers the match object on the datagetter. 31 | 32 | Args: 33 | matcher. The matcher object 34 | 35 | Returns: 36 | None 37 | """ 38 | 39 | self.matcher = matcher 40 | 41 | # Turn right_records into strings and add to db 42 | rows = [] 43 | for key, record in matcher.right_records.items(): 44 | row = {} 45 | row["id"] = record.record_id 46 | row["_concat_all"] = record.clean_string 47 | row["_concat_all_alternatives"] = record.get_concat_string(record.token_misspelling_dict) 48 | rows.append(row) 49 | 50 | df = pd.DataFrame(rows) 51 | df = df[["id", "_concat_all", "_concat_all_alternatives"]] 52 | 53 | con = sqlite3.connect(':memory:', timeout=0.3) 54 | 55 | df.to_sql("df_right_processed", con, index=False) 56 | sql = """ 57 | CREATE VIRTUAL TABLE fts_target 58 | USING fts4({} TEXT, _concat_all TEXT, _concat_all_alternatives TEXT); 59 | """.format(matcher.right_id_col) 60 | con.execute(sql) 61 | con.execute("INSERT INTO fts_target SELECT * FROM df_right_processed") 62 | 63 | self.con = con 64 | 65 | # TODO: Compute the min, max, average number of tokens in a record to help optimise the search 66 | 67 | 68 | def get_potential_match_ids_from_record(self, rec_left): 69 | 70 | """Retrieves lists of potential matches to a record 71 | 72 | Args: 73 | rec_left: The record for which we're trying to find a match 74 | 75 | Returns: 76 | A list of rec_potential_match records which represent the potential matches 77 | to the rec_left 78 | 79 | """ 80 | 81 | tkn_po = self._tokens_in_df_right_prob_order(rec_left) 82 | 83 | # No point in running FTS using a token we know isn't in df_right 84 | 85 | 86 | tkn_ms_po = self._tokens_in_df_right_prob_order(rec_left, misspelling=True) 87 | 88 | 89 | # Start searching with all the terms, then drop them one at a time, starting with the most unusual term 90 | token_lists = [tkn_po, tkn_ms_po] 91 | 92 | for token_list in token_lists: 93 | self._search_specific_to_general_single(token_list, rec_left) 94 | if not self._found_enough_matches(rec_left): 95 | self._search_specific_to_general_band(token_list, rec_left) 96 | if self._found_enough_matches(rec_left): 97 | break 98 | 99 | # If we cannot find a match, search random combinations 100 | if not self._found_good_match(rec_left): 101 | matches = self._search_random(tkn_po) 102 | self._add_matches_to_potential_matches(matches, rec_left) 103 | 104 | @staticmethod 105 | def _get_random_tokens(tokens): 106 | num_tokens = len(tokens) 107 | if num_tokens == 0: 108 | return () 109 | n = random.randint(1, num_tokens) 110 | random_tokens = random.sample(tokens, n) 111 | return tuple(random_tokens) 112 | 113 | def _search_specific_to_general_single(self, token_list, rec_left): 114 | 115 | for i in range(len(token_list)): 116 | sub_tokens = token_list[i:] 117 | new_matches = self._tokens_to_matches(tuple(sub_tokens)) 118 | 119 | self._add_matches_to_potential_matches(new_matches, rec_left) 120 | if self._found_enough_matches(rec_left): 121 | return 122 | 123 | def _search_specific_to_general_band(self, tokens, rec_left): 124 | """ 125 | Search in blocks e.g. if tokens a b c d go [abcd] [abc] [bcd] [ab] [bc] [cd] [a] [b] [c] [d] 126 | """ 127 | num_tokens = len(tokens) 128 | for band_size in range(num_tokens, 0,-1): 129 | take = num_tokens - band_size + 1 130 | for start_pos in range(0, take): 131 | end_pos = start_pos + band_size 132 | search_tokens = tokens[start_pos:end_pos] 133 | new_matches = self._tokens_to_matches(tuple(search_tokens)) 134 | self._add_matches_to_potential_matches(new_matches, rec_left) 135 | if self._found_good_match(rec_left): 136 | return 137 | if len(rec_left.potential_matches) > self.found_num_records_threshold: 138 | return 139 | 140 | def _found_good_match(self, rec_left): 141 | return rec_left.best_match_score > self.found_score_threshold 142 | 143 | def _found_enough_matches(self, rec_left): 144 | if rec_left.best_match_score > self.found_score_threshold: 145 | return True 146 | if len(rec_left.potential_matches) > self.found_num_records_threshold: 147 | return True 148 | return False 149 | 150 | 151 | def _search_random(self, token_list): 152 | matches = [] 153 | prev_random_tokens = set() 154 | for i in range(self.search_intensity): 155 | random_tokens = self._get_random_tokens(token_list) 156 | if random_tokens not in prev_random_tokens: 157 | prev_random_tokens.add(random_tokens) 158 | matches = self._tokens_to_matches(random_tokens) 159 | if len(matches) > 0: 160 | break 161 | return matches 162 | 163 | def _add_matches_to_potential_matches(self, matches, rec_left): 164 | for match in matches: 165 | right_id = match[0] 166 | if right_id not in rec_left.potential_matches: 167 | scored_potential_match = self.matcher.scorer.score_match(rec_left.record_id, right_id) 168 | rec_left.potential_matches[right_id] = scored_potential_match 169 | if rec_left.best_match_score < scored_potential_match["match_score"]: 170 | rec_left.best_match_score = scored_potential_match["match_score"] 171 | 172 | @lru_cache(maxsize=int(1e6)) 173 | def _tokens_to_matches(self, tokens, misspelling = False): 174 | 175 | get_records_sql = """ 176 | SELECT * FROM fts_target WHERE {} MATCH '{}' limit {}; 177 | """ 178 | 179 | # This fails if the special tokens 'and' or 'or' are in fts string! See issue 35! 180 | tokens_to_escape = ["AND", "OR", "NEAR", "NOT"] 181 | 182 | def escape_token(t): 183 | # return t 184 | if t in tokens_to_escape: 185 | return '"' + t + '"' 186 | else: 187 | return t 188 | 189 | 190 | tokens = [escape_token(t) for t in tokens] 191 | 192 | fts_string = " ".join(tokens) 193 | 194 | 195 | if misspelling: 196 | table_name = "_concat_all_alternatives" 197 | else: 198 | table_name = "_concat_all" 199 | 200 | sql = get_records_sql.format(table_name, fts_string, self.return_records_limit) 201 | 202 | 203 | cur = self.con.cursor() 204 | cur.execute(sql) 205 | results = cur.fetchall() 206 | 207 | return results 208 | 209 | 210 | def _tokens_in_df_right_prob_order(self, rec_to_find_match_for, misspelling = False): 211 | # Problem here is that field names are different in left and right 212 | fields = rec_to_find_match_for.fields 213 | if misspelling: 214 | token_dict = rec_to_find_match_for.token_misspelling_dict 215 | else: 216 | token_dict = rec_to_find_match_for.clean_token_dict 217 | get_prob = self.matcher.scorer.get_prob 218 | 219 | tokens_list = [] 220 | for field, tokens in token_dict.items(): 221 | for t in tokens: 222 | translated_field = self.matcher.left_to_right_lookup[field] 223 | prob = get_prob(t,translated_field,"right",misspelling) 224 | tokens_list.append({"token": t, "prob": prob}) 225 | 226 | tokens_list = [t for t in tokens_list if t["prob"] is not None] 227 | tokens_list.sort(key=lambda x: x["prob"]) 228 | tokens_list = [t["token"] for t in tokens_list] 229 | return tokens_list 230 | -------------------------------------------------------------------------------- /fuzzymatcher/data_preprocessor_abc.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import abc 4 | 5 | class DataPreprocessorABC: 6 | 7 | """ 8 | A DataPreprocessor is responsible for ingesting df_left (the dataframe containing the records we 9 | want to find matches for) and df_right (the dataframe we want to search for potential matches) 10 | and applying preprocessing stages like normalisation to make matching easier. 11 | """ 12 | 13 | __metaclass__ = abc.ABCMeta 14 | 15 | @abc.abstractmethod 16 | def add_data(self, 17 | df_left, 18 | df_right, 19 | left_on, 20 | right_on, 21 | left_word_cols=None, 22 | right_word_cols=None, 23 | left_id_col=None, 24 | right_id_col=None): 25 | 26 | """Adds data and parameters the DataPreprocessor needs to run 27 | 28 | This is similar to an __init__ method, except it is run after the object is instantiated. 29 | 30 | Returns: 31 | None 32 | """ 33 | 34 | 35 | 36 | @abc.abstractmethod 37 | def preprocess(self): 38 | """Main method that runs the data preprocesing 39 | 40 | Creates two new attributes on the data preprocessor object: 41 | 42 | data_search_within: 43 | This is a list of dictionaries like this: {"id": record_id, "data:" normalised string} 44 | 45 | data_find_match_for: 46 | This is a list of dictionaries like this: {"id": record_id, "data:" normalised string} 47 | 48 | Returns: 49 | None 50 | """ 51 | -------------------------------------------------------------------------------- /fuzzymatcher/data_preprocessor_default.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from functools import lru_cache 4 | 5 | from fuzzymatcher.data_preprocessor_abc import DataPreprocessorABC 6 | from metaphone import doublemetaphone 7 | 8 | class DataPreprocessor(DataPreprocessorABC): 9 | 10 | """ 11 | Normalise and deal with IDs 12 | """ 13 | 14 | def __init__(self, dmetaphone = True): 15 | #self.include_dmetaphone = dmetaphone TODO 16 | pass 17 | 18 | def register_matcher(self, matcher): 19 | self.matcher = matcher 20 | 21 | def preprocess(self): 22 | 23 | left_cols = self.matcher.left_on 24 | right_cols = self.matcher.right_on 25 | 26 | # Name collisions mean that we want to rename the id columns 27 | if not self.matcher.left_id_col: 28 | self.add_id(self.matcher.df_left, "left") 29 | self.matcher.left_id_col = "__id_left" 30 | else: 31 | self.matcher.df_left["__id_left"] = self.matcher.df_left[self.matcher.left_id_col] 32 | 33 | if not self.matcher.right_id_col: 34 | self.add_id(self.matcher.df_right, "right") 35 | self.matcher.right_id_col = "__id_right" 36 | else: 37 | self.matcher.df_right["__id_right"] = self.matcher.df_right[self.matcher.right_id_col] 38 | 39 | 40 | @staticmethod 41 | def add_id(df, prefix): 42 | id_colname = "__id_" + prefix 43 | data = range(0, len(df)) 44 | data = ["{}_{}".format(i, prefix) for i in data] 45 | df.insert(0, id_colname, data) 46 | -------------------------------------------------------------------------------- /fuzzymatcher/matcher.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | import pandas as pd 4 | from datetime import datetime 5 | from dateutil.relativedelta import relativedelta 6 | 7 | from fuzzymatcher.record import RecordToMatch, Record 8 | from fuzzymatcher.tokencomparison import TokenComparison 9 | from fuzzymatcher.data_preprocessor_default import DataPreprocessor 10 | from fuzzymatcher.data_getter_sqlite import DataGetter 11 | from fuzzymatcher.scorer_default import Scorer 12 | 13 | log = logging.getLogger(__name__) 14 | 15 | class Matcher: 16 | """The Matcher coordinates data matching""" 17 | 18 | def __init__(self, 19 | data_preprocessor = DataPreprocessor(), 20 | data_getter = DataGetter(), 21 | scorer = Scorer(), 22 | token_comparison = TokenComparison(), 23 | top_n_matches = 5): 24 | self.token_comparison = token_comparison 25 | self.data_preprocessor = data_preprocessor 26 | self.data_getter = data_getter 27 | self.scorer = scorer 28 | self.top_n_matches = top_n_matches 29 | 30 | def add_data(self, df_left, 31 | df_right, 32 | left_on, 33 | right_on, 34 | left_id_col = None, 35 | right_id_col = None): 36 | 37 | # Copy to prevent modifying the dataframes the user provides 38 | self.df_left = df_left.copy() 39 | self.df_right = df_right.copy() 40 | 41 | if type(left_on) == str: 42 | left_on = [left_on] 43 | 44 | if type(right_on) == str: 45 | right_on = [right_on] 46 | 47 | self.left_on = left_on 48 | self.right_on = right_on 49 | self.left_id_col = left_id_col 50 | self.right_id_col = right_id_col 51 | 52 | self.left_to_right_lookup = {l:r for (l,r) in zip(left_on, right_on)} 53 | 54 | self.data_preprocessor.register_matcher(self) 55 | 56 | def initiate_records(self): 57 | self.left_records = {} 58 | cols = self.left_on.copy() 59 | cols.append("__id_left") 60 | df = self.df_left[cols] 61 | for r in df.iterrows(): 62 | row = r[1] 63 | fields_dict = dict(row[self.left_on]) 64 | this_id = row["__id_left"] 65 | rec = RecordToMatch(fields_dict, this_id, self) 66 | self.left_records[this_id] = rec 67 | 68 | self.right_records = {} 69 | cols = self.right_on.copy() 70 | cols.append("__id_right") 71 | df = self.df_right[cols] 72 | for r in df.iterrows(): 73 | row = r[1] 74 | fields_dict = dict(row[self.right_on]) 75 | this_id = row["__id_right"] 76 | rec = Record(fields_dict, this_id, self) 77 | self.right_records[this_id] = rec 78 | 79 | def match_all(self): 80 | 81 | # Get a dataset with id, record only for left and right 82 | self.data_preprocessor.preprocess() 83 | 84 | self.initiate_records() 85 | 86 | # Scorer first because some data getters may need to score records on add_data 87 | self.scorer.add_data(self) 88 | 89 | self.data_getter.add_data(self) 90 | 91 | # Get a table that contains only the matches, scores and ids 92 | self.link_table = self._match_processed_data() 93 | 94 | def get_formatted_link_table(self): 95 | return self._add_original_cols_to_link_table(self.link_table) 96 | 97 | def get_left_join_table(self): 98 | df = self.df_left.merge(self.link_table, left_on = "__id_left", right_on = "__id_left", how="left") 99 | df = df.merge(self.df_right, left_on = "__id_right", right_on="__id_right", how="left", suffixes = ("_left", "_right")) 100 | 101 | # Keep records where rank = 1 or there's no rang 102 | filter1 = df["__rank"] == 1 103 | filter2 = pd.isnull(df["__rank"]) 104 | df = df[filter1 | filter2] 105 | df.drop("__rank", axis=1, inplace=True) 106 | 107 | set_cols = ["__score", "__id_left", "__id_right"] 108 | 109 | cols = set_cols.copy() 110 | cols.extend([c for c in df.columns if c not in set_cols]) 111 | 112 | df = df[cols].rename(columns={"__score": "best_match_score"}) 113 | return df 114 | 115 | def _match_processed_data(self): 116 | 117 | # This will store all the records for the link table 118 | 119 | link_table_list = [] 120 | 121 | num_left_records = len(self.left_records.keys()) 122 | num_right_records = len(self.right_records.keys()) 123 | log.debug("Matching {} left records against {} right records".format(num_left_records, num_right_records)) 124 | start_time = datetime.now() 125 | 126 | counter = 0 127 | total = len(self.left_records.items()) 128 | str_template = "Processed {:,.0f} records, {:.0f}% done in {} minutes and {} seconds" 129 | 130 | for key, this_record in self.left_records.items(): 131 | 132 | if (counter) % 1000 == 0 and counter != 0: 133 | diff = relativedelta(datetime.now(), start_time) 134 | log.debug(str_template.format(counter, (counter/total)*100, diff.minutes, diff.seconds)) 135 | 136 | this_record.find_and_score_potential_matches() 137 | link_table_list.extend(this_record.get_link_table_rows()) 138 | 139 | counter += 1 140 | 141 | diff = relativedelta(datetime.now(), start_time) 142 | log.debug(str_template.format(counter, (counter/total)*100, diff.minutes, diff.seconds)) 143 | 144 | return pd.DataFrame(link_table_list) 145 | 146 | def _add_original_cols_to_link_table(self, link_table): 147 | 148 | df = link_table.merge(self.df_left, left_on = "__id_left", right_on = "__id_left", how = "left", suffixes=('_link', '_left')) 149 | 150 | df = df.merge(self.df_right, left_on = "__id_right", right_on = "__id_right", how="left", suffixes=('_left', "_right")) 151 | 152 | match_cols_left = self.left_on[::-1].copy() 153 | match_cols_right = self.right_on[::-1].copy() 154 | col_order = ["__id_left", "__id_right", "__score", "__rank"] 155 | while len(match_cols_left) > 0 and len(match_cols_right) > 0: 156 | 157 | # Check whether suffixes have been added 158 | left_col = match_cols_left.pop() 159 | left_col = self._add_suffix_if_needed(left_col, df, "left") 160 | col_order.append(left_col) 161 | 162 | right_col = match_cols_right.pop() 163 | right_col = self._add_suffix_if_needed(right_col, df, "right") 164 | col_order.append(right_col) 165 | 166 | col_order.extend(match_cols_left) 167 | col_order.extend(match_cols_right) 168 | 169 | df = df[col_order] 170 | 171 | # Finally rename the id columns back to their original and improve names of score and rank 172 | rename_dict = {} 173 | if "match_rank" not in df.columns: 174 | rename_dict["__rank"] = "match_rank" 175 | 176 | if "match_score" not in df.columns: 177 | rename_dict["__score"] = "match_score" 178 | df = df.rename(columns = rename_dict) 179 | return df 180 | 181 | def _add_suffix_if_needed(self, col_name, df, left_or_right): 182 | 183 | all_cols = df.columns 184 | if left_or_right == "left": 185 | left_cols = self.df_left.columns 186 | 187 | if col_name in left_cols and col_name not in all_cols: 188 | return col_name + "_left" 189 | else: 190 | return col_name 191 | 192 | if left_or_right == "right": 193 | right_cols = self.df_right.columns 194 | if col_name in right_cols and col_name not in all_cols: 195 | return col_name + "_right" 196 | else: 197 | return col_name 198 | 199 | -------------------------------------------------------------------------------- /fuzzymatcher/record.py: -------------------------------------------------------------------------------- 1 | from fuzzymatcher.utils import tokens_to_dmetaphones 2 | import pandas as pd 3 | import random 4 | import re 5 | 6 | class Record: 7 | """ 8 | The 'record' objects represents a row of a dataset. 9 | A row is represented as a dictionary called 'field dict', whose keys are the column (field) names 10 | and whose values are the column values. 11 | 12 | The record object has methods to clean (homogenise) and tokenise these column values. 13 | The record object also has a dictionary similar to field dict that contains token misspellings 14 | """ 15 | 16 | def __init__(self, field_dict, record_id, matcher): 17 | self.orig_field_dict = field_dict 18 | self.record_id = record_id 19 | self.matcher = matcher 20 | 21 | self.fields = list(field_dict.keys()) 22 | self.clean_token_dict = Record.get_tokenised_field_dict(field_dict) 23 | self.clean_string = Record.get_concat_string(self.clean_token_dict) 24 | 25 | self.token_misspelling_dict = self.get_tokenised_misspelling_dict() 26 | 27 | def __repr__(self): 28 | return self.clean_string 29 | 30 | def get_tokenised_misspelling_dict(self): 31 | get_misspellings = self.matcher.token_comparison.get_misspellings 32 | 33 | misspellings_dict = {} 34 | for field, tokens in self.clean_token_dict.items(): 35 | misspelling_tokens = [] 36 | for t in tokens: 37 | misspelling_tokens.extend(get_misspellings(t)) 38 | misspellings_dict[field] = misspelling_tokens 39 | return misspellings_dict 40 | 41 | @staticmethod 42 | def field_to_string(value): 43 | return str(value) 44 | 45 | @staticmethod 46 | def get_tokenised_field_dict(field_dict): 47 | cleaned_token_dict = {} 48 | for key, value in field_dict.items(): 49 | value = Record.field_to_string(value) 50 | value = value.upper() 51 | 52 | value = value.replace("'", " ") 53 | value = re.sub('[^\w\s]',' ', value) 54 | value = re.sub('\s{2,100}',' ', value) 55 | value = value.strip() 56 | 57 | cleaned_token_dict[key] = value.split(" ") 58 | return cleaned_token_dict 59 | 60 | @staticmethod 61 | def get_concat_string(token_dict): 62 | tokens = [] 63 | for key, value in token_dict.items(): 64 | tokens.extend(value) 65 | return " ".join(tokens) 66 | 67 | class RecordToMatch(Record): 68 | 69 | def __init__(self, *args, **kwargs): 70 | Record.__init__(self, *args, **kwargs) 71 | self.potential_matches = {} # A dictionary with right_id as key 72 | self.best_match_score = -float("inf") 73 | 74 | def find_and_score_potential_matches(self): 75 | # Each left_record has a list of left_record ids 76 | self.matcher.data_getter.get_potential_match_ids_from_record(self) 77 | 78 | def get_link_table_rows(self): 79 | rows = [] 80 | 81 | for k, v in self.potential_matches.items(): 82 | row = {} 83 | row["__id_left"] = self.record_id 84 | row["__id_right"] = v["record_right"].record_id 85 | row["__score"] = v["match_score"] 86 | # TODO 87 | #row["__score"] = p.match_prob 88 | rows.append(row) 89 | 90 | if len(self.potential_matches.items()) == 0: #If there is no potential match, still want a row in link table 91 | row = {} 92 | row["__id_left"] = self.record_id 93 | row["__id_right"] = None 94 | row["__score"] = None 95 | rows.append(row) 96 | 97 | 98 | 99 | rows.sort(key=lambda r: r['__score'], reverse=True) 100 | 101 | for i, r in enumerate(rows): 102 | r["__rank"] = i + 1 103 | 104 | return rows 105 | 106 | 107 | -------------------------------------------------------------------------------- /fuzzymatcher/scorer_abc.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import abc 4 | 5 | class ScorerABC: 6 | 7 | """ 8 | A DataPreprocessor is responsible for ingesting df_left (the dataframe containing the records we 9 | want to find matches for) and df_right (the dataframe we want to search for potential matches) 10 | and applying preprocessing stages like normalisation to make matching easier. 11 | """ 12 | 13 | __metaclass__ = abc.ABCMeta 14 | 15 | @abc.abstractmethod 16 | def add_data(self, matcher): 17 | pass 18 | 19 | 20 | @abc.abstractmethod 21 | def get_freq(token): 22 | pass 23 | 24 | @abc.abstractmethod 25 | def score_match(record_to_find_match, record_potential_match): 26 | pass 27 | 28 | -------------------------------------------------------------------------------- /fuzzymatcher/scorer_default.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import pandas as pd 3 | from collections import Counter 4 | from itertools import chain, product 5 | from operator import mul 6 | from functools import reduce 7 | from math import log10 8 | from functools import lru_cache 9 | 10 | import logging 11 | log = logging.getLogger(__name__) 12 | 13 | from fuzzymatcher.utils import add_dmetaphones_to_col, is_mispelling, convert_series_to_dmetaphones, tokens_to_dmetaphones 14 | 15 | class Scorer: 16 | 17 | """ 18 | A DataPreprocessor is responsible for ingesting df_left (the dataframe containing the records we 19 | want to find matches for) and df_right (the dataframe we want to search for potential matches) 20 | and applying preprocessing stages like normalisation to make matching easier. 21 | """ 22 | 23 | def add_data(self, matcher): 24 | self.matcher = matcher 25 | self._generate_probs() 26 | 27 | def get_prob(self, token, field, left_right, misspelling=False): 28 | """ 29 | Get probability given field and token 30 | """ 31 | 32 | try: 33 | if not misspelling and left_right == "left": 34 | return self.left_field_token_probs_dict[field][token] 35 | 36 | if not misspelling and left_right == "right": 37 | return self.right_field_token_probs_dict[field][token] 38 | 39 | if misspelling and left_right == "left": 40 | return self.left_field_misspelling_probs_dict[field][token] 41 | 42 | if misspelling and left_right == "right": 43 | return self.right_field_misspelling_probs_dict[field][token] 44 | except KeyError: 45 | return None 46 | 47 | @lru_cache(maxsize=int(1e6)) 48 | def score_match(self, record_left_id, record_right_id): 49 | 50 | record_left = self.matcher.left_records[record_left_id] 51 | record_right = self.matcher.right_records[record_right_id] 52 | # Need to find common tokens, and get their probabilities 53 | fields_left = record_left.fields 54 | 55 | prob = 1 56 | for f_left in fields_left: 57 | p = self._field_to_prob(f_left, record_left, record_right) 58 | prob = p * prob 59 | 60 | match_score = self.prob_to_score(prob) 61 | return {"match_prob" : prob, "match_score": match_score, "record_right": record_right} 62 | 63 | 64 | def _field_to_prob(self, field_left, record_left, record_right): 65 | 66 | field_right = self.matcher.left_to_right_lookup[field_left] 67 | 68 | tokens_left = set(record_left.clean_token_dict[field_left]) 69 | tokens_right = set(record_right.clean_token_dict[field_right]) 70 | 71 | matching_tokens = tokens_left.intersection(tokens_right) 72 | unmatching_tokens_left = tokens_left.difference(tokens_right) 73 | unmatching_tokens_right = tokens_right.difference(tokens_left) 74 | 75 | prob_matching = self._get_prob_matching(matching_tokens, field_right) 76 | 77 | prob_unmatching1 = self._get_prob_unmatching(unmatching_tokens_left, tokens_right, field_right, field_left) 78 | prob_unmatching2 = self._get_prob_unmatching(unmatching_tokens_right, tokens_left, field_right, field_left) 79 | 80 | tokens_alt_left = set(record_left.token_misspelling_dict[field_left]) 81 | tokens_alt_right = set(record_right.token_misspelling_dict[field_right]) 82 | matching_tokens_alt = tokens_alt_left.intersection(tokens_alt_right) 83 | prob_matching_alt = self._get_prob_matching(matching_tokens_alt, field_right, misspelling=True) 84 | 85 | prob = prob_matching * prob_unmatching1 * prob_unmatching2 * prob_matching_alt 86 | 87 | return prob 88 | 89 | def _get_prob_matching(self, tokens, f_right, misspelling=False): 90 | prob = 1 91 | for t in tokens: 92 | p = self.get_prob(t,f_right,"right", misspelling) 93 | prob = p * prob 94 | return prob 95 | 96 | def _get_prob_unmatching(self, unmatching_tokens, record_tokens, field_right, field_left): 97 | # If the unmatching token is not a misspelling, then undo its probability 98 | prob = 1 99 | for umt in unmatching_tokens: 100 | if not self._is_misspelling_of_one(umt, record_tokens): 101 | p = self.get_prob(umt,field_right,"right") 102 | if p is None: # If this token never appears on the right, how often does it appear on the left 103 | p = self.get_prob(umt,field_left,"left") 104 | prob = p * prob 105 | 106 | prob = Scorer._adjust_prob_towards_one(prob) 107 | return 1/prob 108 | 109 | def _is_misspelling_of_one(self, token, token_list): 110 | for t in token_list: 111 | if self.matcher.token_comparison.is_mispelling(token, t): 112 | return True 113 | return False 114 | 115 | def get_token_lists_by_field(self, recordsdict, attribute): 116 | token_lists_by_field = {} 117 | key = next(iter(recordsdict)) 118 | fields = recordsdict[key].fields 119 | for f in fields: 120 | token_lists_by_field[f] = [] 121 | 122 | for key, this_record in recordsdict.items(): 123 | for f in fields: 124 | tokens = getattr(this_record, attribute)[f] 125 | token_lists_by_field[f].extend(tokens) 126 | 127 | return token_lists_by_field 128 | 129 | def field_tokens_to_prob(self, field_tokens): 130 | ft = field_tokens 131 | for key, value in ft.items(): 132 | counts = Counter(value) 133 | count_sum = sum(counts.values()) 134 | counts = {k: v/count_sum for k,v in counts.items()} 135 | ft[key] = counts 136 | return ft 137 | 138 | def _generate_probs(self): 139 | left_field_tokens = self.get_token_lists_by_field(self.matcher.left_records, "clean_token_dict") 140 | self.left_field_token_probs_dict = self.field_tokens_to_prob(left_field_tokens) 141 | 142 | right_field_tokens = self.get_token_lists_by_field(self.matcher.right_records, "clean_token_dict") 143 | self.right_field_token_probs_dict = self.field_tokens_to_prob(right_field_tokens) 144 | 145 | left_field_tokens = self.get_token_lists_by_field(self.matcher.left_records, "token_misspelling_dict") 146 | self.left_field_misspelling_probs_dict = self.field_tokens_to_prob(left_field_tokens) 147 | 148 | right_field_tokens = self.get_token_lists_by_field(self.matcher.right_records, "token_misspelling_dict") 149 | self.right_field_misspelling_probs_dict = self.field_tokens_to_prob(right_field_tokens) 150 | 151 | @staticmethod 152 | def prob_to_score(prob): 153 | return -(log10(prob))/30 154 | 155 | @staticmethod 156 | def _adjust_prob_towards_one(initial_prob, amount = 2): 157 | return initial_prob 158 | -------------------------------------------------------------------------------- /fuzzymatcher/tokencomparison.py: -------------------------------------------------------------------------------- 1 | from functools import lru_cache 2 | from metaphone import doublemetaphone 3 | from rapidfuzz import fuzz 4 | 5 | class TokenComparison: 6 | """ 7 | The tokencomparison object contains functions that check for other misspellings and 'close' matches 8 | """ 9 | 10 | def __init__(self, fuzz_ratio_threshold = 80, number_fuzz_threshold = 1.01): 11 | self.fuzz_ratio_threshold = fuzz_ratio_threshold 12 | self.number_fuzz_threshold = number_fuzz_threshold 13 | 14 | @lru_cache(maxsize=int(1e6)) 15 | def get_misspellings(self, token): 16 | """ 17 | Must return a list of misspellings 18 | If there are no misspellings, just return a list of length 0 19 | """ 20 | misspellings = doublemetaphone(token) 21 | misspellings = [t for t in misspellings if t != ""] 22 | return misspellings 23 | 24 | @lru_cache(maxsize=int(1e6)) 25 | def is_mispelling(self, token1, token2): 26 | mis_t1 = set(self.get_misspellings(token1)) 27 | mis_t2 = set(self.get_misspellings(token2)) 28 | common = mis_t1.intersection(mis_t2).difference({''}) # Difference in case '' included in tokens 29 | 30 | if len(common) > 0: 31 | return True 32 | 33 | # Misspellings only really make sense if the tokens are words not numbers 34 | if token1.isalpha() and token2.isalpha(): 35 | if fuzz.ratio(token1, token2) > self.fuzz_ratio_threshold: 36 | return True 37 | 38 | try: 39 | t1f = float(token1) 40 | t2f = float(token2) 41 | if max(t1f, t2f)/min(t1f, t2f) < self.number_fuzz_threshold: 42 | return True 43 | 44 | except (ValueError, ZeroDivisionError): 45 | pass 46 | 47 | 48 | 49 | return False 50 | 51 | -------------------------------------------------------------------------------- /fuzzymatcher/utils.py: -------------------------------------------------------------------------------- 1 | from metaphone import doublemetaphone 2 | from rapidfuzz.fuzz import ratio 3 | 4 | def tokens_to_dmetaphones(tokens): 5 | new_tokens = [] 6 | for t in tokens: 7 | dmp = doublemetaphone(t) 8 | if dmp[0] == '': 9 | pass 10 | elif dmp[1] == '': 11 | new_tokens.append(dmp[0]) 12 | else: 13 | new_tokens.extend(dmp) 14 | 15 | new_tokens = [t.strip() for t in new_tokens] 16 | return new_tokens 17 | 18 | def add_dmetaphones_to_col(x): 19 | tokens = x.split(" ") 20 | new_tokens = tokens_to_dmetaphones(tokens) 21 | tokens.extend(new_tokens) 22 | return " ".join(tokens) 23 | 24 | def add_dmetaphone_to_concat_all(df): 25 | df["_concat_all"] = df["_concat_all"].apply(add_dmetaphones_to_col) 26 | 27 | def convert_tokens_to_dmetaphones(x): 28 | tokens = x.split(" ") 29 | new_tokens = tokens_to_dmetaphones(tokens) 30 | return " ".join(new_tokens) 31 | 32 | def convert_series_to_dmetaphones(series): 33 | return series.apply(convert_tokens_to_dmetaphones) 34 | 35 | def is_mispelling(token_left, token_right): 36 | 37 | dml = set(doublemetaphone(token_left)) 38 | dmr = set(doublemetaphone(token_right)) 39 | 40 | if len(dml.intersection(dmr).difference({''})) > 0: 41 | return True 42 | 43 | if ratio(token_left, token_right) >= 90: 44 | return True 45 | 46 | return False -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pandas>=0.20.3 2 | metaphone 3 | rapidfuzz 4 | python-dateutil -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | # see https://stackoverflow.com/questions/14399534/reference-requirements-txt-for-the-install-requires-kwarg-in-setuptools-setup-py 4 | setup(name='fuzzymatcher', 5 | version='0.0.6', 6 | description='Fuzzy match two pandas dataframes based on one or more common fields', 7 | url='https://github.com/RobinL/fuzzymatcher', 8 | author='Robin Linacre', 9 | author_email='robinlinacre@hotmail.com', 10 | license='MIT', 11 | packages=['fuzzymatcher'], # The directory to look in for the source code 12 | install_requires=['pandas', 'metaphone', 'python-Levenshtein', 'rapidfuzz', 'python-dateutil'], 13 | test_requires=["pylint", "coverage", "codecov"], 14 | keywords=["matching", "fuzzy", "probabalistic", "recordlinking", "fuzzymatching"], 15 | download_url = 'https://github.com/RobinL/fuzzymatcher/archive/v0.0.6.tar.gz', 16 | zip_safe=False) 17 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """fuzzymatcher test package initialisation.""" 4 | -------------------------------------------------------------------------------- /tests/create_fake_dataset.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%load_ext autoreload\n", 10 | "%autoreload 2" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "import sys\n", 20 | "sys.path.append('..')" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 3, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "from generate_test_data import create_test_data\n", 30 | "df_left, df_right = create_test_data(10000)" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 4, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "df_left.to_csv(\"data/left_4.csv\", index=False)\n", 40 | "df_right.to_csv(\"data/right_4.csv\", index=False)" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [] 49 | } 50 | ], 51 | "metadata": { 52 | "kernelspec": { 53 | "display_name": "Python 3", 54 | "language": "python", 55 | "name": "python3" 56 | }, 57 | "language_info": { 58 | "codemirror_mode": { 59 | "name": "ipython", 60 | "version": 3 61 | }, 62 | "file_extension": ".py", 63 | "mimetype": "text/x-python", 64 | "name": "python", 65 | "nbconvert_exporter": "python", 66 | "pygments_lexer": "ipython3", 67 | "version": "3.9.12" 68 | } 69 | }, 70 | "nbformat": 4, 71 | "nbformat_minor": 2 72 | } 73 | -------------------------------------------------------------------------------- /tests/data/_cities.csv: -------------------------------------------------------------------------------- 1 | city,freq 2 | London,8615246 3 | Birmingham,1224136 4 | Glasgow,801198 5 | Leeds,761481 6 | Bristol,617280 7 | Liverpool,552267 8 | Manchester,520739 9 | Sheffield,518090 10 | Edinburgh,482005 11 | Cardiff,447287 12 | Leicester,443760 13 | Stoke-on-Trent,372775 14 | Bradford,349561 15 | Coventry,325949 16 | Nottingham,315862 17 | Kingston-upon-Hull,314018 18 | Belfast,295223 19 | Newcastle-upon-Tyne,282442 20 | Sunderland,275506 21 | Brighton,273369 22 | Derby,270468 23 | Plymouth,261384 24 | Wolverhampton,254726 25 | Southampton,253651 26 | Swansea,240332 27 | Salford,239019 28 | Portsmouth,238137 29 | Milton,229941 30 | Aberdeen,227130 31 | Reading,218705 32 | Northampton,215173 33 | Luton,211228 34 | Swindon,209156 35 | Warrington,202228 36 | Dudley,200603 37 | York,200018 38 | Bolton,194189 39 | Stockton-on-Tees,191610 40 | Preston,190687 41 | Bournemouth,187503 42 | Norwich,186682 43 | Middlesbrough,184773 44 | Peterborough,178869 45 | Southend-on-Sea,175547 46 | Walsall,174141 47 | Colchester,173074 48 | Mansfield,169987 49 | Telford,166641 50 | Ipswich,164331 51 | Huddersfield,162949 52 | -------------------------------------------------------------------------------- /tests/data/_first_names.csv: -------------------------------------------------------------------------------- 1 | name,freq,gender 2 | Oliver ,6623,m 3 | Harry ,5284,m 4 | George ,5263,m 5 | Jack ,4751,m 6 | Jacob ,4485,m 7 | Noah ,4305,m 8 | Charlie ,4190,m 9 | Muhammad ,3908,m 10 | Thomas ,3898,m 11 | Oscar ,3894,m 12 | William ,3819,m 13 | James ,3580,m 14 | Leo ,3563,m 15 | Alfie ,3555,m 16 | Henry ,3527,m 17 | Joshua ,3495,m 18 | Freddie ,3287,m 19 | Archie ,2791,m 20 | Ethan ,2722,m 21 | Isaac ,2662,m 22 | Alexander ,2567,m 23 | Joseph ,2506,m 24 | Edward ,2429,m 25 | Samuel ,2413,m 26 | Max ,2405,m 27 | Logan ,2335,m 28 | Lucas ,2332,m 29 | Daniel ,2290,m 30 | Theo ,2279,m 31 | Arthur ,2270,m 32 | Mohammed ,2228,m 33 | Harrison ,2220,m 34 | Benjamin ,2153,m 35 | Mason ,2131,m 36 | Finley ,2022,m 37 | Sebastian ,1990,m 38 | Adam ,1815,m 39 | Dylan ,1784,m 40 | Zachary ,1670,m 41 | Riley ,1556,m 42 | Teddy ,1491,m 43 | Theodore ,1484,m 44 | David ,1461,m 45 | Elijah ,1412,m 46 | Jake ,1405,m 47 | Toby ,1400,m 48 | Louie ,1380,m 49 | Reuben ,1322,m 50 | Arlo ,1285,m 51 | Hugo ,1234,m 52 | Jaxon ,1161,m 53 | Luca ,1144,m 54 | Matthew ,1110,m 55 | Harvey ,1102,m 56 | Harley ,1083,m 57 | Reggie ,1083,m 58 | Tommy ,1066,m 59 | Jenson ,1064,m 60 | Luke ,1056,m 61 | Michael ,1056,m 62 | Jayden ,1050,m 63 | Jude ,1047,m 64 | Frankie ,1029,m 65 | Albert ,1028,m 66 | Stanley ,1019,m 67 | Elliot ,999,m 68 | Gabriel ,995,m 69 | Mohammad ,948,m 70 | Ollie ,923,m 71 | Ronnie ,921,m 72 | Louis ,911,m 73 | Charles ,910,m 74 | Blake ,893,m 75 | Elliott ,892,m 76 | Lewis ,884,m 77 | Frederick ,874,m 78 | Nathan ,872,m 79 | Tyler ,863,m 80 | Jackson ,858,m 81 | Rory ,856,m 82 | Ryan ,855,m 83 | Carter ,839,m 84 | Dexter ,831,m 85 | Alex ,813,m 86 | Austin ,812,m 87 | Caleb ,810,m 88 | Kai ,794,m 89 | Albie ,788,m 90 | Ellis ,783,m 91 | Bobby ,775,m 92 | Ezra ,763,m 93 | Leon ,737,m 94 | Roman ,737,m 95 | Jesse ,723,m 96 | Aaron ,704,m 97 | Ibrahim ,701,m 98 | Liam ,698,m 99 | Jasper ,683,m 100 | Felix ,666,m 101 | Finn ,650,m 102 | Olivia ,5017,f 103 | Amelia ,4777,f 104 | Emily ,3551,f 105 | Isla ,3476,f 106 | Ava ,3285,f 107 | Isabella ,2729,f 108 | Lily ,2722,f 109 | Jessica ,2703,f 110 | Ella ,2702,f 111 | Mia ,2662,f 112 | Sophia ,2636,f 113 | Charlotte ,2596,f 114 | Poppy ,2506,f 115 | Sophie ,2505,f 116 | Grace ,2498,f 117 | Evie ,2487,f 118 | Alice ,2264,f 119 | Scarlett ,2096,f 120 | Freya ,2079,f 121 | Florence ,2072,f 122 | Isabelle ,2066,f 123 | Daisy ,2020,f 124 | Chloe ,1980,f 125 | Phoebe ,1975,f 126 | Matilda ,1915,f 127 | Ruby ,1904,f 128 | Evelyn ,1877,f 129 | Sienna ,1815,f 130 | Sofia ,1683,f 131 | Eva ,1641,f 132 | Elsie ,1574,f 133 | Willow ,1536,f 134 | Ivy ,1509,f 135 | Millie ,1456,f 136 | Esme ,1416,f 137 | Rosie ,1403,f 138 | Imogen ,1400,f 139 | Elizabeth ,1384,f 140 | Maya ,1374,f 141 | Layla ,1358,f 142 | Emilia ,1356,f 143 | Lola ,1298,f 144 | Lucy ,1273,f 145 | Harper ,1256,f 146 | Eliza ,1226,f 147 | Erin ,1212,f 148 | Eleanor ,1147,f 149 | Ellie ,1142,f 150 | Harriet ,1132,f 151 | Thea ,1132,f 152 | Maisie ,1111,f 153 | Holly ,1098,f 154 | Emma ,1080,f 155 | Georgia ,1078,f 156 | Amber ,1066,f 157 | Molly ,1041,f 158 | Hannah ,1033,f 159 | Abigail ,1023,f 160 | Jasmine ,1018,f 161 | Lilly ,1014,f 162 | Annabelle ,1009,f 163 | Rose ,974,f 164 | Penelope ,972,f 165 | Amelie ,964,f 166 | Violet ,959,f 167 | Bella ,946,f 168 | Aria ,926,f 169 | Zara ,876,f 170 | Maria ,853,f 171 | Nancy ,842,f 172 | Darcie ,791,f 173 | Lottie ,782,f 174 | Anna ,779,f 175 | Summer ,760,f 176 | Martha ,756,f 177 | Heidi ,752,f 178 | Gracie ,744,f 179 | Luna ,715,f 180 | Maryam ,711,f 181 | Beatrice ,685,f 182 | Mila ,679,f 183 | Darcey ,662,f 184 | Megan ,658,f 185 | Iris ,648,f 186 | Lexi ,626,f 187 | Robyn ,622,f 188 | Aisha ,617,f 189 | Clara ,611,f 190 | Francesca ,611,f 191 | Sara ,606,f 192 | Victoria ,586,f 193 | Zoe ,584,f 194 | Julia ,578,f 195 | Arabella ,577,f 196 | Maddison ,576,f 197 | Sarah ,572,f 198 | Felicity ,570,f 199 | Darcy ,566,f 200 | Leah ,564,f 201 | Lydia ,563,f 202 | -------------------------------------------------------------------------------- /tests/data/_surnames.csv: -------------------------------------------------------------------------------- 1 | surname,freq, 2 | Smith,729862, 3 | Jones,578261, 4 | Taylor,458268, 5 | Williams,411385, 6 | Brown,380443, 7 | Davies,316982, 8 | Evans,231844, 9 | Wilson,227652, 10 | Thomas,220228, 11 | Roberts,219694, 12 | Johnson,214969, 13 | Lewis,198193, 14 | Walker,195372, 15 | Robinson,187889, 16 | Wood,186261, 17 | Thompson,183266, 18 | White,181979, 19 | Watson,181296, 20 | Jackson,173538, 21 | Wright,171438, 22 | Green,166423, 23 | Harris,161505, 24 | Cooper,161076, 25 | King,160918, 26 | Lee,159502, 27 | Martin,152842, 28 | Clarke,152502, 29 | James,151855, 30 | Morgan,150454, 31 | Hughes,147802, 32 | Edwards,147540, 33 | Hill,145723, 34 | Moore,144446, 35 | Clark,140662, 36 | Harrison,137103, 37 | Scott,134059, 38 | Young,131862, 39 | Morris,131499, 40 | Hall,129805, 41 | Ward,125643, 42 | Turner,125278, 43 | Carter,124343, 44 | Phillips,121845, 45 | Mitchell,121734, 46 | Patel,119855, 47 | Adams,116035, 48 | Campbell,115972, 49 | Anderson,115333, 50 | Allen,112703, 51 | Cook,111306, 52 | Bailey,111011, 53 | Parker,109943, 54 | Miller,109474, 55 | Davis,108041, 56 | Murphy,106245, 57 | Price,105993, 58 | Bell,105417, 59 | Baker,104639, 60 | Griffiths,104048, 61 | Kelly,102415, 62 | Simpson,101082, 63 | Marshall,97225, 64 | Collins,95210, 65 | Bennett,94402, 66 | Cox,92572, 67 | Richardson,92558, 68 | Fox,92124, 69 | Gray,90917, 70 | Rose,89001, 71 | Chapman,88136, 72 | Hunt,86792, 73 | Robertson,86269, 74 | Shaw,86049, 75 | Reynolds,85228, 76 | Lloyd,85021, 77 | Ellis,84914, 78 | Richards,83575, 79 | Russell,82898, 80 | Wilkinson,82570, 81 | Khan,80429, 82 | Graham,80026, 83 | Stewart,79031, 84 | Reid,78783, 85 | Murray,77127, 86 | Powell,76973, 87 | Palmer,75969, 88 | Holmes,75423, 89 | Rogers,74821, 90 | Stevens,73965, 91 | Walsh,73208, 92 | Hunter,72547, 93 | Thomson,71616, 94 | Matthews,70610, 95 | Ross,70274, 96 | Owen,69870, 97 | Mason,69708, 98 | Knight,69522, 99 | Kennedy,69299, 100 | Butler,69298, 101 | Saunders,69004, 102 | Cole,68854, 103 | Pearce,68707, 104 | Dean,68073, 105 | Foster,67675, 106 | Harvey,67231, 107 | Hudson,66488, 108 | Gibson,66246, 109 | Mills,65966, 110 | Berry,65022, 111 | Barnes,64291, 112 | Pearson,64156, 113 | Kaur,63969, 114 | Booth,63239, 115 | Dixon,63151, 116 | Grant,63130, 117 | Gordon,62747, 118 | Lane,62226, 119 | Harper,61606, 120 | Ali,61008, 121 | Hart,60683, 122 | Mcdonald,60431, 123 | Brooks,59633, 124 | Ryan,59545, 125 | Carr,59130, 126 | Macdonald,58450, 127 | Hamilton,58170, 128 | Johnston,58060, 129 | West,57780, 130 | Gill,57716, 131 | Dawson,57637, 132 | Armstrong,56917, 133 | Gardner,56314, 134 | Stone,55920, 135 | Andrews,55371, 136 | Williamson,55302, 137 | Barker,55092, 138 | George,54666, 139 | Fisher,54294, 140 | Cunningham,54280, 141 | Watts,54124, 142 | Webb,54036, 143 | Lawrence,53575, 144 | Bradley,52955, 145 | Jenkins,52608, 146 | Wells,52410, 147 | Chambers,52399, 148 | Spencer,52194, 149 | Poole,52038, 150 | Atkinson,51520,97 151 | Lawson,50, 152 | Lawson,50971, 153 | Day,50489, 154 | Woods,50230, 155 | Rees,50058, 156 | Fraser,49924, 157 | Black,49851, 158 | Fletcher,49725, 159 | Hussain,49663, 160 | Willis,49648, 161 | Marsh,49438, 162 | Ahmed,49385, 163 | Doyle,49093, 164 | Lowe,48795, 165 | Burns,48179, 166 | Hopkins,48145, 167 | Nicholson,47947, 168 | Parry,47611, 169 | Newman,47474, 170 | Jordan,47311, 171 | Henderson,46745, 172 | Howard,46513, 173 | Barrett,46507, 174 | Burton,46432, 175 | Riley,46323, 176 | Porter,45820, 177 | Byrne,44686, 178 | Houghton,44528, 179 | John,44411, 180 | Perry,44306, 181 | Baxter,44182, 182 | Ball,43967, 183 | Mccarthy,43841, 184 | Elliott,43810, 185 | Burke,43670, 186 | Gallagher,43183, 187 | Duncan,43107, 188 | Cooke,42991, 189 | Austin,42785, 190 | Read,42641, 191 | Wallace,42601, 192 | Hawkins,42543, 193 | Hayes,42475, 194 | Francis,42437, 195 | Sutton,42393, 196 | Davidson,42349, 197 | Sharp,42266, 198 | Holland,41869, 199 | Moss,40964, 200 | May,40836, 201 | Bates,40781, 202 | Morrison,40765, 203 | Bob,40761, 204 | Oliver,40678, 205 | Kemp,40614, 206 | Page,40554, 207 | Arnold,40053, 208 | Shah,39981, 209 | Stevenson,39857, 210 | Ford,39580, 211 | Potter,39355, 212 | Flynn,39193, 213 | Warren,39108, 214 | Kent,38924, 215 | Alexander,38849, 216 | Field,38753, 217 | Freeman,38498, 218 | Begum,38176, 219 | Rhodes,37989, 220 | O neill,37778, 221 | Middleton,37567, 222 | Payne,37552, 223 | Stephenson,37457, 224 | Pritchard,37155, 225 | Gregory,36854, 226 | Bond,36816, 227 | Webster,36737, 228 | Dunn,36674, 229 | Donnelly,36450, 230 | Lucas,36021, 231 | Long,36011, 232 | Jarvis,35979, 233 | Cross,35736, 234 | Stephens,35599, 235 | Reed,35264, 236 | Coleman,35258, 237 | Nicholls,35101, 238 | Bull,35010, 239 | Bartlett,34909, 240 | O brien,34834, 241 | Curtis,34681, 242 | Bird,34628, 243 | Patterson,34527, 244 | Tucker,34442, 245 | Bryant,34091, 246 | Lynch,34075, 247 | Mackenzie,34001, 248 | Ferguson,33983, 249 | Cameron,33846, 250 | Lopez,33832, 251 | Haynes,33812, 252 | Bolton,33800, 253 | Hardy,33773, 254 | Heath,33647, 255 | Davey,33556, 256 | Rice,33363, 257 | Jacobs,32761, 258 | Parsons,32698, 259 | Ashton,32675, 260 | Robson,32669, 261 | French,32541, 262 | Farrell,32195, 263 | Walton,32149, 264 | Gilbert,32131, 265 | Mcintyre,32129, 266 | Newton,32089, 267 | Norman,32054, 268 | Higgins,31950, 269 | Hodgson,31895, 270 | Sutherland,31784, 271 | Kay,31520, 272 | Bishop,31479, 273 | Burgess,31473, 274 | Simmons,31386, 275 | Hutchinson,31369, 276 | Moran,31308, 277 | Frost,31298, 278 | Sharma,31288, 279 | Slater,31226, 280 | Greenwood,31203, 281 | Kirk,31170, 282 | Fernandez,31117, 283 | Garcia,31015, 284 | Atkins,31015, 285 | Daniel,30918, 286 | Beattie,30886, 287 | Maxwell,30847, 288 | Todd,30590, 289 | Charles,30449, 290 | Paul,30300, 291 | Crawford,30292, 292 | O connor,30271, 293 | Park,30076, 294 | Forrest,30021, 295 | Love,29973, 296 | Rowland,29927, 297 | Connolly,29874, 298 | Sheppard,29653, 299 | Harding,29560, 300 | Banks,29546, 301 | Rowe,29371, 302 | -------------------------------------------------------------------------------- /tests/data/las_ons.csv: -------------------------------------------------------------------------------- 1 | lad16cd,lad16nm 2 | E06000001,Hartlepool 3 | E06000002,Middlesbrough 4 | E06000003,Redcar and Cleveland 5 | E06000004,Stockton-on-Tees 6 | E06000005,Darlington 7 | E06000006,Halton 8 | E06000007,Warrington 9 | E06000008,Blackburn with Darwen 10 | E06000009,Blackpool 11 | E06000010,"Kingston upon Hull, City of" 12 | E06000011,East Riding of Yorkshire 13 | E06000012,North East Lincolnshire 14 | E06000013,North Lincolnshire 15 | E06000014,York 16 | E06000015,Derby 17 | E06000016,Leicester 18 | E06000017,Rutland 19 | E06000018,Nottingham 20 | E06000019,"Herefordshire, County of" 21 | E06000020,Telford and Wrekin 22 | E06000021,Stoke-on-Trent 23 | E06000022,Bath and North East Somerset 24 | E06000023,"Bristol, City of" 25 | E06000024,North Somerset 26 | E06000025,South Gloucestershire 27 | E06000026,Plymouth 28 | E06000027,Torbay 29 | E06000028,Bournemouth 30 | E06000029,Poole 31 | E06000030,Swindon 32 | E06000031,Peterborough 33 | E06000040,Windsor and Maidenhead 34 | E06000041,Wokingham 35 | E06000042,Milton Keynes 36 | E06000043,Brighton and Hove 37 | E06000044,Portsmouth 38 | E06000032,Luton 39 | E06000033,Southend-on-Sea 40 | E06000034,Thurrock 41 | E06000035,Medway 42 | E06000036,Bracknell Forest 43 | E06000037,West Berkshire 44 | E06000045,Southampton 45 | E06000046,Isle of Wight 46 | E06000047,County Durham 47 | E06000049,Cheshire East 48 | E06000050,Cheshire West and Chester 49 | E06000051,Shropshire 50 | E06000052,Cornwall 51 | E06000053,Isles of Scilly 52 | E06000054,Wiltshire 53 | E06000055,Bedford 54 | E06000056,Central Bedfordshire 55 | E06000057,Northumberland 56 | E07000004,Aylesbury Vale 57 | E07000005,Chiltern 58 | E07000006,South Bucks 59 | E07000007,Wycombe 60 | E07000090,Havant 61 | E07000091,New Forest 62 | E07000092,Rushmoor 63 | E07000093,Test Valley 64 | E07000094,Winchester 65 | E07000095,Broxbourne 66 | E07000096,Dacorum 67 | E07000098,Hertsmere 68 | E07000099,North Hertfordshire 69 | E07000102,Three Rivers 70 | E07000130,Charnwood 71 | E07000131,Harborough 72 | E07000132,Hinckley and Bosworth 73 | E06000038,Reading 74 | E06000039,Slough 75 | E07000008,Cambridge 76 | E07000009,East Cambridgeshire 77 | E07000010,Fenland 78 | E07000011,Huntingdonshire 79 | E07000012,South Cambridgeshire 80 | E07000026,Allerdale 81 | E07000027,Barrow-in-Furness 82 | E07000028,Carlisle 83 | E07000029,Copeland 84 | E07000030,Eden 85 | E07000031,South Lakeland 86 | E07000032,Amber Valley 87 | E07000033,Bolsover 88 | E07000034,Chesterfield 89 | E07000035,Derbyshire Dales 90 | E07000036,Erewash 91 | E07000037,High Peak 92 | E07000038,North East Derbyshire 93 | E07000039,South Derbyshire 94 | E07000040,East Devon 95 | E07000041,Exeter 96 | E07000042,Mid Devon 97 | E07000043,North Devon 98 | E07000044,South Hams 99 | E07000045,Teignbridge 100 | E07000046,Torridge 101 | E07000047,West Devon 102 | E07000048,Christchurch 103 | E07000049,East Dorset 104 | E07000050,North Dorset 105 | E07000051,Purbeck 106 | E07000052,West Dorset 107 | E07000053,Weymouth and Portland 108 | E07000061,Eastbourne 109 | E07000062,Hastings 110 | E07000063,Lewes 111 | E07000064,Rother 112 | E07000065,Wealden 113 | E07000066,Basildon 114 | E07000067,Braintree 115 | E07000068,Brentwood 116 | E07000069,Castle Point 117 | E07000070,Chelmsford 118 | E07000071,Colchester 119 | E07000072,Epping Forest 120 | E07000086,Eastleigh 121 | E07000087,Fareham 122 | E07000073,Harlow 123 | E07000074,Maldon 124 | E07000075,Rochford 125 | E07000076,Tendring 126 | E07000077,Uttlesford 127 | E07000078,Cheltenham 128 | E07000079,Cotswold 129 | E07000080,Forest of Dean 130 | E07000088,Gosport 131 | E07000089,Hart 132 | E07000081,Gloucester 133 | E07000082,Stroud 134 | E07000083,Tewkesbury 135 | E07000084,Basingstoke and Deane 136 | E07000085,East Hampshire 137 | E07000103,Watford 138 | E07000105,Ashford 139 | E07000106,Canterbury 140 | E07000107,Dartford 141 | E07000108,Dover 142 | E07000109,Gravesham 143 | E07000110,Maidstone 144 | E07000111,Sevenoaks 145 | E07000112,Shepway 146 | E07000113,Swale 147 | E07000114,Thanet 148 | E07000115,Tonbridge and Malling 149 | E07000116,Tunbridge Wells 150 | E07000117,Burnley 151 | E07000118,Chorley 152 | E07000119,Fylde 153 | E07000120,Hyndburn 154 | E07000121,Lancaster 155 | E07000122,Pendle 156 | E07000123,Preston 157 | E07000124,Ribble Valley 158 | E07000125,Rossendale 159 | E07000126,South Ribble 160 | E07000127,West Lancashire 161 | E07000128,Wyre 162 | E07000129,Blaby 163 | E07000133,Melton 164 | E07000134,North West Leicestershire 165 | E07000135,Oadby and Wigston 166 | E07000136,Boston 167 | E07000137,East Lindsey 168 | E07000192,Cannock Chase 169 | E07000193,East Staffordshire 170 | E07000194,Lichfield 171 | E07000138,Lincoln 172 | E07000139,North Kesteven 173 | E07000140,South Holland 174 | E07000141,South Kesteven 175 | E07000142,West Lindsey 176 | E07000143,Breckland 177 | E07000144,Broadland 178 | E07000145,Great Yarmouth 179 | E07000146,King's Lynn and West Norfolk 180 | E07000147,North Norfolk 181 | E07000148,Norwich 182 | E07000149,South Norfolk 183 | E07000150,Corby 184 | E07000151,Daventry 185 | E07000152,East Northamptonshire 186 | E07000153,Kettering 187 | E07000154,Northampton 188 | E07000155,South Northamptonshire 189 | E07000156,Wellingborough 190 | E07000163,Craven 191 | E07000164,Hambleton 192 | E07000165,Harrogate 193 | E07000166,Richmondshire 194 | E07000167,Ryedale 195 | E07000168,Scarborough 196 | E07000169,Selby 197 | E07000170,Ashfield 198 | E07000171,Bassetlaw 199 | E07000172,Broxtowe 200 | E07000173,Gedling 201 | E07000174,Mansfield 202 | E07000175,Newark and Sherwood 203 | E07000176,Rushcliffe 204 | E07000177,Cherwell 205 | E07000195,Newcastle-under-Lyme 206 | E07000196,South Staffordshire 207 | E07000197,Stafford 208 | E07000178,Oxford 209 | E07000179,South Oxfordshire 210 | E07000180,Vale of White Horse 211 | E07000181,West Oxfordshire 212 | E07000187,Mendip 213 | E07000188,Sedgemoor 214 | E07000189,South Somerset 215 | E07000190,Taunton Deane 216 | E07000191,West Somerset 217 | E07000198,Staffordshire Moorlands 218 | E07000199,Tamworth 219 | E07000200,Babergh 220 | E07000201,Forest Heath 221 | E07000202,Ipswich 222 | E07000203,Mid Suffolk 223 | E07000204,St Edmundsbury 224 | E07000205,Suffolk Coastal 225 | E07000206,Waveney 226 | E07000207,Elmbridge 227 | E07000208,Epsom and Ewell 228 | E07000209,Guildford 229 | E07000210,Mole Valley 230 | E07000211,Reigate and Banstead 231 | E07000212,Runnymede 232 | E07000213,Spelthorne 233 | E07000214,Surrey Heath 234 | E07000215,Tandridge 235 | E07000216,Waverley 236 | E07000217,Woking 237 | E07000218,North Warwickshire 238 | E07000219,Nuneaton and Bedworth 239 | E07000220,Rugby 240 | E07000221,Stratford-on-Avon 241 | E07000222,Warwick 242 | E07000223,Adur 243 | E07000224,Arun 244 | E07000225,Chichester 245 | E07000226,Crawley 246 | E07000227,Horsham 247 | E07000228,Mid Sussex 248 | E07000229,Worthing 249 | E07000234,Bromsgrove 250 | E07000235,Malvern Hills 251 | E07000236,Redditch 252 | E07000242,East Hertfordshire 253 | E08000026,Coventry 254 | E08000027,Dudley 255 | E07000237,Worcester 256 | E07000238,Wychavon 257 | E07000239,Wyre Forest 258 | E07000240,St Albans 259 | E07000241,Welwyn Hatfield 260 | E07000243,Stevenage 261 | E08000001,Bolton 262 | E08000002,Bury 263 | E08000003,Manchester 264 | E08000004,Oldham 265 | E08000005,Rochdale 266 | E08000006,Salford 267 | E08000007,Stockport 268 | E08000008,Tameside 269 | E08000009,Trafford 270 | E08000010,Wigan 271 | E08000011,Knowsley 272 | E08000012,Liverpool 273 | E08000013,St. Helens 274 | E08000014,Sefton 275 | E08000015,Wirral 276 | E08000016,Barnsley 277 | E08000017,Doncaster 278 | E08000018,Rotherham 279 | E08000019,Sheffield 280 | E08000021,Newcastle upon Tyne 281 | E08000022,North Tyneside 282 | E08000023,South Tyneside 283 | E08000024,Sunderland 284 | E08000025,Birmingham 285 | E08000028,Sandwell 286 | E08000029,Solihull 287 | E08000030,Walsall 288 | E08000031,Wolverhampton 289 | E08000032,Bradford 290 | E08000033,Calderdale 291 | E08000034,Kirklees 292 | E08000035,Leeds 293 | E08000036,Wakefield 294 | E08000037,Gateshead 295 | E09000016,Havering 296 | E09000017,Hillingdon 297 | E09000018,Hounslow 298 | E09000020,Kensington and Chelsea 299 | E09000033,Westminster 300 | S12000005,Clackmannanshire 301 | S12000006,Dumfries and Galloway 302 | E09000021,Kingston upon Thames 303 | S12000008,East Ayrshire 304 | S12000010,East Lothian 305 | S12000011,East Renfrewshire 306 | S12000013,Na h-Eileanan Siar 307 | E09000022,Lambeth 308 | E09000023,Lewisham 309 | S12000014,Falkirk 310 | S12000015,Fife 311 | S12000017,Highland 312 | S12000018,Inverclyde 313 | S12000019,Midlothian 314 | S12000020,Moray 315 | S12000021,North Ayrshire 316 | S12000023,Orkney Islands 317 | E09000001,City of London 318 | E09000002,Barking and Dagenham 319 | E09000003,Barnet 320 | E09000004,Bexley 321 | E09000011,Greenwich 322 | E09000019,Islington 323 | E09000005,Brent 324 | E09000006,Bromley 325 | E09000007,Camden 326 | E09000008,Croydon 327 | E09000009,Ealing 328 | E09000010,Enfield 329 | E09000012,Hackney 330 | E09000013,Hammersmith and Fulham 331 | E09000014,Haringey 332 | E09000015,Harrow 333 | S12000024,Perth and Kinross 334 | S12000026,Scottish Borders 335 | S12000027,Shetland Islands 336 | S12000028,South Ayrshire 337 | S12000029,South Lanarkshire 338 | S12000030,Stirling 339 | S12000033,Aberdeen City 340 | S12000034,Aberdeenshire 341 | E09000024,Merton 342 | E09000025,Newham 343 | E09000026,Redbridge 344 | E09000027,Richmond upon Thames 345 | S12000035,Argyll and Bute 346 | S12000036,City of Edinburgh 347 | S12000038,Renfrewshire 348 | S12000039,West Dunbartonshire 349 | S12000040,West Lothian 350 | S12000041,Angus 351 | E09000028,Southwark 352 | W06000016,Rhondda Cynon Taf 353 | W06000018,Caerphilly 354 | W06000019,Blaenau Gwent 355 | W06000020,Torfaen 356 | W06000021,Monmouthshire 357 | W06000022,Newport 358 | W06000023,Powys 359 | E09000029,Sutton 360 | W06000024,Merthyr Tydfil 361 | E09000030,Tower Hamlets 362 | E09000031,Waltham Forest 363 | E09000032,Wandsworth 364 | S12000042,Dundee City 365 | S12000044,North Lanarkshire 366 | S12000045,East Dunbartonshire 367 | S12000046,Glasgow City 368 | W06000001,Isle of Anglesey 369 | W06000002,Gwynedd 370 | W06000003,Conwy 371 | W06000004,Denbighshire 372 | W06000005,Flintshire 373 | W06000006,Wrexham 374 | W06000008,Ceredigion 375 | W06000009,Pembrokeshire 376 | W06000010,Carmarthenshire 377 | W06000011,Swansea 378 | W06000012,Neath Port Talbot 379 | W06000013,Bridgend 380 | W06000014,Vale of Glamorgan 381 | W06000015,Cardiff 382 | -------------------------------------------------------------------------------- /tests/data/las_os.csv: -------------------------------------------------------------------------------- 1 | name,code 2 | Wycombe District,E07000007 3 | South Bucks District,E07000006 4 | Chiltern District,E07000005 5 | Aylesbury Vale District,E07000004 6 | Fenland District,E07000010 7 | South Cambridgeshire District,E07000012 8 | East Cambridgeshire District,E07000009 9 | Huntingdonshire District,E07000011 10 | Cambridge District (B),E07000008 11 | Copeland District (B),E07000029 12 | Carlisle District (B),E07000028 13 | South Lakeland District,E07000031 14 | Allerdale District (B),E07000026 15 | Eden District,E07000030 16 | Barrow-in-Furness District (B),E07000027 17 | High Peak District (B),E07000037 18 | South Derbyshire District,E07000039 19 | Erewash District (B),E07000036 20 | North East Derbyshire District,E07000038 21 | Amber Valley District (B),E07000032 22 | Bolsover District,E07000033 23 | Derbyshire Dales District,E07000035 24 | Chesterfield District (B),E07000034 25 | North Devon District,E07000043 26 | East Devon District,E07000040 27 | Teignbridge District,E07000045 28 | West Devon District (B),E07000047 29 | Mid Devon District,E07000042 30 | Exeter District (B),E07000041 31 | Purbeck District,E07000051 32 | Christchurch District (B),E07000048 33 | West Dorset District,E07000052 34 | East Dorset District,E07000049 35 | North Dorset District,E07000050 36 | Weymouth and Portland District (B),E07000053 37 | Lewes District,E07000063 38 | Rother District,E07000064 39 | Wealden District,E07000065 40 | Eastbourne District (B),E07000061 41 | Hastings District (B),E07000062 42 | Brentwood District (B),E07000068 43 | Rochford District,E07000075 44 | Epping Forest District,E07000072 45 | Tendring District,E07000076 46 | Uttlesford District,E07000077 47 | Chelmsford District (B),E07000070 48 | Colchester District (B),E07000071 49 | Maldon District (B),E07000074 50 | Braintree District,E07000067 51 | Harlow District,E07000073 52 | Basildon District (B),E07000066 53 | Castle Point District (B),E07000069 54 | Forest of Dean District,E07000080 55 | Cotswold District,E07000079 56 | Stroud District,E07000082 57 | Tewkesbury District (B),E07000083 58 | Gloucester District (B),E07000081 59 | Cheltenham District (B),E07000078 60 | Basingstoke and Deane District (B),E07000084 61 | New Forest District,E07000091 62 | Eastleigh District (B),E07000086 63 | East Hampshire District,E07000085 64 | Winchester District (B),E07000094 65 | Test Valley District,E07000093 66 | Hart District,E07000089 67 | Gosport District (B),E07000088 68 | Fareham District (B),E07000087 69 | Havant District (B),E07000090 70 | Rushmoor District (B),E07000092 71 | Three Rivers District,E07000102 72 | Hertsmere District (B),E07000098 73 | Broxbourne District (B),E07000095 74 | Dacorum District (B),E07000096 75 | East Hertfordshire District,E07000242 76 | St. Albans District (B),E07000240 77 | Welwyn Hatfield District (B),E07000241 78 | North Hertfordshire District,E07000099 79 | Watford District (B),E07000103 80 | Stevenage District (B),E07000243 81 | Tunbridge Wells District (B),E07000116 82 | Shepway District,E07000112 83 | Sevenoaks District,E07000111 84 | Tonbridge and Malling District (B),E07000115 85 | Thanet District,E07000114 86 | Ashford District (B),E07000105 87 | Canterbury District (B),E07000106 88 | Dover District,E07000108 89 | Maidstone District (B),E07000110 90 | Swale District (B),E07000113 91 | Dartford District (B),E07000107 92 | Gravesham District (B),E07000109 93 | West Lancashire District (B),E07000127 94 | Lancaster District (B),E07000121 95 | Chorley District (B),E07000118 96 | South Ribble District (B),E07000126 97 | Rossendale District (B),E07000125 98 | Fylde District (B),E07000119 99 | Preston District (B),E07000123 100 | Wyre District (B),E07000128 101 | Pendle District (B),E07000122 102 | Ribble Valley District (B),E07000124 103 | Hyndburn District (B),E07000120 104 | Burnley District (B),E07000117 105 | Hinckley and Bosworth District (B),E07000132 106 | North West Leicestershire District,E07000134 107 | Melton District (B),E07000133 108 | Harborough District,E07000131 109 | Blaby District,E07000129 110 | Charnwood District (B),E07000130 111 | Oadby and Wigston District (B),E07000135 112 | West Lindsey District,E07000142 113 | South Kesteven District,E07000141 114 | South Holland District,E07000140 115 | Boston District (B),E07000136 116 | North Kesteven District,E07000139 117 | East Lindsey District,E07000137 118 | Lincoln District (B),E07000138 119 | Great Yarmouth District (B),E07000145 120 | South Norfolk District,E07000149 121 | King's Lynn and West Norfolk District (B),E07000146 122 | Breckland District,E07000143 123 | Norwich District (B),E07000148 124 | South Northamptonshire District,E07000155 125 | East Northamptonshire District,E07000152 126 | Daventry District,E07000151 127 | Wellingborough District (B),E07000156 128 | Kettering District (B),E07000153 129 | Northampton District (B),E07000154 130 | Corby District (B),E07000150 131 | Scarborough District (B),E07000168 132 | Selby District,E07000169 133 | Craven District,E07000163 134 | Richmondshire District,E07000166 135 | Harrogate District (B),E07000165 136 | Ryedale District,E07000167 137 | Hambleton District,E07000164 138 | Bassetlaw District,E07000171 139 | Rushcliffe District (B),E07000176 140 | Gedling District (B),E07000173 141 | Ashfield District,E07000170 142 | Newark and Sherwood District,E07000175 143 | Broxtowe District (B),E07000172 144 | Mansfield District,E07000174 145 | Vale of White Horse District,E07000180 146 | South Oxfordshire District,E07000179 147 | Cherwell District,E07000177 148 | West Oxfordshire District,E07000181 149 | Oxford District (B),E07000178 150 | West Somerset District,E07000191 151 | Mendip District,E07000187 152 | Taunton Deane District (B),E07000190 153 | South Somerset District,E07000189 154 | Sedgemoor District,E07000188 155 | Staffordshire Moorlands District,E07000198 156 | South Staffordshire District,E07000196 157 | Lichfield District,E07000194 158 | Newcastle-under-Lyme District (B),E07000195 159 | Stafford District (B),E07000197 160 | East Staffordshire District (B),E07000193 161 | Cannock Chase District,E07000192 162 | Tamworth District (B),E07000199 163 | Waveney District,E07000206 164 | Babergh District,E07000200 165 | Suffolk Coastal District,E07000205 166 | St. Edmundsbury District (B),E07000204 167 | Forest Heath District,E07000201 168 | Mid Suffolk District,E07000203 169 | Ipswich District (B),E07000202 170 | Waverley District (B),E07000216 171 | Tandridge District,E07000215 172 | Woking District (B),E07000217 173 | Surrey Heath District (B),E07000214 174 | Runnymede District (B),E07000212 175 | Guildford District (B),E07000209 176 | Reigate and Banstead District (B),E07000211 177 | Mole Valley District,E07000210 178 | Elmbridge District (B),E07000207 179 | Spelthorne District (B),E07000213 180 | Epsom and Ewell District (B),E07000208 181 | North Warwickshire District (B),E07000218 182 | Rugby District (B),E07000220 183 | Warwick District,E07000222 184 | Stratford-on-Avon District,E07000221 185 | Nuneaton and Bedworth District (B),E07000219 186 | Arun District,E07000224 187 | Adur District,E07000223 188 | Chichester District,E07000225 189 | Mid Sussex District,E07000228 190 | Horsham District,E07000227 191 | Worthing District (B),E07000229 192 | Crawley District (B),E07000226 193 | Malvern Hills District,E07000235 194 | Wyre Forest District,E07000239 195 | Wychavon District,E07000238 196 | Bromsgrove District,E07000234 197 | Worcester District (B),E07000237 198 | Redditch District (B),E07000236 199 | Kingston upon Thames London Boro,E09000021 200 | Croydon London Boro,E09000008 201 | Bromley London Boro,E09000006 202 | Hounslow London Boro,E09000018 203 | Ealing London Boro,E09000009 204 | Havering London Boro,E09000016 205 | Hillingdon London Boro,E09000017 206 | Harrow London Boro,E09000015 207 | Brent London Boro,E09000005 208 | Barnet London Boro,E09000003 209 | Lambeth London Boro,E09000022 210 | Southwark London Boro,E09000028 211 | Lewisham London Boro,E09000023 212 | Greenwich London Boro,E09000011 213 | Bexley London Boro,E09000004 214 | Enfield London Boro,E09000010 215 | Waltham Forest London Boro,E09000031 216 | Redbridge London Boro,E09000026 217 | Sutton London Boro,E09000029 218 | Richmond upon Thames London Boro,E09000027 219 | Merton London Boro,E09000024 220 | Wandsworth London Boro,E09000032 221 | Hammersmith and Fulham London Boro,E09000013 222 | Kensington and Chelsea London Boro,E09000020 223 | City of Westminster London Boro,E09000033 224 | Camden London Boro,E09000007 225 | Tower Hamlets London Boro,E09000030 226 | Islington London Boro,E09000019 227 | Hackney London Boro,E09000012 228 | Haringey London Boro,E09000014 229 | Newham London Boro,E09000025 230 | Barking and Dagenham London Boro,E09000002 231 | Kirklees District (B),E08000034 232 | Knowsley District (B),E08000011 233 | Leeds District (B),E08000035 234 | Liverpool District (B),E08000012 235 | Manchester District (B),E08000003 236 | Newcastle upon Tyne District (B),E08000021 237 | North Tyneside District (B),E08000022 238 | Oldham District (B),E08000004 239 | Rochdale District (B),E08000005 240 | Rotherham District (B),E08000018 241 | Salford District (B),E08000006 242 | Sandwell District (B),E08000028 243 | Sefton District (B),E08000014 244 | Sheffield District (B),E08000019 245 | City and County of the City of London,E09000001 246 | Barnsley District (B),E08000016 247 | Birmingham District (B),E08000025 248 | Bolton District (B),E08000001 249 | Bradford District (B),E08000032 250 | Bury District (B),E08000002 251 | Calderdale District (B),E08000033 252 | City of Wolverhampton District (B),E08000031 253 | Coventry District (B),E08000026 254 | Doncaster District (B),E08000017 255 | Dudley District (B),E08000027 256 | Gateshead District (B),E08000037 257 | Solihull District (B),E08000029 258 | South Tyneside District (B),E08000023 259 | Stockport District (B),E08000007 260 | St. Helens District (B),E08000013 261 | Sunderland District (B),E08000024 262 | Tameside District (B),E08000008 263 | Trafford District (B),E08000009 264 | Wakefield District (B),E08000036 265 | Walsall District (B),E08000030 266 | Wigan District (B),E08000010 267 | Wirral District (B),E08000015 268 | Abertawe - Swansea,W06000011 269 | Angus,S12000041 270 | Bath and North East Somerset,E06000022 271 | Bedford (B),E06000055 272 | Blackburn with Darwen (B),E06000008 273 | Blackpool (B),E06000009 274 | Blaenau Gwent - Blaenau Gwent,W06000019 275 | Bournemouth (B),E06000028 276 | Bracknell Forest (B),E06000036 277 | Bro Morgannwg - the Vale of Glamorgan,W06000014 278 | Caerffili - Caerphilly,W06000018 279 | Casnewydd - Newport,W06000022 280 | Castell-nedd Port Talbot - Neath Port Talbot,W06000012 281 | Central Bedfordshire,E06000056 282 | Cheshire East (B),E06000049 283 | Cheshire West and Chester (B),E06000050 284 | City of Bristol (B),E06000023 285 | City of Derby (B),E06000015 286 | City of Kingston upon Hull (B),E06000010 287 | City of Leicester (B),E06000016 288 | City of Nottingham (B),E06000018 289 | City of Peterborough (B),E06000031 290 | City of Southampton (B),E06000045 291 | City of Stoke-on-Trent (B),E06000021 292 | Clackmannanshire,S12000005 293 | Conwy - Conwy,W06000003 294 | County Durham,E06000047 295 | County of Herefordshire,E06000019 296 | Darlington (B),E06000005 297 | Dundee City,S12000042 298 | East Ayrshire,S12000008 299 | East Dunbartonshire,S12000045 300 | East Renfrewshire,S12000011 301 | East Riding of Yorkshire,E06000011 302 | Falkirk,S12000014 303 | Glasgow City,S12000046 304 | Halton (B),E06000006 305 | Hartlepool (B),E06000001 306 | Inverclyde,S12000018 307 | Luton (B),E06000032 308 | Medway (B),E06000035 309 | Merthyr Tudful - Merthyr Tydfil,W06000024 310 | Middlesbrough (B),E06000002 311 | Midlothian,S12000019 312 | Milton Keynes (B),E06000042 313 | North East Lincolnshire (B),E06000012 314 | North Lanarkshire,S12000044 315 | North Lincolnshire (B),E06000013 316 | Pen-y-bont ar Ogwr - Bridgend,W06000013 317 | Perth and Kinross,S12000024 318 | Poole (B),E06000029 319 | Powys - Powys,W06000023 320 | Reading (B),E06000038 321 | Redcar and Cleveland (B),E06000003 322 | Renfrewshire,S12000038 323 | Rhondda Cynon Taf - Rhondda Cynon Taf,W06000016 324 | Rutland,E06000017 325 | Scottish Borders,S12000026 326 | Shropshire,E06000051 327 | Sir Ddinbych - Denbighshire,W06000004 328 | Sir Gaerfyrddin - Carmarthenshire,W06000010 329 | Sir y Fflint - Flintshire,W06000005 330 | Slough (B),E06000039 331 | Southend-on-Sea (B),E06000033 332 | South Gloucestershire,E06000025 333 | South Lanarkshire,S12000029 334 | Stirling,S12000030 335 | Stockton-on-Tees (B),E06000004 336 | Swindon (B),E06000030 337 | Telford and Wrekin (B),E06000020 338 | The City of Brighton and Hove (B),E06000043 339 | Thurrock (B),E06000034 340 | Tor-faen - Torfaen,W06000020 341 | Torbay (B),E06000027 342 | Warrington (B),E06000007 343 | West Berkshire,E06000037 344 | West Dunbartonshire,S12000039 345 | West Lothian,S12000040 346 | Wiltshire,E06000054 347 | Windsor and Maidenhead (B),E06000040 348 | Wokingham (B),E06000041 349 | Wrecsam - Wrexham,W06000006 350 | York (B),E06000014 351 | Broadland District,E07000144 352 | North Norfolk District,E07000147 353 | South Hams District,E07000044 354 | Torridge District,E07000046 355 | Isle of Wight,E06000046 356 | Sir Ynys Mon - Isle of Anglesey,W06000001 357 | Gwynedd - Gwynedd,W06000002 358 | Caerdydd - Cardiff,W06000015 359 | Sir Ceredigion - Ceredigion,W06000008 360 | Sir Fynwy - Monmouthshire,W06000021 361 | Sir Benfro - Pembrokeshire,W06000009 362 | North Somerset,E06000024 363 | Highland,S12000017 364 | Moray,S12000020 365 | Orkney Islands,S12000023 366 | Na h-Eileanan an Iar,S12000013 367 | Argyll and Bute,S12000035 368 | Aberdeenshire,S12000034 369 | Fife,S12000015 370 | Aberdeen City,S12000033 371 | City of Edinburgh,S12000036 372 | East Lothian,S12000010 373 | Shetland Islands,S12000027 374 | North Ayrshire,S12000021 375 | Dumfries and Galloway,S12000006 376 | City of Portsmouth (B),E06000044 377 | City of Plymouth (B),E06000026 378 | South Ayrshire,S12000028 379 | Northumberland,E06000057 380 | Cornwall,E06000052 381 | Isles of Scilly,E06000053 382 | -------------------------------------------------------------------------------- /tests/data/left_1.csv: -------------------------------------------------------------------------------- 1 | id,fname,mname,lname,dob,another_field 2 | 1,Will,James,Johnston,20/05/1980,other data 3 | 2,James,Paul,Smith,15/06/1990,more data 4 | 3,Jody,Liz,Brown,20/05/1960,another thing 5 | 4,David,James,Williams,01/01/2000,thing4 6 | 7 | -------------------------------------------------------------------------------- /tests/data/left_2.csv: -------------------------------------------------------------------------------- 1 | id,fname,mname,lname,dob,another_field 2 | 1,Alistair,Paul,Johnston,20/05/1980,other data 3 | 2,James,Paul,Smith,15/06/1990,more data 4 | 3,Alisdair,Paul,Jonson,20/05/1961,another thing 5 | 4,David,Paul,Williams,01/01/2000,final thing 6 | -------------------------------------------------------------------------------- /tests/data/left_3.csv: -------------------------------------------------------------------------------- 1 | first_name,surname,dob,city,email 2 | Noah ,John,1979-05-29,London,kernandeztin@white-clarh.com 3 | William ,Leo,1996-01-29,Ipswich,simmonseric@estes.info 4 | Muhammad ,lRey,1987-04-13,Bradford,ngutierrez@schmidt.com 5 | iasJne ,Newton,1973-09-15,London,lhale@cooper-novak.info 6 | Brown,Jacob ,1994-10-11,London,xwilliams@mcmahon.inf 7 | Daniel ,Gardner,1988-04-08,Edinburgh,anthony27@english.com 8 | Samuel ,Harrison,2004-06-11,London,andersonamy@love.com 9 | John,Rosie ,2017-10-07,Dudley,irichmond@jones.net 10 | Elizabeth ,Holmes,2005-12-10,London,armstrongdavid@simpson.com 11 | Elizabeth ,Bennett,1977-01-04,London,christophersmith@vega-dickson.net 12 | Sophie ,Scot,1976-10-05,Belfast,etephen96@munz-ross.biz 13 | Reuben ,Davis,1980-11-13,London,laura99@carey.biz 14 | Chloe ,Edwards,2001-03-01,Leeds,don1an6@stevens.inf 15 | Hugo ,Hughes,2006-07-30,Bristol,amandawaker@lee.cmo 16 | Esme ,Hunter,1980-05-06,London,christopher42@lynch.com 17 | Gabriel ,Warren,2012-05-28,Nottingham,aanor93@carenas.com 18 | Sophia ,Stevenson,2004-10-27,Bradford,wilcoxcatherine@norris-hill.net 19 | Jasper ,Thompson,2008-04-23,Glasgow,pamela40@wells-bond.com 20 | Isaac ,Smth,2004-02-19,Belfast,andersonannttee@west-artinez.org 21 | Muhammad ,Tucer,1989-11-20,Bradford,briai15@ood.bnz 22 | Lowe,Maria ,1976-01-29,Brighton,wberry@tucker.com 23 | Olivia ,Scott,1976-12-23,London,brian24@hunt-hall.org 24 | Muhammad ,Wood,1990-04-10,Manchester,dixonbenjamin@mann-hall.biz 25 | Elizabeth ,Reed,1972-10-05,Liverpool,kristy28@osborne-ochoa.net 26 | Henry ,nhopsoT,1982-03-11,Colchester,perezalicia@brites-cook.neg 27 | Matthew ,Lewis,2011-12-06,London,reneeblack@bean.com 28 | Benjman ,Heath,2008-11-26,Leicester,kara10@briggs.com 29 | Arthur ,Johnston,1982-08-22,Kingston-upon-Hull,sroman@blacf.ino 30 | kaJ ,Lynch,2015-02-15,Sheffield,alejandro23@west-lane.org 31 | Alexander ,Patel,1971-05-18,Edinhugb,mollyyoung@schroeder-allen.com 32 | Shah,Jessica ,1993-06-24,Coventry,rhonda75@case.net 33 | Harry ,Williams,1998-04-02,York,kdeleon@crawford-nicholson.com 34 | Thompson,Oscar ,1981-07-15,London,xeaver@milrel.com 35 | Oscar,Wilson,2013-09-27,Bristol,kyle91@cunningham-bell.com 36 | Henr ,Scott,1997-10-22,Birmingham,porterkataleen@bll-hhll.com 37 | ai ,Taylor,1987-10-24,Swindon,owenspatricia@farrell.com 38 | Eva ,Parker,1980-06-16,London,davidsonjoseph@montgomery-harris.com 39 | Felicity ,Bull,1984-10-01,Derby,simpsondaniellr@laeso.com 40 | Samuel ,Curtis,1980-01-02,London,loganhcristopherbutler.org 41 | Harvey ,Jarvis,2012-10-12,Leicester,christrne64@eilly.com 42 | Jack ,Jones,1992-01-20,Lees,natalie78@watkins.com 43 | Scarlett ,Bnwr,1974-05-01,London,leeaaron@brewer-hill.net 44 | Nancy,Wright,1993-09-27,pIswih,vvaldez@ruiz.com 45 | Isla ,Coleman,1997-11-22,London,jameswilson@wiggins.net 46 | Dyisa,Greenwood,2015-01-26,Stockton-on-Tees,utrevino@ross-vargas.info 47 | Finley ,Sutherland,2001-02-10,London,brownashley@flores.com 48 | Henderson,Henry ,1999-10-27,London,muis17@medna.col 49 | Jase ,Wood,1980-11-16,Leeds,pinedastephanie@hall-walker.org 50 | Jesse ,Allen,1986-11-24,Lndoo,autumn07@young-boyer.net 51 | Harrison ,Wiosn,2010-11-28,London,fgolden@henderson.com 52 | eLi ,Rowe,1989-11-14,London,sgomez@jones.com 53 | Teeodorh,Lucas,1990-05-10,Leiecter,cynthiaanderson@welch.com 54 | Elmy ,Foster,2013-10-25,Liverpol,.ara37@yongtinfo 55 | Bella ,Griffiths,1996-08-22,Walsall,thompsonrobert@lee.biz 56 | Stanley ,White,1981-02-12,ork,michaela15@torres.info 57 | Chloe ,James,1984-11-23,Lodnn,rachel17@alexander.com 58 | reoGg ,Pweol,1995-09-18,Birmingham,andrea81@haney.com 59 | Noah ,Cle,2002-02-05,Stockton-on-Tees,daviskimberly@bennett-king.com 60 | Mitchell,Felix ,2000-10-21,Birmingham,moorejeffrey@riley.net 61 | Love,Ella ,1986-09-02,Warrington,jssicas3@thoma4.biz 62 | Leo ,Webb,2012-01-30,Sheffield,sharon50@alexander.com 63 | Sophi ,Stone,1985-06-10,London,qhawkins@pennington-hurley.biz 64 | Emlii ,Cooper,2001-05-27,rventCy,larrycampbell@doyle.biz 65 | Francesca ,Edwards,1987-09-12,Brighton,floydmegan@torres.com 66 | Knight,Adam ,1979-07-12,Derby,mcconnellstephen@walker-flores.net 67 | Phillips,iasy ,2009-12-08,Sheffield,grahamchad@tate.info 68 | Isabelle ,Doyle,2010-06-24,Lonno,michael84@munoz-matthews.com 69 | Robyn ,Ross,1998-06-20,Newcastle-upon-Tyne,wbrown.wall@co 70 | Lottie ,Smith,1982-12-03,London,robersonbrenda@sutton-phillips.com 71 | Arabella ,Gray,1978-11-01,Lnnoo,pmartinez@cox-williams.com 72 | Lewis ,Smith,1996-02-22,Londo,thawkins@hamilton-burch.com 73 | Daisy ,Bo,2009-02-16,Covntre,chelsea39@jackson.com 74 | Wi iaml,Walker,1985-10-11,Liverpool,tracysaiders@mntchellcollins.com 75 | Brown,Daniel ,2002-02-21,Sunderland,qhouse@morton.org 76 | Esme ,Ward,1986-07-15,Norwich,christopherhughes@ryan-kramer.net 77 | Werra,Ncnay,1992-06-27,Coventry,doughertywayne@ramsey.com 78 | Scarlett ,Harvy,2008-01-30,Bristol,hamptonryan@wilcox-mendoza.biz 79 | Megan ,Campbell,2008-10-07,Liverpool,elizabeth68@farrell-hutchinson.net 80 | Beatrice ,Hawkins,2003-10-27,Middlesbrough,jermaineydung@oaton.biz 81 | George ,Stevenson,2002-01-20,Derby,david28@schultz.net 82 | Ella ,Potter,1971-12-16,London,nyan@srosg.com 83 | Ruby ,Oenill,1974-11-27,London,clarkcharles@williams.info 84 | Holly ,Edwards,2007-03-07,London,sfleming@dalton.com 85 | Mas o,Houghton,2005-09-05,London,simmonsscott@wilson.com 86 | Alice ,Payne,2004-07-26,London,jason24@brown.com 87 | Muhammad ,Wolsn,1982-09-01,Portsmouth,edward32@moreno-beasley.net 88 | Jack ,Chpaan,1995-01-17,Plymouth,justin21@smith-wong.com 89 | George ,Evans,1986-11-05,London,rcasey@hernandez.com 90 | Smth,L uso,1990-12-22,Glasow,zalvarado@sibmons.mi 91 | Olievr,Jarvis,2000-03-02,slaGgw,weisstheresa@baird.biz 92 | Oavil ,Watson,2006-01-06,Luton,uallen@king.biz 93 | Eva ,Young,1997-07-01,Wolverhampton,elizabeth48@cobb.com 94 | Bella ,Jenkins,1978-12-06,Lndon,kimberlywells@robinson-lam.net 95 | Eleanor ,Harding,2005-08-10,Liverpool,yflores@williams.com 96 | Alexander ,Stewart,1996-01-08,deeL,jason64@tanner.com 97 | Luna ,Baker,2015-07-11,London,steven05@meyer.biz 98 | Alexander ,Miller,1978-08-08,London,gabriel48@young-cooley.com 99 | Nancy ,Carter,1984-08-07,Leicester,diana42@sderaon.org 100 | Anna ,Atkinson,2012-01-02,Cardiff,aanderson@rhodes.com 101 | Jacob ,Brayel,1993-09-19,London,tomisdaraus@cobb.com 102 | -------------------------------------------------------------------------------- /tests/data/left_5_nas.csv: -------------------------------------------------------------------------------- 1 | first_name,surname,dob,city 2 | Noah ,John,,London 3 | William ,Leo,1996-01-29,Ipswich 4 | Muhammad ,,1987-04-13,Bradford 5 | -------------------------------------------------------------------------------- /tests/data/left_token_escape.csv: -------------------------------------------------------------------------------- 1 | id,fname,mname,lname,dob,another_field 2 | 1,or,or and,and,20/05/1980,other data 3 | 2,or,or,or smith or,15/06/1990,more data 4 | 3,near,and,near,20/05/1960,another thing 5 | 6 | -------------------------------------------------------------------------------- /tests/data/right_1.csv: -------------------------------------------------------------------------------- 1 | id,name,middlename,surname,date,other 2 | 1,William,J,Johnston,20/05/1980,other data 3 | 2,James,Paul,Smith,15/06/1990,more data 4 | 3,Jodi,Elizabeth,Brown,20/05/1961,another thing 5 | 4,William,James,Johnston,21/05/1951,other data -------------------------------------------------------------------------------- /tests/data/right_2.csv: -------------------------------------------------------------------------------- 1 | id,fname,mname,lname,dob,another_field 2 | 1,Alistair,Paul,Johnston,20/05/1980,other data 3 | 2,James,Paul,Smith,15/06/1990,more data 4 | 3,Alasdair,Paul,Johnson,20/05/1960,another thing 5 | -------------------------------------------------------------------------------- /tests/data/right_3.csv: -------------------------------------------------------------------------------- 1 | first_name,surname,dob,city,email 2 | Noah ,John,1979-07-02,London,hernandeztina@white-clark.com 3 | William ,Lowe,1996-01-29,Ipswich,simmonseric@estes.info 4 | Riley,Muhammad ,1987-04-13,Bradford,ngutierrez@schmidt.com 5 | Jasmine ,tewNo,1973-07-29,London,lhale@cooper-novak.info 6 | Jacob ,Brown,1994-07-26,London,xwilliams@mcmahon.info 7 | Daniel ,Gardner,1988-04-08,Edinburg,anthony27@english.com 8 | Hrrison,Samuel ,2004-07-30,London,andersonaym@love.om 9 | John,iose ,2017-10-07,Dudley,irichmond@jones.net 10 | Elizabeth ,Hlmes,2005-12-10,Lnodo,armstrongdavid@simpson.com 11 | Elizabeth ,Bennett,1977-01-04,London,christophersmith@vega-dickson.net 12 | Scott,Sophie ,1976-10-05,Belfast,stephen96@munoz-rose.biz 13 | Reuben ,Davis,1980-11-13,London,laura99@carey.biz 14 | Chloe ,Edwards,2001-03-01,Leeds,donna16@stevens.info 15 | Hugo ,Hughes,2006-07-30,Bristol,amandawalker@lee.com 16 | Esme ,Hunter,1980-05-06,London,christopher42@lynch.com 17 | Gabriel ,Warren,2012-05-28,Nottingham,aaron93@cardenas.com 18 | Sophia ,Stevenson,2004-12-22,Bradford,wilcoxcatheline@norrs-hirl.net 19 | Jasper ,Thompson,2008-06-07,Glogsw,pamela40@wells-bond.com 20 | Isaac ,Smith,2004-02-19,lefast,andersonannette@west-martinez.org 21 | Muhammad ,Tucker,1990-01-05,Bradford,brian15@wood.biz 22 | Maria ,Lowe,1976-01-29,Brighton,wberry@tucker.com 23 | Olivia ,Scott,1976-12-23,nonoL,brian24-unthhall.org 24 | Muhamad ,Wood,1990-04-10,Manchester,dixonbejamin@mnna-hall.biz 25 | Eizabeht ,Reed,1972-10-05,Liverpool,kristy28@osborne-ochoa.net 26 | Henry ,Thompson,1982-03-11,Colcheter,perezalicia@bridges-cook.net 27 | Methaw ,Lewis,2011-12-06,Londo,reneeblack@bean.com 28 | Bniamjn ,Heath,2008-09-06,Leicester,kara10@briggs.com 29 | Arthur ,Jhnntos,1982-09-21,Kingston-upon-Hull,sroman@black.info 30 | Jack ,Lynch,2015-02-15,Sheffield,alejandro23@west-lane.rg 31 | Alexander ,Patel,1971-05-28,Edinburgh,mollyyoung@schroeder-allen.com 32 | Jssicae,ahh,1993-08-10,Coventry,rhond5a@case.net 33 | Harry ,Williams,1998-04-02,Yok,kdeleon@crawford-nicholson.com 34 | Oscr ,Thompson,1981-07-15,London,xweaer@millvr.com 35 | O ars,Wilson,2013-09-27,Bistol,kyle91@cunningham-bell.com 36 | Henry ,Scot,1997-10-22,Birmmnhai,porterkathleen@bell-hall.com 37 | Mia ,Taylor,1987-10-24,Soindw,owenspatricia@farrell.com 38 | Parker,vEa,1980-06-16,Lodon,davidsonjoseph@montgomery-harris.com 39 | Felicity ,Bull,1985-01-02,Derby,simpsondanielle@larson.com 40 | Samuel ,Curtis,1979-12-23,London,loganchristopher@butler.org 41 | Harvey ,Jaivs,2012-08-27,Leicester,ihritcne64@reilly.com 42 | Jck ,Jones,1992-05-03,Leeds,natalie78@watkins.com 43 | Scarlett ,Bnow,1974-05-01,London,leeaaron@brewer-hill.net 44 | Nancy ,Wright,1993-09-18,Ipswich,vvaldez@ruiz.com 45 | Isla ,Coleman,1997-11-22,London,jameswilsonwiggins.net 46 | Greenwood,Daisy ,2014-10-20,etocton-on-TeSs,utrevino@ross-vargas.info 47 | Felny ,Sutherland,2001-02-10,London,browascley@flores.hom 48 | Henry,Henderson,1999-10-27,London,luis17@medina.com 49 | James,Wood,1981-03-04,Leeds,pinedastephanie@hall-walker.org 50 | Allen,Jesse ,1987-01-06,Lonon,autumn07@young-boyen.rt 51 | Harrison ,Wilson,2010-09-29,Lndoo,fgolden@henderson.com 52 | Lexi ,Rowe,1989-09-24,oonLn,sgomez@jones.com 53 | Theodore ,Lucas,1990-05-10,Leicester,cynthiaanderson@welch.com 54 | Emily ,Foster,2013-10-25,Liverpool,tara37@uyng.info 55 | Bella ,Griffihs,1996-08-22,Walsall,thompsonrobert@lee.biz 56 | Stanley ,Whet,1981-02-12,York,michaela15@torres.info 57 | Chloe ,James,1984-12-27,nodon,rachel17@alexander.com 58 | George ,Powell,1995-12-19,Birmingham,andrea81@haney.com 59 | Nah ,Cole,2002-02-05,Stockton-on-Tees,daviskimberly@bennett-king.com 60 | Felix ,Mitchell,2000-10-21,Brmiigham,moorejeffrey@riley.net 61 | Ella ,Love,1986-10-08,Warrington,jessica43@thomas.biz 62 | Leo ,Webb,2012-03-11,Sheffield,sharon50@alexander.com 63 | Sophia ,tSne,1985-06-10,London,qhawkins@pennington-hurley.biz 64 | Emilia ,Cooper,2001-08-19,Coventry,lerrycampbell@doyl.biz 65 | Edwards,Francesca ,1987-09-12,Bightor,floydmegan@torres.com 66 | Ada ,Knight,1979-04-24,Dyrb,mcconnellstfphen@walker-eloresnet 67 | Daisy ,Phillips,2010-02-17,Sheffield,grahamchad@tate.info 68 | Isabelle ,Doyle,2010-06-24,London,michael84@munoz-matthews.com 69 | boRyn,oRs,1998-06-20,NewcTtle-upon-syne,wbrown@wall.com 70 | Lottie ,Smith,1982-12-03,London,robersonbrenda@sutton-phillips.com 71 | Arabella ,Gray,1978-11-01,London,pmartinez@cox-williams.com 72 | Lewis ,Smith,1996-02-22,London,thaokins@hamilton-burh.cwm 73 | Daisy ,Bob,2009-02-16,Coventry,chelsea39@jackson.com 74 | William ,Walker,1985-10-11,Liverpool,tracysanders@mitchell-collins.com 75 | Daniel ,Brown,2002-01-28,Sdnuerlan,qhouse@morton.org 76 | Esme ,dar,1986-07-15,Norwich,christopherhughes@ryan-kramer.net 77 | Nancy ,raWen,1992-07-19,Coertny,doughertywayne@ramsey.com 78 | Scarlett ,Harvey,2008-01-30,Bristol,hamptonryan@wilcox-mendoza.biz 79 | Campbell,Megan ,2008-12-12,Livprool,elizabeth68@farrell-hutchinson.net 80 | Beatrice ,Hawkins,2003-10-27,Middlesbrough,jermaineyoung@dalton.biz 81 | George ,Stevenson,2002-01-20,Derby,david28@schultz.net 82 | Ella ,Potter,1971-12-16,London,nryan@gross.com 83 | Ruby ,O neill,1974-08-26,London,clarkcharles@williams.info 84 | Holly ,Edwards,2007-02-27,London,sfleming@dalton.com 85 | Mans ,Houghton,2005-09-05,London,simmonsscott@wilson.com 86 | Acile,Payne,2004-11-17,Lndon,jason24@brown.com 87 | Muhammad ,Wilson,1982-08-22,Portsmouth,edward32@moreno-beasley.net 88 | Jack ,Chapman,1995-02-15,Plymouth,justin21@smith-wong.com 89 | Evns,George ,1986-11-05,London,rcasey@hernandez.com 90 | Louis ,Smith,1990-12-22,Glasgow,zalvarado@simmons.biz 91 | Jarvis,Oliver ,2000-03-07,Glasgow,weisstheresa@baird.biz 92 | Olivia ,Watson,2006-01-06,Lnot,uallen@king.biz 93 | Eva ,Young,1997-05-24,Wolverhampton,elizabeo48@ctbb.com 94 | Bella ,Jenkins,1979-03-13,London,kimberlywells@robinson-tm.nel 95 | Eeanor ,Harding,2005-08-10,Loerpovl,yfloreslil@iams.com 96 | Alexander ,Stewart,1996-01-08,Leeds,jason64@tanner.com 97 | auL ,Bakr,2015-06-26,London,stven05m@eyer.biz 98 | Alexander ,Miller,1978-07-30,London,gabriel48@young-cooley.com 99 | Nancy ,Carter,1984-08-07,Leicester,diana42@anderson.org 100 | Anna ,Atkinson,2012-01-02,Cardiff,aanderson@rhodes.com 101 | Jacob ,Bradley,1993-12-23,London,thomasdarius@cobb.com 102 | -------------------------------------------------------------------------------- /tests/data/right_5_nas.csv: -------------------------------------------------------------------------------- 1 | first_name,surname,dob,city 2 | Noah ,,,London 3 | William ,Leo,1996-01-29,Ipswich 4 | Muhammad ,lRey,1987-04-13,Bradford 5 | -------------------------------------------------------------------------------- /tests/data/right_token_escape.csv: -------------------------------------------------------------------------------- 1 | id,name,middlename,surname,date,other 2 | 1,or,or,or smith or,15/06/1990,more data 3 | 2,near,and,near,20/05/1960,another thing 4 | 3,or,or and,and,20/05/1980,other data -------------------------------------------------------------------------------- /tests/datagetter_performance.txt: -------------------------------------------------------------------------------- 1 | {"datetime": "2017-11-28T18:24:43.058249", "commit_hash": "c2e71ec", "datagetter_cartesian": 0.97, "datagetter_sqlite": 0.54, "test_type": "left_3"} 2 | {"datetime": "2017-11-28T21:55:46.633256", "commit_hash": "4115da7", "datagetter_cartesian": 0.96, "datagetter_sqlite": 0.87, "test_type": "left_3"} 3 | {"datetime": "2017-12-02T09:25:15.475843", "commit_hash": "2279321", "datagetter_cartesian": 0.96, "datagetter_sqlite": 0.87, "test_type": "left_3"} 4 | {"datetime": "2017-12-03T18:12:18.623974", "commit_hash": "aee05f3", "datagetter_cartesian": 0.87, "datagetter_sqlite": 0.93, "test_type": "left_3"} 5 | {"datetime": "2017-12-09T09:10:17.671481", "commit_hash": "800d0de", "datagetter_cartesian": 0.98, "datagetter_sqlite": 0.94, "test_type": "left_3"} 6 | {"datetime": "2017-12-09T09:15:08.065057", "commit_hash": "800d0de", "datagetter_cartesian": 0.98, "datagetter_sqlite": 0.94, "test_type": "left_3", "time_taken": 19.634991832004744} 7 | {"datetime": "2017-12-10T14:50:23.598907", "commit_hash": "42cd45e", "datagetter_cartesian": 0.92, "datagetter_sqlite": 0.89, "test_type": "left_3", "time_taken": 0.5589753400126938} 8 | {"datetime": "2017-12-10T14:52:42.900029", "commit_hash": "42cd45e", "datagetter_cartesian": 0.92, "datagetter_sqlite": 0.89, "test_type": "left_3", "time_taken": 0.5655303659732454} 9 | {"datetime": "2017-12-10T17:08:36.462037", "commit_hash": "1d74eee", "datagetter_cartesian": 0.92, "datagetter_sqlite": 0.89, "test_type": "left_3", "time_taken": 0.5987050510011613} 10 | {"datetime": "2017-12-10T17:08:48.684092", "commit_hash": "1d74eee", "datagetter_cartesian": 0.92, "datagetter_sqlite": 0.89, "test_type": "left_3", "time_taken": 0.5607335189997684} 11 | {"datetime": "2017-12-10T17:08:55.645271", "commit_hash": "1d74eee", "datagetter_cartesian": 0.92, "datagetter_sqlite": 0.89, "test_type": "left_3", "time_taken": 0.5696567740233149} 12 | {"datetime": "2017-12-20T08:49:46.559783", "commit_hash": "9d13df6", "datagetter_cartesian": 0.0, "datagetter_sqlite": 0.66, "test_type": "left_3", "time_taken": 0.13090573600493371} 13 | {"datetime": "2017-12-20T08:53:43.190434", "commit_hash": "9d13df6", "datagetter_cartesian": 0.0, "datagetter_sqlite": 0.66, "test_type": "left_3", "time_taken": 0.13246479799272493} 14 | {"datetime": "2017-12-20T08:54:42.085973", "commit_hash": "9d13df6", "datagetter_cartesian": "NA", "datagetter_sqlite": 0.6205, "test_type": "left_4", "time_taken": 24.26091426698258} 15 | {"datetime": "2017-12-20T08:58:01.843589", "commit_hash": "9d13df6", "datagetter_cartesian": 0.0, "datagetter_sqlite": 0.66, "test_type": "left_3", "time_taken": 0.13119038398144767} 16 | {"datetime": "2017-12-20T08:58:36.241364", "commit_hash": "9d13df6", "datagetter_cartesian": "NA", "datagetter_sqlite": 0.6205, "test_type": "left_4", "time_taken": 34.02364709501853} 17 | -------------------------------------------------------------------------------- /tests/generate_test_data.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import random 4 | import datetime 5 | from faker import Faker 6 | 7 | def get_random_by_freq(csv_path, field, num_elements): 8 | df = pd.read_csv(csv_path) 9 | elements = df[field] 10 | probabilites = df["freq"]/sum(df["freq"]) 11 | return np.random.choice(elements, num_elements, p= list(probabilites)) 12 | 13 | def get_fakes(fake_fn, num_elements): 14 | result = [] 15 | for i in range(num_elements): 16 | result.append(fake_fn()) 17 | return result 18 | 19 | def switch(my_string): 20 | pos1 = random.randrange(0,len(my_string)) 21 | pos2 = random.randrange(0,len(my_string)) 22 | my_string = list(my_string) 23 | char1 = my_string[pos1] 24 | char2 = my_string[pos2] 25 | my_string[pos1] = char2 26 | my_string[pos2] = char1 27 | return "".join(my_string) 28 | 29 | def new_letter(my_string): 30 | pos1 = random.randrange(0,len(my_string)) 31 | letter = random.choice(string.ascii_lowercase) 32 | return my_string[:pos1] + letter + my_string[pos1:] 33 | 34 | def delete_letter(my_string): 35 | pos1 = random.randrange(1,len(my_string)) 36 | return my_string[:pos1] + my_string[pos1+1:] 37 | 38 | def corrupt_string(my_string, num_switches=1,num_new_letters=0,num_deletes=1): 39 | 40 | for i in range(num_switches): 41 | my_string = switch(my_string) 42 | 43 | for i in range(num_new_letters): 44 | my_string = new_letter(my_string) 45 | 46 | for i in range(num_deletes): 47 | my_string = delete_letter(my_string) 48 | 49 | return my_string 50 | 51 | def corrupt_dob(dob): 52 | fmt = "%Y-%m-%d" 53 | date_1 = datetime.datetime.strptime(dob, fmt) 54 | end_date = date_1 + datetime.timedelta(days=random.randint(-100,100)) 55 | return end_date.strftime(fmt) 56 | 57 | def create_test_data(num_elements = 100): 58 | fake = Faker() 59 | 60 | first_names = pd.read_csv("data/_first_names.csv") 61 | surnames = pd.read_csv("data/_surnames.csv") 62 | 63 | data = {} 64 | data["01first_name"] = get_random_by_freq("data/_first_names.csv", "name", num_elements) 65 | data["02surname"] = get_random_by_freq("data/_surnames.csv", "surname", num_elements) 66 | data["03dob"] = get_fakes(fake.date, num_elements) 67 | data["04city"] = get_random_by_freq("data/_cities.csv", "city", num_elements) 68 | data["05email"] = get_fakes(fake.company_email, num_elements) 69 | 70 | df_left = pd.DataFrame(data) 71 | df_left.columns = [c[2:] for c in df_left.columns] 72 | 73 | df_right = df_left.copy() 74 | 75 | for df in [df_left, df_right]: 76 | for r in df.iterrows(): 77 | index = r[0] 78 | row = r[1] 79 | 80 | for col in ["first_name", "surname", "city", "email"]: 81 | if (random.random()>0.8): 82 | df.loc[index, col] = corrupt_string(row[col]) 83 | 84 | # Sometimes switch first name and surname 85 | if (random.random()>0.9): 86 | surname = row["surname"] 87 | first_name = row["first_name"] 88 | df.loc[index, "first_name"] = surname 89 | df.loc[index, "surname"] = first_name 90 | 91 | # Corrupt the dob 92 | if random.random() > 0.8: 93 | df.loc[index, "dob"] = corrupt_dob(row["dob"]) 94 | return df_left, df_right -------------------------------------------------------------------------------- /tests/realexample_performance.txt: -------------------------------------------------------------------------------- 1 | {"datetime": "2017-12-02T09:24:57.559923", "commit_hash": "2279321", "perc_correct": 0.9921052631578947, "test_type": "local_authority"} 2 | {"datetime": "2017-12-03T18:12:18.982100", "commit_hash": "aee05f3", "perc_correct": 0.9973684210526316, "test_type": "local_authority"} 3 | {"datetime": "2017-12-09T09:09:38.156366", "commit_hash": "800d0de", "perc_correct": 0.9973684210526316, "test_type": "local_authority"} 4 | {"datetime": "2017-12-09T09:10:18.018472", "commit_hash": "800d0de", "perc_correct": 0.9973684210526316, "test_type": "local_authority"} 5 | {"datetime": "2017-12-09T09:15:08.420186", "commit_hash": "800d0de", "perc_correct": 0.9973684210526316, "test_type": "local_authority", "time_taken": 0.34182741500262637} 6 | {"datetime": "2017-12-10T14:50:24.197806", "commit_hash": "42cd45e", "perc_correct": 0.9973684210526316, "test_type": "local_authority", "time_taken": 0.5683609510015231} 7 | {"datetime": "2017-12-10T14:52:43.483357", "commit_hash": "42cd45e", "perc_correct": 0.9973684210526316, "test_type": "local_authority", "time_taken": 0.5710940429999027} 8 | {"datetime": "2017-12-10T17:08:37.070141", "commit_hash": "1d74eee", "perc_correct": 0.9973684210526316, "test_type": "local_authority", "time_taken": 0.5759790809825063} 9 | {"datetime": "2017-12-10T17:08:49.262955", "commit_hash": "1d74eee", "perc_correct": 0.9973684210526316, "test_type": "local_authority", "time_taken": 0.5662913529959042} 10 | {"datetime": "2017-12-10T17:08:56.214418", "commit_hash": "1d74eee", "perc_correct": 0.9973684210526316, "test_type": "local_authority", "time_taken": 0.5566127240017522} 11 | {"datetime": "2017-12-20T08:49:47.164089", "commit_hash": "9d13df6", "perc_correct": 0.9973684210526316, "test_type": "local_authority", "time_taken": 0.5756925900350325} 12 | {"datetime": "2017-12-20T08:54:43.118046", "commit_hash": "9d13df6", "perc_correct": 0.9973684210526316, "test_type": "local_authority", "time_taken": 1.0100204049958847} 13 | {"datetime": "2017-12-20T08:58:37.191906", "commit_hash": "9d13df6", "perc_correct": 0.9973684210526316, "test_type": "local_authority", "time_taken": 0.9297601060243323} 14 | -------------------------------------------------------------------------------- /tests/test_accuracy.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Tests 5 | """ 6 | 7 | import datetime 8 | import json 9 | import pandas as pd 10 | import subprocess 11 | from timeit import default_timer as timer 12 | import unittest 13 | 14 | 15 | 16 | from fuzzymatcher import link_table, fuzzy_left_join 17 | from fuzzymatcher.data_getter_cartesian import DataGetterCartesian 18 | from fuzzymatcher.matcher import Matcher 19 | 20 | def get_commit_hash(): 21 | process = subprocess.Popen("git describe --always", stdout=subprocess.PIPE, shell=True) 22 | output, error = process.communicate() 23 | commit_hash = output.decode("utf-8").replace("\n", "") 24 | return commit_hash 25 | 26 | def link_table_percentage_correct(link_table): 27 | lt = link_table.copy() 28 | lt = lt[lt["match_rank"] == 1] 29 | lt["__id_left"] = lt["__id_left"].str.replace("_left", "") 30 | lt["__id_right"] = lt["__id_right"].str.replace("_right", "") 31 | lt["link_correct"] = (lt["__id_left"] == lt["__id_right"]) 32 | 33 | return lt["link_correct"].sum()/len(lt) 34 | 35 | class DatagetterAccuracy(unittest.TestCase): 36 | """ 37 | These tests actually run accurancy analysis of the results 38 | They're not pass fail but they log how well the matcher is doing 39 | """ 40 | 41 | def test_data_1000(self): 42 | 43 | m = Matcher() 44 | 45 | df_left = pd.read_csv("tests/data/left_4.csv") 46 | df_right = pd.read_csv("tests/data/right_4.csv") 47 | 48 | on = ["first_name", "surname", "dob", "city"] 49 | 50 | m.add_data(df_left, df_right, on, on) 51 | 52 | start = timer() 53 | m.match_all() 54 | lt = m.get_formatted_link_table() 55 | end = timer() 56 | time_taken = end - start 57 | sqlite_perc = link_table_percentage_correct(lt) 58 | 59 | this_record = {} 60 | this_record["datetime"] = datetime.datetime.now().isoformat() 61 | this_record["commit_hash"] = get_commit_hash() 62 | this_record["datagetter_cartesian"] = "NA" 63 | this_record["datagetter_sqlite"] = sqlite_perc 64 | this_record["test_type"] = "left_4" 65 | this_record["time_taken"] = time_taken 66 | 67 | with open("tests/datagetter_performance.txt", "a") as myfile: 68 | myfile.writelines(json.dumps(this_record) + "\n") 69 | 70 | def test_data_100(self): 71 | dg = DataGetterCartesian() 72 | m = Matcher(data_getter = dg) 73 | 74 | df_left = pd.read_csv("tests/data/left_3.csv") 75 | df_right = pd.read_csv("tests/data/right_3.csv") 76 | 77 | on = ["first_name", "surname", "dob", "city"] 78 | 79 | m.add_data(df_left, df_right, on, on) 80 | 81 | start = timer() 82 | m.match_all() 83 | lt = m.get_formatted_link_table() 84 | end = timer() 85 | time_taken = end - start 86 | 87 | cartesian_perc = link_table_percentage_correct(lt) 88 | 89 | lt2 = link_table(df_left, df_right, on, on) 90 | sqlite_perc = link_table_percentage_correct(lt2) 91 | 92 | this_record = {} 93 | this_record["datetime"] = datetime.datetime.now().isoformat() 94 | this_record["commit_hash"] = get_commit_hash() 95 | this_record["datagetter_cartesian"] = cartesian_perc 96 | this_record["datagetter_sqlite"] = sqlite_perc 97 | this_record["test_type"] = "left_3" 98 | this_record["time_taken"] = time_taken 99 | 100 | with open("tests/datagetter_performance.txt", "a") as myfile: 101 | myfile.writelines(json.dumps(this_record) + "\n") 102 | 103 | def test_la_data(self): 104 | ons = pd.read_csv("tests/data/las_ons.csv") 105 | os = pd.read_csv("tests/data/las_os.csv") 106 | 107 | start = timer() 108 | df_joined = fuzzy_left_join(ons, os, left_on = ["lad16nm"], right_on = ["name"]) 109 | end = timer() 110 | time_taken = end - start 111 | 112 | rename = {"lad16cd": "ons_code", "code": "os_code", "lad16nm": "ons_name", "name": "os_name"} 113 | df_joined = df_joined.rename(columns=rename) 114 | col_order = ["best_match_score", "ons_name", "os_name", "ons_code", "os_code"] 115 | 116 | num_records = len(df_joined) 117 | correct_binary = (df_joined["ons_code"] == df_joined["os_code"]) 118 | perc_correct = correct_binary.sum()/num_records 119 | 120 | this_record = {} 121 | this_record["datetime"] = datetime.datetime.now().isoformat() 122 | this_record["commit_hash"] = get_commit_hash() 123 | this_record["perc_correct"] = perc_correct 124 | this_record["test_type"] = "local_authority" 125 | this_record["time_taken"] = time_taken 126 | 127 | with open("tests/realexample_performance.txt", "a") as myfile: 128 | myfile.writelines(json.dumps(this_record) + "\n") 129 | -------------------------------------------------------------------------------- /tests/test_colnames.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Tests 5 | """ 6 | 7 | import unittest 8 | from fuzzymatcher import link_table 9 | import pandas as pd 10 | 11 | ''' 12 | The data path does not seem to be correct, judging from what 13 | was created using 'create_fake_dataset.ipynb'. Correcting it to match 14 | true locations (data folder within tests folder). 15 | ''' 16 | # "tests/data/left_2.csv" (original left data path) 17 | # "tests/data/right_2.csv" (original right data path) 18 | left_1_path = "./data/left_1.csv" 19 | right_1_path = "./data/right_1.csv" 20 | left_2_path = "./data/left_2.csv" 21 | right_2_path = "./data/right_2.csv" 22 | 23 | class ColNameCollisions(unittest.TestCase): 24 | """ 25 | Test what happens when the user provides input data with 26 | some column names which are the same in each dataset 27 | """ 28 | 29 | 30 | 31 | 32 | def test_all_colnames_match(self): 33 | """ 34 | Adding two numbers should give the correct answer 35 | """ 36 | left = pd.read_csv(left_2_path) 37 | right = pd.read_csv(right_2_path) 38 | left_on = ["fname", "mname", "lname", "dob"] 39 | right_on = ["fname", "mname", "lname", "dob"] 40 | 41 | df = link_table(left, right, left_on, right_on) 42 | 43 | expected_columns = ['__id_left', 44 | '__id_right', 45 | 'match_score', 46 | 'match_rank', 47 | 'fname_left', 48 | 'fname_right', 49 | 'mname_left', 50 | 'mname_right', 51 | 'lname_left', 52 | 'lname_right', 53 | 'dob_left', 54 | 'dob_right'] 55 | 56 | actual_columns = list(df.columns) 57 | self.assertEqual(expected_columns, actual_columns) 58 | 59 | def test_all_colnames_match_with_id(self): 60 | """ 61 | Adding two numbers should give the correct answer 62 | """ 63 | left = pd.read_csv(left_2_path) 64 | right = pd.read_csv(right_2_path) 65 | left_on = ["fname", "mname", "lname", "dob"] 66 | right_on = ["fname", "mname", "lname", "dob"] 67 | 68 | df = link_table(left, right, left_on, right_on, left_id_col="id", right_id_col="id") 69 | 70 | expected_columns = ['__id_left', 71 | '__id_right', 72 | 'match_score', 73 | 'match_rank', 74 | 'fname_left', 75 | 'fname_right', 76 | 'mname_left', 77 | 'mname_right', 78 | 'lname_left', 79 | 'lname_right', 80 | 'dob_left', 81 | 'dob_right'] 82 | 83 | actual_columns = list(df.columns) 84 | self.assertEqual(expected_columns, actual_columns) 85 | 86 | def test_some_colnames_match(self): 87 | """ 88 | Adding two numbers should give the correct answer 89 | """ 90 | left = pd.read_csv(left_1_path) 91 | left = left.rename(columns = {"fname": "name"}) 92 | right = pd.read_csv(right_1_path) 93 | left_on = ["name", "mname", "lname", "dob"] 94 | right_on = ["name", "middlename", "surname", "date"] 95 | 96 | df = link_table(left, right, left_on, right_on) 97 | 98 | expected_columns = ['__id_left', 99 | '__id_right', 100 | 'match_score', 101 | 'match_rank', 102 | 'name_left', 103 | 'name_right', 104 | 'mname', 105 | 'middlename', 106 | 'lname', 107 | 'surname', 108 | 'dob', 109 | 'date'] 110 | 111 | actual_columns = list(df.columns) 112 | self.assertEqual(expected_columns, actual_columns) 113 | 114 | 115 | if __name__ == '__main__': 116 | unittest.main() 117 | -------------------------------------------------------------------------------- /tests/test_misc.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from fuzzymatcher import link_table 3 | import pandas as pd 4 | 5 | class TestNulls(unittest.TestCase): 6 | """ 7 | Test what happens when the user provides input data with 8 | null values in some of the cells 9 | """ 10 | 11 | def test_nulls_no_errors(self): 12 | """ 13 | Adding two numbers should give the correct answer 14 | """ 15 | df_left = pd.read_csv("tests/data/left_5_nas.csv") 16 | df_right = pd.read_csv("tests/data/right_5_nas.csv") 17 | 18 | on = ["first_name", "surname", "dob", "city"] 19 | 20 | flj = link_table(df_left, df_right, on, on) 21 | 22 | 23 | class TestNulls(unittest.TestCase): 24 | """ 25 | Test what happens when the user provides input data with 26 | fts4 match expression keyworks like AND, OR, NEAR 27 | """ 28 | 29 | def test_nulls_no_errors(self): 30 | """ 31 | 32 | """ 33 | 34 | 35 | df_left = pd.read_csv("tests/data/left_token_escape.csv") 36 | df_right = pd.read_csv("tests/data/right_token_escape.csv") 37 | 38 | # Columns to match on from df_left 39 | left_on = ["fname", "mname", "lname"] 40 | 41 | # Columns to match on from df_right 42 | right_on = ["name", "middlename", "surname"] 43 | 44 | on = ["first_name", "surname", ] 45 | 46 | flj = link_table(df_left, df_right, left_on, right_on, 47 | left_id_col="id", right_id_col="id") 48 | --------------------------------------------------------------------------------