├── .github
    └── workflows
    │   └── pythonpackage.yml
├── .gitignore
├── LICENSE
├── README.rst
├── docs
    ├── Makefile
    ├── conf.py
    ├── index.rst
    ├── readme.rst
    └── source
    │   ├── fuzzymatcher.rst
    │   └── modules.rst
├── examples.ipynb
├── fuzzymatcher
    ├── __init__.py
    ├── data_getter_abc.py
    ├── data_getter_cartesian.py
    ├── data_getter_sqlite.py
    ├── data_preprocessor_abc.py
    ├── data_preprocessor_default.py
    ├── matcher.py
    ├── record.py
    ├── scorer_abc.py
    ├── scorer_default.py
    ├── tokencomparison.py
    └── utils.py
├── requirements.txt
├── setup.py
└── tests
    ├── __init__.py
    ├── create_fake_dataset.ipynb
    ├── data
        ├── _cities.csv
        ├── _first_names.csv
        ├── _surnames.csv
        ├── las_ons.csv
        ├── las_os.csv
        ├── left_1.csv
        ├── left_2.csv
        ├── left_3.csv
        ├── left_4.csv
        ├── left_5_nas.csv
        ├── left_token_escape.csv
        ├── right_1.csv
        ├── right_2.csv
        ├── right_3.csv
        ├── right_4.csv
        ├── right_5_nas.csv
        └── right_token_escape.csv
    ├── datagetter_performance.txt
    ├── generate_test_data.py
    ├── realexample_performance.txt
    ├── test_accuracy.py
    ├── test_colnames.py
    └── test_misc.py


/.github/workflows/pythonpackage.yml:
--------------------------------------------------------------------------------
 1 | name: Python package
 2 | 
 3 | on: [push]
 4 | 
 5 | jobs:
 6 |   build:
 7 | 
 8 |     runs-on: ubuntu-latest
 9 |     strategy:
10 |       max-parallel: 4
11 |       matrix:
12 |         python-version: [3.5, 3.6, 3.7]
13 | 
14 |     steps:
15 |     - uses: actions/checkout@v1
16 |     - name: Set up Python ${{ matrix.python-version }}
17 |       uses: actions/setup-python@v1
18 |       with:
19 |         python-version: ${{ matrix.python-version }}
20 |     - name: Install dependencies
21 |       run: |
22 |         python -m pip install --upgrade pip
23 |         pip install -r requirements.txt
24 | #     - name: Lint with flake8
25 | #       run: |
26 | #         pip install flake8
27 | #         # stop the build if there are Python syntax errors or undefined names
28 | #         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
29 | #         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
30 | #         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
31 |     - name: Test with unittest
32 |       run: |
33 |         pip install coverage
34 |         coverage run -m unittest discover
35 |         coverage xml -o codecov_report.xml
36 |     - uses: codecov/codecov-action@v1.0.2
37 |       with:
38 |         token: ${{secrets.CODECOV_TOKEN}} #required
39 |         file: ./codecov_report.xml #optional
40 |         flags: unittests #optional
41 |         name: codecov-umbrella #optional
42 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | .static_storage/
 56 | .media/
 57 | local_settings.py
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Robin Linacre
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | .. image:: https://badge.fury.io/py/fuzzymatcher.svg
 2 |     :target: https://badge.fury.io/py/fuzzymatcher
 3 | 
 4 | .. image:: https://codecov.io/gh/RobinL/fuzzymatcher/branch/dev/graph/badge.svg
 5 |   :target: https://codecov.io/gh/RobinL/fuzzymatcher
 6 | 
 7 | 
 8 | fuzzymatcher
 9 | ======================================
10 | 
11 | **Note:  fuzzymatcher is no longer actively maintained.  Please see** `splink <https://github.com/moj-analytical-services/splink>`_ **for a more accurate, scalable and performant solution**
12 | 
13 | A Python package that allows the user to fuzzy match two pandas dataframes based on one or more common fields.
14 | 
15 | Fuzzymatches uses ``sqlite3``'s Full Text Search to find potential matches.
16 | 
17 | It then uses `probabilistic record linkage <https://en.wikipedia.org/wiki/Record_linkage#Probabilistic_record_linkage>`_ to score matches.
18 | 
19 | Finally it outputs a list of the matches it has found and associated score. 
20 | 
21 | 
22 | Installation
23 | ------------
24 | 
25 | ``pip install fuzzymatcher``
26 | 
27 | Note that you will need a build of sqlite which includes FTS4.  This seems to be widely included by default, but otherwise `see here <https://www.sqlite.org/fts3.html#compiling_and_enabling_fts3_and_fts4>`_.
28 | 
29 | Usage
30 | -----
31 | 
32 | See `examples.ipynb <https://github.com/RobinL/fuzzymatcher/blob/master/examples.ipynb>`_ for examples of usage and the output.
33 | 
34 | You can run these examples interactively `here <https://mybinder.org/v2/gh/RobinL/fuzzymatcher/master?filepath=examples.ipynb>`_.
35 | 
36 | Simple example
37 | --------------
38 | 
39 | Suppose you have a table called ``df_left`` which looks like this:
40 | 
41 | ====  =============
42 |   id  ons_name
43 | ====  =============
44 |    0  Darlington
45 |    1  Monmouthshire
46 |    2  Havering
47 |    3  Knowsley
48 |    4  Charnwood
49 |  ...  etc.
50 | ====  =============
51 | 
52 | And you want to link it to a table ``df_right`` that looks like this:
53 | 
54 | ====  =========================
55 |   id  os_name
56 | ====  =========================
57 |    0  Darlington (B)
58 |    1  Havering London Boro
59 |    2  Sir Fynwy - Monmouthshire
60 |    3  Knowsley District (B)
61 |    4  Charnwood District (B)
62 |  ...  etc.
63 | ====  =========================
64 | 
65 | You can write:
66 | 
67 | .. code:: python
68 | 
69 |   import fuzzymatcher
70 |   fuzzymatcher.fuzzy_left_join(df_left, df_right, left_on = "ons_name", right_on = "os_name")
71 | 
72 | And you'll get:
73 | 
74 | ==================  =============  =========================
75 |   best_match_score  ons_name       os_name
76 | ==================  =============  =========================
77 |           0.178449  Darlington     Darlington (B)
78 |           0.133371  Monmouthshire  Sir Fynwy - Monmouthshire
79 |           0.102473  Havering       Havering London Boro
80 |           0.155775  Knowsley       Knowsley District (B)
81 |           0.155775  Charnwood      Charnwood District (B)
82 |                ...  etc.           etc.
83 | ==================  =============  =========================
84 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SPHINXPROJ    = fuzzymatcher
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # fuzzymatcher documentation build configuration file, created by
  5 | # sphinx-quickstart on Wed Nov 15 15:39:19 2017.
  6 | #
  7 | # This file is execfile()d with the current directory set to its
  8 | # containing dir.
  9 | #
 10 | # Note that not all possible configuration values are present in this
 11 | # autogenerated file.
 12 | #
 13 | # All configuration values have a default; values that are commented out
 14 | # serve to show the default.
 15 | 
 16 | # If extensions (or modules to document with autodoc) are in another directory,
 17 | # add these directories to sys.path here. If the directory is relative to the
 18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 19 | #
 20 | import os
 21 | import sys
 22 | sys.path.insert(0, os.path.abspath('.'))
 23 | sys.path.insert(0, os.path.abspath('../'))
 24 | 
 25 | 
 26 | # -- General configuration ------------------------------------------------
 27 | 
 28 | # If your documentation needs a minimal Sphinx version, state it here.
 29 | #
 30 | # needs_sphinx = '1.0'
 31 | 
 32 | # Add any Sphinx extension module names here, as strings. They can be
 33 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 34 | # ones.
 35 | extensions = ['sphinx.ext.autodoc',
 36 |     'sphinx.ext.githubpages',
 37 |     'sphinx.ext.napoleon']
 38 | 
 39 | napoleon_google_docstring = True
 40 | napoleon_numpy_docstring = False
 41 | 
 42 | # Add any paths that contain templates here, relative to this directory.
 43 | templates_path = ['_templates']
 44 | 
 45 | # The suffix(es) of source filenames.
 46 | # You can specify multiple suffix as a list of string:
 47 | #
 48 | # source_suffix = ['.rst', '.md']
 49 | source_suffix = '.rst'
 50 | 
 51 | # The master toctree document.
 52 | master_doc = 'index'
 53 | 
 54 | # General information about the project.
 55 | project = 'fuzzymatcher'
 56 | copyright = '2017, Robin Linacre'
 57 | author = 'Robin Linacre'
 58 | 
 59 | # The version info for the project you're documenting, acts as replacement for
 60 | # |version| and |release|, also used in various other places throughout the
 61 | # built documents.
 62 | #
 63 | # The short X.Y version.
 64 | version = '0.1'
 65 | # The full version, including alpha/beta/rc tags.
 66 | release = '0.1'
 67 | 
 68 | # The language for content autogenerated by Sphinx. Refer to documentation
 69 | # for a list of supported languages.
 70 | #
 71 | # This is also used if you do content translation via gettext catalogs.
 72 | # Usually you set "language" from the command line for these cases.
 73 | language = None
 74 | 
 75 | # List of patterns, relative to source directory, that match files and
 76 | # directories to ignore when looking for source files.
 77 | # This patterns also effect to html_static_path and html_extra_path
 78 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
 79 | 
 80 | # The name of the Pygments (syntax highlighting) style to use.
 81 | pygments_style = 'sphinx'
 82 | 
 83 | # If true, `todo` and `todoList` produce output, else they produce nothing.
 84 | todo_include_todos = False
 85 | 
 86 | 
 87 | # -- Options for HTML output ----------------------------------------------
 88 | 
 89 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 90 | # a list of builtin themes.
 91 | #
 92 | html_theme = 'sphinx_rtd_theme'
 93 | 
 94 | # Theme options are theme-specific and customize the look and feel of a theme
 95 | # further.  For a list of options available for each theme, see the
 96 | # documentation.
 97 | #
 98 | # html_theme_options = {}
 99 | 
100 | # Add any paths that contain custom static files (such as style sheets) here,
101 | # relative to this directory. They are copied after the builtin static files,
102 | # so a file named "default.css" will overwrite the builtin "default.css".
103 | html_static_path = ['_static']
104 | 
105 | 
106 | # -- Options for HTMLHelp output ------------------------------------------
107 | 
108 | # Output file base name for HTML help builder.
109 | htmlhelp_basename = 'fuzzymatcherdoc'
110 | 
111 | 
112 | # -- Options for LaTeX output ---------------------------------------------
113 | 
114 | latex_elements = {
115 |     # The paper size ('letterpaper' or 'a4paper').
116 |     #
117 |     # 'papersize': 'letterpaper',
118 | 
119 |     # The font size ('10pt', '11pt' or '12pt').
120 |     #
121 |     # 'pointsize': '10pt',
122 | 
123 |     # Additional stuff for the LaTeX preamble.
124 |     #
125 |     # 'preamble': '',
126 | 
127 |     # Latex figure (float) alignment
128 |     #
129 |     # 'figure_align': 'htbp',
130 | }
131 | 
132 | # Grouping the document tree into LaTeX files. List of tuples
133 | # (source start file, target name, title,
134 | #  author, documentclass [howto, manual, or own class]).
135 | latex_documents = [
136 |     (master_doc, 'fuzzymatcher.tex', 'fuzzymatcher Documentation',
137 |      'Robin Linacre', 'manual'),
138 | ]
139 | 
140 | 
141 | # -- Options for manual page output ---------------------------------------
142 | 
143 | # One entry per manual page. List of tuples
144 | # (source start file, name, description, authors, manual section).
145 | man_pages = [
146 |     (master_doc, 'fuzzymatcher', 'fuzzymatcher Documentation',
147 |      [author], 1)
148 | ]
149 | 
150 | 
151 | # -- Options for Texinfo output -------------------------------------------
152 | 
153 | # Grouping the document tree into Texinfo files. List of tuples
154 | # (source start file, target name, title, author,
155 | #  dir menu entry, description, category)
156 | texinfo_documents = [
157 |     (master_doc, 'fuzzymatcher', 'fuzzymatcher Documentation',
158 |      author, 'fuzzymatcher', 'One line description of project.',
159 |      'Miscellaneous'),
160 | ]
161 | 
162 | 
163 | 
164 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. fuzzymatcher documentation master file, created by
 2 |    sphinx-quickstart on Wed Nov 15 15:39:19 2017.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to fuzzymatcher's documentation!
 7 | ========================================
 8 | 
 9 | .. toctree::
10 |    :maxdepth: 2
11 |    :caption: Contents:
12 | 
13 |    readme
14 | 
15 | Another title goes here
16 | =======================
17 | 
18 | * :ref:`genindex`
19 | * :ref:`modindex`
20 | * :ref:`search`
21 | 


--------------------------------------------------------------------------------
/docs/readme.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../README.rst


--------------------------------------------------------------------------------
/docs/source/fuzzymatcher.rst:
--------------------------------------------------------------------------------
 1 | fuzzymatcher package
 2 | ====================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | fuzzymatcher\.data\_getter\_abc module
 8 | --------------------------------------
 9 | 
10 | .. automodule:: fuzzymatcher.data_getter_abc
11 |     :members:
12 |     :undoc-members:
13 |     :show-inheritance:
14 | 
15 | fuzzymatcher\.data\_getter\_sqlite module
16 | -----------------------------------------
17 | 
18 | .. automodule:: fuzzymatcher.data_getter_sqlite
19 |     :members:
20 |     :undoc-members:
21 |     :show-inheritance:
22 | 
23 | fuzzymatcher\.data\_preprocessor\_abc module
24 | --------------------------------------------
25 | 
26 | .. automodule:: fuzzymatcher.data_preprocessor_abc
27 |     :members:
28 |     :undoc-members:
29 |     :show-inheritance:
30 | 
31 | fuzzymatcher\.data\_preprocessor\_default module
32 | ------------------------------------------------
33 | 
34 | .. automodule:: fuzzymatcher.data_preprocessor_default
35 |     :members:
36 |     :undoc-members:
37 |     :show-inheritance:
38 | 
39 | fuzzymatcher\.matcher module
40 | ----------------------------
41 | 
42 | .. automodule:: fuzzymatcher.matcher
43 |     :members:
44 |     :undoc-members:
45 |     :show-inheritance:
46 | 
47 | fuzzymatcher\.record module
48 | ---------------------------
49 | 
50 | .. automodule:: fuzzymatcher.record
51 |     :members:
52 |     :undoc-members:
53 |     :show-inheritance:
54 | 
55 | fuzzymatcher\.scorer\_abc module
56 | --------------------------------
57 | 
58 | .. automodule:: fuzzymatcher.scorer_abc
59 |     :members:
60 |     :undoc-members:
61 |     :show-inheritance:
62 | 
63 | fuzzymatcher\.scorer\_default module
64 | ------------------------------------
65 | 
66 | .. automodule:: fuzzymatcher.scorer_default
67 |     :members:
68 |     :undoc-members:
69 |     :show-inheritance:
70 | 
71 | 
72 | Module contents
73 | ---------------
74 | 
75 | .. automodule:: fuzzymatcher
76 |     :members:
77 |     :undoc-members:
78 |     :show-inheritance:
79 | 


--------------------------------------------------------------------------------
/docs/source/modules.rst:
--------------------------------------------------------------------------------
 1 | fuzzymatcher
 2 | ============
 3 | 
 4 | .. toctree::
 5 |    :maxdepth: 4
 6 | 
 7 |    expected_usage
 8 |    fuzzymatcher
 9 |    setup
10 |    tests
11 |    try
12 | 


--------------------------------------------------------------------------------
/examples.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# `fuzzymatcher` examples"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "## Basic usage - `link_table`\n",
 15 |     "\n",
 16 |     "In the most basic usage, the user provides `fuzzymatcher` with two pandas dataframes, indicating which columns to join on.\n",
 17 |     "\n",
 18 |     "The central output of `fuzzymatcher` is the `link_table`.\n",
 19 |     "\n",
 20 |     "For each record in the left table, the link table includes one or more possible matching records from the right table.\n",
 21 |     "\n",
 22 |     "The user can then inspect the link table and decide which matches to retain, e.g. by choosing a score threshold ( `match_score > chosen_threshold` ) or just choosing the best match ( `match_rank == 1` )"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": null,
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "import logging\n",
 32 |     "logging.basicConfig(level=logging.DEBUG)"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": null,
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "import fuzzymatcher\n",
 42 |     "import pandas as pd\n",
 43 |     "\n",
 44 |     "df_left = pd.read_csv(\"tests/data/left_1.csv\")\n",
 45 |     "df_left"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "df_right = pd.read_csv(\"tests/data/right_1.csv\")\n",
 55 |     "df_right"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": null,
 61 |    "metadata": {},
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "# Columns to match on from df_left\n",
 65 |     "left_on = [\"fname\", \"mname\", \"lname\",  \"dob\"]\n",
 66 |     "\n",
 67 |     "# Columns to match on from df_right\n",
 68 |     "right_on = [\"name\", \"middlename\", \"surname\", \"date\"]\n",
 69 |     "\n",
 70 |     "# Note that if left_id_col or right_id_col are admitted a unique id will be autogenerated\n",
 71 |     "fuzzymatcher.link_table(df_left, df_right, left_on, right_on, left_id_col = \"id\", right_id_col = \"id\")"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "markdown",
 76 |    "metadata": {},
 77 |    "source": [
 78 |     "## Basic usage - `fuzzy_left_join`\n",
 79 |     "\n",
 80 |     "A second option is to use `fuzzy_left_join`, which automatically links the two dataframes based on the highest-scoring match."
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": null,
 86 |    "metadata": {},
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "import fuzzymatcher\n",
 90 |     "import pandas as pd\n",
 91 |     "\n",
 92 |     "df_left = pd.read_csv(\"tests/data/left_1.csv\")\n",
 93 |     "df_right = pd.read_csv(\"tests/data/right_1.csv\")\n",
 94 |     "left_on = [\"fname\", \"lname\",  \"dob\"]\n",
 95 |     "right_on = [\"name\", \"surname\", \"date\"]\n",
 96 |     "\n",
 97 |     "fuzzymatcher.fuzzy_left_join(df_left, df_right, left_on, right_on)"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "markdown",
102 |    "metadata": {},
103 |    "source": [
104 |     "# Basic example with real data\n",
105 |     "### Matching the names of Local Authorities provided by Office for National Statistics with the names provided by Ordnance Survey\n",
106 |     "\n",
107 |     "We would usually join this data on the Local Authority District (LAD) Codes (e.g. E06000001 = Hartlepool), but sometimes these are unavailable.  In this example, we fuzzy match on the name, but provide the LAD code to demonstate it has worked."
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": null,
113 |    "metadata": {},
114 |    "outputs": [],
115 |    "source": [
116 |     "ons = pd.read_csv(\"tests/data/las_ons.csv\")\n",
117 |     "os = pd.read_csv(\"tests/data/las_os.csv\")\n",
118 |     "\n",
119 |     "df_joined = fuzzymatcher.fuzzy_left_join(ons, os, left_on = \"lad16nm\", right_on = \"name\")\n",
120 |     "rename = {\"lad16cd\": \"ons_code\", \"code\": \"os_code\", \"lad16nm\": \"ons_name\", \"name\": \"os_name\"}\n",
121 |     "df_joined = df_joined.rename(columns=rename)\n",
122 |     "col_order = [\"best_match_score\", \"ons_name\", \"os_name\", \"ons_code\", \"os_code\"]\n",
123 |     "df_joined[col_order].sample(5)"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "markdown",
128 |    "metadata": {},
129 |    "source": [
130 |     "We can get a sense of match quality by measuring how often the fuzzy matcher got it right:"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": null,
136 |    "metadata": {},
137 |    "outputs": [],
138 |    "source": [
139 |     "num_records = len(df_joined)\n",
140 |     "correct_binary = (df_joined[\"ons_code\"] == df_joined[\"os_code\"])\n",
141 |     "perc_correct = correct_binary.sum()/num_records\n",
142 |     "\n",
143 |     "\"The percentage of codes which were correctly matched was {:,.1f}%\".format(perc_correct*100)"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "markdown",
148 |    "metadata": {},
149 |    "source": [
150 |     "# Advanced usage - configuring the matcher\n",
151 |     "\n",
152 |     "`fuzzymatcher` uses a number of components, each one of which can be re-written or adapted by the user:\n",
153 |     "\n",
154 |     "* **`data_preprocessor`**:  Responsible for normalising strings, removing punctuation etc.\n",
155 |     "* **`datagetter`**:  Responsible for finding a list of possible matches for each df_left record in df_right\n",
156 |     "* **`scorer`**:  Responsible for computing a match score, given a record from df_left and df_right respectively\n",
157 |     "\n",
158 |     "The main `link_table` and `fuzzy_left_join` convenience functions use these components under the hood.  See [here](https://github.com/RobinL/fuzzymatcher/blob/master/fuzzymatcher/__init__.py) for how this work.\n",
159 |     "\n",
160 |     "This section provides a few examples of how an advanced user can compose these components to create a custom matcher"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "markdown",
165 |    "metadata": {},
166 |    "source": [
167 |     "## Example 1:  Replacing the default sqlite datagetter with the cartesian datagetter"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": null,
173 |    "metadata": {},
174 |    "outputs": [],
175 |    "source": [
176 |     "from fuzzymatcher.data_getter_cartesian import DataGetterCartesian\n",
177 |     "from fuzzymatcher.matcher import Matcher\n",
178 |     "\n",
179 |     "dg = DataGetterCartesian()\n",
180 |     "\n",
181 |     "m = Matcher(data_getter = dg)\n",
182 |     "\n",
183 |     "df_left = pd.read_csv(\"tests/data/left_3.csv\")\n",
184 |     "df_right = pd.read_csv(\"tests/data/right_3.csv\")\n",
185 |     "\n",
186 |     "on = [\"first_name\", \"surname\", \"dob\", \"city\"]\n",
187 |     "\n",
188 |     "m.add_data(df_left, df_right, on, on)\n",
189 |     "\n",
190 |     "m.match_all()\n",
191 |     "lt = m.get_formatted_link_table()\n",
192 |     "print(\"Length of Cartesian join table: {:,.0f}\".format(len(lt))) # Note, because df_left and df_right are 100 records each, this table is 10,000 records long\n",
193 |     "lt.head()"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "markdown",
198 |    "metadata": {},
199 |    "source": [
200 |     "The Cartesian matcher considers more potential matches, but its performance is considerably worse"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "markdown",
205 |    "metadata": {},
206 |    "source": [
207 |     "# Performance\n"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": null,
213 |    "metadata": {},
214 |    "outputs": [],
215 |    "source": [
216 |     "df_left = pd.read_csv(\"tests/data/left_4.csv\")\n",
217 |     "# df_left = df_left[:1000]\n",
218 |     "df_right = pd.read_csv(\"tests/data/right_4.csv\")\n",
219 |     "# df_right = df_right[:1000]\n",
220 |     "on = [\"first_name\", \"surname\", \"dob\", \"city\"]\n",
221 |     "\n",
222 |     "lt = fuzzymatcher.link_table(df_left, df_right, on, on)\n",
223 |     "lt.head(5)"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "code",
228 |    "execution_count": null,
229 |    "metadata": {},
230 |    "outputs": [],
231 |    "source": [
232 |     "def link_table_percentage_correct(link_table):\n",
233 |     "    \"\"\"\n",
234 |     "    In this test dataset, we know what the link should be\n",
235 |     "    Therefore we can compute a measure of performance\n",
236 |     "    \"\"\"\n",
237 |     "    lt = link_table.copy()\n",
238 |     "    lt = lt[lt[\"match_rank\"] == 1]\n",
239 |     "    lt[\"__id_left\"] = lt[\"__id_left\"].str.replace(\"_left\", \"\")\n",
240 |     "    lt[\"__id_right\"] = lt[\"__id_right\"].str.replace(\"_right\", \"\")\n",
241 |     "    lt[\"link_correct\"] = (lt[\"__id_left\"] == lt[\"__id_right\"])\n",
242 |     "\n",
243 |     "    return lt[\"link_correct\"].sum()/len(lt)\n",
244 |     "\n",
245 |     "\"Percent matches correct: {:,.1f}%\".format(link_table_percentage_correct(lt)*100)"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "markdown",
250 |    "metadata": {},
251 |    "source": [
252 |     "### Note that in this particular case we can improve the match rate by including initials and allowing inversion of first name and surname \n",
253 |     "\n",
254 |     "(Within a field, the matcher pays no attention to token order)"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": null,
260 |    "metadata": {},
261 |    "outputs": [],
262 |    "source": [
263 |     "df_left[\"full_name\"] = df_left[\"first_name\"] + \" \" + df_left[\"surname\"]\n",
264 |     "df_right[\"full_name\"] = df_right[\"first_name\"] + \" \" + df_right[\"surname\"]\n",
265 |     "df_left[\"initials\"] = df_left[\"first_name\"].str[0] + df_left[\"surname\"].str[0]\n",
266 |     "df_right[\"initials\"] = df_right[\"first_name\"].str[0] + df_right[\"surname\"].str[0]\n",
267 |     "\n",
268 |     "on = [\"full_name\", \"initials\", \"dob\", \"city\"]\n",
269 |     "\n",
270 |     "lt = fuzzymatcher.link_table(df_left, df_right, on, on)\n",
271 |     "lt.head(5)"
272 |    ]
273 |   },
274 |   {
275 |    "cell_type": "code",
276 |    "execution_count": null,
277 |    "metadata": {},
278 |    "outputs": [],
279 |    "source": [
280 |     "\"Percent matches correct: {:,.1f}%\".format(link_table_percentage_correct(lt)*100)"
281 |    ]
282 |   }
283 |  ],
284 |  "metadata": {
285 |   "kernelspec": {
286 |    "display_name": "Python 3",
287 |    "language": "python",
288 |    "name": "python3"
289 |   },
290 |   "language_info": {
291 |    "codemirror_mode": {
292 |     "name": "ipython",
293 |     "version": 3
294 |    },
295 |    "file_extension": ".py",
296 |    "mimetype": "text/x-python",
297 |    "name": "python",
298 |    "nbconvert_exporter": "python",
299 |    "pygments_lexer": "ipython3",
300 |    "version": "3.6.1"
301 |   }
302 |  },
303 |  "nbformat": 4,
304 |  "nbformat_minor": 2
305 | }
306 | 


--------------------------------------------------------------------------------
/fuzzymatcher/__init__.py:
--------------------------------------------------------------------------------
 1 | from fuzzymatcher.data_preprocessor_default import DataPreprocessor
 2 | from fuzzymatcher.data_getter_sqlite import DataGetter
 3 | from fuzzymatcher.scorer_default import Scorer
 4 | 
 5 | from fuzzymatcher.matcher import Matcher
 6 | 
 7 | import pandas as pd
 8 | import importlib
 9 | 
10 | 
11 | def link_table(df_left,
12 |             df_right,
13 |             left_on,
14 |             right_on,
15 |             left_id_col = None,
16 |             right_id_col = None):
17 | 
18 |     dp = DataPreprocessor()
19 |     dg = DataGetter()
20 |     s = Scorer()
21 | 
22 |     m = Matcher(dp, dg, s)
23 |     m.add_data(df_left, df_right, left_on, right_on,  left_id_col, right_id_col)
24 |     m.match_all()
25 | 
26 |     return m.get_formatted_link_table()
27 | 
28 | def fuzzy_left_join(df_left,
29 |             df_right,
30 |             left_on,
31 |             right_on,
32 |             left_id_col = None,
33 |             right_id_col = None):
34 | 
35 |     dp = DataPreprocessor()
36 |     dg = DataGetter()
37 |     s = Scorer()
38 | 
39 |     m = Matcher(dp, dg, s)
40 |     m.add_data(df_left, df_right, left_on, right_on,  left_id_col, right_id_col)
41 |     m.match_all()
42 | 
43 |     return m.get_left_join_table()


--------------------------------------------------------------------------------
/fuzzymatcher/data_getter_abc.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | 
 3 | class DataGetterABC:
 4 | 
 5 |     __metaclass__ = abc.ABCMeta
 6 | 
 7 |     """
 8 |     A DataGetter handles the retrieval of data from df_right (the dataframe in which to search for matches)
 9 |     It retrieves a list of potential match ids
10 |     """
11 | 
12 |     @abc.abstractmethod
13 |     def add_data(self, df_search_within):
14 | 
15 |         """Adds the data in 'df_search_within'.
16 | 
17 |         Args:
18 |             df_search_within: The search space i.e. the whole dataset we search within
19 |             to find potential matches
20 | 
21 |         Returns:
22 |             None
23 |         """
24 | 
25 | 
26 | 
27 |     @abc.abstractmethod
28 |     def get_potential_match_ids_from_record(self, rec_find_match_for):
29 | 
30 |         """Retrieves lists of potential matches to a record
31 | 
32 |         Args:
33 |             rec_find_match_for: The record for which we're trying to find a match
34 | 
35 |         Returns:
36 |             A list of rec_potential_match records which represent the potential matches
37 |             to the rec_find_match_for
38 | 
39 |         """
40 | 


--------------------------------------------------------------------------------
/fuzzymatcher/data_getter_cartesian.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import sqlite3
 3 | 
 4 | from fuzzymatcher.record import Record
 5 | from fuzzymatcher.data_getter_abc import DataGetterABC
 6 | 
 7 | class DataGetterCartesian(DataGetterABC):
 8 | 
 9 |     """
10 |     The DataGetter class handles the retrieval of record_ids from 'df_right'
11 | 
12 |     This Cartesian datagetter is the simplest, most thorough, but least efficient implementation
13 |     where every record_id in 'df_right' is returned, compared and scored against 'df_left', leading to n^2 complexity.
14 | 
15 |     """
16 | 
17 |     def add_data(self, matcher):
18 | 
19 |         """
20 |         Registers the matcher on the datagetter so the datagetter can manipulate the matcher object
21 | 
22 |         Args:
23 |             matcher.  The matcher object
24 | 
25 |         Returns:
26 |             None
27 |         """
28 | 
29 |         self.matcher = matcher
30 | 
31 |     def get_potential_match_ids_from_record(self, rec_left):
32 | 
33 |         """Retrieves lists of potential matches to a record
34 | 
35 |         Args:
36 |             rec_left: The record for which we're trying to find a match
37 | 
38 |         Returns:
39 |             A list of rec_potential_match records which represent the potential matches
40 |             to the rec_find_match_for
41 | 
42 |         """
43 | 
44 |         for record_right_id ,record_right in self.matcher.right_records.items():
45 |             scored_potential_match = self.matcher.scorer.score_match(rec_left.record_id, record_right_id)
46 |             rec_left.potential_matches[record_right_id] = scored_potential_match
47 | 
48 | 


--------------------------------------------------------------------------------
/fuzzymatcher/data_getter_sqlite.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import pandas as pd
  3 | import random
  4 | import sqlite3
  5 | import copy
  6 | from functools import lru_cache
  7 | 
  8 | from fuzzymatcher.record import Record
  9 | from fuzzymatcher.utils import tokens_to_dmetaphones, add_dmetaphone_to_concat_all
 10 | log = logging.getLogger(__name__)
 11 | 
 12 | class DataGetter:
 13 | 
 14 |     """
 15 |     A DataGetter handles the retrieval of data from 'df_search_within'
 16 |     It retrieves lists of potential matches to a record in 'df_find_match_for'
 17 |     in 'df_search_within'
 18 |     """
 19 | 
 20 |     def __init__(self, return_records_limit=50, search_intensity=100, found_score_threshold = 0, found_num_records_threshold = 200):
 21 |         self.return_records_limit = return_records_limit
 22 |         self.search_intensity = search_intensity
 23 |         self.found_score_threshold = found_score_threshold
 24 |         self.found_num_records_threshold = found_num_records_threshold
 25 | 
 26 |     def add_data(self, matcher):
 27 | 
 28 |         """Adds the data in 'matcher.df_search_within' to a sqlite database
 29 |         and create a connection to the database to be used by the data getter
 30 |         Also registers the match object on the datagetter.
 31 | 
 32 |         Args:
 33 |             matcher.  The matcher object
 34 | 
 35 |         Returns:
 36 |             None
 37 |         """
 38 | 
 39 |         self.matcher = matcher
 40 | 
 41 |         # Turn right_records into strings and add to db
 42 |         rows = []
 43 |         for key, record in matcher.right_records.items():
 44 |             row = {}
 45 |             row["id"] = record.record_id
 46 |             row["_concat_all"] = record.clean_string
 47 |             row["_concat_all_alternatives"] = record.get_concat_string(record.token_misspelling_dict)
 48 |             rows.append(row)
 49 | 
 50 |         df = pd.DataFrame(rows)
 51 |         df = df[["id", "_concat_all", "_concat_all_alternatives"]]
 52 | 
 53 |         con = sqlite3.connect(':memory:', timeout=0.3)
 54 | 
 55 |         df.to_sql("df_right_processed", con, index=False)
 56 |         sql = """
 57 |                  CREATE VIRTUAL TABLE fts_target
 58 |                  USING fts4({} TEXT, _concat_all TEXT, _concat_all_alternatives TEXT);
 59 |               """.format(matcher.right_id_col)
 60 |         con.execute(sql)
 61 |         con.execute("INSERT INTO fts_target SELECT * FROM df_right_processed")
 62 | 
 63 |         self.con = con
 64 | 
 65 |         # TODO:  Compute the min, max, average number of tokens in a record to help optimise the search
 66 | 
 67 | 
 68 |     def get_potential_match_ids_from_record(self, rec_left):
 69 | 
 70 |         """Retrieves lists of potential matches to a record
 71 | 
 72 |         Args:
 73 |             rec_left: The record for which we're trying to find a match
 74 | 
 75 |         Returns:
 76 |             A list of rec_potential_match records which represent the potential matches
 77 |             to the rec_left
 78 | 
 79 |         """
 80 | 
 81 |         tkn_po = self._tokens_in_df_right_prob_order(rec_left)
 82 | 
 83 |         # No point in running FTS using a token we know isn't in df_right
 84 | 
 85 | 
 86 |         tkn_ms_po = self._tokens_in_df_right_prob_order(rec_left, misspelling=True)
 87 | 
 88 | 
 89 |         # Start searching with all the terms, then drop them one at a time, starting with the most unusual term
 90 |         token_lists = [tkn_po, tkn_ms_po]
 91 | 
 92 |         for token_list in token_lists:
 93 |             self._search_specific_to_general_single(token_list, rec_left)
 94 |             if not self._found_enough_matches(rec_left):
 95 |                 self._search_specific_to_general_band(token_list, rec_left)
 96 |             if self._found_enough_matches(rec_left):
 97 |                 break
 98 | 
 99 |         # If we cannot find a match, search random combinations
100 |         if not self._found_good_match(rec_left):
101 |             matches = self._search_random(tkn_po)
102 |             self._add_matches_to_potential_matches(matches, rec_left)
103 | 
104 |     @staticmethod
105 |     def _get_random_tokens(tokens):
106 |         num_tokens = len(tokens)
107 |         if num_tokens == 0:
108 |             return ()
109 |         n = random.randint(1, num_tokens)
110 |         random_tokens = random.sample(tokens, n)
111 |         return tuple(random_tokens)
112 | 
113 |     def _search_specific_to_general_single(self, token_list, rec_left):
114 | 
115 |         for i in range(len(token_list)):
116 |             sub_tokens = token_list[i:]
117 |             new_matches = self._tokens_to_matches(tuple(sub_tokens))
118 | 
119 |             self._add_matches_to_potential_matches(new_matches, rec_left)
120 |             if self._found_enough_matches(rec_left):
121 |                 return
122 | 
123 |     def _search_specific_to_general_band(self, tokens, rec_left):
124 |         """
125 |         Search in blocks e.g. if tokens a b c d go [abcd] [abc] [bcd] [ab] [bc] [cd] [a] [b] [c] [d]
126 |         """
127 |         num_tokens = len(tokens)
128 |         for band_size in range(num_tokens, 0,-1):
129 |             take = num_tokens - band_size + 1
130 |             for start_pos in range(0, take):
131 |                 end_pos = start_pos + band_size
132 |                 search_tokens = tokens[start_pos:end_pos]
133 |                 new_matches = self._tokens_to_matches(tuple(search_tokens))
134 |                 self._add_matches_to_potential_matches(new_matches, rec_left)
135 |                 if self._found_good_match(rec_left):
136 |                     return
137 |                 if len(rec_left.potential_matches) > self.found_num_records_threshold:
138 |                     return
139 | 
140 |     def _found_good_match(self, rec_left):
141 |         return rec_left.best_match_score > self.found_score_threshold
142 | 
143 |     def _found_enough_matches(self, rec_left):
144 |         if rec_left.best_match_score > self.found_score_threshold:
145 |             return True
146 |         if len(rec_left.potential_matches) > self.found_num_records_threshold:
147 |             return True
148 |         return False
149 | 
150 | 
151 |     def _search_random(self, token_list):
152 |         matches = []
153 |         prev_random_tokens = set()
154 |         for i in range(self.search_intensity):
155 |             random_tokens = self._get_random_tokens(token_list)
156 |             if random_tokens not in prev_random_tokens:
157 |                 prev_random_tokens.add(random_tokens)
158 |                 matches = self._tokens_to_matches(random_tokens)
159 |             if len(matches) > 0:
160 |                 break
161 |         return matches
162 | 
163 |     def _add_matches_to_potential_matches(self, matches, rec_left):
164 |         for match in matches:
165 |             right_id = match[0]
166 |             if right_id not in rec_left.potential_matches:
167 |                 scored_potential_match = self.matcher.scorer.score_match(rec_left.record_id, right_id)
168 |                 rec_left.potential_matches[right_id] = scored_potential_match
169 |                 if rec_left.best_match_score < scored_potential_match["match_score"]:
170 |                    rec_left.best_match_score = scored_potential_match["match_score"]
171 | 
172 |     @lru_cache(maxsize=int(1e6))
173 |     def _tokens_to_matches(self, tokens, misspelling = False):
174 | 
175 |         get_records_sql = """
176 |             SELECT * FROM fts_target WHERE {} MATCH '{}' limit {};
177 |             """
178 | 
179 |         # This fails if the special tokens 'and' or 'or' are in fts string!  See issue 35!
180 |         tokens_to_escape = ["AND", "OR", "NEAR", "NOT"]
181 | 
182 |         def escape_token(t):
183 |             # return t
184 |             if t in tokens_to_escape:
185 |                 return '"' + t + '"'
186 |             else:
187 |                 return t
188 | 
189 | 
190 |         tokens = [escape_token(t) for t in tokens]
191 | 
192 |         fts_string = " ".join(tokens)
193 | 
194 | 
195 |         if misspelling:
196 |             table_name = "_concat_all_alternatives"
197 |         else:
198 |             table_name = "_concat_all"
199 | 
200 |         sql = get_records_sql.format(table_name, fts_string, self.return_records_limit)
201 | 
202 | 
203 |         cur = self.con.cursor()
204 |         cur.execute(sql)
205 |         results = cur.fetchall()
206 | 
207 |         return results
208 | 
209 | 
210 |     def _tokens_in_df_right_prob_order(self, rec_to_find_match_for, misspelling = False):
211 |         # Problem here is that field names are different in left and right
212 |         fields = rec_to_find_match_for.fields
213 |         if misspelling:
214 |             token_dict = rec_to_find_match_for.token_misspelling_dict
215 |         else:
216 |             token_dict = rec_to_find_match_for.clean_token_dict
217 |         get_prob = self.matcher.scorer.get_prob
218 | 
219 |         tokens_list = []
220 |         for field, tokens in token_dict.items():
221 |             for t in tokens:
222 |                 translated_field = self.matcher.left_to_right_lookup[field]
223 |                 prob = get_prob(t,translated_field,"right",misspelling)
224 |                 tokens_list.append({"token": t, "prob": prob})
225 | 
226 |         tokens_list = [t for t in tokens_list if t["prob"] is not None]
227 |         tokens_list.sort(key=lambda x: x["prob"])
228 |         tokens_list = [t["token"] for t in tokens_list]
229 |         return tokens_list
230 | 


--------------------------------------------------------------------------------
/fuzzymatcher/data_preprocessor_abc.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import abc
 4 | 
 5 | class DataPreprocessorABC:
 6 | 
 7 |     """
 8 |     A DataPreprocessor is responsible for ingesting df_left (the dataframe containing the records we
 9 |     want to find matches for) and df_right (the dataframe we want to search for potential matches)
10 |     and applying preprocessing stages like normalisation to make matching easier.
11 |     """
12 | 
13 |     __metaclass__ = abc.ABCMeta
14 | 
15 |     @abc.abstractmethod
16 |     def add_data(self,
17 |                  df_left,
18 |                  df_right,
19 |                  left_on,
20 |                  right_on,
21 |                  left_word_cols=None,
22 |                  right_word_cols=None,
23 |                  left_id_col=None,
24 |                  right_id_col=None):
25 | 
26 |         """Adds data and parameters the DataPreprocessor needs to run
27 | 
28 |         This is similar to an __init__ method, except it is run after the object is instantiated.
29 | 
30 |         Returns:
31 |             None
32 |         """
33 | 
34 | 
35 | 
36 |     @abc.abstractmethod
37 |     def preprocess(self):
38 |         """Main method that runs the data preprocesing
39 | 
40 |         Creates two new attributes on the data preprocessor object:
41 | 
42 |         data_search_within:
43 |         This is a list of dictionaries like this:  {"id": record_id, "data:" normalised string}
44 | 
45 |         data_find_match_for:
46 |         This is a list of dictionaries like this:  {"id": record_id, "data:" normalised string}
47 | 
48 |         Returns:
49 |             None
50 |         """
51 | 


--------------------------------------------------------------------------------
/fuzzymatcher/data_preprocessor_default.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from functools import lru_cache
 4 | 
 5 | from fuzzymatcher.data_preprocessor_abc import DataPreprocessorABC
 6 | from metaphone import doublemetaphone
 7 | 
 8 | class DataPreprocessor(DataPreprocessorABC):
 9 | 
10 |     """
11 |     Normalise and deal with IDs
12 |     """
13 | 
14 |     def __init__(self, dmetaphone = True):
15 |         #self.include_dmetaphone = dmetaphone TODO
16 |         pass
17 | 
18 |     def register_matcher(self, matcher):
19 |         self.matcher = matcher
20 | 
21 |     def preprocess(self):
22 | 
23 |         left_cols = self.matcher.left_on
24 |         right_cols = self.matcher.right_on
25 | 
26 |         # Name collisions mean that we want to rename the id columns
27 |         if not self.matcher.left_id_col:
28 |             self.add_id(self.matcher.df_left, "left")
29 |             self.matcher.left_id_col = "__id_left"
30 |         else:
31 |             self.matcher.df_left["__id_left"] = self.matcher.df_left[self.matcher.left_id_col]
32 | 
33 |         if not self.matcher.right_id_col:
34 |             self.add_id(self.matcher.df_right, "right")
35 |             self.matcher.right_id_col = "__id_right"
36 |         else:
37 |             self.matcher.df_right["__id_right"] = self.matcher.df_right[self.matcher.right_id_col]
38 | 
39 | 
40 |     @staticmethod
41 |     def add_id(df, prefix):
42 |         id_colname = "__id_" + prefix
43 |         data = range(0, len(df))
44 |         data = ["{}_{}".format(i, prefix) for i in data]
45 |         df.insert(0, id_colname, data)
46 | 


--------------------------------------------------------------------------------
/fuzzymatcher/matcher.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import logging
  3 | import pandas as pd
  4 | from datetime import datetime
  5 | from dateutil.relativedelta import relativedelta
  6 | 
  7 | from fuzzymatcher.record import RecordToMatch, Record
  8 | from fuzzymatcher.tokencomparison import TokenComparison
  9 | from fuzzymatcher.data_preprocessor_default import DataPreprocessor
 10 | from fuzzymatcher.data_getter_sqlite import DataGetter
 11 | from fuzzymatcher.scorer_default import Scorer
 12 | 
 13 | log = logging.getLogger(__name__)
 14 | 
 15 | class Matcher:
 16 |     """The Matcher coordinates data matching"""
 17 | 
 18 |     def __init__(self,
 19 |                  data_preprocessor = DataPreprocessor(),
 20 |                  data_getter = DataGetter(),
 21 |                  scorer = Scorer(),
 22 |                  token_comparison = TokenComparison(),
 23 |                  top_n_matches = 5):
 24 |         self.token_comparison = token_comparison
 25 |         self.data_preprocessor = data_preprocessor
 26 |         self.data_getter = data_getter
 27 |         self.scorer = scorer
 28 |         self.top_n_matches = top_n_matches
 29 | 
 30 |     def add_data(self, df_left,
 31 |               df_right,
 32 |               left_on,
 33 |               right_on,
 34 |               left_id_col = None,
 35 |               right_id_col = None):
 36 | 
 37 |         # Copy to prevent modifying the dataframes the user provides
 38 |         self.df_left = df_left.copy()
 39 |         self.df_right = df_right.copy()
 40 | 
 41 |         if type(left_on) == str:
 42 |             left_on = [left_on]
 43 | 
 44 |         if type(right_on) == str:
 45 |             right_on = [right_on]
 46 | 
 47 |         self.left_on = left_on
 48 |         self.right_on = right_on
 49 |         self.left_id_col = left_id_col
 50 |         self.right_id_col = right_id_col
 51 | 
 52 |         self.left_to_right_lookup = {l:r for (l,r) in zip(left_on, right_on)}
 53 | 
 54 |         self.data_preprocessor.register_matcher(self)
 55 | 
 56 |     def initiate_records(self):
 57 |         self.left_records = {}
 58 |         cols = self.left_on.copy()
 59 |         cols.append("__id_left")
 60 |         df = self.df_left[cols]
 61 |         for r in df.iterrows():
 62 |             row = r[1]
 63 |             fields_dict = dict(row[self.left_on])
 64 |             this_id = row["__id_left"]
 65 |             rec = RecordToMatch(fields_dict, this_id, self)
 66 |             self.left_records[this_id] = rec
 67 | 
 68 |         self.right_records = {}
 69 |         cols = self.right_on.copy()
 70 |         cols.append("__id_right")
 71 |         df = self.df_right[cols]
 72 |         for r in df.iterrows():
 73 |             row = r[1]
 74 |             fields_dict = dict(row[self.right_on])
 75 |             this_id = row["__id_right"]
 76 |             rec = Record(fields_dict, this_id, self)
 77 |             self.right_records[this_id] = rec
 78 | 
 79 |     def match_all(self):
 80 | 
 81 |         # Get a dataset with id, record only for left and right
 82 |         self.data_preprocessor.preprocess()
 83 | 
 84 |         self.initiate_records()
 85 | 
 86 |         # Scorer first because some data getters may need to score records on add_data
 87 |         self.scorer.add_data(self)
 88 | 
 89 |         self.data_getter.add_data(self)
 90 | 
 91 |         # Get a table that contains only the matches, scores and ids
 92 |         self.link_table = self._match_processed_data()
 93 | 
 94 |     def get_formatted_link_table(self):
 95 |         return self._add_original_cols_to_link_table(self.link_table)
 96 | 
 97 |     def get_left_join_table(self):
 98 |         df = self.df_left.merge(self.link_table, left_on = "__id_left", right_on = "__id_left", how="left")
 99 |         df = df.merge(self.df_right, left_on = "__id_right", right_on="__id_right", how="left", suffixes = ("_left", "_right"))
100 | 
101 |         # Keep records where rank = 1 or there's no rang
102 |         filter1 = df["__rank"] == 1
103 |         filter2 = pd.isnull(df["__rank"])
104 |         df = df[filter1 | filter2]
105 |         df.drop("__rank", axis=1, inplace=True)
106 | 
107 |         set_cols = ["__score", "__id_left", "__id_right"]
108 | 
109 |         cols = set_cols.copy()
110 |         cols.extend([c for c in df.columns if c not in set_cols])
111 | 
112 |         df = df[cols].rename(columns={"__score": "best_match_score"})
113 |         return df
114 | 
115 |     def _match_processed_data(self):
116 | 
117 |         # This will store all the records for the link table
118 | 
119 |         link_table_list = []
120 | 
121 |         num_left_records = len(self.left_records.keys())
122 |         num_right_records = len(self.right_records.keys())
123 |         log.debug("Matching {} left records against {} right records".format(num_left_records, num_right_records))
124 |         start_time  = datetime.now()
125 | 
126 |         counter = 0
127 |         total = len(self.left_records.items())
128 |         str_template = "Processed {:,.0f} records, {:.0f}% done in {} minutes and {} seconds"
129 | 
130 |         for key, this_record in self.left_records.items():
131 | 
132 |             if (counter) % 1000 == 0 and counter != 0:
133 |                 diff = relativedelta(datetime.now(), start_time)
134 |                 log.debug(str_template.format(counter, (counter/total)*100, diff.minutes, diff.seconds))
135 | 
136 |             this_record.find_and_score_potential_matches()
137 |             link_table_list.extend(this_record.get_link_table_rows())
138 | 
139 |             counter += 1
140 | 
141 |         diff = relativedelta(datetime.now(), start_time)
142 |         log.debug(str_template.format(counter, (counter/total)*100, diff.minutes, diff.seconds))
143 | 
144 |         return pd.DataFrame(link_table_list)
145 | 
146 |     def _add_original_cols_to_link_table(self, link_table):
147 | 
148 |         df = link_table.merge(self.df_left, left_on = "__id_left", right_on = "__id_left", how = "left", suffixes=('_link', '_left'))
149 | 
150 |         df = df.merge(self.df_right, left_on = "__id_right", right_on = "__id_right", how="left", suffixes=('_left', "_right"))
151 | 
152 |         match_cols_left = self.left_on[::-1].copy()
153 |         match_cols_right = self.right_on[::-1].copy()
154 |         col_order = ["__id_left", "__id_right", "__score", "__rank"]
155 |         while len(match_cols_left) > 0 and len(match_cols_right) > 0:
156 | 
157 |             # Check whether suffixes have been added
158 |             left_col = match_cols_left.pop()
159 |             left_col = self._add_suffix_if_needed(left_col, df, "left")
160 |             col_order.append(left_col)
161 | 
162 |             right_col = match_cols_right.pop()
163 |             right_col = self._add_suffix_if_needed(right_col, df, "right")
164 |             col_order.append(right_col)
165 | 
166 |         col_order.extend(match_cols_left)
167 |         col_order.extend(match_cols_right)
168 | 
169 |         df = df[col_order]
170 | 
171 |         # Finally rename the id columns back to their original and improve names of score and rank
172 |         rename_dict = {}
173 |         if "match_rank" not in df.columns:
174 |             rename_dict["__rank"] = "match_rank"
175 | 
176 |         if "match_score" not in df.columns:
177 |             rename_dict["__score"] = "match_score"
178 |         df = df.rename(columns = rename_dict)
179 |         return df
180 | 
181 |     def _add_suffix_if_needed(self, col_name, df, left_or_right):
182 | 
183 |         all_cols = df.columns
184 |         if left_or_right == "left":
185 |             left_cols = self.df_left.columns
186 | 
187 |             if col_name in left_cols and col_name not in all_cols:
188 |                 return col_name + "_left"
189 |             else:
190 |                 return col_name
191 | 
192 |         if left_or_right == "right":
193 |             right_cols = self.df_right.columns
194 |             if col_name in right_cols and col_name not in all_cols:
195 |                 return col_name + "_right"
196 |             else:
197 |                 return col_name
198 | 
199 | 


--------------------------------------------------------------------------------
/fuzzymatcher/record.py:
--------------------------------------------------------------------------------
  1 | from fuzzymatcher.utils import tokens_to_dmetaphones
  2 | import pandas as pd
  3 | import random
  4 | import re
  5 | 
  6 | class Record:
  7 |     """
  8 |     The 'record' objects represents a row of a dataset.
  9 |     A row is represented as a dictionary called 'field dict', whose keys are the column (field) names
 10 |     and whose values are the column values.
 11 | 
 12 |     The record object has methods to clean (homogenise) and tokenise these column values.
 13 |     The record object also has a dictionary similar to field dict that contains token misspellings
 14 |     """
 15 | 
 16 |     def __init__(self, field_dict, record_id, matcher):
 17 |         self.orig_field_dict = field_dict
 18 |         self.record_id = record_id
 19 |         self.matcher = matcher
 20 | 
 21 |         self.fields = list(field_dict.keys())
 22 |         self.clean_token_dict = Record.get_tokenised_field_dict(field_dict)
 23 |         self.clean_string = Record.get_concat_string(self.clean_token_dict)
 24 | 
 25 |         self.token_misspelling_dict = self.get_tokenised_misspelling_dict()
 26 | 
 27 |     def __repr__(self):
 28 |         return self.clean_string
 29 | 
 30 |     def get_tokenised_misspelling_dict(self):
 31 |         get_misspellings = self.matcher.token_comparison.get_misspellings
 32 | 
 33 |         misspellings_dict = {}
 34 |         for field, tokens in self.clean_token_dict.items():
 35 |             misspelling_tokens = []
 36 |             for t in tokens:
 37 |                 misspelling_tokens.extend(get_misspellings(t))
 38 |             misspellings_dict[field] = misspelling_tokens
 39 |         return misspellings_dict
 40 | 
 41 |     @staticmethod
 42 |     def field_to_string(value):
 43 |         return str(value)
 44 | 
 45 |     @staticmethod
 46 |     def get_tokenised_field_dict(field_dict):
 47 |         cleaned_token_dict = {}
 48 |         for key, value in field_dict.items():
 49 |             value = Record.field_to_string(value)
 50 |             value = value.upper()
 51 | 
 52 |             value = value.replace("'", " ")
 53 |             value = re.sub('[^\w\s]',' ', value)
 54 |             value = re.sub('\s{2,100}',' ', value)
 55 |             value = value.strip()
 56 | 
 57 |             cleaned_token_dict[key] = value.split(" ")
 58 |         return cleaned_token_dict
 59 | 
 60 |     @staticmethod
 61 |     def get_concat_string(token_dict):
 62 |         tokens = []
 63 |         for key, value in token_dict.items():
 64 |             tokens.extend(value)
 65 |         return " ".join(tokens)
 66 | 
 67 | class RecordToMatch(Record):
 68 | 
 69 |     def __init__(self, *args, **kwargs):
 70 |         Record.__init__(self, *args, **kwargs)
 71 |         self.potential_matches = {} # A dictionary with right_id as key
 72 |         self.best_match_score = -float("inf")
 73 | 
 74 |     def find_and_score_potential_matches(self):
 75 |         # Each left_record has a list of left_record ids
 76 |         self.matcher.data_getter.get_potential_match_ids_from_record(self)
 77 | 
 78 |     def get_link_table_rows(self):
 79 |         rows = []
 80 | 
 81 |         for k, v in self.potential_matches.items():
 82 |             row = {}
 83 |             row["__id_left"] = self.record_id
 84 |             row["__id_right"] = v["record_right"].record_id
 85 |             row["__score"] = v["match_score"]
 86 |             # TODO
 87 |             #row["__score"] = p.match_prob
 88 |             rows.append(row)
 89 | 
 90 |         if len(self.potential_matches.items()) == 0: #If there is no potential match, still want a row in link table
 91 |             row = {}
 92 |             row["__id_left"] = self.record_id
 93 |             row["__id_right"] = None
 94 |             row["__score"] = None
 95 |             rows.append(row)
 96 | 
 97 | 
 98 | 
 99 |         rows.sort(key=lambda r: r['__score'], reverse=True)
100 | 
101 |         for i, r in enumerate(rows):
102 |             r["__rank"] = i + 1
103 | 
104 |         return rows
105 | 
106 | 
107 | 


--------------------------------------------------------------------------------
/fuzzymatcher/scorer_abc.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import abc
 4 | 
 5 | class ScorerABC:
 6 | 
 7 |     """
 8 |     A DataPreprocessor is responsible for ingesting df_left (the dataframe containing the records we
 9 |     want to find matches for) and df_right (the dataframe we want to search for potential matches)
10 |     and applying preprocessing stages like normalisation to make matching easier.
11 |     """
12 | 
13 |     __metaclass__ = abc.ABCMeta
14 | 
15 |     @abc.abstractmethod
16 |     def add_data(self, matcher):
17 |         pass
18 | 
19 | 
20 |     @abc.abstractmethod
21 |     def get_freq(token):
22 |         pass
23 | 
24 |     @abc.abstractmethod
25 |     def score_match(record_to_find_match, record_potential_match):
26 |         pass
27 | 
28 | 


--------------------------------------------------------------------------------
/fuzzymatcher/scorer_default.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import pandas as pd
  3 | from collections import Counter
  4 | from itertools import chain, product
  5 | from operator import mul
  6 | from functools import reduce
  7 | from math import log10
  8 | from functools import lru_cache
  9 | 
 10 | import logging
 11 | log = logging.getLogger(__name__)
 12 | 
 13 | from fuzzymatcher.utils import add_dmetaphones_to_col, is_mispelling, convert_series_to_dmetaphones, tokens_to_dmetaphones
 14 | 
 15 | class Scorer:
 16 | 
 17 |     """
 18 |     A DataPreprocessor is responsible for ingesting df_left (the dataframe containing the records we
 19 |     want to find matches for) and df_right (the dataframe we want to search for potential matches)
 20 |     and applying preprocessing stages like normalisation to make matching easier.
 21 |     """
 22 | 
 23 |     def add_data(self, matcher):
 24 |         self.matcher = matcher
 25 |         self._generate_probs()
 26 | 
 27 |     def get_prob(self, token, field, left_right, misspelling=False):
 28 |         """
 29 |         Get probability given field and token
 30 |         """
 31 | 
 32 |         try:
 33 |             if not misspelling and left_right == "left":
 34 |                 return self.left_field_token_probs_dict[field][token]
 35 | 
 36 |             if not misspelling and left_right == "right":
 37 |                 return self.right_field_token_probs_dict[field][token]
 38 | 
 39 |             if misspelling and left_right == "left":
 40 |                 return self.left_field_misspelling_probs_dict[field][token]
 41 | 
 42 |             if misspelling and left_right == "right":
 43 |                 return self.right_field_misspelling_probs_dict[field][token]
 44 |         except KeyError:
 45 |             return None
 46 | 
 47 |     @lru_cache(maxsize=int(1e6))
 48 |     def score_match(self, record_left_id, record_right_id):
 49 | 
 50 |         record_left = self.matcher.left_records[record_left_id]
 51 |         record_right = self.matcher.right_records[record_right_id]
 52 |         # Need to find common tokens, and get their probabilities
 53 |         fields_left = record_left.fields
 54 | 
 55 |         prob = 1
 56 |         for f_left in fields_left:
 57 |             p = self._field_to_prob(f_left, record_left, record_right)
 58 |             prob = p * prob
 59 | 
 60 |         match_score = self.prob_to_score(prob)
 61 |         return {"match_prob" : prob, "match_score": match_score, "record_right": record_right}
 62 | 
 63 | 
 64 |     def _field_to_prob(self, field_left, record_left, record_right):
 65 | 
 66 |         field_right = self.matcher.left_to_right_lookup[field_left]
 67 | 
 68 |         tokens_left = set(record_left.clean_token_dict[field_left])
 69 |         tokens_right = set(record_right.clean_token_dict[field_right])
 70 | 
 71 |         matching_tokens = tokens_left.intersection(tokens_right)
 72 |         unmatching_tokens_left = tokens_left.difference(tokens_right)
 73 |         unmatching_tokens_right = tokens_right.difference(tokens_left)
 74 | 
 75 |         prob_matching = self._get_prob_matching(matching_tokens, field_right)
 76 | 
 77 |         prob_unmatching1 = self._get_prob_unmatching(unmatching_tokens_left, tokens_right, field_right, field_left)
 78 |         prob_unmatching2 = self._get_prob_unmatching(unmatching_tokens_right, tokens_left, field_right, field_left)
 79 | 
 80 |         tokens_alt_left = set(record_left.token_misspelling_dict[field_left])
 81 |         tokens_alt_right = set(record_right.token_misspelling_dict[field_right])
 82 |         matching_tokens_alt = tokens_alt_left.intersection(tokens_alt_right)
 83 |         prob_matching_alt = self._get_prob_matching(matching_tokens_alt, field_right, misspelling=True)
 84 | 
 85 |         prob = prob_matching * prob_unmatching1 * prob_unmatching2 * prob_matching_alt
 86 | 
 87 |         return prob
 88 | 
 89 |     def _get_prob_matching(self, tokens, f_right, misspelling=False):
 90 |         prob = 1
 91 |         for t in tokens:
 92 |             p = self.get_prob(t,f_right,"right", misspelling)
 93 |             prob = p * prob
 94 |         return prob
 95 | 
 96 |     def _get_prob_unmatching(self, unmatching_tokens, record_tokens, field_right, field_left):
 97 |         # If the unmatching token is not a misspelling, then undo its probability
 98 |         prob = 1
 99 |         for umt in unmatching_tokens:
100 |             if not self._is_misspelling_of_one(umt, record_tokens):
101 |                 p = self.get_prob(umt,field_right,"right")
102 |                 if p is None: # If this token never appears on the right, how often does it appear on the left
103 |                     p = self.get_prob(umt,field_left,"left")
104 |                 prob = p * prob
105 | 
106 |         prob = Scorer._adjust_prob_towards_one(prob)
107 |         return 1/prob
108 | 
109 |     def _is_misspelling_of_one(self, token, token_list):
110 |         for t in token_list:
111 |             if self.matcher.token_comparison.is_mispelling(token, t):
112 |                 return True
113 |         return False
114 | 
115 |     def get_token_lists_by_field(self, recordsdict, attribute):
116 |         token_lists_by_field = {}
117 |         key = next(iter(recordsdict))
118 |         fields = recordsdict[key].fields
119 |         for f in fields:
120 |             token_lists_by_field[f] = []
121 | 
122 |         for key, this_record in recordsdict.items():
123 |             for f in fields:
124 |                 tokens = getattr(this_record, attribute)[f]
125 |                 token_lists_by_field[f].extend(tokens)
126 | 
127 |         return token_lists_by_field
128 | 
129 |     def field_tokens_to_prob(self, field_tokens):
130 |         ft = field_tokens
131 |         for key, value in ft.items():
132 |             counts = Counter(value)
133 |             count_sum = sum(counts.values())
134 |             counts = {k: v/count_sum for k,v in counts.items()}
135 |             ft[key] = counts
136 |         return ft
137 | 
138 |     def _generate_probs(self):
139 |         left_field_tokens =  self.get_token_lists_by_field(self.matcher.left_records, "clean_token_dict")
140 |         self.left_field_token_probs_dict = self.field_tokens_to_prob(left_field_tokens)
141 | 
142 |         right_field_tokens =  self.get_token_lists_by_field(self.matcher.right_records, "clean_token_dict")
143 |         self.right_field_token_probs_dict = self.field_tokens_to_prob(right_field_tokens)
144 | 
145 |         left_field_tokens =  self.get_token_lists_by_field(self.matcher.left_records, "token_misspelling_dict")
146 |         self.left_field_misspelling_probs_dict = self.field_tokens_to_prob(left_field_tokens)
147 | 
148 |         right_field_tokens =  self.get_token_lists_by_field(self.matcher.right_records, "token_misspelling_dict")
149 |         self.right_field_misspelling_probs_dict = self.field_tokens_to_prob(right_field_tokens)
150 | 
151 |     @staticmethod
152 |     def prob_to_score(prob):
153 |         return -(log10(prob))/30
154 | 
155 |     @staticmethod
156 |     def _adjust_prob_towards_one(initial_prob, amount = 2):
157 |         return initial_prob
158 | 


--------------------------------------------------------------------------------
/fuzzymatcher/tokencomparison.py:
--------------------------------------------------------------------------------
 1 | from functools import lru_cache
 2 | from metaphone import doublemetaphone
 3 | from rapidfuzz import fuzz
 4 | 
 5 | class TokenComparison:
 6 |     """
 7 |     The tokencomparison object contains functions that check for other misspellings and 'close' matches
 8 |     """
 9 | 
10 |     def __init__(self, fuzz_ratio_threshold = 80, number_fuzz_threshold = 1.01):
11 |         self.fuzz_ratio_threshold = fuzz_ratio_threshold
12 |         self.number_fuzz_threshold = number_fuzz_threshold
13 | 
14 |     @lru_cache(maxsize=int(1e6))
15 |     def get_misspellings(self, token):
16 |         """
17 |         Must return a list of misspellings
18 |         If there are no misspellings, just return a list of length 0
19 |         """
20 |         misspellings = doublemetaphone(token)
21 |         misspellings = [t for t in misspellings if t != ""]
22 |         return misspellings
23 | 
24 |     @lru_cache(maxsize=int(1e6))
25 |     def is_mispelling(self, token1, token2):
26 |         mis_t1 = set(self.get_misspellings(token1))
27 |         mis_t2 = set(self.get_misspellings(token2))
28 |         common = mis_t1.intersection(mis_t2).difference({''})  # Difference in case '' included in tokens
29 | 
30 |         if len(common) > 0:
31 |             return True
32 | 
33 |         # Misspellings only really make sense if the tokens are words not numbers
34 |         if token1.isalpha() and token2.isalpha():
35 |             if fuzz.ratio(token1, token2) > self.fuzz_ratio_threshold:
36 |                 return True
37 | 
38 |         try:
39 |             t1f = float(token1)
40 |             t2f = float(token2)
41 |             if max(t1f, t2f)/min(t1f, t2f) < self.number_fuzz_threshold:
42 |                 return True
43 | 
44 |         except (ValueError, ZeroDivisionError):
45 |             pass
46 | 
47 | 
48 | 
49 |         return False
50 | 
51 | 


--------------------------------------------------------------------------------
/fuzzymatcher/utils.py:
--------------------------------------------------------------------------------
 1 | from metaphone import doublemetaphone
 2 | from rapidfuzz.fuzz import ratio
 3 | 
 4 | def tokens_to_dmetaphones(tokens):
 5 |     new_tokens = []
 6 |     for t in tokens:
 7 |         dmp = doublemetaphone(t)
 8 |         if dmp[0] == '':
 9 |             pass
10 |         elif dmp[1] == '':
11 |             new_tokens.append(dmp[0])
12 |         else:
13 |             new_tokens.extend(dmp)
14 | 
15 |     new_tokens = [t.strip() for t in new_tokens]
16 |     return new_tokens
17 | 
18 | def add_dmetaphones_to_col(x):
19 |     tokens = x.split(" ")
20 |     new_tokens = tokens_to_dmetaphones(tokens)
21 |     tokens.extend(new_tokens)
22 |     return " ".join(tokens)
23 | 
24 | def add_dmetaphone_to_concat_all(df):
25 |     df["_concat_all"] =  df["_concat_all"].apply(add_dmetaphones_to_col)
26 | 
27 | def convert_tokens_to_dmetaphones(x):
28 |     tokens = x.split(" ")
29 |     new_tokens = tokens_to_dmetaphones(tokens)
30 |     return " ".join(new_tokens)
31 | 
32 | def convert_series_to_dmetaphones(series):
33 |     return series.apply(convert_tokens_to_dmetaphones)
34 | 
35 | def is_mispelling(token_left, token_right):
36 | 
37 |     dml = set(doublemetaphone(token_left))
38 |     dmr = set(doublemetaphone(token_right))
39 | 
40 |     if len(dml.intersection(dmr).difference({''})) > 0:
41 |         return True
42 | 
43 |     if ratio(token_left, token_right) >= 90:
44 |         return True
45 | 
46 |     return False


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas>=0.20.3
2 | metaphone
3 | rapidfuzz
4 | python-dateutil


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | # see https://stackoverflow.com/questions/14399534/reference-requirements-txt-for-the-install-requires-kwarg-in-setuptools-setup-py
 4 | setup(name='fuzzymatcher',
 5 |       version='0.0.6',
 6 |       description='Fuzzy match two pandas dataframes based on one or more common fields',
 7 |       url='https://github.com/RobinL/fuzzymatcher',
 8 |       author='Robin Linacre',
 9 |       author_email='robinlinacre@hotmail.com',
10 |       license='MIT',
11 |       packages=['fuzzymatcher'],  # The directory to look in for the source code
12 |       install_requires=['pandas', 'metaphone', 'python-Levenshtein', 'rapidfuzz', 'python-dateutil'],
13 |       test_requires=["pylint", "coverage", "codecov"],
14 |       keywords=["matching", "fuzzy", "probabalistic", "recordlinking", "fuzzymatching"],
15 |       download_url = 'https://github.com/RobinL/fuzzymatcher/archive/v0.0.6.tar.gz',
16 |       zip_safe=False)
17 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | """fuzzymatcher test package initialisation."""
4 | 


--------------------------------------------------------------------------------
/tests/create_fake_dataset.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": null,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "%load_ext autoreload\n",
10 |     "%autoreload 2"
11 |    ]
12 |   },
13 |   {
14 |    "cell_type": "code",
15 |    "execution_count": 2,
16 |    "metadata": {},
17 |    "outputs": [],
18 |    "source": [
19 |     "import sys\n",
20 |     "sys.path.append('..')"
21 |    ]
22 |   },
23 |   {
24 |    "cell_type": "code",
25 |    "execution_count": 3,
26 |    "metadata": {},
27 |    "outputs": [],
28 |    "source": [
29 |     "from generate_test_data import create_test_data\n",
30 |     "df_left, df_right = create_test_data(10000)"
31 |    ]
32 |   },
33 |   {
34 |    "cell_type": "code",
35 |    "execution_count": 4,
36 |    "metadata": {},
37 |    "outputs": [],
38 |    "source": [
39 |     "df_left.to_csv(\"data/left_4.csv\", index=False)\n",
40 |     "df_right.to_csv(\"data/right_4.csv\", index=False)"
41 |    ]
42 |   },
43 |   {
44 |    "cell_type": "code",
45 |    "execution_count": null,
46 |    "metadata": {},
47 |    "outputs": [],
48 |    "source": []
49 |   }
50 |  ],
51 |  "metadata": {
52 |   "kernelspec": {
53 |    "display_name": "Python 3",
54 |    "language": "python",
55 |    "name": "python3"
56 |   },
57 |   "language_info": {
58 |    "codemirror_mode": {
59 |     "name": "ipython",
60 |     "version": 3
61 |    },
62 |    "file_extension": ".py",
63 |    "mimetype": "text/x-python",
64 |    "name": "python",
65 |    "nbconvert_exporter": "python",
66 |    "pygments_lexer": "ipython3",
67 |    "version": "3.9.12"
68 |   }
69 |  },
70 |  "nbformat": 4,
71 |  "nbformat_minor": 2
72 | }
73 | 


--------------------------------------------------------------------------------
/tests/data/_cities.csv:
--------------------------------------------------------------------------------
 1 | city,freq
 2 | London,8615246
 3 | Birmingham,1224136
 4 | Glasgow,801198
 5 | Leeds,761481
 6 | Bristol,617280
 7 | Liverpool,552267
 8 | Manchester,520739
 9 | Sheffield,518090
10 | Edinburgh,482005
11 | Cardiff,447287
12 | Leicester,443760
13 | Stoke-on-Trent,372775
14 | Bradford,349561
15 | Coventry,325949
16 | Nottingham,315862
17 | Kingston-upon-Hull,314018
18 | Belfast,295223
19 | Newcastle-upon-Tyne,282442
20 | Sunderland,275506
21 | Brighton,273369
22 | Derby,270468
23 | Plymouth,261384
24 | Wolverhampton,254726
25 | Southampton,253651
26 | Swansea,240332
27 | Salford,239019
28 | Portsmouth,238137
29 | Milton,229941
30 | Aberdeen,227130
31 | Reading,218705
32 | Northampton,215173
33 | Luton,211228
34 | Swindon,209156
35 | Warrington,202228
36 | Dudley,200603
37 | York,200018
38 | Bolton,194189
39 | Stockton-on-Tees,191610
40 | Preston,190687
41 | Bournemouth,187503
42 | Norwich,186682
43 | Middlesbrough,184773
44 | Peterborough,178869
45 | Southend-on-Sea,175547
46 | Walsall,174141
47 | Colchester,173074
48 | Mansfield,169987
49 | Telford,166641
50 | Ipswich,164331
51 | Huddersfield,162949
52 | 


--------------------------------------------------------------------------------
/tests/data/_first_names.csv:
--------------------------------------------------------------------------------
  1 | name,freq,gender
  2 | Oliver ,6623,m
  3 | Harry ,5284,m
  4 | George ,5263,m
  5 | Jack ,4751,m
  6 | Jacob ,4485,m
  7 | Noah ,4305,m
  8 | Charlie ,4190,m
  9 | Muhammad ,3908,m
 10 | Thomas ,3898,m
 11 | Oscar ,3894,m
 12 | William ,3819,m
 13 | James ,3580,m
 14 | Leo ,3563,m
 15 | Alfie ,3555,m
 16 | Henry ,3527,m
 17 | Joshua ,3495,m
 18 | Freddie ,3287,m
 19 | Archie ,2791,m
 20 | Ethan ,2722,m
 21 | Isaac ,2662,m
 22 | Alexander ,2567,m
 23 | Joseph ,2506,m
 24 | Edward ,2429,m
 25 | Samuel ,2413,m
 26 | Max ,2405,m
 27 | Logan ,2335,m
 28 | Lucas ,2332,m
 29 | Daniel ,2290,m
 30 | Theo ,2279,m
 31 | Arthur ,2270,m
 32 | Mohammed ,2228,m
 33 | Harrison ,2220,m
 34 | Benjamin ,2153,m
 35 | Mason ,2131,m
 36 | Finley ,2022,m
 37 | Sebastian ,1990,m
 38 | Adam ,1815,m
 39 | Dylan ,1784,m
 40 | Zachary ,1670,m
 41 | Riley ,1556,m
 42 | Teddy ,1491,m
 43 | Theodore ,1484,m
 44 | David ,1461,m
 45 | Elijah ,1412,m
 46 | Jake ,1405,m
 47 | Toby ,1400,m
 48 | Louie ,1380,m
 49 | Reuben ,1322,m
 50 | Arlo ,1285,m
 51 | Hugo ,1234,m
 52 | Jaxon ,1161,m
 53 | Luca ,1144,m
 54 | Matthew ,1110,m
 55 | Harvey ,1102,m
 56 | Harley ,1083,m
 57 | Reggie ,1083,m
 58 | Tommy ,1066,m
 59 | Jenson ,1064,m
 60 | Luke ,1056,m
 61 | Michael ,1056,m
 62 | Jayden ,1050,m
 63 | Jude ,1047,m
 64 | Frankie ,1029,m
 65 | Albert ,1028,m
 66 | Stanley ,1019,m
 67 | Elliot ,999,m
 68 | Gabriel ,995,m
 69 | Mohammad ,948,m
 70 | Ollie ,923,m
 71 | Ronnie ,921,m
 72 | Louis ,911,m
 73 | Charles ,910,m
 74 | Blake ,893,m
 75 | Elliott ,892,m
 76 | Lewis ,884,m
 77 | Frederick ,874,m
 78 | Nathan ,872,m
 79 | Tyler ,863,m
 80 | Jackson ,858,m
 81 | Rory ,856,m
 82 | Ryan ,855,m
 83 | Carter ,839,m
 84 | Dexter ,831,m
 85 | Alex ,813,m
 86 | Austin ,812,m
 87 | Caleb ,810,m
 88 | Kai ,794,m
 89 | Albie ,788,m
 90 | Ellis ,783,m
 91 | Bobby ,775,m
 92 | Ezra ,763,m
 93 | Leon ,737,m
 94 | Roman ,737,m
 95 | Jesse ,723,m
 96 | Aaron ,704,m
 97 | Ibrahim ,701,m
 98 | Liam ,698,m
 99 | Jasper ,683,m
100 | Felix ,666,m
101 | Finn ,650,m
102 | Olivia ,5017,f
103 | Amelia ,4777,f
104 | Emily ,3551,f
105 | Isla ,3476,f
106 | Ava ,3285,f
107 | Isabella ,2729,f
108 | Lily ,2722,f
109 | Jessica ,2703,f
110 | Ella ,2702,f
111 | Mia ,2662,f
112 | Sophia ,2636,f
113 | Charlotte ,2596,f
114 | Poppy ,2506,f
115 | Sophie ,2505,f
116 | Grace ,2498,f
117 | Evie ,2487,f
118 | Alice ,2264,f
119 | Scarlett ,2096,f
120 | Freya ,2079,f
121 | Florence ,2072,f
122 | Isabelle ,2066,f
123 | Daisy ,2020,f
124 | Chloe ,1980,f
125 | Phoebe ,1975,f
126 | Matilda ,1915,f
127 | Ruby ,1904,f
128 | Evelyn ,1877,f
129 | Sienna ,1815,f
130 | Sofia ,1683,f
131 | Eva ,1641,f
132 | Elsie ,1574,f
133 | Willow ,1536,f
134 | Ivy ,1509,f
135 | Millie ,1456,f
136 | Esme ,1416,f
137 | Rosie ,1403,f
138 | Imogen ,1400,f
139 | Elizabeth ,1384,f
140 | Maya ,1374,f
141 | Layla ,1358,f
142 | Emilia ,1356,f
143 | Lola ,1298,f
144 | Lucy ,1273,f
145 | Harper ,1256,f
146 | Eliza ,1226,f
147 | Erin ,1212,f
148 | Eleanor ,1147,f
149 | Ellie ,1142,f
150 | Harriet ,1132,f
151 | Thea ,1132,f
152 | Maisie ,1111,f
153 | Holly ,1098,f
154 | Emma ,1080,f
155 | Georgia ,1078,f
156 | Amber ,1066,f
157 | Molly ,1041,f
158 | Hannah ,1033,f
159 | Abigail ,1023,f
160 | Jasmine ,1018,f
161 | Lilly ,1014,f
162 | Annabelle ,1009,f
163 | Rose ,974,f
164 | Penelope ,972,f
165 | Amelie ,964,f
166 | Violet ,959,f
167 | Bella ,946,f
168 | Aria ,926,f
169 | Zara ,876,f
170 | Maria ,853,f
171 | Nancy ,842,f
172 | Darcie ,791,f
173 | Lottie ,782,f
174 | Anna ,779,f
175 | Summer ,760,f
176 | Martha ,756,f
177 | Heidi ,752,f
178 | Gracie ,744,f
179 | Luna ,715,f
180 | Maryam ,711,f
181 | Beatrice ,685,f
182 | Mila ,679,f
183 | Darcey ,662,f
184 | Megan ,658,f
185 | Iris ,648,f
186 | Lexi ,626,f
187 | Robyn ,622,f
188 | Aisha ,617,f
189 | Clara ,611,f
190 | Francesca ,611,f
191 | Sara ,606,f
192 | Victoria ,586,f
193 | Zoe ,584,f
194 | Julia ,578,f
195 | Arabella ,577,f
196 | Maddison ,576,f
197 | Sarah ,572,f
198 | Felicity ,570,f
199 | Darcy ,566,f
200 | Leah ,564,f
201 | Lydia ,563,f
202 | 


--------------------------------------------------------------------------------
/tests/data/_surnames.csv:
--------------------------------------------------------------------------------
  1 | surname,freq,
  2 | Smith,729862,
  3 | Jones,578261,
  4 | Taylor,458268,
  5 | Williams,411385,
  6 | Brown,380443,
  7 | Davies,316982,
  8 | Evans,231844,
  9 | Wilson,227652,
 10 | Thomas,220228,
 11 | Roberts,219694,
 12 | Johnson,214969,
 13 | Lewis,198193,
 14 | Walker,195372,
 15 | Robinson,187889,
 16 | Wood,186261,
 17 | Thompson,183266,
 18 | White,181979,
 19 | Watson,181296,
 20 | Jackson,173538,
 21 | Wright,171438,
 22 | Green,166423,
 23 | Harris,161505,
 24 | Cooper,161076,
 25 | King,160918,
 26 | Lee,159502,
 27 | Martin,152842,
 28 | Clarke,152502,
 29 | James,151855,
 30 | Morgan,150454,
 31 | Hughes,147802,
 32 | Edwards,147540,
 33 | Hill,145723,
 34 | Moore,144446,
 35 | Clark,140662,
 36 | Harrison,137103,
 37 | Scott,134059,
 38 | Young,131862,
 39 | Morris,131499,
 40 | Hall,129805,
 41 | Ward,125643,
 42 | Turner,125278,
 43 | Carter,124343,
 44 | Phillips,121845,
 45 | Mitchell,121734,
 46 | Patel,119855,
 47 | Adams,116035,
 48 | Campbell,115972,
 49 | Anderson,115333,
 50 | Allen,112703,
 51 | Cook,111306,
 52 | Bailey,111011,
 53 | Parker,109943,
 54 | Miller,109474,
 55 | Davis,108041,
 56 | Murphy,106245,
 57 | Price,105993,
 58 | Bell,105417,
 59 | Baker,104639,
 60 | Griffiths,104048,
 61 | Kelly,102415,
 62 | Simpson,101082,
 63 | Marshall,97225,
 64 | Collins,95210,
 65 | Bennett,94402,
 66 | Cox,92572,
 67 | Richardson,92558,
 68 | Fox,92124,
 69 | Gray,90917,
 70 | Rose,89001,
 71 | Chapman,88136,
 72 | Hunt,86792,
 73 | Robertson,86269,
 74 | Shaw,86049,
 75 | Reynolds,85228,
 76 | Lloyd,85021,
 77 | Ellis,84914,
 78 | Richards,83575,
 79 | Russell,82898,
 80 | Wilkinson,82570,
 81 | Khan,80429,
 82 | Graham,80026,
 83 | Stewart,79031,
 84 | Reid,78783,
 85 | Murray,77127,
 86 | Powell,76973,
 87 | Palmer,75969,
 88 | Holmes,75423,
 89 | Rogers,74821,
 90 | Stevens,73965,
 91 | Walsh,73208,
 92 | Hunter,72547,
 93 | Thomson,71616,
 94 | Matthews,70610,
 95 | Ross,70274,
 96 | Owen,69870,
 97 | Mason,69708,
 98 | Knight,69522,
 99 | Kennedy,69299,
100 | Butler,69298,
101 | Saunders,69004,
102 | Cole,68854,
103 | Pearce,68707,
104 | Dean,68073,
105 | Foster,67675,
106 | Harvey,67231,
107 | Hudson,66488,
108 | Gibson,66246,
109 | Mills,65966,
110 | Berry,65022,
111 | Barnes,64291,
112 | Pearson,64156,
113 | Kaur,63969,
114 | Booth,63239,
115 | Dixon,63151,
116 | Grant,63130,
117 | Gordon,62747,
118 | Lane,62226,
119 | Harper,61606,
120 | Ali,61008,
121 | Hart,60683,
122 | Mcdonald,60431,
123 | Brooks,59633,
124 | Ryan,59545,
125 | Carr,59130,
126 | Macdonald,58450,
127 | Hamilton,58170,
128 | Johnston,58060,
129 | West,57780,
130 | Gill,57716,
131 | Dawson,57637,
132 | Armstrong,56917,
133 | Gardner,56314,
134 | Stone,55920,
135 | Andrews,55371,
136 | Williamson,55302,
137 | Barker,55092,
138 | George,54666,
139 | Fisher,54294,
140 | Cunningham,54280,
141 | Watts,54124,
142 | Webb,54036,
143 | Lawrence,53575,
144 | Bradley,52955,
145 | Jenkins,52608,
146 | Wells,52410,
147 | Chambers,52399,
148 | Spencer,52194,
149 | Poole,52038,
150 | Atkinson,51520,97
151 | Lawson,50,
152 | Lawson,50971,
153 | Day,50489,
154 | Woods,50230,
155 | Rees,50058,
156 | Fraser,49924,
157 | Black,49851,
158 | Fletcher,49725,
159 | Hussain,49663,
160 | Willis,49648,
161 | Marsh,49438,
162 | Ahmed,49385,
163 | Doyle,49093,
164 | Lowe,48795,
165 | Burns,48179,
166 | Hopkins,48145,
167 | Nicholson,47947,
168 | Parry,47611,
169 | Newman,47474,
170 | Jordan,47311,
171 | Henderson,46745,
172 | Howard,46513,
173 | Barrett,46507,
174 | Burton,46432,
175 | Riley,46323,
176 | Porter,45820,
177 | Byrne,44686,
178 | Houghton,44528,
179 | John,44411,
180 | Perry,44306,
181 | Baxter,44182,
182 | Ball,43967,
183 | Mccarthy,43841,
184 | Elliott,43810,
185 | Burke,43670,
186 | Gallagher,43183,
187 | Duncan,43107,
188 | Cooke,42991,
189 | Austin,42785,
190 | Read,42641,
191 | Wallace,42601,
192 | Hawkins,42543,
193 | Hayes,42475,
194 | Francis,42437,
195 | Sutton,42393,
196 | Davidson,42349,
197 | Sharp,42266,
198 | Holland,41869,
199 | Moss,40964,
200 | May,40836,
201 | Bates,40781,
202 | Morrison,40765,
203 | Bob,40761,
204 | Oliver,40678,
205 | Kemp,40614,
206 | Page,40554,
207 | Arnold,40053,
208 | Shah,39981,
209 | Stevenson,39857,
210 | Ford,39580,
211 | Potter,39355,
212 | Flynn,39193,
213 | Warren,39108,
214 | Kent,38924,
215 | Alexander,38849,
216 | Field,38753,
217 | Freeman,38498,
218 | Begum,38176,
219 | Rhodes,37989,
220 | O neill,37778,
221 | Middleton,37567,
222 | Payne,37552,
223 | Stephenson,37457,
224 | Pritchard,37155,
225 | Gregory,36854,
226 | Bond,36816,
227 | Webster,36737,
228 | Dunn,36674,
229 | Donnelly,36450,
230 | Lucas,36021,
231 | Long,36011,
232 | Jarvis,35979,
233 | Cross,35736,
234 | Stephens,35599,
235 | Reed,35264,
236 | Coleman,35258,
237 | Nicholls,35101,
238 | Bull,35010,
239 | Bartlett,34909,
240 | O brien,34834,
241 | Curtis,34681,
242 | Bird,34628,
243 | Patterson,34527,
244 | Tucker,34442,
245 | Bryant,34091,
246 | Lynch,34075,
247 | Mackenzie,34001,
248 | Ferguson,33983,
249 | Cameron,33846,
250 | Lopez,33832,
251 | Haynes,33812,
252 | Bolton,33800,
253 | Hardy,33773,
254 | Heath,33647,
255 | Davey,33556,
256 | Rice,33363,
257 | Jacobs,32761,
258 | Parsons,32698,
259 | Ashton,32675,
260 | Robson,32669,
261 | French,32541,
262 | Farrell,32195,
263 | Walton,32149,
264 | Gilbert,32131,
265 | Mcintyre,32129,
266 | Newton,32089,
267 | Norman,32054,
268 | Higgins,31950,
269 | Hodgson,31895,
270 | Sutherland,31784,
271 | Kay,31520,
272 | Bishop,31479,
273 | Burgess,31473,
274 | Simmons,31386,
275 | Hutchinson,31369,
276 | Moran,31308,
277 | Frost,31298,
278 | Sharma,31288,
279 | Slater,31226,
280 | Greenwood,31203,
281 | Kirk,31170,
282 | Fernandez,31117,
283 | Garcia,31015,
284 | Atkins,31015,
285 | Daniel,30918,
286 | Beattie,30886,
287 | Maxwell,30847,
288 | Todd,30590,
289 | Charles,30449,
290 | Paul,30300,
291 | Crawford,30292,
292 | O connor,30271,
293 | Park,30076,
294 | Forrest,30021,
295 | Love,29973,
296 | Rowland,29927,
297 | Connolly,29874,
298 | Sheppard,29653,
299 | Harding,29560,
300 | Banks,29546,
301 | Rowe,29371,
302 | 


--------------------------------------------------------------------------------
/tests/data/las_ons.csv:
--------------------------------------------------------------------------------
  1 | lad16cd,lad16nm
  2 | E06000001,Hartlepool
  3 | E06000002,Middlesbrough
  4 | E06000003,Redcar and Cleveland
  5 | E06000004,Stockton-on-Tees
  6 | E06000005,Darlington
  7 | E06000006,Halton
  8 | E06000007,Warrington
  9 | E06000008,Blackburn with Darwen
 10 | E06000009,Blackpool
 11 | E06000010,"Kingston upon Hull, City of"
 12 | E06000011,East Riding of Yorkshire
 13 | E06000012,North East Lincolnshire
 14 | E06000013,North Lincolnshire
 15 | E06000014,York
 16 | E06000015,Derby
 17 | E06000016,Leicester
 18 | E06000017,Rutland
 19 | E06000018,Nottingham
 20 | E06000019,"Herefordshire, County of"
 21 | E06000020,Telford and Wrekin
 22 | E06000021,Stoke-on-Trent
 23 | E06000022,Bath and North East Somerset
 24 | E06000023,"Bristol, City of"
 25 | E06000024,North Somerset
 26 | E06000025,South Gloucestershire
 27 | E06000026,Plymouth
 28 | E06000027,Torbay
 29 | E06000028,Bournemouth
 30 | E06000029,Poole
 31 | E06000030,Swindon
 32 | E06000031,Peterborough
 33 | E06000040,Windsor and Maidenhead
 34 | E06000041,Wokingham
 35 | E06000042,Milton Keynes
 36 | E06000043,Brighton and Hove
 37 | E06000044,Portsmouth
 38 | E06000032,Luton
 39 | E06000033,Southend-on-Sea
 40 | E06000034,Thurrock
 41 | E06000035,Medway
 42 | E06000036,Bracknell Forest
 43 | E06000037,West Berkshire
 44 | E06000045,Southampton
 45 | E06000046,Isle of Wight
 46 | E06000047,County Durham
 47 | E06000049,Cheshire East
 48 | E06000050,Cheshire West and Chester
 49 | E06000051,Shropshire
 50 | E06000052,Cornwall
 51 | E06000053,Isles of Scilly
 52 | E06000054,Wiltshire
 53 | E06000055,Bedford
 54 | E06000056,Central Bedfordshire
 55 | E06000057,Northumberland
 56 | E07000004,Aylesbury Vale
 57 | E07000005,Chiltern
 58 | E07000006,South Bucks
 59 | E07000007,Wycombe
 60 | E07000090,Havant
 61 | E07000091,New Forest
 62 | E07000092,Rushmoor
 63 | E07000093,Test Valley
 64 | E07000094,Winchester
 65 | E07000095,Broxbourne
 66 | E07000096,Dacorum
 67 | E07000098,Hertsmere
 68 | E07000099,North Hertfordshire
 69 | E07000102,Three Rivers
 70 | E07000130,Charnwood
 71 | E07000131,Harborough
 72 | E07000132,Hinckley and Bosworth
 73 | E06000038,Reading
 74 | E06000039,Slough
 75 | E07000008,Cambridge
 76 | E07000009,East Cambridgeshire
 77 | E07000010,Fenland
 78 | E07000011,Huntingdonshire
 79 | E07000012,South Cambridgeshire
 80 | E07000026,Allerdale
 81 | E07000027,Barrow-in-Furness
 82 | E07000028,Carlisle
 83 | E07000029,Copeland
 84 | E07000030,Eden
 85 | E07000031,South Lakeland
 86 | E07000032,Amber Valley
 87 | E07000033,Bolsover
 88 | E07000034,Chesterfield
 89 | E07000035,Derbyshire Dales
 90 | E07000036,Erewash
 91 | E07000037,High Peak
 92 | E07000038,North East Derbyshire
 93 | E07000039,South Derbyshire
 94 | E07000040,East Devon
 95 | E07000041,Exeter
 96 | E07000042,Mid Devon
 97 | E07000043,North Devon
 98 | E07000044,South Hams
 99 | E07000045,Teignbridge
100 | E07000046,Torridge
101 | E07000047,West Devon
102 | E07000048,Christchurch
103 | E07000049,East Dorset
104 | E07000050,North Dorset
105 | E07000051,Purbeck
106 | E07000052,West Dorset
107 | E07000053,Weymouth and Portland
108 | E07000061,Eastbourne
109 | E07000062,Hastings
110 | E07000063,Lewes
111 | E07000064,Rother
112 | E07000065,Wealden
113 | E07000066,Basildon
114 | E07000067,Braintree
115 | E07000068,Brentwood
116 | E07000069,Castle Point
117 | E07000070,Chelmsford
118 | E07000071,Colchester
119 | E07000072,Epping Forest
120 | E07000086,Eastleigh
121 | E07000087,Fareham
122 | E07000073,Harlow
123 | E07000074,Maldon
124 | E07000075,Rochford
125 | E07000076,Tendring
126 | E07000077,Uttlesford
127 | E07000078,Cheltenham
128 | E07000079,Cotswold
129 | E07000080,Forest of Dean
130 | E07000088,Gosport
131 | E07000089,Hart
132 | E07000081,Gloucester
133 | E07000082,Stroud
134 | E07000083,Tewkesbury
135 | E07000084,Basingstoke and Deane
136 | E07000085,East Hampshire
137 | E07000103,Watford
138 | E07000105,Ashford
139 | E07000106,Canterbury
140 | E07000107,Dartford
141 | E07000108,Dover
142 | E07000109,Gravesham
143 | E07000110,Maidstone
144 | E07000111,Sevenoaks
145 | E07000112,Shepway
146 | E07000113,Swale
147 | E07000114,Thanet
148 | E07000115,Tonbridge and Malling
149 | E07000116,Tunbridge Wells
150 | E07000117,Burnley
151 | E07000118,Chorley
152 | E07000119,Fylde
153 | E07000120,Hyndburn
154 | E07000121,Lancaster
155 | E07000122,Pendle
156 | E07000123,Preston
157 | E07000124,Ribble Valley
158 | E07000125,Rossendale
159 | E07000126,South Ribble
160 | E07000127,West Lancashire
161 | E07000128,Wyre
162 | E07000129,Blaby
163 | E07000133,Melton
164 | E07000134,North West Leicestershire
165 | E07000135,Oadby and Wigston
166 | E07000136,Boston
167 | E07000137,East Lindsey
168 | E07000192,Cannock Chase
169 | E07000193,East Staffordshire
170 | E07000194,Lichfield
171 | E07000138,Lincoln
172 | E07000139,North Kesteven
173 | E07000140,South Holland
174 | E07000141,South Kesteven
175 | E07000142,West Lindsey
176 | E07000143,Breckland
177 | E07000144,Broadland
178 | E07000145,Great Yarmouth
179 | E07000146,King's Lynn and West Norfolk
180 | E07000147,North Norfolk
181 | E07000148,Norwich
182 | E07000149,South Norfolk
183 | E07000150,Corby
184 | E07000151,Daventry
185 | E07000152,East Northamptonshire
186 | E07000153,Kettering
187 | E07000154,Northampton
188 | E07000155,South Northamptonshire
189 | E07000156,Wellingborough
190 | E07000163,Craven
191 | E07000164,Hambleton
192 | E07000165,Harrogate
193 | E07000166,Richmondshire
194 | E07000167,Ryedale
195 | E07000168,Scarborough
196 | E07000169,Selby
197 | E07000170,Ashfield
198 | E07000171,Bassetlaw
199 | E07000172,Broxtowe
200 | E07000173,Gedling
201 | E07000174,Mansfield
202 | E07000175,Newark and Sherwood
203 | E07000176,Rushcliffe
204 | E07000177,Cherwell
205 | E07000195,Newcastle-under-Lyme
206 | E07000196,South Staffordshire
207 | E07000197,Stafford
208 | E07000178,Oxford
209 | E07000179,South Oxfordshire
210 | E07000180,Vale of White Horse
211 | E07000181,West Oxfordshire
212 | E07000187,Mendip
213 | E07000188,Sedgemoor
214 | E07000189,South Somerset
215 | E07000190,Taunton Deane
216 | E07000191,West Somerset
217 | E07000198,Staffordshire Moorlands
218 | E07000199,Tamworth
219 | E07000200,Babergh
220 | E07000201,Forest Heath
221 | E07000202,Ipswich
222 | E07000203,Mid Suffolk
223 | E07000204,St Edmundsbury
224 | E07000205,Suffolk Coastal
225 | E07000206,Waveney
226 | E07000207,Elmbridge
227 | E07000208,Epsom and Ewell
228 | E07000209,Guildford
229 | E07000210,Mole Valley
230 | E07000211,Reigate and Banstead
231 | E07000212,Runnymede
232 | E07000213,Spelthorne
233 | E07000214,Surrey Heath
234 | E07000215,Tandridge
235 | E07000216,Waverley
236 | E07000217,Woking
237 | E07000218,North Warwickshire
238 | E07000219,Nuneaton and Bedworth
239 | E07000220,Rugby
240 | E07000221,Stratford-on-Avon
241 | E07000222,Warwick
242 | E07000223,Adur
243 | E07000224,Arun
244 | E07000225,Chichester
245 | E07000226,Crawley
246 | E07000227,Horsham
247 | E07000228,Mid Sussex
248 | E07000229,Worthing
249 | E07000234,Bromsgrove
250 | E07000235,Malvern Hills
251 | E07000236,Redditch
252 | E07000242,East Hertfordshire
253 | E08000026,Coventry
254 | E08000027,Dudley
255 | E07000237,Worcester
256 | E07000238,Wychavon
257 | E07000239,Wyre Forest
258 | E07000240,St Albans
259 | E07000241,Welwyn Hatfield
260 | E07000243,Stevenage
261 | E08000001,Bolton
262 | E08000002,Bury
263 | E08000003,Manchester
264 | E08000004,Oldham
265 | E08000005,Rochdale
266 | E08000006,Salford
267 | E08000007,Stockport
268 | E08000008,Tameside
269 | E08000009,Trafford
270 | E08000010,Wigan
271 | E08000011,Knowsley
272 | E08000012,Liverpool
273 | E08000013,St. Helens
274 | E08000014,Sefton
275 | E08000015,Wirral
276 | E08000016,Barnsley
277 | E08000017,Doncaster
278 | E08000018,Rotherham
279 | E08000019,Sheffield
280 | E08000021,Newcastle upon Tyne
281 | E08000022,North Tyneside
282 | E08000023,South Tyneside
283 | E08000024,Sunderland
284 | E08000025,Birmingham
285 | E08000028,Sandwell
286 | E08000029,Solihull
287 | E08000030,Walsall
288 | E08000031,Wolverhampton
289 | E08000032,Bradford
290 | E08000033,Calderdale
291 | E08000034,Kirklees
292 | E08000035,Leeds
293 | E08000036,Wakefield
294 | E08000037,Gateshead
295 | E09000016,Havering
296 | E09000017,Hillingdon
297 | E09000018,Hounslow
298 | E09000020,Kensington and Chelsea
299 | E09000033,Westminster
300 | S12000005,Clackmannanshire
301 | S12000006,Dumfries and Galloway
302 | E09000021,Kingston upon Thames
303 | S12000008,East Ayrshire
304 | S12000010,East Lothian
305 | S12000011,East Renfrewshire
306 | S12000013,Na h-Eileanan Siar
307 | E09000022,Lambeth
308 | E09000023,Lewisham
309 | S12000014,Falkirk
310 | S12000015,Fife
311 | S12000017,Highland
312 | S12000018,Inverclyde
313 | S12000019,Midlothian
314 | S12000020,Moray
315 | S12000021,North Ayrshire
316 | S12000023,Orkney Islands
317 | E09000001,City of London
318 | E09000002,Barking and Dagenham
319 | E09000003,Barnet
320 | E09000004,Bexley
321 | E09000011,Greenwich
322 | E09000019,Islington
323 | E09000005,Brent
324 | E09000006,Bromley
325 | E09000007,Camden
326 | E09000008,Croydon
327 | E09000009,Ealing
328 | E09000010,Enfield
329 | E09000012,Hackney
330 | E09000013,Hammersmith and Fulham
331 | E09000014,Haringey
332 | E09000015,Harrow
333 | S12000024,Perth and Kinross
334 | S12000026,Scottish Borders
335 | S12000027,Shetland Islands
336 | S12000028,South Ayrshire
337 | S12000029,South Lanarkshire
338 | S12000030,Stirling
339 | S12000033,Aberdeen City
340 | S12000034,Aberdeenshire
341 | E09000024,Merton
342 | E09000025,Newham
343 | E09000026,Redbridge
344 | E09000027,Richmond upon Thames
345 | S12000035,Argyll and Bute
346 | S12000036,City of Edinburgh
347 | S12000038,Renfrewshire
348 | S12000039,West Dunbartonshire
349 | S12000040,West Lothian
350 | S12000041,Angus
351 | E09000028,Southwark
352 | W06000016,Rhondda Cynon Taf
353 | W06000018,Caerphilly
354 | W06000019,Blaenau Gwent
355 | W06000020,Torfaen
356 | W06000021,Monmouthshire
357 | W06000022,Newport
358 | W06000023,Powys
359 | E09000029,Sutton
360 | W06000024,Merthyr Tydfil
361 | E09000030,Tower Hamlets
362 | E09000031,Waltham Forest
363 | E09000032,Wandsworth
364 | S12000042,Dundee City
365 | S12000044,North Lanarkshire
366 | S12000045,East Dunbartonshire
367 | S12000046,Glasgow City
368 | W06000001,Isle of Anglesey
369 | W06000002,Gwynedd
370 | W06000003,Conwy
371 | W06000004,Denbighshire
372 | W06000005,Flintshire
373 | W06000006,Wrexham
374 | W06000008,Ceredigion
375 | W06000009,Pembrokeshire
376 | W06000010,Carmarthenshire
377 | W06000011,Swansea
378 | W06000012,Neath Port Talbot
379 | W06000013,Bridgend
380 | W06000014,Vale of Glamorgan
381 | W06000015,Cardiff
382 | 


--------------------------------------------------------------------------------
/tests/data/las_os.csv:
--------------------------------------------------------------------------------
  1 | name,code
  2 | Wycombe District,E07000007
  3 | South Bucks District,E07000006
  4 | Chiltern District,E07000005
  5 | Aylesbury Vale District,E07000004
  6 | Fenland District,E07000010
  7 | South Cambridgeshire District,E07000012
  8 | East Cambridgeshire District,E07000009
  9 | Huntingdonshire District,E07000011
 10 | Cambridge District (B),E07000008
 11 | Copeland District (B),E07000029
 12 | Carlisle District (B),E07000028
 13 | South Lakeland District,E07000031
 14 | Allerdale District (B),E07000026
 15 | Eden District,E07000030
 16 | Barrow-in-Furness District (B),E07000027
 17 | High Peak District (B),E07000037
 18 | South Derbyshire District,E07000039
 19 | Erewash District (B),E07000036
 20 | North East Derbyshire District,E07000038
 21 | Amber Valley District (B),E07000032
 22 | Bolsover District,E07000033
 23 | Derbyshire Dales District,E07000035
 24 | Chesterfield District (B),E07000034
 25 | North Devon District,E07000043
 26 | East Devon District,E07000040
 27 | Teignbridge District,E07000045
 28 | West Devon District (B),E07000047
 29 | Mid Devon District,E07000042
 30 | Exeter District (B),E07000041
 31 | Purbeck District,E07000051
 32 | Christchurch District (B),E07000048
 33 | West Dorset District,E07000052
 34 | East Dorset District,E07000049
 35 | North Dorset District,E07000050
 36 | Weymouth and Portland District (B),E07000053
 37 | Lewes District,E07000063
 38 | Rother District,E07000064
 39 | Wealden District,E07000065
 40 | Eastbourne District (B),E07000061
 41 | Hastings District (B),E07000062
 42 | Brentwood District (B),E07000068
 43 | Rochford District,E07000075
 44 | Epping Forest District,E07000072
 45 | Tendring District,E07000076
 46 | Uttlesford District,E07000077
 47 | Chelmsford District (B),E07000070
 48 | Colchester District (B),E07000071
 49 | Maldon District (B),E07000074
 50 | Braintree District,E07000067
 51 | Harlow District,E07000073
 52 | Basildon District (B),E07000066
 53 | Castle Point District (B),E07000069
 54 | Forest of Dean District,E07000080
 55 | Cotswold District,E07000079
 56 | Stroud District,E07000082
 57 | Tewkesbury District (B),E07000083
 58 | Gloucester District (B),E07000081
 59 | Cheltenham District (B),E07000078
 60 | Basingstoke and Deane District (B),E07000084
 61 | New Forest District,E07000091
 62 | Eastleigh District (B),E07000086
 63 | East Hampshire District,E07000085
 64 | Winchester District (B),E07000094
 65 | Test Valley District,E07000093
 66 | Hart District,E07000089
 67 | Gosport District (B),E07000088
 68 | Fareham District (B),E07000087
 69 | Havant District (B),E07000090
 70 | Rushmoor District (B),E07000092
 71 | Three Rivers District,E07000102
 72 | Hertsmere District (B),E07000098
 73 | Broxbourne District (B),E07000095
 74 | Dacorum District (B),E07000096
 75 | East Hertfordshire District,E07000242
 76 | St. Albans District (B),E07000240
 77 | Welwyn Hatfield District (B),E07000241
 78 | North Hertfordshire District,E07000099
 79 | Watford District (B),E07000103
 80 | Stevenage District (B),E07000243
 81 | Tunbridge Wells District (B),E07000116
 82 | Shepway District,E07000112
 83 | Sevenoaks District,E07000111
 84 | Tonbridge and Malling District (B),E07000115
 85 | Thanet District,E07000114
 86 | Ashford District (B),E07000105
 87 | Canterbury District (B),E07000106
 88 | Dover District,E07000108
 89 | Maidstone District (B),E07000110
 90 | Swale District (B),E07000113
 91 | Dartford District (B),E07000107
 92 | Gravesham District (B),E07000109
 93 | West Lancashire District (B),E07000127
 94 | Lancaster District (B),E07000121
 95 | Chorley District (B),E07000118
 96 | South Ribble District (B),E07000126
 97 | Rossendale District (B),E07000125
 98 | Fylde District (B),E07000119
 99 | Preston District (B),E07000123
100 | Wyre District (B),E07000128
101 | Pendle District (B),E07000122
102 | Ribble Valley District (B),E07000124
103 | Hyndburn District (B),E07000120
104 | Burnley District (B),E07000117
105 | Hinckley and Bosworth District (B),E07000132
106 | North West Leicestershire District,E07000134
107 | Melton District (B),E07000133
108 | Harborough District,E07000131
109 | Blaby District,E07000129
110 | Charnwood District (B),E07000130
111 | Oadby and Wigston District (B),E07000135
112 | West Lindsey District,E07000142
113 | South Kesteven District,E07000141
114 | South Holland District,E07000140
115 | Boston District (B),E07000136
116 | North Kesteven District,E07000139
117 | East Lindsey District,E07000137
118 | Lincoln District (B),E07000138
119 | Great Yarmouth District (B),E07000145
120 | South Norfolk District,E07000149
121 | King's Lynn and West Norfolk District (B),E07000146
122 | Breckland District,E07000143
123 | Norwich District (B),E07000148
124 | South Northamptonshire District,E07000155
125 | East Northamptonshire District,E07000152
126 | Daventry District,E07000151
127 | Wellingborough District (B),E07000156
128 | Kettering District (B),E07000153
129 | Northampton District (B),E07000154
130 | Corby District (B),E07000150
131 | Scarborough District (B),E07000168
132 | Selby District,E07000169
133 | Craven District,E07000163
134 | Richmondshire District,E07000166
135 | Harrogate District (B),E07000165
136 | Ryedale District,E07000167
137 | Hambleton District,E07000164
138 | Bassetlaw District,E07000171
139 | Rushcliffe District (B),E07000176
140 | Gedling District (B),E07000173
141 | Ashfield District,E07000170
142 | Newark and Sherwood District,E07000175
143 | Broxtowe District (B),E07000172
144 | Mansfield District,E07000174
145 | Vale of White Horse District,E07000180
146 | South Oxfordshire District,E07000179
147 | Cherwell District,E07000177
148 | West Oxfordshire District,E07000181
149 | Oxford District (B),E07000178
150 | West Somerset District,E07000191
151 | Mendip District,E07000187
152 | Taunton Deane District (B),E07000190
153 | South Somerset District,E07000189
154 | Sedgemoor District,E07000188
155 | Staffordshire Moorlands District,E07000198
156 | South Staffordshire District,E07000196
157 | Lichfield District,E07000194
158 | Newcastle-under-Lyme District (B),E07000195
159 | Stafford District (B),E07000197
160 | East Staffordshire District (B),E07000193
161 | Cannock Chase District,E07000192
162 | Tamworth District (B),E07000199
163 | Waveney District,E07000206
164 | Babergh District,E07000200
165 | Suffolk Coastal District,E07000205
166 | St. Edmundsbury District (B),E07000204
167 | Forest Heath District,E07000201
168 | Mid Suffolk District,E07000203
169 | Ipswich District (B),E07000202
170 | Waverley District (B),E07000216
171 | Tandridge District,E07000215
172 | Woking District (B),E07000217
173 | Surrey Heath District (B),E07000214
174 | Runnymede District (B),E07000212
175 | Guildford District (B),E07000209
176 | Reigate and Banstead District (B),E07000211
177 | Mole Valley District,E07000210
178 | Elmbridge District (B),E07000207
179 | Spelthorne District (B),E07000213
180 | Epsom and Ewell District (B),E07000208
181 | North Warwickshire District (B),E07000218
182 | Rugby District (B),E07000220
183 | Warwick District,E07000222
184 | Stratford-on-Avon District,E07000221
185 | Nuneaton and Bedworth District (B),E07000219
186 | Arun District,E07000224
187 | Adur District,E07000223
188 | Chichester District,E07000225
189 | Mid Sussex District,E07000228
190 | Horsham District,E07000227
191 | Worthing District (B),E07000229
192 | Crawley District (B),E07000226
193 | Malvern Hills District,E07000235
194 | Wyre Forest District,E07000239
195 | Wychavon District,E07000238
196 | Bromsgrove District,E07000234
197 | Worcester District (B),E07000237
198 | Redditch District (B),E07000236
199 | Kingston upon Thames London Boro,E09000021
200 | Croydon London Boro,E09000008
201 | Bromley London Boro,E09000006
202 | Hounslow London Boro,E09000018
203 | Ealing London Boro,E09000009
204 | Havering London Boro,E09000016
205 | Hillingdon London Boro,E09000017
206 | Harrow London Boro,E09000015
207 | Brent London Boro,E09000005
208 | Barnet London Boro,E09000003
209 | Lambeth London Boro,E09000022
210 | Southwark London Boro,E09000028
211 | Lewisham London Boro,E09000023
212 | Greenwich London Boro,E09000011
213 | Bexley London Boro,E09000004
214 | Enfield London Boro,E09000010
215 | Waltham Forest London Boro,E09000031
216 | Redbridge London Boro,E09000026
217 | Sutton London Boro,E09000029
218 | Richmond upon Thames London Boro,E09000027
219 | Merton London Boro,E09000024
220 | Wandsworth London Boro,E09000032
221 | Hammersmith and Fulham London Boro,E09000013
222 | Kensington and Chelsea London Boro,E09000020
223 | City of Westminster London Boro,E09000033
224 | Camden London Boro,E09000007
225 | Tower Hamlets London Boro,E09000030
226 | Islington London Boro,E09000019
227 | Hackney London Boro,E09000012
228 | Haringey London Boro,E09000014
229 | Newham London Boro,E09000025
230 | Barking and Dagenham London Boro,E09000002
231 | Kirklees District (B),E08000034
232 | Knowsley District (B),E08000011
233 | Leeds District (B),E08000035
234 | Liverpool District (B),E08000012
235 | Manchester District (B),E08000003
236 | Newcastle upon Tyne District (B),E08000021
237 | North Tyneside District (B),E08000022
238 | Oldham District (B),E08000004
239 | Rochdale District (B),E08000005
240 | Rotherham District (B),E08000018
241 | Salford District (B),E08000006
242 | Sandwell District (B),E08000028
243 | Sefton District (B),E08000014
244 | Sheffield District (B),E08000019
245 | City and County of the City of London,E09000001
246 | Barnsley District (B),E08000016
247 | Birmingham District (B),E08000025
248 | Bolton District (B),E08000001
249 | Bradford District (B),E08000032
250 | Bury District (B),E08000002
251 | Calderdale District (B),E08000033
252 | City of Wolverhampton District (B),E08000031
253 | Coventry District (B),E08000026
254 | Doncaster District (B),E08000017
255 | Dudley District (B),E08000027
256 | Gateshead District (B),E08000037
257 | Solihull District (B),E08000029
258 | South Tyneside District (B),E08000023
259 | Stockport District (B),E08000007
260 | St. Helens District (B),E08000013
261 | Sunderland District (B),E08000024
262 | Tameside District (B),E08000008
263 | Trafford District (B),E08000009
264 | Wakefield District (B),E08000036
265 | Walsall District (B),E08000030
266 | Wigan District (B),E08000010
267 | Wirral District (B),E08000015
268 | Abertawe - Swansea,W06000011
269 | Angus,S12000041
270 | Bath and North East Somerset,E06000022
271 | Bedford (B),E06000055
272 | Blackburn with Darwen (B),E06000008
273 | Blackpool (B),E06000009
274 | Blaenau Gwent - Blaenau Gwent,W06000019
275 | Bournemouth (B),E06000028
276 | Bracknell Forest (B),E06000036
277 | Bro Morgannwg - the Vale of Glamorgan,W06000014
278 | Caerffili - Caerphilly,W06000018
279 | Casnewydd - Newport,W06000022
280 | Castell-nedd Port Talbot - Neath Port Talbot,W06000012
281 | Central Bedfordshire,E06000056
282 | Cheshire East (B),E06000049
283 | Cheshire West and Chester (B),E06000050
284 | City of Bristol (B),E06000023
285 | City of Derby (B),E06000015
286 | City of Kingston upon Hull (B),E06000010
287 | City of Leicester (B),E06000016
288 | City of Nottingham (B),E06000018
289 | City of Peterborough (B),E06000031
290 | City of Southampton (B),E06000045
291 | City of Stoke-on-Trent (B),E06000021
292 | Clackmannanshire,S12000005
293 | Conwy - Conwy,W06000003
294 | County Durham,E06000047
295 | County of Herefordshire,E06000019
296 | Darlington (B),E06000005
297 | Dundee City,S12000042
298 | East Ayrshire,S12000008
299 | East Dunbartonshire,S12000045
300 | East Renfrewshire,S12000011
301 | East Riding of Yorkshire,E06000011
302 | Falkirk,S12000014
303 | Glasgow City,S12000046
304 | Halton (B),E06000006
305 | Hartlepool (B),E06000001
306 | Inverclyde,S12000018
307 | Luton (B),E06000032
308 | Medway (B),E06000035
309 | Merthyr Tudful - Merthyr Tydfil,W06000024
310 | Middlesbrough (B),E06000002
311 | Midlothian,S12000019
312 | Milton Keynes (B),E06000042
313 | North East Lincolnshire (B),E06000012
314 | North Lanarkshire,S12000044
315 | North Lincolnshire (B),E06000013
316 | Pen-y-bont ar Ogwr - Bridgend,W06000013
317 | Perth and Kinross,S12000024
318 | Poole (B),E06000029
319 | Powys - Powys,W06000023
320 | Reading (B),E06000038
321 | Redcar and Cleveland (B),E06000003
322 | Renfrewshire,S12000038
323 | Rhondda Cynon Taf - Rhondda Cynon Taf,W06000016
324 | Rutland,E06000017
325 | Scottish Borders,S12000026
326 | Shropshire,E06000051
327 | Sir Ddinbych - Denbighshire,W06000004
328 | Sir Gaerfyrddin - Carmarthenshire,W06000010
329 | Sir y Fflint - Flintshire,W06000005
330 | Slough (B),E06000039
331 | Southend-on-Sea (B),E06000033
332 | South Gloucestershire,E06000025
333 | South Lanarkshire,S12000029
334 | Stirling,S12000030
335 | Stockton-on-Tees (B),E06000004
336 | Swindon (B),E06000030
337 | Telford and Wrekin (B),E06000020
338 | The City of Brighton and Hove (B),E06000043
339 | Thurrock (B),E06000034
340 | Tor-faen - Torfaen,W06000020
341 | Torbay (B),E06000027
342 | Warrington (B),E06000007
343 | West Berkshire,E06000037
344 | West Dunbartonshire,S12000039
345 | West Lothian,S12000040
346 | Wiltshire,E06000054
347 | Windsor and Maidenhead (B),E06000040
348 | Wokingham (B),E06000041
349 | Wrecsam - Wrexham,W06000006
350 | York (B),E06000014
351 | Broadland District,E07000144
352 | North Norfolk District,E07000147
353 | South Hams District,E07000044
354 | Torridge District,E07000046
355 | Isle of Wight,E06000046
356 | Sir Ynys Mon - Isle of Anglesey,W06000001
357 | Gwynedd - Gwynedd,W06000002
358 | Caerdydd - Cardiff,W06000015
359 | Sir Ceredigion - Ceredigion,W06000008
360 | Sir Fynwy - Monmouthshire,W06000021
361 | Sir Benfro - Pembrokeshire,W06000009
362 | North Somerset,E06000024
363 | Highland,S12000017
364 | Moray,S12000020
365 | Orkney Islands,S12000023
366 | Na h-Eileanan an Iar,S12000013
367 | Argyll and Bute,S12000035
368 | Aberdeenshire,S12000034
369 | Fife,S12000015
370 | Aberdeen City,S12000033
371 | City of Edinburgh,S12000036
372 | East Lothian,S12000010
373 | Shetland Islands,S12000027
374 | North Ayrshire,S12000021
375 | Dumfries and Galloway,S12000006
376 | City of Portsmouth (B),E06000044
377 | City of Plymouth (B),E06000026
378 | South Ayrshire,S12000028
379 | Northumberland,E06000057
380 | Cornwall,E06000052
381 | Isles of Scilly,E06000053
382 | 


--------------------------------------------------------------------------------
/tests/data/left_1.csv:
--------------------------------------------------------------------------------
1 | id,fname,mname,lname,dob,another_field
2 | 1,Will,James,Johnston,20/05/1980,other data
3 | 2,James,Paul,Smith,15/06/1990,more data
4 | 3,Jody,Liz,Brown,20/05/1960,another thing
5 | 4,David,James,Williams,01/01/2000,thing4
6 | 
7 | 


--------------------------------------------------------------------------------
/tests/data/left_2.csv:
--------------------------------------------------------------------------------
1 | id,fname,mname,lname,dob,another_field
2 | 1,Alistair,Paul,Johnston,20/05/1980,other data
3 | 2,James,Paul,Smith,15/06/1990,more data
4 | 3,Alisdair,Paul,Jonson,20/05/1961,another thing
5 | 4,David,Paul,Williams,01/01/2000,final thing
6 | 


--------------------------------------------------------------------------------
/tests/data/left_3.csv:
--------------------------------------------------------------------------------
  1 | first_name,surname,dob,city,email
  2 | Noah ,John,1979-05-29,London,kernandeztin@white-clarh.com
  3 | William ,Leo,1996-01-29,Ipswich,simmonseric@estes.info
  4 | Muhammad ,lRey,1987-04-13,Bradford,ngutierrez@schmidt.com
  5 | iasJne ,Newton,1973-09-15,London,lhale@cooper-novak.info
  6 | Brown,Jacob ,1994-10-11,London,xwilliams@mcmahon.inf
  7 | Daniel ,Gardner,1988-04-08,Edinburgh,anthony27@english.com
  8 | Samuel ,Harrison,2004-06-11,London,andersonamy@love.com
  9 | John,Rosie ,2017-10-07,Dudley,irichmond@jones.net
 10 | Elizabeth ,Holmes,2005-12-10,London,armstrongdavid@simpson.com
 11 | Elizabeth ,Bennett,1977-01-04,London,christophersmith@vega-dickson.net
 12 | Sophie ,Scot,1976-10-05,Belfast,etephen96@munz-ross.biz
 13 | Reuben ,Davis,1980-11-13,London,laura99@carey.biz
 14 | Chloe ,Edwards,2001-03-01,Leeds,don1an6@stevens.inf
 15 | Hugo ,Hughes,2006-07-30,Bristol,amandawaker@lee.cmo
 16 | Esme ,Hunter,1980-05-06,London,christopher42@lynch.com
 17 | Gabriel ,Warren,2012-05-28,Nottingham,aanor93@carenas.com
 18 | Sophia ,Stevenson,2004-10-27,Bradford,wilcoxcatherine@norris-hill.net
 19 | Jasper ,Thompson,2008-04-23,Glasgow,pamela40@wells-bond.com
 20 | Isaac ,Smth,2004-02-19,Belfast,andersonannttee@west-artinez.org
 21 | Muhammad ,Tucer,1989-11-20,Bradford,briai15@ood.bnz
 22 | Lowe,Maria ,1976-01-29,Brighton,wberry@tucker.com
 23 | Olivia ,Scott,1976-12-23,London,brian24@hunt-hall.org
 24 | Muhammad ,Wood,1990-04-10,Manchester,dixonbenjamin@mann-hall.biz
 25 | Elizabeth ,Reed,1972-10-05,Liverpool,kristy28@osborne-ochoa.net
 26 | Henry ,nhopsoT,1982-03-11,Colchester,perezalicia@brites-cook.neg
 27 | Matthew ,Lewis,2011-12-06,London,reneeblack@bean.com
 28 | Benjman ,Heath,2008-11-26,Leicester,kara10@briggs.com
 29 | Arthur ,Johnston,1982-08-22,Kingston-upon-Hull,sroman@blacf.ino
 30 | kaJ ,Lynch,2015-02-15,Sheffield,alejandro23@west-lane.org
 31 | Alexander ,Patel,1971-05-18,Edinhugb,mollyyoung@schroeder-allen.com
 32 | Shah,Jessica ,1993-06-24,Coventry,rhonda75@case.net
 33 | Harry ,Williams,1998-04-02,York,kdeleon@crawford-nicholson.com
 34 | Thompson,Oscar ,1981-07-15,London,xeaver@milrel.com
 35 | Oscar,Wilson,2013-09-27,Bristol,kyle91@cunningham-bell.com
 36 | Henr ,Scott,1997-10-22,Birmingham,porterkataleen@bll-hhll.com
 37 | ai ,Taylor,1987-10-24,Swindon,owenspatricia@farrell.com
 38 | Eva ,Parker,1980-06-16,London,davidsonjoseph@montgomery-harris.com
 39 | Felicity ,Bull,1984-10-01,Derby,simpsondaniellr@laeso.com
 40 | Samuel ,Curtis,1980-01-02,London,loganhcristopherbutler.org
 41 | Harvey ,Jarvis,2012-10-12,Leicester,christrne64@eilly.com
 42 | Jack ,Jones,1992-01-20,Lees,natalie78@watkins.com
 43 | Scarlett ,Bnwr,1974-05-01,London,leeaaron@brewer-hill.net
 44 | Nancy,Wright,1993-09-27,pIswih,vvaldez@ruiz.com
 45 | Isla ,Coleman,1997-11-22,London,jameswilson@wiggins.net
 46 | Dyisa,Greenwood,2015-01-26,Stockton-on-Tees,utrevino@ross-vargas.info
 47 | Finley ,Sutherland,2001-02-10,London,brownashley@flores.com
 48 | Henderson,Henry ,1999-10-27,London,muis17@medna.col
 49 | Jase ,Wood,1980-11-16,Leeds,pinedastephanie@hall-walker.org
 50 | Jesse ,Allen,1986-11-24,Lndoo,autumn07@young-boyer.net
 51 | Harrison ,Wiosn,2010-11-28,London,fgolden@henderson.com
 52 | eLi ,Rowe,1989-11-14,London,sgomez@jones.com
 53 | Teeodorh,Lucas,1990-05-10,Leiecter,cynthiaanderson@welch.com
 54 | Elmy ,Foster,2013-10-25,Liverpol,.ara37@yongtinfo
 55 | Bella ,Griffiths,1996-08-22,Walsall,thompsonrobert@lee.biz
 56 | Stanley ,White,1981-02-12,ork,michaela15@torres.info
 57 | Chloe ,James,1984-11-23,Lodnn,rachel17@alexander.com
 58 | reoGg ,Pweol,1995-09-18,Birmingham,andrea81@haney.com
 59 | Noah ,Cle,2002-02-05,Stockton-on-Tees,daviskimberly@bennett-king.com
 60 | Mitchell,Felix ,2000-10-21,Birmingham,moorejeffrey@riley.net
 61 | Love,Ella ,1986-09-02,Warrington,jssicas3@thoma4.biz
 62 | Leo ,Webb,2012-01-30,Sheffield,sharon50@alexander.com
 63 | Sophi ,Stone,1985-06-10,London,qhawkins@pennington-hurley.biz
 64 | Emlii ,Cooper,2001-05-27,rventCy,larrycampbell@doyle.biz
 65 | Francesca ,Edwards,1987-09-12,Brighton,floydmegan@torres.com
 66 | Knight,Adam ,1979-07-12,Derby,mcconnellstephen@walker-flores.net
 67 | Phillips,iasy ,2009-12-08,Sheffield,grahamchad@tate.info
 68 | Isabelle ,Doyle,2010-06-24,Lonno,michael84@munoz-matthews.com
 69 | Robyn ,Ross,1998-06-20,Newcastle-upon-Tyne,wbrown.wall@co
 70 | Lottie ,Smith,1982-12-03,London,robersonbrenda@sutton-phillips.com
 71 | Arabella ,Gray,1978-11-01,Lnnoo,pmartinez@cox-williams.com
 72 | Lewis ,Smith,1996-02-22,Londo,thawkins@hamilton-burch.com
 73 | Daisy ,Bo,2009-02-16,Covntre,chelsea39@jackson.com
 74 | Wi iaml,Walker,1985-10-11,Liverpool,tracysaiders@mntchellcollins.com
 75 | Brown,Daniel ,2002-02-21,Sunderland,qhouse@morton.org
 76 | Esme ,Ward,1986-07-15,Norwich,christopherhughes@ryan-kramer.net
 77 | Werra,Ncnay,1992-06-27,Coventry,doughertywayne@ramsey.com
 78 | Scarlett ,Harvy,2008-01-30,Bristol,hamptonryan@wilcox-mendoza.biz
 79 | Megan ,Campbell,2008-10-07,Liverpool,elizabeth68@farrell-hutchinson.net
 80 | Beatrice ,Hawkins,2003-10-27,Middlesbrough,jermaineydung@oaton.biz
 81 | George ,Stevenson,2002-01-20,Derby,david28@schultz.net
 82 | Ella ,Potter,1971-12-16,London,nyan@srosg.com
 83 | Ruby ,Oenill,1974-11-27,London,clarkcharles@williams.info
 84 | Holly ,Edwards,2007-03-07,London,sfleming@dalton.com
 85 | Mas o,Houghton,2005-09-05,London,simmonsscott@wilson.com
 86 | Alice ,Payne,2004-07-26,London,jason24@brown.com
 87 | Muhammad ,Wolsn,1982-09-01,Portsmouth,edward32@moreno-beasley.net
 88 | Jack ,Chpaan,1995-01-17,Plymouth,justin21@smith-wong.com
 89 | George ,Evans,1986-11-05,London,rcasey@hernandez.com
 90 | Smth,L uso,1990-12-22,Glasow,zalvarado@sibmons.mi
 91 | Olievr,Jarvis,2000-03-02,slaGgw,weisstheresa@baird.biz
 92 | Oavil ,Watson,2006-01-06,Luton,uallen@king.biz
 93 | Eva ,Young,1997-07-01,Wolverhampton,elizabeth48@cobb.com
 94 | Bella ,Jenkins,1978-12-06,Lndon,kimberlywells@robinson-lam.net
 95 | Eleanor ,Harding,2005-08-10,Liverpool,yflores@williams.com
 96 | Alexander ,Stewart,1996-01-08,deeL,jason64@tanner.com
 97 | Luna ,Baker,2015-07-11,London,steven05@meyer.biz
 98 | Alexander ,Miller,1978-08-08,London,gabriel48@young-cooley.com
 99 | Nancy ,Carter,1984-08-07,Leicester,diana42@sderaon.org
100 | Anna ,Atkinson,2012-01-02,Cardiff,aanderson@rhodes.com
101 | Jacob ,Brayel,1993-09-19,London,tomisdaraus@cobb.com
102 | 


--------------------------------------------------------------------------------
/tests/data/left_5_nas.csv:
--------------------------------------------------------------------------------
1 | first_name,surname,dob,city
2 | Noah ,John,,London
3 | William ,Leo,1996-01-29,Ipswich
4 | Muhammad ,,1987-04-13,Bradford
5 | 


--------------------------------------------------------------------------------
/tests/data/left_token_escape.csv:
--------------------------------------------------------------------------------
1 | id,fname,mname,lname,dob,another_field
2 | 1,or,or and,and,20/05/1980,other data
3 | 2,or,or,or smith or,15/06/1990,more data
4 | 3,near,and,near,20/05/1960,another thing
5 | 
6 | 


--------------------------------------------------------------------------------
/tests/data/right_1.csv:
--------------------------------------------------------------------------------
1 | id,name,middlename,surname,date,other
2 | 1,William,J,Johnston,20/05/1980,other data
3 | 2,James,Paul,Smith,15/06/1990,more data
4 | 3,Jodi,Elizabeth,Brown,20/05/1961,another thing
5 | 4,William,James,Johnston,21/05/1951,other data


--------------------------------------------------------------------------------
/tests/data/right_2.csv:
--------------------------------------------------------------------------------
1 | id,fname,mname,lname,dob,another_field
2 | 1,Alistair,Paul,Johnston,20/05/1980,other data
3 | 2,James,Paul,Smith,15/06/1990,more data
4 | 3,Alasdair,Paul,Johnson,20/05/1960,another thing
5 | 


--------------------------------------------------------------------------------
/tests/data/right_3.csv:
--------------------------------------------------------------------------------
  1 | first_name,surname,dob,city,email
  2 | Noah ,John,1979-07-02,London,hernandeztina@white-clark.com
  3 | William ,Lowe,1996-01-29,Ipswich,simmonseric@estes.info
  4 | Riley,Muhammad ,1987-04-13,Bradford,ngutierrez@schmidt.com
  5 | Jasmine ,tewNo,1973-07-29,London,lhale@cooper-novak.info
  6 | Jacob ,Brown,1994-07-26,London,xwilliams@mcmahon.info
  7 | Daniel ,Gardner,1988-04-08,Edinburg,anthony27@english.com
  8 | Hrrison,Samuel ,2004-07-30,London,andersonaym@love.om
  9 | John,iose ,2017-10-07,Dudley,irichmond@jones.net
 10 | Elizabeth ,Hlmes,2005-12-10,Lnodo,armstrongdavid@simpson.com
 11 | Elizabeth ,Bennett,1977-01-04,London,christophersmith@vega-dickson.net
 12 | Scott,Sophie ,1976-10-05,Belfast,stephen96@munoz-rose.biz
 13 | Reuben ,Davis,1980-11-13,London,laura99@carey.biz
 14 | Chloe ,Edwards,2001-03-01,Leeds,donna16@stevens.info
 15 | Hugo ,Hughes,2006-07-30,Bristol,amandawalker@lee.com
 16 | Esme ,Hunter,1980-05-06,London,christopher42@lynch.com
 17 | Gabriel ,Warren,2012-05-28,Nottingham,aaron93@cardenas.com
 18 | Sophia ,Stevenson,2004-12-22,Bradford,wilcoxcatheline@norrs-hirl.net
 19 | Jasper ,Thompson,2008-06-07,Glogsw,pamela40@wells-bond.com
 20 | Isaac ,Smith,2004-02-19,lefast,andersonannette@west-martinez.org
 21 | Muhammad ,Tucker,1990-01-05,Bradford,brian15@wood.biz
 22 | Maria ,Lowe,1976-01-29,Brighton,wberry@tucker.com
 23 | Olivia ,Scott,1976-12-23,nonoL,brian24-unthhall.org
 24 | Muhamad ,Wood,1990-04-10,Manchester,dixonbejamin@mnna-hall.biz
 25 | Eizabeht ,Reed,1972-10-05,Liverpool,kristy28@osborne-ochoa.net
 26 | Henry ,Thompson,1982-03-11,Colcheter,perezalicia@bridges-cook.net
 27 | Methaw ,Lewis,2011-12-06,Londo,reneeblack@bean.com
 28 | Bniamjn ,Heath,2008-09-06,Leicester,kara10@briggs.com
 29 | Arthur ,Jhnntos,1982-09-21,Kingston-upon-Hull,sroman@black.info
 30 | Jack ,Lynch,2015-02-15,Sheffield,alejandro23@west-lane.rg
 31 | Alexander ,Patel,1971-05-28,Edinburgh,mollyyoung@schroeder-allen.com
 32 | Jssicae,ahh,1993-08-10,Coventry,rhond5a@case.net
 33 | Harry ,Williams,1998-04-02,Yok,kdeleon@crawford-nicholson.com
 34 | Oscr ,Thompson,1981-07-15,London,xweaer@millvr.com
 35 | O ars,Wilson,2013-09-27,Bistol,kyle91@cunningham-bell.com
 36 | Henry ,Scot,1997-10-22,Birmmnhai,porterkathleen@bell-hall.com
 37 | Mia ,Taylor,1987-10-24,Soindw,owenspatricia@farrell.com
 38 | Parker,vEa,1980-06-16,Lodon,davidsonjoseph@montgomery-harris.com
 39 | Felicity ,Bull,1985-01-02,Derby,simpsondanielle@larson.com
 40 | Samuel ,Curtis,1979-12-23,London,loganchristopher@butler.org
 41 | Harvey ,Jaivs,2012-08-27,Leicester,ihritcne64@reilly.com
 42 | Jck ,Jones,1992-05-03,Leeds,natalie78@watkins.com
 43 | Scarlett ,Bnow,1974-05-01,London,leeaaron@brewer-hill.net
 44 | Nancy ,Wright,1993-09-18,Ipswich,vvaldez@ruiz.com
 45 | Isla ,Coleman,1997-11-22,London,jameswilsonwiggins.net
 46 | Greenwood,Daisy ,2014-10-20,etocton-on-TeSs,utrevino@ross-vargas.info
 47 | Felny ,Sutherland,2001-02-10,London,browascley@flores.hom
 48 | Henry,Henderson,1999-10-27,London,luis17@medina.com
 49 | James,Wood,1981-03-04,Leeds,pinedastephanie@hall-walker.org
 50 | Allen,Jesse ,1987-01-06,Lonon,autumn07@young-boyen.rt
 51 | Harrison ,Wilson,2010-09-29,Lndoo,fgolden@henderson.com
 52 | Lexi ,Rowe,1989-09-24,oonLn,sgomez@jones.com
 53 | Theodore ,Lucas,1990-05-10,Leicester,cynthiaanderson@welch.com
 54 | Emily ,Foster,2013-10-25,Liverpool,tara37@uyng.info
 55 | Bella ,Griffihs,1996-08-22,Walsall,thompsonrobert@lee.biz
 56 | Stanley ,Whet,1981-02-12,York,michaela15@torres.info
 57 | Chloe ,James,1984-12-27,nodon,rachel17@alexander.com
 58 | George ,Powell,1995-12-19,Birmingham,andrea81@haney.com
 59 | Nah ,Cole,2002-02-05,Stockton-on-Tees,daviskimberly@bennett-king.com
 60 | Felix ,Mitchell,2000-10-21,Brmiigham,moorejeffrey@riley.net
 61 | Ella ,Love,1986-10-08,Warrington,jessica43@thomas.biz
 62 | Leo ,Webb,2012-03-11,Sheffield,sharon50@alexander.com
 63 | Sophia ,tSne,1985-06-10,London,qhawkins@pennington-hurley.biz
 64 | Emilia ,Cooper,2001-08-19,Coventry,lerrycampbell@doyl.biz
 65 | Edwards,Francesca ,1987-09-12,Bightor,floydmegan@torres.com
 66 | Ada ,Knight,1979-04-24,Dyrb,mcconnellstfphen@walker-eloresnet
 67 | Daisy ,Phillips,2010-02-17,Sheffield,grahamchad@tate.info
 68 | Isabelle ,Doyle,2010-06-24,London,michael84@munoz-matthews.com
 69 | boRyn,oRs,1998-06-20,NewcTtle-upon-syne,wbrown@wall.com
 70 | Lottie ,Smith,1982-12-03,London,robersonbrenda@sutton-phillips.com
 71 | Arabella ,Gray,1978-11-01,London,pmartinez@cox-williams.com
 72 | Lewis ,Smith,1996-02-22,London,thaokins@hamilton-burh.cwm
 73 | Daisy ,Bob,2009-02-16,Coventry,chelsea39@jackson.com
 74 | William ,Walker,1985-10-11,Liverpool,tracysanders@mitchell-collins.com
 75 | Daniel ,Brown,2002-01-28,Sdnuerlan,qhouse@morton.org
 76 | Esme ,dar,1986-07-15,Norwich,christopherhughes@ryan-kramer.net
 77 | Nancy ,raWen,1992-07-19,Coertny,doughertywayne@ramsey.com
 78 | Scarlett ,Harvey,2008-01-30,Bristol,hamptonryan@wilcox-mendoza.biz
 79 | Campbell,Megan ,2008-12-12,Livprool,elizabeth68@farrell-hutchinson.net
 80 | Beatrice ,Hawkins,2003-10-27,Middlesbrough,jermaineyoung@dalton.biz
 81 | George ,Stevenson,2002-01-20,Derby,david28@schultz.net
 82 | Ella ,Potter,1971-12-16,London,nryan@gross.com
 83 | Ruby ,O neill,1974-08-26,London,clarkcharles@williams.info
 84 | Holly ,Edwards,2007-02-27,London,sfleming@dalton.com
 85 | Mans ,Houghton,2005-09-05,London,simmonsscott@wilson.com
 86 | Acile,Payne,2004-11-17,Lndon,jason24@brown.com
 87 | Muhammad ,Wilson,1982-08-22,Portsmouth,edward32@moreno-beasley.net
 88 | Jack ,Chapman,1995-02-15,Plymouth,justin21@smith-wong.com
 89 | Evns,George ,1986-11-05,London,rcasey@hernandez.com
 90 | Louis ,Smith,1990-12-22,Glasgow,zalvarado@simmons.biz
 91 | Jarvis,Oliver ,2000-03-07,Glasgow,weisstheresa@baird.biz
 92 | Olivia ,Watson,2006-01-06,Lnot,uallen@king.biz
 93 | Eva ,Young,1997-05-24,Wolverhampton,elizabeo48@ctbb.com
 94 | Bella ,Jenkins,1979-03-13,London,kimberlywells@robinson-tm.nel
 95 | Eeanor ,Harding,2005-08-10,Loerpovl,yfloreslil@iams.com
 96 | Alexander ,Stewart,1996-01-08,Leeds,jason64@tanner.com
 97 | auL ,Bakr,2015-06-26,London,stven05m@eyer.biz
 98 | Alexander ,Miller,1978-07-30,London,gabriel48@young-cooley.com
 99 | Nancy ,Carter,1984-08-07,Leicester,diana42@anderson.org
100 | Anna ,Atkinson,2012-01-02,Cardiff,aanderson@rhodes.com
101 | Jacob ,Bradley,1993-12-23,London,thomasdarius@cobb.com
102 | 


--------------------------------------------------------------------------------
/tests/data/right_5_nas.csv:
--------------------------------------------------------------------------------
1 | first_name,surname,dob,city
2 | Noah ,,,London
3 | William ,Leo,1996-01-29,Ipswich
4 | Muhammad ,lRey,1987-04-13,Bradford
5 | 


--------------------------------------------------------------------------------
/tests/data/right_token_escape.csv:
--------------------------------------------------------------------------------
1 | id,name,middlename,surname,date,other
2 | 1,or,or,or smith or,15/06/1990,more data
3 | 2,near,and,near,20/05/1960,another thing
4 | 3,or,or and,and,20/05/1980,other data


--------------------------------------------------------------------------------
/tests/datagetter_performance.txt:
--------------------------------------------------------------------------------
 1 | {"datetime": "2017-11-28T18:24:43.058249", "commit_hash": "c2e71ec", "datagetter_cartesian": 0.97, "datagetter_sqlite": 0.54, "test_type": "left_3"}
 2 | {"datetime": "2017-11-28T21:55:46.633256", "commit_hash": "4115da7", "datagetter_cartesian": 0.96, "datagetter_sqlite": 0.87, "test_type": "left_3"}
 3 | {"datetime": "2017-12-02T09:25:15.475843", "commit_hash": "2279321", "datagetter_cartesian": 0.96, "datagetter_sqlite": 0.87, "test_type": "left_3"}
 4 | {"datetime": "2017-12-03T18:12:18.623974", "commit_hash": "aee05f3", "datagetter_cartesian": 0.87, "datagetter_sqlite": 0.93, "test_type": "left_3"}
 5 | {"datetime": "2017-12-09T09:10:17.671481", "commit_hash": "800d0de", "datagetter_cartesian": 0.98, "datagetter_sqlite": 0.94, "test_type": "left_3"}
 6 | {"datetime": "2017-12-09T09:15:08.065057", "commit_hash": "800d0de", "datagetter_cartesian": 0.98, "datagetter_sqlite": 0.94, "test_type": "left_3", "time_taken": 19.634991832004744}
 7 | {"datetime": "2017-12-10T14:50:23.598907", "commit_hash": "42cd45e", "datagetter_cartesian": 0.92, "datagetter_sqlite": 0.89, "test_type": "left_3", "time_taken": 0.5589753400126938}
 8 | {"datetime": "2017-12-10T14:52:42.900029", "commit_hash": "42cd45e", "datagetter_cartesian": 0.92, "datagetter_sqlite": 0.89, "test_type": "left_3", "time_taken": 0.5655303659732454}
 9 | {"datetime": "2017-12-10T17:08:36.462037", "commit_hash": "1d74eee", "datagetter_cartesian": 0.92, "datagetter_sqlite": 0.89, "test_type": "left_3", "time_taken": 0.5987050510011613}
10 | {"datetime": "2017-12-10T17:08:48.684092", "commit_hash": "1d74eee", "datagetter_cartesian": 0.92, "datagetter_sqlite": 0.89, "test_type": "left_3", "time_taken": 0.5607335189997684}
11 | {"datetime": "2017-12-10T17:08:55.645271", "commit_hash": "1d74eee", "datagetter_cartesian": 0.92, "datagetter_sqlite": 0.89, "test_type": "left_3", "time_taken": 0.5696567740233149}
12 | {"datetime": "2017-12-20T08:49:46.559783", "commit_hash": "9d13df6", "datagetter_cartesian": 0.0, "datagetter_sqlite": 0.66, "test_type": "left_3", "time_taken": 0.13090573600493371}
13 | {"datetime": "2017-12-20T08:53:43.190434", "commit_hash": "9d13df6", "datagetter_cartesian": 0.0, "datagetter_sqlite": 0.66, "test_type": "left_3", "time_taken": 0.13246479799272493}
14 | {"datetime": "2017-12-20T08:54:42.085973", "commit_hash": "9d13df6", "datagetter_cartesian": "NA", "datagetter_sqlite": 0.6205, "test_type": "left_4", "time_taken": 24.26091426698258}
15 | {"datetime": "2017-12-20T08:58:01.843589", "commit_hash": "9d13df6", "datagetter_cartesian": 0.0, "datagetter_sqlite": 0.66, "test_type": "left_3", "time_taken": 0.13119038398144767}
16 | {"datetime": "2017-12-20T08:58:36.241364", "commit_hash": "9d13df6", "datagetter_cartesian": "NA", "datagetter_sqlite": 0.6205, "test_type": "left_4", "time_taken": 34.02364709501853}
17 | 


--------------------------------------------------------------------------------
/tests/generate_test_data.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np 
 3 | import random
 4 | import datetime
 5 | from faker import Faker
 6 | 
 7 | def get_random_by_freq(csv_path, field, num_elements):
 8 |     df = pd.read_csv(csv_path)
 9 |     elements = df[field]
10 |     probabilites = df["freq"]/sum(df["freq"])
11 |     return np.random.choice(elements, num_elements, p= list(probabilites))
12 | 
13 | def get_fakes(fake_fn, num_elements):
14 |     result = []
15 |     for i in range(num_elements):
16 |         result.append(fake_fn())
17 |     return result
18 | 
19 | def switch(my_string):
20 |     pos1 = random.randrange(0,len(my_string)) 
21 |     pos2 = random.randrange(0,len(my_string)) 
22 |     my_string = list(my_string)
23 |     char1 = my_string[pos1]
24 |     char2 = my_string[pos2]
25 |     my_string[pos1] = char2
26 |     my_string[pos2] = char1
27 |     return "".join(my_string)
28 | 
29 | def new_letter(my_string):
30 |     pos1 = random.randrange(0,len(my_string)) 
31 |     letter = random.choice(string.ascii_lowercase)
32 |     return my_string[:pos1] + letter + my_string[pos1:]
33 | 
34 | def delete_letter(my_string):
35 |     pos1 = random.randrange(1,len(my_string)) 
36 |     return my_string[:pos1] + my_string[pos1+1:]
37 | 
38 | def corrupt_string(my_string, num_switches=1,num_new_letters=0,num_deletes=1):
39 |     
40 |     for i in range(num_switches):
41 |         my_string = switch(my_string)
42 |         
43 |     for i in range(num_new_letters):
44 |         my_string = new_letter(my_string)
45 |         
46 |     for i in range(num_deletes):
47 |         my_string = delete_letter(my_string)
48 |         
49 |     return my_string
50 | 
51 | def corrupt_dob(dob):
52 |     fmt = "%Y-%m-%d"
53 |     date_1 = datetime.datetime.strptime(dob, fmt)
54 |     end_date = date_1 + datetime.timedelta(days=random.randint(-100,100))
55 |     return end_date.strftime(fmt)
56 | 
57 | def create_test_data(num_elements = 100):
58 |     fake = Faker()
59 | 
60 |     first_names = pd.read_csv("data/_first_names.csv")
61 |     surnames = pd.read_csv("data/_surnames.csv")
62 | 
63 |     data = {}
64 |     data["01first_name"] = get_random_by_freq("data/_first_names.csv", "name", num_elements)
65 |     data["02surname"] = get_random_by_freq("data/_surnames.csv", "surname", num_elements)
66 |     data["03dob"] = get_fakes(fake.date, num_elements)
67 |     data["04city"] = get_random_by_freq("data/_cities.csv", "city", num_elements)
68 |     data["05email"] = get_fakes(fake.company_email, num_elements)
69 | 
70 |     df_left = pd.DataFrame(data)
71 |     df_left.columns = [c[2:] for c in df_left.columns]
72 | 
73 |     df_right = df_left.copy()
74 | 
75 |     for df in [df_left, df_right]:
76 |         for r in df.iterrows():
77 |             index = r[0]
78 |             row = r[1]
79 | 
80 |             for col in ["first_name", "surname", "city", "email"]:
81 |                 if (random.random()>0.8):
82 |                     df.loc[index, col] = corrupt_string(row[col])
83 | 
84 |             # Sometimes switch first name and surname
85 |             if (random.random()>0.9):
86 |                 surname = row["surname"]
87 |                 first_name = row["first_name"]
88 |                 df.loc[index, "first_name"] = surname
89 |                 df.loc[index, "surname"] = first_name
90 | 
91 |             # Corrupt the dob
92 |             if random.random() > 0.8:
93 |                 df.loc[index, "dob"] = corrupt_dob(row["dob"])
94 |     return df_left, df_right


--------------------------------------------------------------------------------
/tests/realexample_performance.txt:
--------------------------------------------------------------------------------
 1 | {"datetime": "2017-12-02T09:24:57.559923", "commit_hash": "2279321", "perc_correct": 0.9921052631578947, "test_type": "local_authority"}
 2 | {"datetime": "2017-12-03T18:12:18.982100", "commit_hash": "aee05f3", "perc_correct": 0.9973684210526316, "test_type": "local_authority"}
 3 | {"datetime": "2017-12-09T09:09:38.156366", "commit_hash": "800d0de", "perc_correct": 0.9973684210526316, "test_type": "local_authority"}
 4 | {"datetime": "2017-12-09T09:10:18.018472", "commit_hash": "800d0de", "perc_correct": 0.9973684210526316, "test_type": "local_authority"}
 5 | {"datetime": "2017-12-09T09:15:08.420186", "commit_hash": "800d0de", "perc_correct": 0.9973684210526316, "test_type": "local_authority", "time_taken": 0.34182741500262637}
 6 | {"datetime": "2017-12-10T14:50:24.197806", "commit_hash": "42cd45e", "perc_correct": 0.9973684210526316, "test_type": "local_authority", "time_taken": 0.5683609510015231}
 7 | {"datetime": "2017-12-10T14:52:43.483357", "commit_hash": "42cd45e", "perc_correct": 0.9973684210526316, "test_type": "local_authority", "time_taken": 0.5710940429999027}
 8 | {"datetime": "2017-12-10T17:08:37.070141", "commit_hash": "1d74eee", "perc_correct": 0.9973684210526316, "test_type": "local_authority", "time_taken": 0.5759790809825063}
 9 | {"datetime": "2017-12-10T17:08:49.262955", "commit_hash": "1d74eee", "perc_correct": 0.9973684210526316, "test_type": "local_authority", "time_taken": 0.5662913529959042}
10 | {"datetime": "2017-12-10T17:08:56.214418", "commit_hash": "1d74eee", "perc_correct": 0.9973684210526316, "test_type": "local_authority", "time_taken": 0.5566127240017522}
11 | {"datetime": "2017-12-20T08:49:47.164089", "commit_hash": "9d13df6", "perc_correct": 0.9973684210526316, "test_type": "local_authority", "time_taken": 0.5756925900350325}
12 | {"datetime": "2017-12-20T08:54:43.118046", "commit_hash": "9d13df6", "perc_correct": 0.9973684210526316, "test_type": "local_authority", "time_taken": 1.0100204049958847}
13 | {"datetime": "2017-12-20T08:58:37.191906", "commit_hash": "9d13df6", "perc_correct": 0.9973684210526316, "test_type": "local_authority", "time_taken": 0.9297601060243323}
14 | 


--------------------------------------------------------------------------------
/tests/test_accuracy.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | Tests
  5 | """
  6 | 
  7 | import datetime
  8 | import json
  9 | import pandas as pd
 10 | import subprocess
 11 | from timeit import default_timer as timer
 12 | import unittest
 13 | 
 14 | 
 15 | 
 16 | from fuzzymatcher import link_table, fuzzy_left_join
 17 | from fuzzymatcher.data_getter_cartesian import DataGetterCartesian
 18 | from fuzzymatcher.matcher import Matcher
 19 | 
 20 | def get_commit_hash():
 21 |     process = subprocess.Popen("git describe --always", stdout=subprocess.PIPE, shell=True)
 22 |     output, error = process.communicate()
 23 |     commit_hash = output.decode("utf-8").replace("\n", "")
 24 |     return commit_hash
 25 | 
 26 | def link_table_percentage_correct(link_table):
 27 |     lt = link_table.copy()
 28 |     lt = lt[lt["match_rank"] == 1]
 29 |     lt["__id_left"] = lt["__id_left"].str.replace("_left", "")
 30 |     lt["__id_right"] = lt["__id_right"].str.replace("_right", "")
 31 |     lt["link_correct"] = (lt["__id_left"] == lt["__id_right"])
 32 | 
 33 |     return lt["link_correct"].sum()/len(lt)
 34 | 
 35 | class DatagetterAccuracy(unittest.TestCase):
 36 |     """
 37 |     These tests actually run accurancy analysis of the results
 38 |     They're not pass fail but they log how well the matcher is doing
 39 |     """
 40 | 
 41 |     def test_data_1000(self):
 42 | 
 43 |         m = Matcher()
 44 | 
 45 |         df_left = pd.read_csv("tests/data/left_4.csv")
 46 |         df_right = pd.read_csv("tests/data/right_4.csv")
 47 | 
 48 |         on = ["first_name", "surname", "dob", "city"]
 49 | 
 50 |         m.add_data(df_left, df_right, on, on)
 51 | 
 52 |         start = timer()
 53 |         m.match_all()
 54 |         lt = m.get_formatted_link_table()
 55 |         end = timer()
 56 |         time_taken = end - start
 57 |         sqlite_perc = link_table_percentage_correct(lt)
 58 | 
 59 |         this_record = {}
 60 |         this_record["datetime"] = datetime.datetime.now().isoformat()
 61 |         this_record["commit_hash"] = get_commit_hash()
 62 |         this_record["datagetter_cartesian"] = "NA"
 63 |         this_record["datagetter_sqlite"] = sqlite_perc
 64 |         this_record["test_type"] = "left_4"
 65 |         this_record["time_taken"] = time_taken
 66 | 
 67 |         with open("tests/datagetter_performance.txt", "a") as myfile:
 68 |             myfile.writelines(json.dumps(this_record) + "\n")
 69 | 
 70 |     def test_data_100(self):
 71 |         dg = DataGetterCartesian()
 72 |         m = Matcher(data_getter = dg)
 73 | 
 74 |         df_left = pd.read_csv("tests/data/left_3.csv")
 75 |         df_right = pd.read_csv("tests/data/right_3.csv")
 76 | 
 77 |         on = ["first_name", "surname", "dob", "city"]
 78 | 
 79 |         m.add_data(df_left, df_right, on, on)
 80 | 
 81 |         start = timer()
 82 |         m.match_all()
 83 |         lt = m.get_formatted_link_table()
 84 |         end = timer()
 85 |         time_taken = end - start
 86 | 
 87 |         cartesian_perc = link_table_percentage_correct(lt)
 88 | 
 89 |         lt2 = link_table(df_left, df_right, on, on)
 90 |         sqlite_perc = link_table_percentage_correct(lt2)
 91 | 
 92 |         this_record = {}
 93 |         this_record["datetime"] = datetime.datetime.now().isoformat()
 94 |         this_record["commit_hash"] = get_commit_hash()
 95 |         this_record["datagetter_cartesian"] = cartesian_perc
 96 |         this_record["datagetter_sqlite"] = sqlite_perc
 97 |         this_record["test_type"] = "left_3"
 98 |         this_record["time_taken"] = time_taken
 99 | 
100 |         with open("tests/datagetter_performance.txt", "a") as myfile:
101 |             myfile.writelines(json.dumps(this_record) + "\n")
102 | 
103 |     def test_la_data(self):
104 |         ons = pd.read_csv("tests/data/las_ons.csv")
105 |         os = pd.read_csv("tests/data/las_os.csv")
106 | 
107 |         start = timer()
108 |         df_joined = fuzzy_left_join(ons, os, left_on = ["lad16nm"], right_on = ["name"])
109 |         end = timer()
110 |         time_taken =  end - start
111 | 
112 |         rename = {"lad16cd": "ons_code", "code": "os_code", "lad16nm": "ons_name", "name": "os_name"}
113 |         df_joined = df_joined.rename(columns=rename)
114 |         col_order = ["best_match_score", "ons_name", "os_name", "ons_code", "os_code"]
115 | 
116 |         num_records = len(df_joined)
117 |         correct_binary = (df_joined["ons_code"] == df_joined["os_code"])
118 |         perc_correct = correct_binary.sum()/num_records
119 | 
120 |         this_record = {}
121 |         this_record["datetime"] = datetime.datetime.now().isoformat()
122 |         this_record["commit_hash"] = get_commit_hash()
123 |         this_record["perc_correct"] = perc_correct
124 |         this_record["test_type"] = "local_authority"
125 |         this_record["time_taken"] = time_taken
126 | 
127 |         with open("tests/realexample_performance.txt", "a") as myfile:
128 |             myfile.writelines(json.dumps(this_record) + "\n")
129 | 


--------------------------------------------------------------------------------
/tests/test_colnames.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | Tests
  5 | """
  6 | 
  7 | import unittest
  8 | from fuzzymatcher import link_table
  9 | import pandas as pd
 10 | 
 11 | '''
 12 | The data path does not seem to be correct, judging from what
 13 | was created using 'create_fake_dataset.ipynb'. Correcting it to match
 14 | true locations (data folder within tests folder).
 15 | '''
 16 | # "tests/data/left_2.csv" (original left data path)
 17 | # "tests/data/right_2.csv" (original right data path)
 18 | left_1_path = "./data/left_1.csv"
 19 | right_1_path = "./data/right_1.csv"
 20 | left_2_path = "./data/left_2.csv"
 21 | right_2_path = "./data/right_2.csv"
 22 | 
 23 | class ColNameCollisions(unittest.TestCase):
 24 |     """
 25 |     Test what happens when the user provides input data with
 26 |     some column names which are the same in each dataset
 27 |     """
 28 | 
 29 | 
 30 | 
 31 |     
 32 |     def test_all_colnames_match(self):
 33 |         """
 34 |         Adding two numbers should give the correct answer
 35 |         """
 36 |         left = pd.read_csv(left_2_path)
 37 |         right = pd.read_csv(right_2_path)
 38 |         left_on = ["fname", "mname", "lname",  "dob"]
 39 |         right_on = ["fname", "mname", "lname",  "dob"]
 40 | 
 41 |         df = link_table(left, right, left_on, right_on)
 42 | 
 43 |         expected_columns = ['__id_left',
 44 |                       '__id_right',
 45 |                       'match_score',
 46 |                       'match_rank',
 47 |                       'fname_left',
 48 |                       'fname_right',
 49 |                       'mname_left',
 50 |                       'mname_right',
 51 |                       'lname_left',
 52 |                       'lname_right',
 53 |                       'dob_left',
 54 |                       'dob_right']
 55 | 
 56 |         actual_columns = list(df.columns)
 57 |         self.assertEqual(expected_columns, actual_columns)
 58 | 
 59 |     def test_all_colnames_match_with_id(self):
 60 |         """
 61 |         Adding two numbers should give the correct answer
 62 |         """
 63 |         left = pd.read_csv(left_2_path)
 64 |         right = pd.read_csv(right_2_path)
 65 |         left_on = ["fname", "mname", "lname",  "dob"]
 66 |         right_on = ["fname", "mname", "lname",  "dob"]
 67 | 
 68 |         df = link_table(left, right, left_on, right_on, left_id_col="id", right_id_col="id")
 69 | 
 70 |         expected_columns = ['__id_left',
 71 |                     '__id_right',
 72 |                     'match_score',
 73 |                     'match_rank',
 74 |                     'fname_left',
 75 |                     'fname_right',
 76 |                     'mname_left',
 77 |                     'mname_right',
 78 |                     'lname_left',
 79 |                     'lname_right',
 80 |                     'dob_left',
 81 |                     'dob_right']
 82 | 
 83 |         actual_columns = list(df.columns)
 84 |         self.assertEqual(expected_columns, actual_columns)
 85 | 
 86 |     def test_some_colnames_match(self):
 87 |         """
 88 |         Adding two numbers should give the correct answer
 89 |         """
 90 |         left = pd.read_csv(left_1_path)
 91 |         left = left.rename(columns = {"fname": "name"})
 92 |         right = pd.read_csv(right_1_path)
 93 |         left_on = ["name", "mname", "lname",  "dob"]
 94 |         right_on = ["name", "middlename", "surname", "date"]
 95 | 
 96 |         df = link_table(left, right, left_on, right_on)
 97 | 
 98 |         expected_columns = ['__id_left',
 99 |                             '__id_right',
100 |                             'match_score',
101 |                             'match_rank',
102 |                             'name_left',
103 |                             'name_right',
104 |                             'mname',
105 |                             'middlename',
106 |                             'lname',
107 |                             'surname',
108 |                             'dob',
109 |                             'date']
110 | 
111 |         actual_columns = list(df.columns)
112 |         self.assertEqual(expected_columns, actual_columns)
113 | 
114 | 
115 | if __name__ == '__main__':
116 |     unittest.main()
117 | 


--------------------------------------------------------------------------------
/tests/test_misc.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from fuzzymatcher import link_table
 3 | import pandas as pd
 4 | 
 5 | class TestNulls(unittest.TestCase):
 6 |     """
 7 |     Test what happens when the user provides input data with
 8 |     null values in some of the cells
 9 |     """
10 | 
11 |     def test_nulls_no_errors(self):
12 |         """
13 |         Adding two numbers should give the correct answer
14 |         """
15 |         df_left = pd.read_csv("tests/data/left_5_nas.csv")
16 |         df_right = pd.read_csv("tests/data/right_5_nas.csv")
17 | 
18 |         on = ["first_name", "surname", "dob", "city"]
19 | 
20 |         flj = link_table(df_left, df_right, on, on)
21 | 
22 | 
23 | class TestNulls(unittest.TestCase):
24 |     """
25 |     Test what happens when the user provides input data with
26 |     fts4 match expression keyworks like AND, OR, NEAR
27 |     """
28 | 
29 |     def test_nulls_no_errors(self):
30 |         """
31 | 
32 |         """
33 | 
34 | 
35 |         df_left = pd.read_csv("tests/data/left_token_escape.csv")
36 |         df_right = pd.read_csv("tests/data/right_token_escape.csv")
37 | 
38 |         # Columns to match on from df_left
39 |         left_on = ["fname", "mname", "lname"]
40 | 
41 |         # Columns to match on from df_right
42 |         right_on = ["name", "middlename", "surname"]
43 | 
44 |         on = ["first_name", "surname", ]
45 | 
46 |         flj = link_table(df_left, df_right, left_on, right_on,
47 |                          left_id_col="id", right_id_col="id")
48 | 


--------------------------------------------------------------------------------