├── string_grouper
├── test
│ ├── __init__.py
│ └── test_string_grouper.py
└── __init__.py
├── string_grouper_utils
├── test
│ ├── __init__.py
│ └── test_string_grouper_utils.py
├── __init__.py
└── string_grouper_utils.py
├── images
├── Fuzzy_vs_Exact.png
├── BlockMatrix_1_1.png
├── BlockMatrix_1_2.png
├── BlockMatrix_2_2.png
├── ScaledRuntimeContourPlot.png
├── ScaledTimePerComparison.png
└── BlockNumberSpaceExploration1.png
├── .gitignore
├── docs
├── references.md
├── references
│ ├── compute_pairwise_similarities.md
│ ├── group_similar_strings.md
│ ├── match_strings.md
│ ├── match_most_similar.md
│ ├── options_kwargs.md
│ └── sg_class.md
├── performance.md
├── index.md
└── examples.md
├── tutorials
├── accounts.csv
├── tutorial_1.md
├── zero_similarity.md
└── group_representatives.md
├── .github
└── workflows
│ └── test.yml
├── pyproject.toml
├── mkdocs.yml
├── LICENSE
├── setup.py
├── CHANGELOG.md
└── README.md
/string_grouper/test/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/string_grouper_utils/test/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/images/Fuzzy_vs_Exact.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bergvca/string_grouper/HEAD/images/Fuzzy_vs_Exact.png
--------------------------------------------------------------------------------
/images/BlockMatrix_1_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bergvca/string_grouper/HEAD/images/BlockMatrix_1_1.png
--------------------------------------------------------------------------------
/images/BlockMatrix_1_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bergvca/string_grouper/HEAD/images/BlockMatrix_1_2.png
--------------------------------------------------------------------------------
/images/BlockMatrix_2_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bergvca/string_grouper/HEAD/images/BlockMatrix_2_2.png
--------------------------------------------------------------------------------
/images/ScaledRuntimeContourPlot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bergvca/string_grouper/HEAD/images/ScaledRuntimeContourPlot.png
--------------------------------------------------------------------------------
/images/ScaledTimePerComparison.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bergvca/string_grouper/HEAD/images/ScaledTimePerComparison.png
--------------------------------------------------------------------------------
/images/BlockNumberSpaceExploration1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bergvca/string_grouper/HEAD/images/BlockNumberSpaceExploration1.png
--------------------------------------------------------------------------------
/string_grouper_utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .string_grouper_utils import new_group_rep_by_earliest_timestamp, new_group_rep_by_completeness, \
2 | new_group_rep_by_highest_weight
3 |
--------------------------------------------------------------------------------
/string_grouper/__init__.py:
--------------------------------------------------------------------------------
1 | from .string_grouper import compute_pairwise_similarities, group_similar_strings, match_most_similar, match_strings, \
2 | StringGrouperConfig, StringGrouper
3 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 |
2 |
3 | __pycache__
4 | */__pycache__/
5 | *.py[cod]
6 | *$py.class
7 |
8 | .ipynb_checkpoints
9 | *.ipynb
10 |
11 | dist/
12 | build/
13 | *.egg-info/
14 |
15 | .DS_Store
16 |
17 | site/
18 |
19 |
20 | tests.txt
21 |
--------------------------------------------------------------------------------
/docs/references.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: References
3 | ---
4 |
5 |
6 | All functions are built using a class **`StringGrouper`**. This class can be used through pre-defined functions, for example the four high level functions above, as well as using a more interactive approach where matches can be added or removed if needed by calling the **`StringGrouper`** class directly.
7 |
8 |
--------------------------------------------------------------------------------
/tutorials/accounts.csv:
--------------------------------------------------------------------------------
1 | id,name
2 | AA012345X,mega enterprises corp.
3 | BB016741P,mega enterprises corporation
4 | CC052345T,mega corp.
5 | AA098762D,hyper startup inc.
6 | BB099931J,hyper-startup inc.
7 | CC082744L,hyper startup incorporated
8 | HH072982K,hyper hyper inc.
9 | AA903844B,slow and steady inc.
10 | BB904941H,slow and steady incorporated
11 | CC903844B,slow steady inc.
12 | AA777431C,abc enterprises inc.
13 | BB760431Y,a.b.c. enterprises incorporated
14 | BB750431M,a.b.c. enterprises inc.
15 | ZZ123456H,one and only inc.
--------------------------------------------------------------------------------
/docs/references/compute_pairwise_similarities.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: compute_pairwise_similarities
3 | ---
4 |
5 |
6 | ## Arguments
7 |
8 | ```python
9 | compute_pairwise_similarities(string_series_1: pd.Series,
10 | string_series_2: pd.Series,
11 | **kwargs) -> pd.Series
12 | ```
13 |
14 |
15 | ## Result
16 |
17 | Returns a `Series` of cosine similarity scores the same length and index as `string_series_1`. Each score is the cosine similarity between the pair of strings in the same position (row) in the two input `Series`, `string_series_1` and `string_series_2`, as the position of the score in the output `Series`. This can be seen as an element-wise comparison between the two input `Series`.
18 |
19 |
20 |
21 |
--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
1 | name: Run tests
2 | on:
3 | pull_request:
4 | push:
5 | branches:
6 | - master
7 |
8 | jobs:
9 | test:
10 | runs-on: ${{ matrix.os }}
11 | strategy:
12 | matrix:
13 | python-version: [3.9, 3.11, 3.12.3]
14 | os: [ubuntu-latest, windows-latest]
15 |
16 | steps:
17 | - uses: actions/checkout@v2
18 |
19 | - name: Set up Python ${{ matrix.python-version }}
20 | uses: actions/setup-python@v2
21 | with:
22 | python-version: ${{ matrix.python-version }}
23 |
24 | - name: Install dev-package
25 | run: |
26 | python -m pip install --upgrade pip
27 | pip install poetry
28 | poetry install
29 | python -m pip install -e .
30 |
31 | - name: Run tests
32 | run: python -m unittest
33 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "string_grouper"
3 | version = "0.7.1"
4 | description = "String grouper contains functions to do string matching using TF-IDF and the cossine similarity."
5 | authors = [
6 | {name = "Chris van den Berg"},
7 | {name = "ParticularMiner"},
8 | ]
9 |
10 | maintainers = [
11 | {name = "Chris van den Berg"},
12 | {name = "Guillaume Pressiat"},
13 | ]
14 |
15 |
16 | license = "MIT License"
17 | readme = "README.md"
18 |
19 | packages = [
20 | { include = "string_grouper" },
21 | { include = "string_grouper_utils" },
22 | ]
23 |
24 | [tool.poetry.dependencies]
25 | python = "^3.9"
26 | pandas = "^2.0"
27 | scipy = ">=1.4.1"
28 | scikit-learn = "^1.4.0"
29 | numpy = "^1.26.0"
30 | sparse_dot_topn = ">=1.1.0"
31 | loguru = ">0.7.0"
32 |
33 | [build-system]
34 | requires = ["poetry-core"]
35 | build-backend = "poetry.core.masonry.api"
36 |
--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
1 | site_name: String Grouper
2 |
3 | repo_url: https://github.com/bergvca/string_grouper
4 |
5 |
6 | theme:
7 | name: material
8 | features:
9 | - navigation.tabs
10 | # - navigation.tabs.sticky
11 |
12 | nav:
13 | - Home:
14 | - index.md
15 | - References:
16 | - references/match_strings.md
17 | - references/match_most_similar.md
18 | - references/group_similar_strings.md
19 | - references/compute_pairwise_similarities.md
20 | - references/options_kwargs.md
21 | - references/sg_class.md
22 | - Examples:
23 | - examples.md
24 | - Performance:
25 | - performance.md
26 |
27 |
28 | markdown_extensions:
29 | - toc:
30 | toc_depth: 3
31 | - pymdownx.highlight:
32 | anchor_linenums: true
33 | line_spans: __span
34 | pygments_lang_class: true
35 | - pymdownx.inlinehilite
36 | - pymdownx.snippets
37 | - pymdownx.superfences
38 | - admonition
39 | - pymdownx.details
40 | - pymdownx.superfences
41 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 Chris van den Berg
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 | import pathlib
3 |
4 | # The directory containing this file
5 | HERE = pathlib.Path(__file__).parent
6 |
7 | # The text of the README file
8 | README = (HERE / "README.md").read_text()
9 |
10 | setup(
11 | name='string_grouper',
12 | version='0.7.0',
13 | packages=['string_grouper', 'string_grouper_utils'],
14 | license='MIT License',
15 | description='String grouper contains functions to do string matching using TF-IDF and the cossine similarity. '
16 | 'Based on https://bergvca.github.io/2017/10/14/super-fast-string-matching.html',
17 | author='Chris van den Berg',
18 | long_description=README,
19 | long_description_content_type="text/markdown",
20 | author_email='fake_email@gmail.com',
21 | url='https://github.com/Bergvca/string_grouper',
22 | zip_safe=False,
23 | python_requires='>3.7',
24 | install_requires=['pandas>=2.0'
25 | , 'scipy>=1.4.1'
26 | , 'scikit-learn>=1.4.0'
27 | , 'numpy>=1.26.0, < 2.0'
28 | , 'sparse_dot_topn>=1.1.0'
29 | , 'loguru>=0.7'
30 | ]
31 | )
32 |
33 |
--------------------------------------------------------------------------------
/docs/references/group_similar_strings.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: group_similar_strings
3 | ---
4 |
5 |
6 | ## Arguments
7 |
8 | ```python
9 |
10 | ## Arguments
11 |
12 | group_similar_strings(strings_to_group: pd.Series,
13 | string_ids: Optional[pd.Series],
14 | **kwargs) -> Union[pd.DataFrame, pd.Series]
15 | ```
16 |
17 |
18 | ## Result
19 |
20 | Takes a single `Series` of strings (`strings_to_group`) and groups them by assigning to each string one string from `strings_to_group` chosen as the group-representative for each group of similar strings found. (See [tutorials/group_representatives.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/group_representatives.md) for details on how the the group-representatives are chosen.)
21 |
22 | If `ignore_index=True`, the output is a `Series` (with the same name as `strings_to_group` prefixed by the string `'group_rep_'`) of the same length and index as `strings_to_group` containing the group-representative strings. If `strings_to_group` has no name then the name of the returned `Series` is `'group_rep'`.
23 |
24 | For example, an input Series with values: `['foooo', 'foooob', 'bar']` will return `['foooo', 'foooo', 'bar']`. Here `'foooo'` and `'foooob'` are grouped together into group `'foooo'` because they are found to be similar. Another example can be found [below](#dedup).
25 |
26 | If `ignore_index=False`, the output is a `DataFrame` containing the above output `Series` as one of its columns with the same name. The remaining column(s) correspond to the index (or index-levels) of `strings_to_group` and contain the index-labels of the group-representatives as values. These columns have the same names as their counterparts prefixed by the string `'group_rep_'`.
27 |
28 | If `strings_id` is also given, then the IDs from `strings_id` corresponding to the group-representatives are also returned in an additional column (with the same name as `strings_id` prefixed as described above). If `strings_id` has no name, it is assumed to have the name `'id'` before being prefixed.
29 |
30 |
31 |
--------------------------------------------------------------------------------
/docs/references/match_strings.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: match_strings
3 | ---
4 |
5 |
6 | ## Arguments
7 |
8 | ```python
9 | match_strings(master: pd.Series,
10 | duplicates: Optional[pd.Series],
11 | master_id: Optional[pd.Series],
12 | duplicates_id: Optional[pd.Series],
13 | **kwargs) -> pd.DataFrame
14 | ```
15 |
16 | ## Result
17 |
18 | Returns a `DataFrame` containing similarity-scores of all matching pairs of highly similar strings from `master` (and `duplicates` if given). Each matching pair in the output appears in its own row/record consisting of
19 |
20 |
21 | 1. its "left" part: a string (with/without its index-label) from `master`,
22 | 2. its similarity score, and
23 | 3. its "right" part: a string (with/without its index-label) from `duplicates` (or `master` if `duplicates` is not given),
24 |
25 | in that order. Thus the column-names of the output are a collection of three groups:
26 |
27 | 1. The name of `master` and the name(s) of its index (or index-levels) all prefixed by the - string `'left_'`,
28 | 2. `'similarity'` whose column has the similarity-scores as values, and
29 | 3. The name of `duplicates` (or `master` if `duplicates` is not given) and the name(s) of its index (or index-levels) prefixed by the string `'right_'`.
30 |
31 |
32 | Indexes (or their levels) only appear when the keyword argument `ignore_index=False` (the default). (See [tutorials/ignore_index_and_replace_na.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/ignore_index_and_replace_na.md) for a demonstration.)
33 |
34 | If either `master` or `duplicates` has no name, it assumes the name `'side'` which is then prefixed as described above. Similarly, if any of the indexes (or index-levels) has no name it assumes its `pandas` default name (`'index'`, `'level_0'`, and so on) and is then prefixed as described above.
35 |
36 | In other words, if only parameter `master` is given, the function will return pairs of highly similar strings within `master`. This can be seen as a self-join where both `'left_'` and `'right_'` prefixed columns come from `master`. If both parameters `master` and `duplicates` are given, it will return pairs of highly similar strings between `master` and `duplicates`. This can be seen as an inner-join where `'left_'` and `'right_'` prefixed columns come from `master` and `duplicates` respectively.
37 |
38 | The function also supports optionally inputting IDs (`master_id` and `duplicates_id`) corresponding to the strings being matched. In which case, the output includes two additional columns whose names are the names of these optional `Series` prefixed by `'left_'` and `'right_'` accordingly, and containing the IDs corresponding to the strings in the output. If any of these `Series` has no name, then it assumes the name `'id'` and is then prefixed as described above.
39 |
40 |
41 |
--------------------------------------------------------------------------------
/docs/references/match_most_similar.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: match_most_similar
3 | ---
4 |
5 |
6 | ## Arguments
7 |
8 | ```python
9 | match_most_similar(master: pd.Series,
10 | duplicates: Optional[pd.Series],
11 | master_id: Optional[pd.Series],
12 | duplicates_id: Optional[pd.Series],
13 | **kwargs) -> Union[pd.DataFrame, pd.Series]
14 | ```
15 |
16 | ## Result
17 |
18 | If `ignore_index=True`, returns a `Series` of strings, where for each string in `duplicates` the most similar string in `master` is returned. If there are no similar strings in `master` for a given string in `duplicates` (because there is no potential match where the cosine similarity is above the threshold \[default: 0.8\]) then the original string in `duplicates` is returned. The output `Series` thus has the same length and index as `duplicates`.
19 |
20 | For example, if an input `Series` with the values `\['foooo', 'bar', 'baz'\]` is passed as the argument `master`, and `\['foooob', 'bar', 'new'\]` as the values of the argument `duplicates`, the function will return a `Series` with values: `\['foooo', 'bar', 'new'\]`.
21 |
22 | The name of the output `Series` is the same as that of `master` prefixed with the string `'most_similar_'`. If `master` has no name, it is assumed to have the name `'master'` before being prefixed.
23 |
24 | If `ignore_index=False` (the default), `match_most_similar` returns a `DataFrame` containing the same `Series` described above as one of its columns. So it inherits the same index and length as `duplicates`. The rest of its columns correspond to the index (or index-levels) of `master` and thus contain the index-labels of the most similar strings being output as values. If there are no similar strings in `master` for a given string in `duplicates` then the value(s) assigned to this index-column(s) for that string is `NaN` by default. However, if the keyword argument `replace_na=True`, then these `NaN` values are replaced with the index-label(s) of that string in `duplicates`. Note that such replacements can only occur if the indexes of `master` and `duplicates` have the same number of levels. (See [tutorials/ignore_index_and_replace_na.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/ignore_index_and_replace_na.md#MMS) for a demonstration.)
25 |
26 | Each column-name of the output `DataFrame` has the same name as its corresponding column, index, or index-level of `master` prefixed with the string `'most_similar_'`.
27 |
28 | If both parameters `master_id` and `duplicates_id` are also given, then a `DataFrame` is always returned with the same column(s) as described above, but with an additional column containing those IDs from these input `Series` corresponding to the output strings. This column's name is the same as that of `master_id` prefixed in the same way as described above. If `master_id` has no name, it is assumed to have the name `'master_id'` before being prefixed.
29 |
30 |
--------------------------------------------------------------------------------
/docs/references/options_kwargs.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Options / **kwargs
3 | ---
4 |
5 | All keyword arguments not mentioned in the function definitions above are used to update the default settings. The following optional arguments can be used:
6 |
7 | ## Tokenization settings
8 |
9 | * **`ngram_size`**: The amount of characters in each n-gram. Default is `3`.
10 | * **`regex`**: The regex string used to clean-up the input string. Default is `r"[,-./]|\s"`.
11 | * **`ignore_case`**: Determines whether or not letter case in strings should be ignored. Defaults to `True`.
12 | * **`normalize_to_ascii`**: Determines whether or not unicode to ascii normarlization is done. Default to `True`.
13 |
14 | ## Match and output settings
15 |
16 | * **`max_n_matches`**: The maximum number of matching strings in `master` allowed per string in `duplicates`. Default is 20.
17 | * **`min_similarity`**: The minimum cosine similarity for two strings to be considered a match.
18 | Defaults to `0.8`
19 | * **`include_zeroes`**: When `min_similarity` ≤ 0, determines whether zero-similarity matches appear in the output. Defaults to `True`. (See [tutorials/zero_similarity.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/zero_similarity.md).)
20 | * **`ignore_index`**: Determines whether indexes are ignored or not. If `False` (the default), index-columns will appear in the output, otherwise not. (See [tutorials/ignore_index_and_replace_na.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/ignore_index_and_replace_na.md) for a demonstration.)
21 | * **`replace_na`**: For function `match_most_similar`, determines whether `NaN` values in index-columns are replaced or not by index-labels from `duplicates`. Defaults to `False`. (See [tutorials/ignore_index_and_replace_na.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/ignore_index_and_replace_na.md) for a demonstration.)
22 |
23 | ## Performance settings
24 |
25 | * **`number_of_processes`**: The number of processes used by the cosine similarity calculation. Defaults to
26 | `number of cores on a machine - 1.`
27 | * **`n_blocks`**: This parameter is a tuple of two `int`s provided to help boost performance, if possible, of processing large DataFrames (see [Subsection Performance](#perf)), by splitting the DataFrames into `n_blocks[0]` blocks for the left operand (of the underlying matrix multiplication) and into `n_blocks[1]` blocks for the right operand before performing the string-comparisons block-wise. Defaults to `None`, in which case automatic splitting occurs if an `OverflowError` would otherwise occur.
28 |
29 | ## Other settings
30 |
31 | * **`tfidf_matrix_dtype`**: The datatype for the tf-idf values of the matrix components. Allowed values are `numpy.float32` and `numpy.float64`. Default is `numpy.float64`. (Note: `numpy.float32` often leads to faster processing and a smaller memory footprint albeit less numerical precision than `numpy.float64`.)
32 | * **`group_rep`**: For function `group_similar_strings`, determines how group-representatives are chosen. Allowed values are `'centroid'` (the default) and `'first'`. See [tutorials/group_representatives.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/group_representatives.md) for an explanation.
33 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # Changelog
2 |
3 | All notable changes to this project will be documented in this file.
4 |
5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7 |
8 | ## [0.7.1] - 2025-01-23
9 |
10 | ### Changed
11 | * Code wise, nothing changed. However, the version number is cleaned up in pyproject.toml
12 | * Cleaned up documentation and readme.md. Most documentation is moved from the `readme.md` to:
13 | [https://bergvca.github.io/string_grouper](https://bergvca.github.io/string_grouper/).
14 |
15 |
16 | ## [0.7.0] - 2025-01-23
17 |
18 | ### Changed
19 |
20 | * sparse_dot_topn_for_blocks and topn dependencies are removed and replaced by sparse_dot_topn official library from ING Bank, this is a big change: it may have impacts from old code using string_grouper
21 | * `n_blocks` None is now the default value for `n_blocks` and optimal numbers of blocks will be guessed based on empirical observation to split data into smaller chunks (based on input data size)
22 | * sparse_dot_topn now integrates a [block/chunk strategy](https://github.com/ing-bank/sparse_dot_topn?tab=readme-ov-file#distributing-the-top-n-multiplication-of-two-large-o10m-sparse-matrices-over-a-cluster). This strategy is used in string_grouper.
23 |
24 |
25 | ### Added
26 |
27 | * a new parameter normalize_to_ascii to normalize unicode character to ascii ones
28 | * loguru dependency is introduced to print messages to user
29 |
30 |
31 | ## [0.6.1] - 2021-10-19
32 |
33 | * `n_blocks` Added "guesstimate" as default value for `n_blocks`. This will guess an optimal number of blocks
34 | based on empirical observation.
35 |
36 |
37 | ## [0.6.0] - 2021-09-21
38 |
39 | ### Added
40 |
41 | * matrix-blocking/splitting as a performance-enhancer (see [README.md](https://github.com/Bergvca/string_grouper/tree/master/#performance) for details)
42 | * new keyword arguments `force_symmetries` and `n_blocks` (see [README.md](https://github.com/Bergvca/string_grouper/tree/master/#kwargs) for details)
43 | * new dependency on packages `topn` and `sparse_dot_topn_for_blocks` to help with the matrix-blocking
44 | * capability to reuse a previously initialized StringGrouper (that is, the corpus can now persist across high-level function calls like `match_strings()`. See [README.md](https://github.com/Bergvca/string_grouper/tree/master/#corpus) for details.)
45 |
46 |
47 | ## [0.5.0] - 2021-06-11
48 |
49 | ### Added
50 |
51 | * Added new keyword argument **`tfidf_matrix_dtype`** (the datatype for the tf-idf values of the matrix components). Allowed values are `numpy.float32` and `numpy.float64` (used by the required external package `sparse_dot_topn` version 0.3.1). Default is `numpy.float32`. (Note: `numpy.float32` often leads to faster processing and a smaller memory footprint albeit less numerical precision than `numpy.float64`.)
52 |
53 | ### Changed
54 |
55 | * Changed dependency on `sparse_dot_topn` from version 0.2.9 to 0.3.1
56 | * Changed the default datatype for cosine similarities from numpy.float64 to numpy.float32 to boost computational performance at the expense of numerical precision.
57 | * Changed the default value of the keyword argument `max_n_matches` from 20 to the number of strings in `duplicates` (or `master`, if `duplicates` is not given).
58 | * Changed warning issued when the condition \[`include_zeroes=True` and `min_similarity` ≤ 0 and `max_n_matches` is not sufficiently high to capture all nonzero-similarity-matches\] is met to an exception.
59 |
60 | ### Removed
61 |
62 | * Removed the keyword argument `suppress_warning`
63 |
64 | ## [0.4.0] - 2021-04-11
65 |
66 | ### Added
67 |
68 | * Added group representative functionality - by default the centroid is used. From [@ParticularMiner](https://github.com/ParticularMiner)
69 | * Added string_grouper_utils package with additional group-representative functionality:
70 | * new_group_rep_by_earliest_timestamp
71 | * new_group_rep_by_completeness
72 | * new_group_rep_by_highest_weight
73 |
74 | From [@ParticularMiner](https://github.com/ParticularMiner)
75 | * Original indices are now added by default to output of `group_similar_strings`, `match_most_similar` and `match_strings`.
76 | From [@ParticularMiner](https://github.com/ParticularMiner)
77 | * `compute_pairwise_similarities` function From [@ParticularMiner](https://github.com/ParticularMiner)
78 |
79 | ### Changed
80 |
81 | * Default group representative is now the centroid. Used to be the first string in the series belonging to a group.
82 | From [@ParticularMiner](https://github.com/ParticularMiner)
83 | * Output of `match_most_similar` and `match_strings` is now a `pandas.DataFrame` object instead of a `pandas.Series`
84 | by default. From [@ParticularMiner](https://github.com/ParticularMiner)
85 | * Fixed a bug which occurs when min_similarity=0. From [@ParticularMiner](https://github.com/ParticularMiner)
--------------------------------------------------------------------------------
/docs/performance.md:
--------------------------------------------------------------------------------
1 | ## Performance
2 |
3 |
4 | Semilogx plots of run-times of `match_strings()` vs the number of blocks (`n_blocks[1]`) into which the right matrix-operand of the dataset (663 000 strings from sec__edgar_company_info.csv) was split before performing the string comparison. As shown in the legend, each plot corresponds to the number `n_blocks[0]` of blocks into which the left matrix-operand was split.
5 | 
6 |
7 | String comparison, as implemented by `string_grouper`, is essentially matrix
8 | multiplication. A pandas Series of strings is converted (tokenized) into a
9 | matrix. Then that matrix is multiplied by itself (or another) transposed.
10 |
11 | Here is an illustration of multiplication of two matrices ***D*** and ***M***T:
12 | 
13 |
14 | It turns out that when the matrix (or Series) is very large, the computer
15 | proceeds quite slowly with the multiplication (apparently due to the RAM being
16 | too full). Some computers give up with an `OverflowError`.
17 |
18 | To circumvent this issue, `string_grouper` now allows the division of the Series
19 | into smaller chunks (or blocks) and multiplies the chunks one pair at a time
20 | instead to get the same result:
21 |
22 | 
23 |
24 | But surprise ... the run-time of the process is sometimes drastically reduced
25 | as a result. For example, the speed-up of the following call is about 500%
26 | (here, the Series is divided into 200 blocks on the right operand, that is,
27 | 1 block on the left × 200 on the right) compared to the same call with no
28 | splitting \[`n_blocks=(1, 1)`, the default, which is what previous versions
29 | (0.5.0 and earlier) of `string_grouper` did\]:
30 |
31 | ```python
32 | # A DataFrame of 668 000 records:
33 | companies = pd.read_csv('data/sec__edgar_company_info.csv')
34 |
35 | # The following call is more than 6 times faster than earlier versions of
36 | # match_strings() (that is, when n_blocks=(1, 1))!
37 | match_strings(companies['Company Name')], n_blocks=(1, 200))
38 | ```
39 |
40 | Further exploration of the block number space ([see plot above](#Semilogx)) has revealed that for any fixed
41 | number of right blocks, the run-time gets longer the larger the number of left
42 | blocks specified. For this reason, it is recommended *not* to split the left matrix.
43 |
44 | 
45 |
46 | In general,
47 |
48 | ***total runtime*** = `n_blocks[0]` × `n_blocks[1]` × ***mean runtime per block-pair***
49 |
50 | = ***Left Operand Size*** × ***Right Operand Size*** ×
51 |
52 | ***mean runtime per block-pair*** / (***Left Block Size*** × ***Right Block Size***)
53 |
54 | So for given left and right operands, minimizing the ***total runtime*** is the same as minimizing the
55 |
56 | ***runtime per string-pair comparison*** ≝
***mean runtime per block-pair*** / (***Left Block Size*** × ***Right Block Size***)
57 |
58 |
59 | [Below is a log-log-log contour plot](#ContourPlot) of the ***runtime per string-pair comparison*** scaled by its value
60 | at ***Left Block Size*** = ***Right Block Size*** = 5000. Here, ***Block Size***
61 | is the number of strings in that block, and ***mean runtime per block-pair*** is the time taken for the following call to run:
62 | ```python
63 | # note the parameter order!
64 | match_strings(right_Series, left_Series, n_blocks=(1, 1))
65 | ```
66 | where `left_Series` and `right_Series`, corresponding to ***Left Block*** and ***Right Block*** respectively, are random subsets of the Series `companies['Company Name')]` from the
67 | [sec__edgar_company_info.csv](https://www.kaggle.com/dattapiy/sec-edgar-companies-list/version/1) sample data file.
68 |
69 | 
70 |
71 | It can be seen that when `right_Series` is roughly the size of 80 000 (denoted by the
72 | white dashed line in the contour plot above), the runtime per string-pair comparison is at
73 | its lowest for any fixed `left_Series` size. Above ***Right Block Size*** = 80 000, the
74 | matrix-multiplication routine begins to feel the limits of the computer's
75 | available memory space and thus its performance deteriorates, as evidenced by the increase
76 | in runtime per string-pair comparison there (above the white dashed line). This knowledge
77 | could serve as a guide for estimating the optimum block numbers —
78 | namely those that divide the Series into blocks of size roughly equal to
79 | 80 000 for the right operand (or `right_Series`).
80 |
81 | So what are the optimum block number values for *any* given Series? That is
82 | anyone's guess, and may likely depend on the data itself. Furthermore, as hinted above,
83 | the answer may vary from computer to computer.
84 |
85 | We however encourage the user to make judicious use of the `n_blocks`
86 | parameter to boost performance of `string_grouper` whenever possible.
87 |
--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: String Grouper
3 | ---
4 |
5 | **`string_grouper`** is a library that makes finding groups of similar strings within a single, or multiple, lists of strings easy — and fast. **`string_grouper`** uses **tf-idf** to calculate [**cosine similarities**](http://blog.christianperone.com/2013/09/machine-learning-cosine-similarity-for-vector-space-models-part-iii/) within a single list or between two lists of strings. The full process is described in the blog [Super Fast String Matching in Python](https://bergvca.github.io/2017/10/14/super-fast-string-matching.html).
6 |
7 | ## Install
8 |
9 | ```bash
10 | pip install string-grouper
11 | ```
12 |
13 | or see the releases [here](https://github.com/bergvca/string_grouper/releases)
14 |
15 | ## First usage
16 |
17 | ```python
18 | import pandas as pd
19 | from string_grouper import match_strings
20 |
21 | #https://github.com/ngshya/pfsm/blob/master/data/sec_edgar_company_info.csv
22 | company_names = './data/sec_edgar_company_info.csv'
23 | # We only look at the first 50k as an example:
24 | companies = pd.read_csv(company_names)[0:50000]
25 | # Create all matches:
26 | matches = match_strings(companies['Company Name'])
27 | # Look at only the non-exact matches:
28 | matches[matches['left_Company Name'] != matches['right_Company Name']].head()
29 | ```
30 |
31 | As shown above, the library may be used together with `pandas`, and contains four high level functions (`match_strings`, `match_most_similar`, `group_similar_strings`, and `compute_pairwise_similarities`) that can be used directly, and one class (`StringGrouper`) that allows for a more interactive approach.
32 |
33 | The permitted calling patterns of the four functions, and their return types, are:
34 |
35 | | Function | Parameters | `pandas` Return Type |
36 | | -------------: |:-------------|:-----:|
37 | | `match_strings`| `(master, **kwargs)`| `DataFrame` |
38 | | `match_strings`| `(master, duplicates, **kwargs)`| `DataFrame` |
39 | | `match_strings`| `(master, master_id=id_series, **kwargs)`| `DataFrame` |
40 | | `match_strings`| `(master, duplicates, master_id, duplicates_id, **kwargs)`| `DataFrame` |
41 |
42 |
43 | ## With Polars
44 |
45 | For the moment polars is not yet supported natively.
46 |
47 | But you can juggle easily one with the other:
48 |
49 | ```python
50 | import polars as pl
51 | from string_grouper import match_strings
52 |
53 | company_names = 'https://raw.githubusercontent.com/ngshya/pfsm/refs/heads/master/data/sec_edgar_company_info.csv'
54 | # We only look at the first 50k as an example:
55 | companies = pl.read_csv(company_names).slice(0,50000).to_pandas()
56 | # Create all matches:
57 | matches = pl.from_pandas(match_strings(companies['Company Name']))
58 | # Look at only the non-exact matches:
59 | matches.filter(pl.col('left_Company Name') != pl.col('right_Company Name')).head()
60 | ```
61 |
62 | ## High Level Functions
63 | In the rest of this document the names, `Series` and `DataFrame`, refer to the familiar `pandas` object types.
64 |
65 | As shown above, the library may be used together with `pandas`, and contains four high level functions (`match_strings`, `match_most_similar`, `group_similar_strings`, and `compute_pairwise_similarities`) that can be used directly, and one class (`StringGrouper`) that allows for a more interactive approach.
66 |
67 | The permitted calling patterns of the four functions, and their return types, are:
68 |
69 | | Function | Parameters | `pandas` Return Type |
70 | | -------------: |:-------------|:-----:|
71 | | `match_strings`| `(master, **kwargs)`| `DataFrame` |
72 | | `match_strings`| `(master, duplicates, **kwargs)`| `DataFrame` |
73 | | `match_strings`| `(master, master_id=id_series, **kwargs)`| `DataFrame` |
74 | | `match_strings`| `(master, duplicates, master_id, duplicates_id, **kwargs)`| `DataFrame` |
75 | | `match_most_similar`| `(master, duplicates, **kwargs)`| `Series` (if kwarg `ignore_index=True`) otherwise `DataFrame` (default)|
76 | | `match_most_similar`| `(master, duplicates, master_id, duplicates_id, **kwargs)`| `DataFrame` |
77 | | `group_similar_strings`| `(strings_to_group, **kwargs)`| `Series` (if kwarg `ignore_index=True`) otherwise `DataFrame` (default)|
78 | | `group_similar_strings`| `(strings_to_group, strings_id, **kwargs)`| `DataFrame` |
79 | | `compute_pairwise_similarities`| `(string_series_1, string_series_2, **kwargs)`| `Series` |
80 |
81 |
82 |
83 | ## Generic Parameters
84 |
85 | |Name | Description |
86 | |:--- | :--- |
87 | |**`master`** | A `Series` of strings to be matched with themselves (or with those in `duplicates`). |
88 | |**`duplicates`** | A `Series` of strings to be matched with those of `master`. |
89 | |**`master_id`** (or `id_series`) | A `Series` of IDs corresponding to the strings in `master`. |
90 | |**`duplicates_id`** | A `Series` of IDs corresponding to the strings in `duplicates`. |
91 | |**`strings_to_group`** | A `Series` of strings to be grouped. |
92 | |**`strings_id`** | A `Series` of IDs corresponding to the strings in `strings_to_group`. |
93 | |**`string_series_1(_2)`** | A `Series` of strings each of which is to be compared with its corresponding string in `string_series_2(_1)`. |
94 | |**`**kwargs`** | Keyword arguments (see [below](#kwargs)).|
95 |
96 |
97 | ## StringGrouper Class
98 |
99 | The above-mentioned functions are all build using the [StringGrouper](references/sg_class.md) class. This class can be used for more
100 | each of the high-level functions listed above also has a `StringGrouper`
101 | method counterpart of the same name and parameters. Calling such a method of any instance of `StringGrouper` will not
102 | rebuild the instance's underlying corpus to make string-comparisons but rather use it to perform the string-comparisons.
103 | The input Series to the method (`master`, `duplicates`, and so on) will thus be encoded,
104 | or transformed, into tf-idf matrices, using this corpus. See [StringGrouper](references/sg_class.md) for further
105 | details.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # String Grouper
2 |
3 | [](https://pypi.org/project/string-grouper)
4 | [](https://github.com/Bergvca/string_grouper)
5 | [](https://github.com/Bergvca/string_grouper)
6 | [](https://codecov.io/gh/Bergvca/string_grouper)
7 | [](https://pepy.tech/projects/string-grouper)
8 |
9 |
10 |
11 | Click to see image
12 |
13 |
14 |
15 | The image displayed above is a visualization of the graph-structure of one of the groups of strings found by `string_grouper`. Each circle (node) represents a string, and each connecting arc (edge) represents a match between a pair of strings with a similarity score above a given threshold score (here `0.8`).
16 |
17 | The ***centroid*** of the group, as determined by `string_grouper` (see [tutorials/group_representatives.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/group_representatives.md) for an explanation), is the largest node, also with the most edges originating from it. A thick line in the image denotes a strong similarity between the nodes at its ends, while a faint thin line denotes weak similarity.
18 |
19 | The power of `string_grouper` is discernible from this image: in large datasets, `string_grouper` is often able to resolve indirect associations between strings even when, say, due to memory-resource-limitations, direct matches between those strings cannot be computed using conventional methods with a lower threshold similarity score.
20 |
21 | ———
22 |
23 | This image was designed using the graph-visualization software Gephi 0.9.2 with data generated by `string_grouper` operating on the [sec__edgar_company_info.csv](https://www.kaggle.com/dattapiy/sec-edgar-companies-list/version/1) sample data file.
24 |
25 | ---
26 |
27 |
28 | **`string_grouper`** is a library that makes finding groups of similar strings within a single, or multiple, lists of
29 | strings easy — and _fast_. **`string_grouper`** uses **tf-idf** to calculate [**cosine similarities**](https://towardsdatascience.com/understanding-cosine-similarity-and-its-application-fd42f585296a)
30 | within a single list or between two lists of strings. The full process is described in the blog [Super Fast String Matching in Python](https://bergvca.github.io/2017/10/14/super-fast-string-matching.html).
31 |
32 |
33 | ## Installing
34 |
35 | `pip install string-grouper`
36 |
37 | ## Speed
38 |
39 | **`string_grouper`** leverages the blazingly fast [sparse_dot_topn](https://github.com/ing-bank/sparse_dot_topn) libary
40 | to calculate cosine similarities.
41 |
42 | ```python
43 | s = datetime.datetime.now()
44 | matches = match_strings(names['Company Name'], number_of_processes = 4)
45 |
46 | e = datetime.datetime.now()
47 | diff = (e - s)
48 | str(diff)
49 | ```
50 | Results in:
51 |
52 | `00:05:34.65` On an Intel i7-6500U CPU @ 2.50GHz, where `len(names)` = 663 000
53 |
54 | *in other words*,
55 | the library is able to perform fuzzy matching of 663 000 names in _five and a half minutes_
56 | on a 2015 consumer CPU using 4 cores.
57 |
58 | ## Simple Match
59 |
60 | ```python
61 | import pandas as pd
62 | from string_grouper import match_strings
63 |
64 | company_names = 'sec__edgar_company_info.csv'
65 | companies = pd.read_csv(company_names)
66 | # Create all matches:
67 | matches = match_strings(companies['Company Name'])
68 | # Look at only the non-exact matches:
69 | matches[matches['left_Company Name'] != matches['right_Company Name']].head()
70 | ```
71 |
72 | | | left_index | left_Company Name | similarity | right_Company Name | right_index |
73 | |----:|-------------:|:------------------------------------------------------------|-------------:|:----------------------------------------|--------------:|
74 | | 15 | 14 | 0210, LLC | 0.870291 | 90210 LLC | 4211 |
75 | | 167 | 165 | 1 800 MUTUALS ADVISOR SERIES | 0.931615 | 1 800 MUTUALS ADVISORS SERIES | 166 |
76 | | 168 | 166 | 1 800 MUTUALS ADVISORS SERIES | 0.931615 | 1 800 MUTUALS ADVISOR SERIES | 165 |
77 | | 172 | 168 | 1 800 RADIATOR FRANCHISE INC | 1 | 1-800-RADIATOR FRANCHISE INC. | 201 |
78 | | 178 | 173 | 1 FINANCIAL MARKETPLACE SECURITIES LLC /BD | 0.949364 | 1 FINANCIAL MARKETPLACE SECURITIES, LLC | 174 |
79 |
80 |
81 | ## Group Similar Strings and Find most Common
82 |
83 | ```python
84 | companies[["group-id", "name_deduped"]] = group_similar_strings(companies['Company Name'])
85 | companies.groupby('name_deduped')['Line Number'].count().sort_values(ascending=False).head(10)
86 | ```
87 | | name_deduped | Line Number |
88 | |:---------------------------------------------------|--------------:|
89 | | ADVISORS DISCIPLINED TRUST | 1747 |
90 | | NUVEEN TAX EXEMPT UNIT TRUST SERIES 1 | 916 |
91 | | GUGGENHEIM DEFINED PORTFOLIOS, SERIES 1200 | 652 |
92 | | U S TECHNOLOGIES INC | 632 |
93 | | CAPITAL MANAGEMENT LLC | 628 |
94 | | CLAYMORE SECURITIES DEFINED PORTFOLIOS, SERIES 200 | 611 |
95 | | E ACQUISITION CORP | 561 |
96 | | CAPITAL PARTNERS LP | 561 |
97 | | FIRST TRUST COMBINED SERIES 1 | 560 |
98 | | PRINCIPAL LIFE INCOME FUNDINGS TRUST 20 | 544 |
99 |
100 | ## Documentation
101 |
102 | The documentation can be found [here](https://bergvca.github.io/string_grouper/)
103 |
--------------------------------------------------------------------------------
/string_grouper_utils/string_grouper_utils.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from typing import List, Optional, Union
3 | from dateutil.parser import parse
4 | from dateutil.tz import UTC
5 | from numbers import Number
6 | from datetime import datetime
7 | import re
8 | import pydoc
9 |
10 |
11 | def new_group_rep_by_earliest_timestamp(grouped_data: pd.DataFrame,
12 | group_col: Union[str, int],
13 | record_id_col: Union[str, int],
14 | timestamps: Union[pd.Series, str, int],
15 | record_name_col: Optional[Union[str, int]] = None,
16 | parserinfo=None,
17 | **kwargs) -> Union[pd.DataFrame, pd.Series]:
18 | """
19 | Selects the oldest string in each group as group-representative.
20 | :param grouped_data: The grouped DataFrame
21 | :param group_col: The name or positional index of the column in grouped_data containing the groups
22 | :param record_id_col: The name or positional index of the column in grouped_data with all groups' members' IDs
23 | (This will appear in the output)
24 | :param timestamps: pandas.Series or the column name (str) or column positional index (int) in grouped_data
25 | This contains the timestamps of the strings to be grouped.
26 | :param record_name_col: (Optional) The name or positional index of the column in grouped_data with
27 | all groups' members' names. (This will appear in the output.)
28 | :param parserinfo: (See below.)
29 | :param **kwargs: (See below.)
30 | parserinfo and kwargs are the same arguments as those you would pass to dateutil.parser.parse. They help in
31 | interpreting the string inputs which are to be parsed into datetime datatypes.
32 |
33 | FYI, the dateutil.parser.parse documentation for these arguments follows:
34 | """
35 | if isinstance(timestamps, pd.Series):
36 | if len(grouped_data) != len(timestamps):
37 | raise Exception('Both grouped_data and timestamps must be pandas.Series of the same length.')
38 | else:
39 | timestamps = get_column(timestamps, grouped_data)
40 | weights = parse_timestamps(timestamps, parserinfo, **kwargs)
41 | return group_rep_transform('idxmin', weights, grouped_data, group_col, record_id_col, record_name_col)
42 |
43 |
44 | def new_group_rep_by_completeness(grouped_data: pd.DataFrame,
45 | group_col: Union[str, int],
46 | record_id_col: Union[str, int],
47 | record_name_col: Optional[Union[str, int]] = None,
48 | tested_cols: Optional[Union[pd.DataFrame, List[Union[str, int]]]] = None
49 | ) -> Union[pd.DataFrame, pd.Series]:
50 | """
51 | Selects the string in the group with the most filled-in row/record as group-representative.
52 | :param grouped_data: The grouped DataFrame
53 | :param group_col: The name or positional index of the column in grouped_data containing the groups
54 | :param record_id_col: The name or positional index of the column in grouped_data with all groups' members' IDs
55 | (This will appear in the output)
56 | :param record_name_col: (Optional) The name or positional index of the column in grouped_data with
57 | all groups' members' names. (This will appear in the output.)
58 | :param tested_cols: (Optional) pandas.DataFrame or list of column names/indices of grouped_data whose
59 | filled-in statuses are used to determine the new group-representative.
60 | If it is None then the entire group_data itself is used
61 | The input DataFrame of fields of the strings to be grouped.
62 | """
63 | if isinstance(tested_cols, pd.DataFrame):
64 | if len(grouped_data) != len(tested_cols):
65 | raise Exception('Both grouped_data and tested_cols must be pandas.DataFrame of the same length.')
66 | elif tested_cols is not None:
67 | tested_cols = get_column(tested_cols, grouped_data)
68 | else:
69 | tested_cols = grouped_data
70 |
71 | def is_notnull_and_not_empty(x):
72 | if x == '' or pd.isnull(x):
73 | return 0
74 | else:
75 | return 1
76 |
77 | weights = tested_cols.applymap(is_notnull_and_not_empty).sum(axis=1)
78 | return group_rep_transform('idxmax', weights, grouped_data, group_col, record_id_col, record_name_col)
79 |
80 |
81 | def new_group_rep_by_highest_weight(grouped_data: pd.DataFrame,
82 | group_col: Union[str, int],
83 | record_id_col: Union[str, int],
84 | weights: Union[pd.Series, str, int],
85 | record_name_col: Optional[Union[str, int]] = None,
86 | ) -> Union[pd.DataFrame, pd.Series]:
87 | """
88 | Selects the string in the group with the largest weight as group-representative.
89 | :param grouped_data: The grouped DataFrame
90 | :param group_col: The name or positional index of the column in grouped_data containing the groups
91 | :param record_id_col: The name or positional index of the column in grouped_data with all groups' members' IDs
92 | (This will appear in the output)
93 | :param weights: pandas.Series or the column name (str) or column positional index (int) in grouped_data
94 | containing the user-defined weights of the strings to be grouped
95 | :param record_name_col: (Optional) The name or positional index of the column in grouped_data with
96 | all groups' members' names. (This will appear in the output.)
97 | """
98 | if isinstance(weights, pd.Series):
99 | if len(grouped_data) != len(weights):
100 | raise Exception('Both grouped_data and weights must be pandas.Series of the same length.')
101 | else:
102 | weights = get_column(weights, grouped_data)
103 | return group_rep_transform('idxmax', weights, grouped_data, group_col, record_id_col, record_name_col)
104 |
105 |
106 | def group_rep_transform(method: str,
107 | weights: pd.Series,
108 | grouped_data,
109 | group_col,
110 | record_id_col,
111 | record_name_col) -> Union[pd.Series, pd.DataFrame]:
112 | stashed_index = grouped_data.index
113 | group_of_master_id = get_column(group_col, grouped_data).reset_index(drop=True)
114 | group_of_master_id = group_of_master_id.rename('raw_group_id').reset_index().rename(columns={'index': 'weight'})
115 | group_of_master_id['weight'] = weights.reset_index(drop=True)
116 | group_of_master_id['group_rep'] = \
117 | group_of_master_id.groupby('raw_group_id', sort=False)['weight'].transform(method)
118 | record_id_col = get_column(record_id_col, grouped_data)
119 | new_rep = record_id_col.iloc[group_of_master_id.group_rep].reset_index(drop=True).rename(None)
120 | if record_name_col is None:
121 | output = new_rep
122 | else:
123 | record_name_col = get_column(record_name_col, grouped_data)
124 | new_rep_name = record_name_col.iloc[group_of_master_id.group_rep].reset_index(drop=True).rename(None)
125 | output = pd.concat([new_rep, new_rep_name], axis=1)
126 | output.index = stashed_index
127 | return output
128 |
129 |
130 | def get_column(col: Union[str, int, List[Union[str, int]]], data: pd.DataFrame):
131 | if isinstance(col, str):
132 | return data.loc[:, col]
133 | elif isinstance(col, int):
134 | return data.iloc[:, col]
135 | elif isinstance(col, List):
136 | return pd.concat([get_column(m, data) for m in col], axis=1)
137 |
138 |
139 | def parse_timestamps(timestamps: pd.Series, parserinfo=None, **kwargs) -> pd.Series:
140 | error_msg = "timestamps must be a Series of date-like or datetime-like strings"
141 | error_msg += " or datetime datatype or pandas Timestamp datatype or numbers"
142 | if is_series_of_type(str, timestamps):
143 | # if any of the strings is not datetime-like raise an exception
144 | if timestamps.to_frame().applymap(is_date).squeeze().all():
145 | # convert strings to numpy datetime64
146 | return timestamps.transform(lambda x: parse(x, parserinfo, **kwargs).astimezone(UTC))
147 | elif is_series_of_type(type(pd.Timestamp('15-1-2000')), timestamps):
148 | # convert pandas Timestamps to numpy datetime64
149 | return timestamps.transform(lambda x: x.to_numpy())
150 | elif is_series_of_type(datetime, timestamps):
151 | # convert python datetimes to numpy datetime64
152 | return timestamps.transform(lambda x: x.astimezone(UTC))
153 | elif is_series_of_type(Number, timestamps):
154 | return timestamps
155 | raise Exception(error_msg)
156 |
157 |
158 | def is_date(string, parserinfo=None, **kwargs):
159 | """
160 | Return whether the string can be interpreted as a date.
161 | :param string: str, string to check for date
162 | :param parserinfo: (See below.)
163 | :param **kwargs: (See below.)
164 | parserinfo and kwargs are the same arguments as those you would pass to dateutil.parser.parse. They help in
165 | interpreting the string inputs which are to be parsed into datetime datatypes.
166 | """
167 | try:
168 | parse(string, parserinfo, **kwargs)
169 | return True
170 | except ValueError:
171 | return False
172 |
173 |
174 | def is_series_of_type(what: type, series_to_test: pd.Series) -> bool:
175 | if series_to_test.to_frame().applymap(
176 | lambda x: not isinstance(x, what)
177 | ).squeeze().any():
178 | return False
179 | return True
180 |
181 |
182 | # The following lines modify and append the kwargs portion of the docstring of dateutil.parser.parse to
183 | # the docstring of new_group_rep_by_earliest_timestamp:
184 | parse_docstring_kwargs = re.search(':param parserinfo:.*?:return:', pydoc.render_doc(parse), flags=re.DOTALL).group(0)
185 | parse_docstring_kwargs = re.sub(
186 | '``timestr``',
187 | 'the strings containing the date/time-stamps',
188 | parse_docstring_kwargs
189 | )
190 | new_group_rep_by_earliest_timestamp.__doc__ = new_group_rep_by_earliest_timestamp.__doc__ + \
191 | parse_docstring_kwargs[:-9]
192 |
--------------------------------------------------------------------------------
/docs/references/sg_class.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: String Grouper Class
3 | ---
4 |
5 |
6 | ## Concept
7 |
8 | All functions are built using a class **`StringGrouper`**. This class can be used through pre-defined functions, for example the four high level functions above, as well as using a more interactive approach where matches can be added or removed if needed by calling the **`StringGrouper`** class directly.
9 |
10 |
11 | The four functions mentioned above all create a `StringGrouper` object behind the scenes and call different functions on it. The `StringGrouper` class keeps track of all tuples of similar strings and creates the groups out of these. Since matches are often not perfect, a common workflow is to:
12 |
13 | ## Example 1 - reuse the same tf-idf corpus without rebuilding
14 |
15 | ```python
16 | # Build a corpus using strings in the pandas Series master:
17 | sg = StringGrouper(master)
18 | # The following method-calls will compare strings first in
19 | # pandas Series new_master_1 and next in new_master_2
20 | # using the corpus already built above without rebuilding or
21 | # changing it in any way:
22 | matches1 = sg.match_strings(new_master_1)
23 | matches2 = sg.match_strings(new_master_2)
24 | ```
25 |
26 | ## Example 2 - add and remove matches
27 |
28 | 1. Create matches
29 | 2. Manually inspect the results
30 | 3. Add and remove matches where necessary
31 | 4. Create groups of similar strings
32 |
33 | The `StringGrouper` class allows for this without having to re-calculate the cosine similarity matrix. See below for an example.
34 |
35 |
36 | ```python
37 | company_names = './data/sec_edgar_company_info.csv'
38 | companies = pd.read_csv(company_names)
39 | ```
40 |
41 | 1. Create matches
42 |
43 |
44 | ```python
45 | # Create a new StringGrouper
46 | string_grouper = StringGrouper(companies['Company Name'], ignore_index=True)
47 | # Check if the ngram function does what we expect:
48 | string_grouper.n_grams('McDonalds')
49 | ```
50 |
51 | ['McD', 'cDo', 'Don', 'ona', 'nal', 'ald', 'lds']
52 |
53 | ```python
54 | string_grouper.n_grams('ÀbracâDABRÀ')
55 | ```
56 |
57 | ['abr', 'bra', 'rac', 'aca', 'cad', 'ada', 'dab', 'abr', 'bra']
58 |
59 | ```python
60 | # Now fit the StringGrouper - this will take a while since we are calculating cosine similarities on 600k strings
61 | string_grouper = string_grouper.fit()
62 | ```
63 |
64 | ```python
65 | # Add the grouped strings
66 | companies['deduplicated_name'] = string_grouper.get_groups()
67 | ```
68 |
69 | Suppose we know that PWC HOLDING CORP and PRICEWATERHOUSECOOPERS LLP are the same company. StringGrouper will not match these since they are not similar enough.
70 |
71 |
72 | ```python
73 | companies[companies.deduplicated_name.str.contains('PRICEWATERHOUSECOOPERS LLP')]
74 | ```
75 |
76 |
77 |
78 |
79 |
80 |
81 | |
82 | Line Number |
83 | Company Name |
84 | Company CIK Key |
85 | deduplicated_name |
86 |
87 |
88 |
89 |
90 | | 478441 |
91 | 478442 |
92 | PRICEWATERHOUSECOOPERS LLP /TA |
93 | 1064284 |
94 | PRICEWATERHOUSECOOPERS LLP /TA |
95 |
96 |
97 | | 478442 |
98 | 478443 |
99 | PRICEWATERHOUSECOOPERS LLP |
100 | 1186612 |
101 | PRICEWATERHOUSECOOPERS LLP /TA |
102 |
103 |
104 | | 478443 |
105 | 478444 |
106 | PRICEWATERHOUSECOOPERS SECURITIES LLC |
107 | 1018444 |
108 | PRICEWATERHOUSECOOPERS LLP /TA |
109 |
110 |
111 |
112 |
113 |
114 |
115 | ```python
116 | companies[companies.deduplicated_name.str.contains('PWC')]
117 | ```
118 |
119 |
120 |
121 |
122 |
123 |
124 | |
125 | Line Number |
126 | Company Name |
127 | Company CIK Key |
128 | deduplicated_name |
129 |
130 |
131 |
132 |
133 | | 485535 |
134 | 485536 |
135 | PWC CAPITAL INC. |
136 | 1690640 |
137 | PWC CAPITAL INC. |
138 |
139 |
140 | | 485536 |
141 | 485537 |
142 | PWC HOLDING CORP |
143 | 1456450 |
144 | PWC HOLDING CORP |
145 |
146 |
147 | | 485537 |
148 | 485538 |
149 | PWC INVESTORS, LLC |
150 | 1480311 |
151 | PWC INVESTORS, LLC |
152 |
153 |
154 | | 485538 |
155 | 485539 |
156 | PWC REAL ESTATE VALUE FUND I LLC |
157 | 1668928 |
158 | PWC REAL ESTATE VALUE FUND I LLC |
159 |
160 |
161 | | 485539 |
162 | 485540 |
163 | PWC SECURITIES CORP /BD |
164 | 1023989 |
165 | PWC SECURITIES CORP /BD |
166 |
167 |
168 | | 485540 |
169 | 485541 |
170 | PWC SECURITIES CORPORATION |
171 | 1023989 |
172 | PWC SECURITIES CORPORATION |
173 |
174 |
175 | | 485541 |
176 | 485542 |
177 | PWCC LTD |
178 | 1172241 |
179 | PWCC LTD |
180 |
181 |
182 | | 485542 |
183 | 485543 |
184 | PWCG BROKERAGE, INC. |
185 | 67301 |
186 | PWCG BROKERAGE, INC. |
187 |
188 |
189 |
190 |
191 |
192 |
193 | We can add these with the add function:
194 |
195 |
196 | ```python
197 | string_grouper = string_grouper.add_match('PRICEWATERHOUSECOOPERS LLP', 'PWC HOLDING CORP')
198 | companies['deduplicated_name'] = string_grouper.get_groups()
199 | # Now lets check again:
200 |
201 | companies[companies.deduplicated_name.str.contains('PRICEWATERHOUSECOOPERS LLP')]
202 | ```
203 |
204 |
205 |
206 |
207 |
208 |
209 | |
210 | Line Number |
211 | Company Name |
212 | Company CIK Key |
213 | deduplicated_name |
214 |
215 |
216 |
217 |
218 | | 478441 |
219 | 478442 |
220 | PRICEWATERHOUSECOOPERS LLP /TA |
221 | 1064284 |
222 | PRICEWATERHOUSECOOPERS LLP /TA |
223 |
224 |
225 | | 478442 |
226 | 478443 |
227 | PRICEWATERHOUSECOOPERS LLP |
228 | 1186612 |
229 | PRICEWATERHOUSECOOPERS LLP /TA |
230 |
231 |
232 | | 478443 |
233 | 478444 |
234 | PRICEWATERHOUSECOOPERS SECURITIES LLC |
235 | 1018444 |
236 | PRICEWATERHOUSECOOPERS LLP /TA |
237 |
238 |
239 | | 485536 |
240 | 485537 |
241 | PWC HOLDING CORP |
242 | 1456450 |
243 | PRICEWATERHOUSECOOPERS LLP /TA |
244 |
245 |
246 |
247 |
248 |
249 |
250 | This can also be used to merge two groups:
251 |
252 |
253 | ```python
254 | string_grouper = string_grouper.add_match('PRICEWATERHOUSECOOPERS LLP', 'ZUCKER MICHAEL')
255 | companies['deduplicated_name'] = string_grouper.get_groups()
256 |
257 | # Now lets check again:
258 | companies[companies.deduplicated_name.str.contains('PRICEWATERHOUSECOOPERS LLP')]
259 | ```
260 |
261 |
262 |
263 |
264 |
265 |
266 | |
267 | Line Number |
268 | Company Name |
269 | Company CIK Key |
270 | deduplicated_name |
271 |
272 |
273 |
274 |
275 | | 478441 |
276 | 478442 |
277 | PRICEWATERHOUSECOOPERS LLP /TA |
278 | 1064284 |
279 | PRICEWATERHOUSECOOPERS LLP /TA |
280 |
281 |
282 | | 478442 |
283 | 478443 |
284 | PRICEWATERHOUSECOOPERS LLP |
285 | 1186612 |
286 | PRICEWATERHOUSECOOPERS LLP /TA |
287 |
288 |
289 | | 478443 |
290 | 478444 |
291 | PRICEWATERHOUSECOOPERS SECURITIES LLC |
292 | 1018444 |
293 | PRICEWATERHOUSECOOPERS LLP /TA |
294 |
295 |
296 | | 485536 |
297 | 485537 |
298 | PWC HOLDING CORP |
299 | 1456450 |
300 | PRICEWATERHOUSECOOPERS LLP /TA |
301 |
302 |
303 | | 662585 |
304 | 662586 |
305 | ZUCKER MICHAEL |
306 | 1629018 |
307 | PRICEWATERHOUSECOOPERS LLP /TA |
308 |
309 |
310 | | 662604 |
311 | 662605 |
312 | ZUCKERMAN MICHAEL |
313 | 1303321 |
314 | PRICEWATERHOUSECOOPERS LLP /TA |
315 |
316 |
317 | | 662605 |
318 | 662606 |
319 | ZUCKERMAN MICHAEL |
320 | 1496366 |
321 | PRICEWATERHOUSECOOPERS LLP /TA |
322 |
323 |
324 |
325 |
326 |
327 |
328 | We can remove strings from groups in the same way:
329 |
330 |
331 | ```python
332 | string_grouper = string_grouper.remove_match('PRICEWATERHOUSECOOPERS LLP', 'ZUCKER MICHAEL')
333 | companies['deduplicated_name'] = string_grouper.get_groups()
334 |
335 | # Now lets check again:
336 | companies[companies.deduplicated_name.str.contains('PRICEWATERHOUSECOOPERS LLP')]
337 | ```
338 |
339 |
340 |
341 |
342 |
343 |
344 | |
345 | Line Number |
346 | Company Name |
347 | Company CIK Key |
348 | deduplicated_name |
349 |
350 |
351 |
352 |
353 | | 478441 |
354 | 478442 |
355 | PRICEWATERHOUSECOOPERS LLP /TA |
356 | 1064284 |
357 | PRICEWATERHOUSECOOPERS LLP /TA |
358 |
359 |
360 | | 478442 |
361 | 478443 |
362 | PRICEWATERHOUSECOOPERS LLP |
363 | 1186612 |
364 | PRICEWATERHOUSECOOPERS LLP /TA |
365 |
366 |
367 | | 478443 |
368 | 478444 |
369 | PRICEWATERHOUSECOOPERS SECURITIES LLC |
370 | 1018444 |
371 | PRICEWATERHOUSECOOPERS LLP /TA |
372 |
373 |
374 | | 485536 |
375 | 485537 |
376 | PWC HOLDING CORP |
377 | 1456450 |
378 | PRICEWATERHOUSECOOPERS LLP /TA |
379 |
380 |
381 |
382 |
383 |
384 |
--------------------------------------------------------------------------------
/tutorials/tutorial_1.md:
--------------------------------------------------------------------------------
1 | # Finding Duplicates With IDs In String Grouper
2 |
3 | ## Introduction
4 |
5 | A common requirement in data clean-up is the scenario where a data set (database, pandas DataFrame) has multiple database records for the same entity and duplicates need to be found. This example will not cover the task of merging or removing duplicate records — what it will do is use String Grouper to find duplicate records using the match_strings function and the optional IDs functionality.
6 |
7 | For the example we will use [this](accounts.csv) simple data set. The number of rows is not important, the 'name' column has a number of typical cases of types of variations in spelling.
8 |
9 | ```
10 | id,name
11 | AA012345X,mega enterprises corp.
12 | BB016741P,mega enterprises corporation
13 | CC052345T,mega corp.
14 | AA098762D,hyper startup inc.
15 | BB099931J,hyper-startup inc.
16 | CC082744L,hyper startup incorporated
17 | HH072982K,hyper hyper inc.
18 | AA903844B,slow and steady inc.
19 | BB904941H,slow and steady incorporated
20 | CC903844B,slow steady inc.
21 | AA777431C,abc enterprises inc.
22 | BB760431Y,a.b.c. enterprises incorporated
23 | BB750431M,a.b.c. enterprises inc.
24 | ZZ123456H,one and only inc.
25 | ```
26 |
27 | ## Example
28 |
29 | The steps below will process the above sample file using String Grouper to search for matches in the values in the 'name' column. The results shown in the tables at each step are based on the sample data above.
30 |
31 | ### Setup
32 |
33 | ```python
34 | import pandas as pd
35 | from string_grouper import match_strings
36 | ```
37 |
38 | ### Import Data
39 |
40 | ***Tip:*** Assuming the data set will come from an external database, for optimum performance only do an export of the ID column, and the text column that matching will be done on, and convert the text data column (**not the ID column**) to lower case.
41 |
42 | #### Import the sample data.
43 |
44 | ```python
45 | accounts = pd.read_csv('string_grouper/tutorials/accounts.csv')
46 | # Show dataframe
47 | accounts
48 | ```
49 |
50 | #### Result (first three rows only shown):
51 |
52 |
53 |
54 |
55 |
56 | |
57 | id |
58 | name |
59 |
60 |
61 |
62 |
63 | | 0 |
64 | AA012345X |
65 | mega enterprises corp. |
66 |
67 |
68 | | 1 |
69 | BB016741P |
70 | mega enterprises corporation |
71 |
72 |
73 | | 2 |
74 | CC052345T |
75 | mega corp. |
76 |
77 |
78 | | ... |
79 | ... |
80 | ... |
81 |
82 |
83 |
84 |
85 |
86 |
87 | ### Find matches, assign to new pandas variable
88 |
89 | Next, use the `match_strings` function and pass the 'name' column as the argument to the `master` parameter, and the 'id' column as the argument to the `master_id` parameter.
90 |
91 | **N.B.** In production with a real data set, depending on its size, the following command can/may take a number of minutes — ***no update/progress indicator is shown***. This obviously also depends on the performance of the computer used. Memory and hard disk performance are a factor, as well as the CPU. String Grouper uses pandas which, in turn, uses NumPy, so matching is not done by computationally intensive looping, but by [array mathematics](https://realpython.com/numpy-array-programming/) — but it still may take some time to process large data sets.
92 |
93 | ```python
94 | matches = match_strings(accounts['name'], master_id = accounts['id'], ignore_index=True)
95 | matches
96 | ```
97 | This will return a pandas DataFrame as below. The values (company) we will focus on in this example will be those that have variations in the name of the fictitious company, 'Hyper Startup Inc.'.
98 |
99 |
100 |
101 |
102 |
103 |
104 | |
105 | left_id |
106 | left_name |
107 | similarity |
108 | right_name |
109 | right_id |
110 |
111 |
112 |
113 |
114 | | ... |
115 | ... |
116 | ... |
117 | ... |
118 | ... |
119 | ... |
120 |
121 |
122 | | 3 |
123 | AA098762D |
124 | hyper startup inc. |
125 | 1.00 |
126 | hyper-startup inc. |
127 | BB099931J |
128 |
129 |
130 | | 4 |
131 | AA098762D |
132 | hyper startup inc. |
133 | 1.00 |
134 | hyper startup inc. |
135 | AA098762D |
136 |
137 |
138 | | 5 |
139 | BB099931J |
140 | hyper-startup inc. |
141 | 1.00 |
142 | hyper-startup inc. |
143 | BB099931J |
144 |
145 |
146 | | 6 |
147 | BB099931J |
148 | hyper-startup inc. |
149 | 1.00 |
150 | hyper startup inc. |
151 | AA098762D |
152 |
153 |
154 | | 7 |
155 | CC082744L |
156 | hyper startup incorporated |
157 | 1.00 |
158 | hyper startup incorporated |
159 | CC082744L |
160 |
161 |
162 | | ... |
163 | ... |
164 | ... |
165 | ... |
166 | ... |
167 | ... |
168 |
169 |
170 |
171 |
172 |
173 |
174 | In a pattern-matching process, each value in a row of the column being matched is checked against *every other value* in the column.
175 |
176 | Processing this using typical Python looping code would mean, in the case of a 100,000 row data set, that the total iterations would be 100,0002 = 10 Billion. Processing that number of iterations might require replacing the CPU of the computer after each investigation! Well maybe not ... but you *would* have time for a few cups of coffee. String Grouper works in a totally different way.
177 |
178 | In the resultant DataFrame above, we see the IDs (AA098762D, BB099931J) having each a group of two values — once where a close match is found, and once where its own record (value) is found. The third ID, CC082744L, is only returned once, even though it is pretty clear that it would be a variation of our fictitious company 'Hyper Startup Inc.'
179 |
180 |
181 | ### Using the 'Minimum Similarity' keyword argument
182 |
183 | String Grouper has a number of configuration options (see the **kwargs** in README.md). The option of interest in the above case is `min_similarity`.
184 |
185 | The default minimum similarity is 0.8. It can be seen that more matches may be found by reducing the minimum similarity from 0.8 to, for example, 0.7.
186 |
187 | ```python
188 | matches = match_strings(accounts['name'], master_id = accounts['id'], ignore_index = True, min_similarity = 0.7)
189 | ```
190 |
191 | ***Tip:*** If the data set being matched is large, and you wish to experiment with the minimum similarity option, it may be helpful to import only a limited data set during testing, and increase to the full data set when ready. The number of rows imported can be specified in this way:
192 |
193 | ```python
194 | # We only look at the first 50k as an example
195 | accounts = pd.read_csv('/path/to/folder/huge_file.csv')[0:50000]
196 | ```
197 |
198 | Back to our example ... changing the option to `min_similarity = 0.7` returns this:
199 |
200 |
201 |
202 |
203 |
204 | |
205 | left_id |
206 | left_name |
207 | similarity |
208 | right_name |
209 | right_id |
210 |
211 |
212 |
213 |
214 | | ... |
215 | ... |
216 | ... |
217 | ... |
218 | ... |
219 | ... |
220 |
221 |
222 | | 5 |
223 | AA098762D |
224 | hyper startup inc. |
225 | 1.00 |
226 | hyper-startup inc. |
227 | BB099931J |
228 |
229 |
230 | | 6 |
231 | AA098762D |
232 | hyper startup inc. |
233 | 1.00 |
234 | hyper startup inc. |
235 | AA098762D |
236 |
237 |
238 | | 7 |
239 | AA098762D |
240 | hyper startup inc. |
241 | 0.78 |
242 | hyper startup incorporated |
243 | CC082744L |
244 |
245 |
246 | | 8 |
247 | BB099931J |
248 | hyper-startup inc. |
249 | 1.00 |
250 | hyper-startup inc. |
251 | BB099931J |
252 |
253 |
254 | | 9 |
255 | BB099931J |
256 | hyper-startup inc. |
257 | 1.00 |
258 | hyper startup inc. |
259 | AA098762D |
260 |
261 |
262 | | 10 |
263 | BB099931J |
264 | hyper-startup inc. |
265 | 0.78 |
266 | hyper startup incorporated |
267 | CC082744L |
268 |
269 |
270 | | 11 |
271 | CC082744L |
272 | hyper startup incorporated |
273 | 1.00 |
274 | hyper startup incorporated |
275 | CC082744L |
276 |
277 |
278 | | 12 |
279 | CC082744L |
280 | hyper startup incorporated |
281 | 0.78 |
282 | hyper-startup inc. |
283 | BB099931J |
284 |
285 |
286 | | 13 |
287 | CC082744L |
288 | hyper startup incorporated |
289 | 0.78 |
290 | hyper startup inc. |
291 | AA098762D |
292 |
293 |
294 | | 14 |
295 | HH072982K |
296 | hyper hyper inc. |
297 | 1.00 |
298 | hyper hyper inc. |
299 | HH072982K |
300 |
301 |
302 | | ... |
303 | ... |
304 | ... |
305 | ... |
306 | ... |
307 | ... |
308 |
309 |
310 |
311 |
312 |
313 | Now we see the IDs — AA098762D, BB099931J, CC082744L — have further matches. Each 'name' value has two other matching rows (IDs). However, we see that setting minimum similarity to 0.7 has still not matched 'hyper hyper inc.' (ID HH072982K) even though a person would judge that the 'name' is a match. The minimum similarity setting can be adjusted up and down until it is considered that most duplicates are being matched. If so, we can progress.
314 |
315 | ### Removing identical rows
316 |
317 | Once we are happy with the level of matching, we can remove the rows where the IDs are the same. Having the original (database) IDs for the rows means that we can precisely remove identical rows — that is, we are not removing matches based on similar values, but on the exact (database) IDs:
318 |
319 | ```python
320 | dupes = matches[matches.left_id != matches.right_id]
321 | dupes
322 | ```
323 | And we see the following for the company name we have been following:
324 |
325 |
326 |
327 |
328 |
329 |
330 | |
331 | left_id |
332 | left_name |
333 | similarity |
334 | right_name |
335 | right_id |
336 |
337 |
338 |
339 |
340 | | ... |
341 | ... |
342 | ... |
343 | ... |
344 | ... |
345 | ... |
346 |
347 |
348 | | 5 |
349 | AA098762D |
350 | hyper startup inc. |
351 | 1.00 |
352 | hyper-startup inc. |
353 | BB099931J |
354 |
355 |
356 | | 7 |
357 | AA098762D |
358 | hyper startup inc. |
359 | 0.78 |
360 | hyper startup incorporated |
361 | CC082744L |
362 |
363 |
364 | | 9 |
365 | BB099931J |
366 | hyper-startup inc. |
367 | 1.00 |
368 | hyper startup inc. |
369 | AA098762D |
370 |
371 |
372 | | 10 |
373 | BB099931J |
374 | hyper-startup inc. |
375 | 0.78 |
376 | hyper startup incorporated |
377 | CC082744L |
378 |
379 |
380 | | 12 |
381 | CC082744L |
382 | hyper startup incorporated |
383 | 0.78 |
384 | hyper-startup inc. |
385 | BB099931J |
386 |
387 |
388 | | 13 |
389 | CC082744L |
390 | hyper startup incorporated |
391 | 0.78 |
392 | hyper startup inc. |
393 | AA098762D |
394 |
395 |
396 | | ... |
397 | ... |
398 | ... |
399 | ... |
400 | ... |
401 | ... |
402 |
403 |
404 |
405 |
406 |
407 | ***N.B.** the pandas index number 14 has gone because the left and right side IDs were identical.*
408 |
409 | ### Reduce data to unique rows having duplicate IDs
410 |
411 | Finally we reduce the data to a pandas Series ready for exporting with one row for each record that has any duplicates.
412 |
413 | ```python
414 | company_dupes = pd.DataFrame(dupes.left_id.unique()).squeeze().rename('company_id')
415 | company_dupes
416 | ```
417 |
418 | This gives the following result:
419 |
420 | ```
421 | 0 AA012345X
422 | 1 BB016741P
423 | 2 AA098762D
424 | 3 BB099931J
425 | 4 CC082744L
426 | 5 AA903844B
427 | 6 BB904941H
428 | 7 AA777431C
429 | 8 BB760431Y
430 | 9 BB750431M
431 | Name: company_id, dtype: object
432 | ```
433 |
434 | How this is processed, as with any database clean-up, is out of the scope of this tutorial. A first step however could be:
435 |
436 | 1. Import the list of database IDs into the relevant database as a temporary table
437 | 1. Do an inner-join with the original table the data was exported from and sort ascending by the 'name' column
438 |
439 | This will return filtered rows with the 'name' field in adjacent rows showing similar matched strings.
440 |
--------------------------------------------------------------------------------
/docs/examples.md:
--------------------------------------------------------------------------------
1 |
2 | In this section we will cover a few use cases for which string_grouper may be used. We will use the same data set of company names as used in: [Super Fast String Matching in Python](https://bergvca.github.io/2017/10/14/super-fast-string-matching.html).
3 |
4 | ### Find all matches within a single data set
5 |
6 |
7 | ```python
8 | import pandas as pd
9 | import numpy as np
10 | from string_grouper import match_strings, match_most_similar, \
11 | group_similar_strings, compute_pairwise_similarities, \
12 | StringGrouper
13 | ```
14 |
15 |
16 | ```python
17 | company_names = './data/sec_edgar_company_info.csv'
18 | # We only look at the first 50k as an example:
19 | companies = pd.read_csv(company_names)[0:50000]
20 | # Create all matches:
21 | matches = match_strings(companies['Company Name'])
22 | # Look at only the non-exact matches:
23 | matches[matches['left_Company Name'] != matches['right_Company Name']].head()
24 | ```
25 |
26 |
27 |
28 |
29 |
30 |
31 | |
32 | left_index |
33 | left_Company Name |
34 | similarity |
35 | right_Company Name |
36 | right_index |
37 |
38 |
39 |
40 |
41 | | 15 |
42 | 14 |
43 | 0210, LLC |
44 | 0.870291 |
45 | 90210 LLC |
46 | 4211 |
47 |
48 |
49 | | 167 |
50 | 165 |
51 | 1 800 MUTUALS ADVISOR SERIES |
52 | 0.931615 |
53 | 1 800 MUTUALS ADVISORS SERIES |
54 | 166 |
55 |
56 |
57 | | 168 |
58 | 166 |
59 | 1 800 MUTUALS ADVISORS SERIES |
60 | 0.931615 |
61 | 1 800 MUTUALS ADVISOR SERIES |
62 | 165 |
63 |
64 |
65 | | 172 |
66 | 168 |
67 | 1 800 RADIATOR FRANCHISE INC |
68 | 1.000000 |
69 | 1-800-RADIATOR FRANCHISE INC. |
70 | 201 |
71 |
72 |
73 | | 178 |
74 | 173 |
75 | 1 FINANCIAL MARKETPLACE SECURITIES LLC ... |
76 | 0.949364 |
77 | 1 FINANCIAL MARKETPLACE SECURITIES, LLC |
78 | 174 |
79 |
80 |
81 |
82 |
83 |
84 |
85 | ### Find all matches in between two data sets.
86 | The `match_strings` function finds similar items between two data sets as well. This can be seen as an inner join between two data sets:
87 |
88 |
89 | ```python
90 | # Create a small set of artificial company names:
91 | duplicates = pd.Series(['S MEDIA GROUP', '012 SMILE.COMMUNICATIONS', 'foo bar', 'B4UTRADE COM CORP'])
92 | # Create all matches:
93 | matches = match_strings(companies['Company Name'], duplicates)
94 | matches
95 | ```
96 |
97 |
98 |
99 |
100 |
101 |
102 | |
103 | left_index |
104 | left_Company Name |
105 | similarity |
106 | right_side |
107 | right_index |
108 |
109 |
110 |
111 |
112 | | 0 |
113 | 12 |
114 | 012 SMILE.COMMUNICATIONS LTD |
115 | 0.944092 |
116 | 012 SMILE.COMMUNICATIONS |
117 | 1 |
118 |
119 |
120 | | 1 |
121 | 49777 |
122 | B.A.S. MEDIA GROUP |
123 | 0.854383 |
124 | S MEDIA GROUP |
125 | 0 |
126 |
127 |
128 | | 2 |
129 | 49855 |
130 | B4UTRADE COM CORP |
131 | 1.000000 |
132 | B4UTRADE COM CORP |
133 | 3 |
134 |
135 |
136 | | 3 |
137 | 49856 |
138 | B4UTRADE COM INC |
139 | 0.810217 |
140 | B4UTRADE COM CORP |
141 | 3 |
142 |
143 |
144 | | 4 |
145 | 49857 |
146 | B4UTRADE CORP |
147 | 0.878276 |
148 | B4UTRADE COM CORP |
149 | 3 |
150 |
151 |
152 |
153 |
154 |
155 |
156 | Out of the four company names in `duplicates`, three companies are found in the original company data set. One company is found three times.
157 |
158 | ### Finding duplicates from a (database extract to) DataFrame where IDs for rows are supplied.
159 |
160 | A very common scenario is the case where duplicate records for an entity have been entered into a database. That is, there are two or more records where a name field has slightly different spelling. For example, "A.B. Corporation" and "AB Corporation". Using the optional 'ID' parameter in the `match_strings` function duplicates can be found easily. A [tutorial](https://github.com/Bergvca/string_grouper/blob/master/tutorials/tutorial_1.md) that steps though the process with an example data set is available.
161 |
162 |
163 | ### For a second data set, find only the most similar match
164 |
165 | In the example above, it's possible that multiple matches are found for a single string. Sometimes we just want a string to match with a single most similar string. If there are no similar strings found, the original string should be returned:
166 |
167 |
168 | ```python
169 | # Create a small set of artificial company names:
170 | new_companies = pd.Series(['S MEDIA GROUP', '012 SMILE.COMMUNICATIONS', 'foo bar', 'B4UTRADE COM CORP'],\
171 | name='New Company')
172 | # Create all matches:
173 | matches = match_most_similar(companies['Company Name'], new_companies, ignore_index=True)
174 | # Display the results:
175 | pd.concat([new_companies, matches], axis=1)
176 | ```
177 |
178 |
179 |
180 |
181 |
182 |
183 | |
184 | New Company |
185 | most_similar_Company Name |
186 |
187 |
188 |
189 |
190 | | 0 |
191 | S MEDIA GROUP |
192 | B.A.S. MEDIA GROUP |
193 |
194 |
195 | | 1 |
196 | 012 SMILE.COMMUNICATIONS |
197 | 012 SMILE.COMMUNICATIONS LTD |
198 |
199 |
200 | | 2 |
201 | foo bar |
202 | foo bar |
203 |
204 |
205 | | 3 |
206 | B4UTRADE COM CORP |
207 | B4UTRADE COM CORP |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 | ### Deduplicate a single data set and show items with most duplicates
216 |
217 | The `group_similar_strings` function groups strings that are similar using a single linkage clustering algorithm. That is, if item A and item B are similar; and item B and item C are similar; but the similarity between A and C is below the threshold; then all three items are grouped together.
218 |
219 | ```python
220 | # Add the grouped strings:
221 | companies['deduplicated_name'] = group_similar_strings(companies['Company Name'],
222 | ignore_index=True)
223 | # Show items with most duplicates:
224 | companies.groupby('deduplicated_name')['Line Number'].count().sort_values(ascending=False).head(10)
225 | ```
226 |
227 |
228 |
229 |
230 | deduplicated_name
231 | ADVISORS DISCIPLINED TRUST 1824
232 | AGL LIFE ASSURANCE CO SEPARATE ACCOUNT 183
233 | ANGELLIST-ART-FUND, A SERIES OF ANGELLIST-FG-FUNDS, LLC 116
234 | AMERICREDIT AUTOMOBILE RECEIVABLES TRUST 2001-1 87
235 | ACE SECURITIES CORP. HOME EQUITY LOAN TRUST, SERIES 2006-HE2 57
236 | ASSET-BACKED PASS-THROUGH CERTIFICATES SERIES 2004-W1 40
237 | ALLSTATE LIFE GLOBAL FUNDING TRUST 2005-3 39
238 | ALLY AUTO RECEIVABLES TRUST 2014-1 33
239 | ANDERSON ROBERT E / 28
240 | ADVENT INTERNATIONAL GPE VIII LIMITED PARTNERSHIP 28
241 | Name: Line Number, dtype: int64
242 |
243 |
244 | The `group_similar_strings` function also works with IDs: imagine a `DataFrame` (`customers_df`) with the following content:
245 | ```python
246 | # Create a small set of artificial customer names:
247 | customers_df = pd.DataFrame(
248 | [
249 | ('BB016741P', 'Mega Enterprises Corporation'),
250 | ('CC082744L', 'Hyper Startup Incorporated'),
251 | ('AA098762D', 'Hyper Startup Inc.'),
252 | ('BB099931J', 'Hyper-Startup Inc.'),
253 | ('HH072982K', 'Hyper Hyper Inc.')
254 | ],
255 | columns=('Customer ID', 'Customer Name')
256 | ).set_index('Customer ID')
257 | # Display the data:
258 | customers_df
259 | ```
260 |
261 |
262 |
263 |
264 |
265 | |
266 | Customer Name |
267 |
268 |
269 | | Customer ID |
270 | |
271 |
272 |
273 |
274 |
275 | | BB016741P |
276 | Mega Enterprises Corporation |
277 |
278 |
279 | | CC082744L |
280 | Hyper Startup Incorporated |
281 |
282 |
283 | | AA098762D |
284 | Hyper Startup Inc. |
285 |
286 |
287 | | BB099931J |
288 | Hyper-Startup Inc. |
289 |
290 |
291 | | HH072982K |
292 | Hyper Hyper Inc. |
293 |
294 |
295 |
296 |
297 |
298 | The output of `group_similar_strings` can be directly used as a mapping table:
299 | ```python
300 | # Group customers with similar names:
301 | customers_df[["group-id", "name_deduped"]] = \
302 | group_similar_strings(customers_df["Customer Name"])
303 | # Display the mapping table:
304 | customers_df
305 | ```
306 |
307 |
308 |
309 |
310 |
311 | |
312 | Customer Name |
313 | group-id |
314 | name_deduped |
315 |
316 |
317 | | Customer ID |
318 | |
319 | |
320 | |
321 |
322 |
323 |
324 |
325 | | BB016741P |
326 | Mega Enterprises Corporation |
327 | BB016741P |
328 | Mega Enterprises Corporation |
329 |
330 |
331 | | CC082744L |
332 | Hyper Startup Incorporated |
333 | CC082744L |
334 | Hyper Startup Incorporated |
335 |
336 |
337 | | AA098762D |
338 | Hyper Startup Inc. |
339 | AA098762D |
340 | Hyper Startup Inc. |
341 |
342 |
343 | | BB099931J |
344 | Hyper-Startup Inc. |
345 | AA098762D |
346 | Hyper Startup Inc. |
347 |
348 |
349 | | HH072982K |
350 | Hyper Hyper Inc. |
351 | HH072982K |
352 | Hyper Hyper Inc. |
353 |
354 |
355 |
356 |
357 |
358 | Note that here `customers_df` initially had only one column "Customer Name" (before the `group_similar_strings` function call); and it acquired two more columns "group-id" (the index-column) and "name_deduped" after the call through a "[setting with enlargement](https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#setting-with-enlargement)" (a `pandas` feature).
359 |
360 | ### Simply compute the cosine similarities of pairs of strings
361 |
362 | Sometimes we have pairs of strings that have already been matched but whose similarity scores need to be computed. For this purpose we provide the function `compute_pairwise_similarities`:
363 |
364 | ```python
365 | # Create a small DataFrame of pairs of strings:
366 | pair_s = pd.DataFrame(
367 | [
368 | ('Mega Enterprises Corporation', 'Mega Enterprises Corporation'),
369 | ('Hyper Startup Inc.', 'Hyper Startup Incorporated'),
370 | ('Hyper Startup Inc.', 'Hyper Startup Inc.'),
371 | ('Hyper Startup Inc.', 'Hyper-Startup Inc.'),
372 | ('Hyper Hyper Inc.', 'Hyper Hyper Inc.'),
373 | ('Mega Enterprises Corporation', 'Mega Enterprises Corp.')
374 | ],
375 | columns=('left', 'right')
376 | )
377 | # Display the data:
378 | pair_s
379 | ```
380 |
381 |
382 |
383 |
384 |
385 |
386 |
387 |
388 | |
389 | left |
390 | right |
391 |
392 |
393 |
394 |
395 | | 0 |
396 | Mega Enterprises Corporation |
397 | Mega Enterprises Corporation |
398 |
399 |
400 | | 1 |
401 | Hyper Startup Inc. |
402 | Hyper Startup Incorporated |
403 |
404 |
405 | | 2 |
406 | Hyper Startup Inc. |
407 | Hyper Startup Inc. |
408 |
409 |
410 | | 3 |
411 | Hyper Startup Inc. |
412 | Hyper-Startup Inc. |
413 |
414 |
415 | | 4 |
416 | Hyper Hyper Inc. |
417 | Hyper Hyper Inc. |
418 |
419 |
420 | | 5 |
421 | Mega Enterprises Corporation |
422 | Mega Enterprises Corp. |
423 |
424 |
425 |
426 |
427 |
428 |
429 |
430 |
431 | ```python
432 | # Compute their cosine similarities and display them:
433 | pair_s['similarity'] = compute_pairwise_similarities(pair_s['left'], pair_s['right'])
434 | pair_s
435 | ```
436 |
437 |
438 |
439 |
440 |
441 |
442 |
443 |
444 | |
445 | left |
446 | right |
447 | similarity |
448 |
449 |
450 |
451 |
452 | | 0 |
453 | Mega Enterprises Corporation |
454 | Mega Enterprises Corporation |
455 | 1.000000 |
456 |
457 |
458 | | 1 |
459 | Hyper Startup Inc. |
460 | Hyper Startup Incorporated |
461 | 0.633620 |
462 |
463 |
464 | | 2 |
465 | Hyper Startup Inc. |
466 | Hyper Startup Inc. |
467 | 1.000000 |
468 |
469 |
470 | | 3 |
471 | Hyper Startup Inc. |
472 | Hyper-Startup Inc. |
473 | 1.000000 |
474 |
475 |
476 | | 4 |
477 | Hyper Hyper Inc. |
478 | Hyper Hyper Inc. |
479 | 1.000000 |
480 |
481 |
482 | | 5 |
483 | Mega Enterprises Corporation |
484 | Mega Enterprises Corp. |
485 | 0.826463 |
486 |
487 |
488 |
489 |
490 |
491 |
--------------------------------------------------------------------------------
/string_grouper_utils/test/test_string_grouper_utils.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import pandas as pd
3 | from dateutil.parser import parse
4 | from string_grouper_utils.string_grouper_utils import new_group_rep_by_earliest_timestamp, \
5 | new_group_rep_by_completeness, new_group_rep_by_highest_weight
6 |
7 |
8 | class SimpleExample(object):
9 | def __init__(self):
10 | self.customers_df = pd.DataFrame(
11 | [
12 | ('BB016741P', 'Mega Enterprises Corporation', 'Address0', 'Tel0', 'Description0', 0.2,
13 | '2014-12-30 10:55:00-02:00', 'EE059082Q', 'Mega Enterprises Corp.'),
14 | ('CC082744L', 'Hyper Startup Incorporated', '', 'Tel1', '', 0.5, '2017-01-01 20:23:15-05:00',
15 | 'BB099931J', 'Hyper-Startup Inc.'),
16 | ('AA098762D', 'Hyper Startup Inc.', 'Address2', 'Tel2', 'Description2', 0.3,
17 | '2020-10-20 15:29:30+02:00', 'BB099931J', 'Hyper-Startup Inc.'),
18 | ('BB099931J', 'Hyper-Startup Inc.', 'Address3', 'Tel3', 'Description3', 0.1,
19 | '2013-07-01 03:34:45-05:00', 'BB099931J', 'Hyper-Startup Inc.'),
20 | ('HH072982K', 'Hyper Hyper Inc.', 'Address4', '', 'Description4', 0.9, '2005-09-11 11:56:00-07:00',
21 | 'HH072982K', 'Hyper Hyper Inc.'),
22 | ('EE059082Q', 'Mega Enterprises Corp.', 'Address5', 'Tel5', 'Description5', 1.0,
23 | '1998-04-14 09:21:11+00:00', 'EE059082Q', 'Mega Enterprises Corp.')
24 | ],
25 | columns=('Customer ID', 'Customer Name', 'Address', 'Tel', 'Description', 'weight', 'timestamp',
26 | 'group ID', 'group name')
27 | )
28 | # new_group_rep_by_earliest_timestamp(customers_df, 'group ID', 'Customer ID', 'timestamp')
29 | self.expected_result_TS = pd.Series(
30 | [
31 | 'EE059082Q',
32 | 'BB099931J',
33 | 'BB099931J',
34 | 'BB099931J',
35 | 'HH072982K',
36 | 'EE059082Q',
37 | ]
38 | )
39 | # new_group_rep_by_earliest_timestamp(customers_df, 'group ID', 'Customer ID', 'timestamp', 'Customer Name')
40 | self.expected_result_T = pd.DataFrame(
41 | [
42 | ('EE059082Q', 'Mega Enterprises Corp.'),
43 | ('BB099931J', 'Hyper-Startup Inc.'),
44 | ('BB099931J', 'Hyper-Startup Inc.'),
45 | ('BB099931J', 'Hyper-Startup Inc.'),
46 | ('HH072982K', 'Hyper Hyper Inc.'),
47 | ('EE059082Q', 'Mega Enterprises Corp.')
48 | ]
49 | )
50 | # new_group_rep_by_earliest_timestamp(customers_df, 'group ID', 'Customer ID', 'weight', 'Customer Name')
51 | self.expected_result_TW = pd.DataFrame(
52 | [
53 | ('BB016741P', 'Mega Enterprises Corporation'),
54 | ('BB099931J', 'Hyper-Startup Inc.'),
55 | ('BB099931J', 'Hyper-Startup Inc.'),
56 | ('BB099931J', 'Hyper-Startup Inc.'),
57 | ('HH072982K', 'Hyper Hyper Inc.'),
58 | ('BB016741P', 'Mega Enterprises Corporation')
59 | ]
60 | )
61 | # new_group_rep_by_highest_weight(customers_df, 'group ID', 'Customer ID', 'weight', 'Customer Name')
62 | self.expected_result_W = pd.DataFrame(
63 | [
64 | ('EE059082Q', 'Mega Enterprises Corp.'),
65 | ('CC082744L', 'Hyper Startup Incorporated'),
66 | ('CC082744L', 'Hyper Startup Incorporated'),
67 | ('CC082744L', 'Hyper Startup Incorporated'),
68 | ('HH072982K', 'Hyper Hyper Inc.'),
69 | ('EE059082Q', 'Mega Enterprises Corp.')
70 | ]
71 | )
72 | # new_group_rep_by_highest_weight(customers_df, 'group ID', 'Customer ID', 'weight', 'Customer Name')
73 | self.expected_result_C = pd.DataFrame(
74 | [
75 | ('BB016741P', 'Mega Enterprises Corporation'),
76 | ('AA098762D', 'Hyper Startup Inc.'),
77 | ('AA098762D', 'Hyper Startup Inc.'),
78 | ('AA098762D', 'Hyper Startup Inc.'),
79 | ('HH072982K', 'Hyper Hyper Inc.'),
80 | ('BB016741P', 'Mega Enterprises Corporation')
81 | ]
82 | )
83 |
84 |
85 | class StringGrouperUtilTest(unittest.TestCase):
86 | def test_group_rep_by_timestamp_return_series(self):
87 | """Should return a pd.Series object with the same length as the grouped_data. The series object will contain
88 | a list of groups whose group-representatives have the earliest timestamp of the group"""
89 | simple_example = SimpleExample()
90 | customers_df = simple_example.customers_df
91 | pd.testing.assert_series_equal(
92 | simple_example.expected_result_TS,
93 | new_group_rep_by_earliest_timestamp(
94 | customers_df,
95 | 'group ID',
96 | 'Customer ID',
97 | 'timestamp'
98 | )
99 | )
100 |
101 | def test_group_rep_by_timestamp_return_dataframe(self):
102 | """Should return a pd.DataFrame object with the same length as the grouped_data. The DataFrame object will contain
103 | a list of groups whose group-representatives have the earliest timestamp of the group"""
104 | simple_example = SimpleExample()
105 | customers_df = simple_example.customers_df
106 | pd.testing.assert_frame_equal(
107 | simple_example.expected_result_T,
108 | new_group_rep_by_earliest_timestamp(
109 | customers_df,
110 | 'group ID',
111 | 'Customer ID',
112 | 'timestamp',
113 | 'Customer Name'
114 | )
115 | )
116 |
117 | def test_group_rep_by_timestamp_series_input(self):
118 | """Should return a pd.DataFrame object with the same length as the grouped_data. The DataFrame object will contain
119 | a list of groups whose group-representatives have the earliest timestamp of the group"""
120 | simple_example = SimpleExample()
121 | customers_df = simple_example.customers_df
122 | pd.testing.assert_frame_equal(
123 | simple_example.expected_result_T,
124 | new_group_rep_by_earliest_timestamp(
125 | customers_df,
126 | 'group ID',
127 | 'Customer ID',
128 | customers_df['timestamp'],
129 | 'Customer Name'
130 | )
131 | )
132 |
133 | def test_group_rep_by_timestamp_input_series_length(self):
134 | """Should raise an exception when timestamps series length is not the same as the length of grouped_data"""
135 | simple_example = SimpleExample()
136 | customers_df = simple_example.customers_df
137 | with self.assertRaises(Exception):
138 | _ = new_group_rep_by_earliest_timestamp(
139 | customers_df,
140 | 'group ID',
141 | 'Customer ID',
142 | customers_df['timestamp'].iloc[:-2],
143 | 'Customer Name'
144 | )
145 |
146 | def test_group_rep_by_timestamp_bad_input_timestamp_strings(self):
147 | """Should raise an exception when timestamps series of strings is not datetime-like"""
148 | simple_example = SimpleExample()
149 | customers_df = simple_example.customers_df
150 | with self.assertRaises(Exception):
151 | _ = new_group_rep_by_earliest_timestamp(
152 | customers_df,
153 | 'group ID',
154 | 'Customer ID',
155 | customers_df['Customer ID'],
156 | 'Customer Name'
157 | )
158 |
159 | def test_group_rep_by_timestamp_pandas_timestamps(self):
160 | """Should return a pd.DataFrame object with the same length as the grouped_data. The DataFrame object will contain
161 | a list of groups whose group-representatives have the earliest timestamp of the group"""
162 | simple_example = SimpleExample()
163 | customers_df = simple_example.customers_df
164 | customers_df2 = customers_df.copy()
165 | customers_df2['timestamp'] = customers_df2['timestamp'].transform(lambda t: pd.Timestamp(t))
166 | pd.testing.assert_frame_equal(
167 | simple_example.expected_result_T,
168 | new_group_rep_by_earliest_timestamp(
169 | customers_df2,
170 | 'group ID',
171 | 'Customer ID',
172 | customers_df2['timestamp'],
173 | 'Customer Name'
174 | )
175 | )
176 |
177 | def test_group_rep_by_timestamp_dateutil_timestamps(self):
178 | """Should return a pd.DataFrame object with the same length as the grouped_data. The DataFrame object will contain
179 | a list of groups whose group-representatives have the earliest timestamp of the group"""
180 | simple_example = SimpleExample()
181 | customers_df = simple_example.customers_df
182 | customers_df2 = customers_df.copy()
183 | customers_df2['timestamp'] = customers_df2['timestamp'].transform(lambda t: parse(t))
184 | pd.testing.assert_frame_equal(
185 | simple_example.expected_result_T,
186 | new_group_rep_by_earliest_timestamp(
187 | customers_df2,
188 | 'group ID',
189 | 'Customer ID',
190 | customers_df2['timestamp'],
191 | 'Customer Name'
192 | )
193 | )
194 |
195 | def test_group_rep_by_timestamp_bad_nonstring_timestamps(self):
196 | """Should raise an exception when not all provided timestamps are datetime-like or number-like"""
197 | simple_example = SimpleExample()
198 | customers_df = simple_example.customers_df
199 | customers_df2 = customers_df.copy()
200 | customers_df2.at[0, 'timestamp'] = 1.0
201 | with self.assertRaises(Exception):
202 | _ = new_group_rep_by_earliest_timestamp(
203 | customers_df2,
204 | 'group ID',
205 | 'Customer ID',
206 | customers_df2['timestamp'],
207 | 'Customer Name'
208 | )
209 |
210 | def test_group_rep_by_timestamp_input_numbers(self):
211 | """Should return a pd.DataFrame object with the same length as the grouped_data. The DataFrame object will contain
212 | a list of groups whose group-representatives have the earliest timestamp of the group"""
213 | simple_example = SimpleExample()
214 | customers_df = simple_example.customers_df
215 | pd.testing.assert_frame_equal(
216 | simple_example.expected_result_TW,
217 | new_group_rep_by_earliest_timestamp(
218 | customers_df,
219 | 'group ID',
220 | 'Customer ID',
221 | customers_df['weight'],
222 | 'Customer Name'
223 | )
224 | )
225 |
226 | def test_group_rep_by_weight(self):
227 | """Should return a pd.DataFrame object with the same length as the grouped_data. The DataFrame object will contain
228 | a list of groups whose group-representatives have the highest weight of the group"""
229 | simple_example = SimpleExample()
230 | customers_df = simple_example.customers_df
231 | pd.testing.assert_frame_equal(
232 | simple_example.expected_result_W,
233 | new_group_rep_by_highest_weight(
234 | customers_df,
235 | 'group ID',
236 | 'Customer ID',
237 | 'weight',
238 | 'Customer Name'
239 | )
240 | )
241 |
242 | def test_group_rep_by_weight_input_series(self):
243 | """Should return a pd.DataFrame object with the same length as the grouped_data. The DataFrame object will contain
244 | a list of groups whose group-representatives have the highest weight of the group"""
245 | simple_example = SimpleExample()
246 | customers_df = simple_example.customers_df
247 | pd.testing.assert_frame_equal(
248 | simple_example.expected_result_W,
249 | new_group_rep_by_highest_weight(
250 | customers_df,
251 | 'group ID',
252 | 'Customer ID',
253 | customers_df['weight'],
254 | 'Customer Name'
255 | )
256 | )
257 |
258 | def test_group_rep_by_weight_input_series_length(self):
259 | """Should raise an exception when weights series length is not the same as the length of grouped_data"""
260 | simple_example = SimpleExample()
261 | customers_df = simple_example.customers_df
262 | with self.assertRaises(Exception):
263 | _ = new_group_rep_by_highest_weight(
264 | customers_df,
265 | 'group ID',
266 | 'Customer ID',
267 | customers_df['weight'].iloc[:-2],
268 | 'Customer Name'
269 | )
270 |
271 | def test_group_rep_by_completeness_column_list(self):
272 | """Should return a pd.DataFrame object with the same length as the grouped_data. The DataFrame object will contain
273 | a list of groups whose group-representatives have the most filled-in records of the group"""
274 | simple_example = SimpleExample()
275 | customers_df = simple_example.customers_df
276 | pd.testing.assert_frame_equal(
277 | simple_example.expected_result_C,
278 | new_group_rep_by_completeness(
279 | customers_df,
280 | 'group ID',
281 | 'Customer ID',
282 | 'Customer Name',
283 | [1, 2, 3, 4]
284 | )
285 | )
286 |
287 | def test_group_rep_by_completeness_no_columns(self):
288 | """Should return a pd.DataFrame object with the same length as the grouped_data. The DataFrame object will contain
289 | a list of groups whose group-representatives have the most filled-in records of the group"""
290 | simple_example = SimpleExample()
291 | customers_df = simple_example.customers_df
292 | pd.testing.assert_frame_equal(
293 | simple_example.expected_result_C,
294 | new_group_rep_by_completeness(
295 | customers_df,
296 | 'group ID',
297 | 'Customer ID',
298 | 'Customer Name'
299 | )
300 | )
301 |
302 | def test_group_rep_by_completeness_input_dataframe(self):
303 | """Should return a pd.DataFrame object with the same length as the grouped_data. The DataFrame object will contain
304 | a list of groups whose group-representatives have the most filled-in records of the group"""
305 | simple_example = SimpleExample()
306 | customers_df = simple_example.customers_df
307 | pd.testing.assert_frame_equal(
308 | simple_example.expected_result_C,
309 | new_group_rep_by_completeness(
310 | customers_df,
311 | 'group ID',
312 | 'Customer ID',
313 | 'Customer Name',
314 | customers_df
315 | )
316 | )
317 |
318 | def test_group_rep_by_completeness_input_dataframe_length(self):
319 | """Should raise an exception when tested_cols length is not the same as the length of grouped_data"""
320 | simple_example = SimpleExample()
321 | customers_df = simple_example.customers_df
322 | with self.assertRaises(Exception):
323 | _ = new_group_rep_by_completeness(
324 | customers_df,
325 | 'group ID',
326 | 'Customer ID',
327 | 'Customer Name',
328 | customers_df.iloc[:-2, :]
329 | )
330 |
331 |
332 | if __name__ == '__main__':
333 | unittest.main()
334 |
--------------------------------------------------------------------------------
/tutorials/zero_similarity.md:
--------------------------------------------------------------------------------
1 | # When min_similarity ≤ 0 and include_zeroes = [True | False])
2 |
3 |
4 | ```python
5 | import pandas as pd
6 | import numpy as np
7 | from string_grouper import StringGrouper
8 | ```
9 |
10 |
11 | ```python
12 | companies_df = pd.read_csv('data/sec__edgar_company_info.csv')[0:50000]
13 | ```
14 |
15 |
16 | ```python
17 | master = companies_df['Company Name']
18 | master_id = companies_df['Line Number']
19 | duplicates = pd.Series(["ADVISORS DISCIPLINED TRUST", "ADVISORS DISCIPLINED TRUST '18"])
20 | duplicates_id = pd.Series([3, 5])
21 | ```
22 |
23 | #### When ID's are passed as arguments:
24 | By default, zero-similarity matches are found and output when `min_similarity = 0`:
25 |
26 |
27 | ```python
28 | string_grouper = StringGrouper(
29 | master = master,
30 | duplicates=duplicates,
31 | master_id=master_id,
32 | duplicates_id=duplicates_id,
33 | ignore_index=True,
34 | min_similarity = 0,
35 | max_n_matches = 10000,
36 | regex = "[,-./#]"
37 | ).fit()
38 | string_grouper.get_matches()
39 | ```
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 | |
49 | left_Company Name |
50 | left_Line Number |
51 | similarity |
52 | right_id |
53 | right_side |
54 |
55 |
56 |
57 |
58 | | 0 |
59 | #1 ARIZONA DISCOUNT PROPERTIES LLC |
60 | 3 |
61 | 0.091157 |
62 | 3 |
63 | ADVISORS DISCIPLINED TRUST |
64 |
65 |
66 | | 1 |
67 | #1 ARIZONA DISCOUNT PROPERTIES LLC |
68 | 3 |
69 | 0.063861 |
70 | 5 |
71 | ADVISORS DISCIPLINED TRUST '18 |
72 |
73 |
74 | | 2 |
75 | 05 CAT THIEF/GOLD IN MY STARS LLC |
76 | 21 |
77 | 0.015313 |
78 | 3 |
79 | ADVISORS DISCIPLINED TRUST |
80 |
81 |
82 | | 3 |
83 | 05 CAT THIEF/GOLD IN MY STARS LLC |
84 | 21 |
85 | 0.010728 |
86 | 5 |
87 | ADVISORS DISCIPLINED TRUST '18 |
88 |
89 |
90 | | 4 |
91 | 05 DIXIE UNION/UNDER FIRE LLC |
92 | 22 |
93 | 0.025397 |
94 | 3 |
95 | ADVISORS DISCIPLINED TRUST |
96 |
97 |
98 | | ... |
99 | ... |
100 | ... |
101 | ... |
102 | ... |
103 | ... |
104 |
105 |
106 | | 99995 |
107 | ALLDREDGE WILLIAM T |
108 | 21746 |
109 | 0.000000 |
110 | 3 |
111 | ADVISORS DISCIPLINED TRUST |
112 |
113 |
114 | | 99996 |
115 | ALLEN SAMUEL R |
116 | 22183 |
117 | 0.000000 |
118 | 5 |
119 | ADVISORS DISCIPLINED TRUST '18 |
120 |
121 |
122 | | 99997 |
123 | ATSP INNOVATIONS, LLC |
124 | 45273 |
125 | 0.000000 |
126 | 5 |
127 | ADVISORS DISCIPLINED TRUST '18 |
128 |
129 |
130 | | 99998 |
131 | ATLAS IDF, LP |
132 | 44877 |
133 | 0.000000 |
134 | 5 |
135 | ADVISORS DISCIPLINED TRUST '18 |
136 |
137 |
138 | | 99999 |
139 | AU LEO Y |
140 | 45535 |
141 | 0.000000 |
142 | 5 |
143 | ADVISORS DISCIPLINED TRUST '18 |
144 |
145 |
146 |
147 |
100000 rows × 5 columns
148 |
149 |
150 |
151 |
152 | #### `StringGrouper` also includes option `include_zeroes`:
153 |
154 |
155 | ```python
156 | string_grouper = StringGrouper(
157 | master = master,
158 | duplicates=duplicates,
159 | master_id=master_id,
160 | duplicates_id=duplicates_id,
161 | ignore_index=True,
162 | min_similarity = 0,
163 | max_n_matches = 10000,
164 | regex = "[,-./#]",
165 | include_zeroes = False
166 | ).fit()
167 | string_grouper.get_matches()
168 | ```
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 | |
178 | left_Company Name |
179 | left_Line Number |
180 | similarity |
181 | right_id |
182 | right_side |
183 |
184 |
185 |
186 |
187 | | 0 |
188 | #1 ARIZONA DISCOUNT PROPERTIES LLC |
189 | 3 |
190 | 0.091157 |
191 | 3 |
192 | ADVISORS DISCIPLINED TRUST |
193 |
194 |
195 | | 1 |
196 | #1 ARIZONA DISCOUNT PROPERTIES LLC |
197 | 3 |
198 | 0.063861 |
199 | 5 |
200 | ADVISORS DISCIPLINED TRUST '18 |
201 |
202 |
203 | | 2 |
204 | 05 CAT THIEF/GOLD IN MY STARS LLC |
205 | 21 |
206 | 0.015313 |
207 | 3 |
208 | ADVISORS DISCIPLINED TRUST |
209 |
210 |
211 | | 3 |
212 | 05 CAT THIEF/GOLD IN MY STARS LLC |
213 | 21 |
214 | 0.010728 |
215 | 5 |
216 | ADVISORS DISCIPLINED TRUST '18 |
217 |
218 |
219 | | 4 |
220 | 05 DIXIE UNION/UNDER FIRE LLC |
221 | 22 |
222 | 0.025397 |
223 | 3 |
224 | ADVISORS DISCIPLINED TRUST |
225 |
226 |
227 | | ... |
228 | ... |
229 | ... |
230 | ... |
231 | ... |
232 | ... |
233 |
234 |
235 | | 28754 |
236 | BAAPLIFE3-2015, LLC |
237 | 49976 |
238 | 0.021830 |
239 | 5 |
240 | ADVISORS DISCIPLINED TRUST '18 |
241 |
242 |
243 | | 28755 |
244 | BAAPLIFE4-2016, LLC |
245 | 49977 |
246 | 0.030983 |
247 | 3 |
248 | ADVISORS DISCIPLINED TRUST |
249 |
250 |
251 | | 28756 |
252 | BAAPLIFE4-2016, LLC |
253 | 49977 |
254 | 0.021706 |
255 | 5 |
256 | ADVISORS DISCIPLINED TRUST '18 |
257 |
258 |
259 | | 28757 |
260 | BABA JOE DIAMOND VENTURES US INC. |
261 | 49989 |
262 | 0.027064 |
263 | 3 |
264 | ADVISORS DISCIPLINED TRUST |
265 |
266 |
267 | | 28758 |
268 | BABA JOE DIAMOND VENTURES US INC. |
269 | 49989 |
270 | 0.018960 |
271 | 5 |
272 | ADVISORS DISCIPLINED TRUST '18 |
273 |
274 |
275 |
276 |
28759 rows × 5 columns
277 |
278 |
279 |
280 |
281 | #### `get_matches` option `include_zeroes` can override `StringGrouper` default:
282 |
283 |
284 | ```python
285 | string_grouper.get_matches(include_zeroes=True)
286 | ```
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 | |
296 | left_Company Name |
297 | left_Line Number |
298 | similarity |
299 | right_id |
300 | right_side |
301 |
302 |
303 |
304 |
305 | | 0 |
306 | #1 ARIZONA DISCOUNT PROPERTIES LLC |
307 | 3 |
308 | 0.091157 |
309 | 3 |
310 | ADVISORS DISCIPLINED TRUST |
311 |
312 |
313 | | 1 |
314 | #1 ARIZONA DISCOUNT PROPERTIES LLC |
315 | 3 |
316 | 0.063861 |
317 | 5 |
318 | ADVISORS DISCIPLINED TRUST '18 |
319 |
320 |
321 | | 2 |
322 | 05 CAT THIEF/GOLD IN MY STARS LLC |
323 | 21 |
324 | 0.015313 |
325 | 3 |
326 | ADVISORS DISCIPLINED TRUST |
327 |
328 |
329 | | 3 |
330 | 05 CAT THIEF/GOLD IN MY STARS LLC |
331 | 21 |
332 | 0.010728 |
333 | 5 |
334 | ADVISORS DISCIPLINED TRUST '18 |
335 |
336 |
337 | | 4 |
338 | 05 DIXIE UNION/UNDER FIRE LLC |
339 | 22 |
340 | 0.025397 |
341 | 3 |
342 | ADVISORS DISCIPLINED TRUST |
343 |
344 |
345 | | ... |
346 | ... |
347 | ... |
348 | ... |
349 | ... |
350 | ... |
351 |
352 |
353 | | 99995 |
354 | ALLDREDGE WILLIAM T |
355 | 21746 |
356 | 0.000000 |
357 | 3 |
358 | ADVISORS DISCIPLINED TRUST |
359 |
360 |
361 | | 99996 |
362 | ALLEN SAMUEL R |
363 | 22183 |
364 | 0.000000 |
365 | 5 |
366 | ADVISORS DISCIPLINED TRUST '18 |
367 |
368 |
369 | | 99997 |
370 | ATSP INNOVATIONS, LLC |
371 | 45273 |
372 | 0.000000 |
373 | 5 |
374 | ADVISORS DISCIPLINED TRUST '18 |
375 |
376 |
377 | | 99998 |
378 | ATLAS IDF, LP |
379 | 44877 |
380 | 0.000000 |
381 | 5 |
382 | ADVISORS DISCIPLINED TRUST '18 |
383 |
384 |
385 | | 99999 |
386 | AU LEO Y |
387 | 45535 |
388 | 0.000000 |
389 | 5 |
390 | ADVISORS DISCIPLINED TRUST '18 |
391 |
392 |
393 |
394 |
100000 rows × 5 columns
395 |
396 |
397 |
398 |
399 | #### When no ID's are passed as arguments and indexes are not set:
400 | Default indexes are output:
401 |
402 |
403 | ```python
404 | string_grouper = StringGrouper(
405 | master = master,
406 | duplicates=duplicates,
407 | min_similarity = 0,
408 | max_n_matches = 10000,
409 | regex = "[,-./#]"
410 | ).fit()
411 | string_grouper.get_matches()
412 | ```
413 |
414 |
415 |
416 |
417 |
418 |
419 |
420 |
421 | |
422 | left_index |
423 | left_Company Name |
424 | similarity |
425 | right_side |
426 | right_index |
427 |
428 |
429 |
430 |
431 | | 0 |
432 | 2 |
433 | #1 ARIZONA DISCOUNT PROPERTIES LLC |
434 | 0.091157 |
435 | ADVISORS DISCIPLINED TRUST |
436 | 0 |
437 |
438 |
439 | | 1 |
440 | 2 |
441 | #1 ARIZONA DISCOUNT PROPERTIES LLC |
442 | 0.063861 |
443 | ADVISORS DISCIPLINED TRUST '18 |
444 | 1 |
445 |
446 |
447 | | 2 |
448 | 20 |
449 | 05 CAT THIEF/GOLD IN MY STARS LLC |
450 | 0.015313 |
451 | ADVISORS DISCIPLINED TRUST |
452 | 0 |
453 |
454 |
455 | | 3 |
456 | 20 |
457 | 05 CAT THIEF/GOLD IN MY STARS LLC |
458 | 0.010728 |
459 | ADVISORS DISCIPLINED TRUST '18 |
460 | 1 |
461 |
462 |
463 | | 4 |
464 | 21 |
465 | 05 DIXIE UNION/UNDER FIRE LLC |
466 | 0.025397 |
467 | ADVISORS DISCIPLINED TRUST |
468 | 0 |
469 |
470 |
471 | | ... |
472 | ... |
473 | ... |
474 | ... |
475 | ... |
476 | ... |
477 |
478 |
479 | | 99995 |
480 | 21745 |
481 | ALLDREDGE WILLIAM T |
482 | 0.000000 |
483 | ADVISORS DISCIPLINED TRUST |
484 | 0 |
485 |
486 |
487 | | 99996 |
488 | 22182 |
489 | ALLEN SAMUEL R |
490 | 0.000000 |
491 | ADVISORS DISCIPLINED TRUST '18 |
492 | 1 |
493 |
494 |
495 | | 99997 |
496 | 45272 |
497 | ATSP INNOVATIONS, LLC |
498 | 0.000000 |
499 | ADVISORS DISCIPLINED TRUST '18 |
500 | 1 |
501 |
502 |
503 | | 99998 |
504 | 44876 |
505 | ATLAS IDF, LP |
506 | 0.000000 |
507 | ADVISORS DISCIPLINED TRUST '18 |
508 | 1 |
509 |
510 |
511 | | 99999 |
512 | 45534 |
513 | AU LEO Y |
514 | 0.000000 |
515 | ADVISORS DISCIPLINED TRUST '18 |
516 | 1 |
517 |
518 |
519 |
520 |
100000 rows × 5 columns
521 |
522 |
523 |
524 |
525 | #### When no ID's are passed as arguments but indexes are set:
526 | Indexes are output:
527 |
528 |
529 | ```python
530 | master.index = pd.Index(master_id)
531 | duplicates.index = pd.Index(duplicates_id)
532 | string_grouper = StringGrouper(
533 | master = master,
534 | duplicates=duplicates,
535 | min_similarity = 0,
536 | max_n_matches = 10000,
537 | regex = "[,-./#]"
538 | ).fit()
539 | string_grouper.get_matches()
540 | ```
541 |
542 |
543 |
544 |
545 |
546 |
547 |
548 |
549 | |
550 | left_Line Number |
551 | left_Company Name |
552 | similarity |
553 | right_side |
554 | right_index |
555 |
556 |
557 |
558 |
559 | | 0 |
560 | 3 |
561 | #1 ARIZONA DISCOUNT PROPERTIES LLC |
562 | 0.091157 |
563 | ADVISORS DISCIPLINED TRUST |
564 | 3 |
565 |
566 |
567 | | 1 |
568 | 3 |
569 | #1 ARIZONA DISCOUNT PROPERTIES LLC |
570 | 0.063861 |
571 | ADVISORS DISCIPLINED TRUST '18 |
572 | 5 |
573 |
574 |
575 | | 2 |
576 | 21 |
577 | 05 CAT THIEF/GOLD IN MY STARS LLC |
578 | 0.015313 |
579 | ADVISORS DISCIPLINED TRUST |
580 | 3 |
581 |
582 |
583 | | 3 |
584 | 21 |
585 | 05 CAT THIEF/GOLD IN MY STARS LLC |
586 | 0.010728 |
587 | ADVISORS DISCIPLINED TRUST '18 |
588 | 5 |
589 |
590 |
591 | | 4 |
592 | 22 |
593 | 05 DIXIE UNION/UNDER FIRE LLC |
594 | 0.025397 |
595 | ADVISORS DISCIPLINED TRUST |
596 | 3 |
597 |
598 |
599 | | ... |
600 | ... |
601 | ... |
602 | ... |
603 | ... |
604 | ... |
605 |
606 |
607 | | 99995 |
608 | 21746 |
609 | ALLDREDGE WILLIAM T |
610 | 0.000000 |
611 | ADVISORS DISCIPLINED TRUST |
612 | 3 |
613 |
614 |
615 | | 99996 |
616 | 22183 |
617 | ALLEN SAMUEL R |
618 | 0.000000 |
619 | ADVISORS DISCIPLINED TRUST '18 |
620 | 5 |
621 |
622 |
623 | | 99997 |
624 | 45273 |
625 | ATSP INNOVATIONS, LLC |
626 | 0.000000 |
627 | ADVISORS DISCIPLINED TRUST '18 |
628 | 5 |
629 |
630 |
631 | | 99998 |
632 | 44877 |
633 | ATLAS IDF, LP |
634 | 0.000000 |
635 | ADVISORS DISCIPLINED TRUST '18 |
636 | 5 |
637 |
638 |
639 | | 99999 |
640 | 45535 |
641 | AU LEO Y |
642 | 0.000000 |
643 | ADVISORS DISCIPLINED TRUST '18 |
644 | 5 |
645 |
646 |
647 |
648 |
100000 rows × 5 columns
649 |
650 |
651 |
652 |
--------------------------------------------------------------------------------
/tutorials/group_representatives.md:
--------------------------------------------------------------------------------
1 | # Group Representatives
2 | ------
3 |
4 |
5 | ```python
6 | import pandas as pd
7 | from string_grouper import group_similar_strings
8 | ```
9 |
10 | We have already seen that string_grouper has a function group_similar_strings() that partitions a Series of strings into groups based on their degree of mutual similarity. To represent each group, group_similar_strings() chooses one member of the group. The default choice is the so-called ***centroid*** of the group.
11 |
12 | The **centroid** of a group of similar strings is that string in the group which has the highest ***similarity aggregate***.
13 |
14 | The **similarity aggregate** of a string is the sum of all the cosine similarities between it and the strings that it matches.
15 |
16 | This choice can also be specified by setting the following keyword argument of group_similar_strings:
17 | `group_rep='centroid'`.
18 |
19 | group_similar_strings() has an alternative choice of group representative which is specified by setting `group_rep='first'`. This choice is merely the first member of the group according to its index (that is, its position in the order of appearance of members in the group). Though somewhat arbitrary, this choice is the fastest and can be used for large datasets whenever the choice of group-representative is not important.
20 |
21 | |`group_rep='first'`|
22 | |:---:|
23 | |**`group_rep='centroid'`**|
24 |
25 | But the user may not be satisfied with group_similar_strings()' only two available choices. For example, he/she might prefer the earliest recorded string in the group to represent the group (if timestamp metadata is available). Fortunately, there are three other choices available in an auxiliary module named `string_grouper_utils` included in the package and which can be imported whenever necessary:
26 |
27 |
28 | ```python
29 | from string_grouper_utils import new_group_rep_by_highest_weight, \
30 | new_group_rep_by_earliest_timestamp, new_group_rep_by_completeness
31 | ```
32 |
33 | string_grouper_utils provides three high-level functions `new_group_rep_by_highest_weight()`, `new_group_rep_by_earliest_timestamp()`, and `new_group_rep_by_completeness()`. These functions change the group-representatives of data that have already been grouped (by group_similar_strings(), for example).
34 |
35 | Let us create a DataFrame with some artificial timestamped records:
36 |
37 |
38 | ```python
39 | customers_df = pd.DataFrame(
40 | [
41 | ('BB016741P', 'Mega Enterprises Corporation', 'Address0', 'Tel0', 'Description0', 0.2, '2014-12-30 10:55:00-02:00'),
42 | ('CC082744L', 'Hyper Startup Incorporated', '', 'Tel1', '', 0.5, '2017-01-01 20:23:15-05:00'),
43 | ('AA098762D', 'Hyper Startup Inc.', 'Address2', 'Tel2', 'Description2', 0.3, '2020-10-20 15:29:30+02:00'),
44 | ('BB099931J', 'Hyper-Startup Inc.', 'Address3', 'Tel3', 'Description3', 0.1, '2013-07-01 03:34:45-05:00'),
45 | ('HH072982K', 'Hyper Hyper Inc.', 'Address4', '', 'Description4', 0.9, '2005-09-11 11:56:00-07:00'),
46 | ('EE059082Q', 'Mega Enterprises Corp.', 'Address5', 'Tel5', 'Description5', 1.0, '1998-04-14 09:21:11+00:00')
47 | ],
48 | columns=('Customer ID', 'Customer Name', 'Address', 'Tel', 'Description', 'weight', 'timestamp')
49 | ).set_index('Customer ID')
50 | ```
51 |
52 | **NB.** These 'timestamps' are not actual `pandas Timestamp` datatypes --- they are strings. If we like, we could convert these strings to `pandas Timestamp` datatypes or datetime datatypes (from python module `datetime`), but this is not necessary because string_grouper_utils can deal with these strings just as they are and can automatically _parse_ them to into (localized) `pandas Timestamp` datatypes internally for comparison as we shall soon see.
53 |
54 | Also, in this example we have used the most general timestamps, that is, each string has a date together with time-of-day and timezone information. This is not always necessary, for example, if desired, only date information can be contained in each string.
55 |
56 | Let us display the DataFrame:
57 |
58 |
59 | ```python
60 | customers_df
61 | ```
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 | |
71 | Customer Name |
72 | Address |
73 | Tel |
74 | Description |
75 | weight |
76 | timestamp |
77 |
78 |
79 | | Customer ID |
80 | |
81 | |
82 | |
83 | |
84 | |
85 | |
86 |
87 |
88 |
89 |
90 | | BB016741P |
91 | Mega Enterprises Corporation |
92 | Address0 |
93 | Tel0 |
94 | Description0 |
95 | 0.2 |
96 | 2014-12-30 10:55:00-02:00 |
97 |
98 |
99 | | CC082744L |
100 | Hyper Startup Incorporated |
101 | |
102 | Tel1 |
103 | |
104 | 0.5 |
105 | 2017-01-01 20:23:15-05:00 |
106 |
107 |
108 | | AA098762D |
109 | Hyper Startup Inc. |
110 | Address2 |
111 | Tel2 |
112 | Description2 |
113 | 0.3 |
114 | 2020-10-20 15:29:30+02:00 |
115 |
116 |
117 | | BB099931J |
118 | Hyper-Startup Inc. |
119 | Address3 |
120 | Tel3 |
121 | Description3 |
122 | 0.1 |
123 | 2013-07-01 03:34:45-05:00 |
124 |
125 |
126 | | HH072982K |
127 | Hyper Hyper Inc. |
128 | Address4 |
129 | |
130 | Description4 |
131 | 0.9 |
132 | 2005-09-11 11:56:00-07:00 |
133 |
134 |
135 | | EE059082Q |
136 | Mega Enterprises Corp. |
137 | Address5 |
138 | Tel5 |
139 | Description5 |
140 | 1.0 |
141 | 1998-04-14 09:21:11+00:00 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 | ## group_similar_strings()
150 |
151 | With the following command, we can create a mapping table with the groupings that group_similar_strings() finds. Here the keyword argument `group_rep` is not explicitly set. It therefore takes on the default value `'centroid'`.
152 |
153 |
154 | ```python
155 | customers_df[['group rep ID', 'group rep']] = \
156 | group_similar_strings(
157 | customers_df['Customer Name'],
158 | min_similarity=0.6)
159 | ```
160 |
161 | Let's display the mapping table:
162 |
163 |
164 | ```python
165 | customers_df
166 | ```
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 | |
176 | Customer Name |
177 | Address |
178 | Tel |
179 | Description |
180 | weight |
181 | timestamp |
182 | group rep ID |
183 | group rep |
184 |
185 |
186 | | Customer ID |
187 | |
188 | |
189 | |
190 | |
191 | |
192 | |
193 | |
194 | |
195 |
196 |
197 |
198 |
199 | | BB016741P |
200 | Mega Enterprises Corporation |
201 | Address0 |
202 | Tel0 |
203 | Description0 |
204 | 0.2 |
205 | 2014-12-30 10:55:00-02:00 |
206 | BB016741P |
207 | Mega Enterprises Corporation |
208 |
209 |
210 | | CC082744L |
211 | Hyper Startup Incorporated |
212 | |
213 | Tel1 |
214 | |
215 | 0.5 |
216 | 2017-01-01 20:23:15-05:00 |
217 | AA098762D |
218 | Hyper Startup Inc. |
219 |
220 |
221 | | AA098762D |
222 | Hyper Startup Inc. |
223 | Address2 |
224 | Tel2 |
225 | Description2 |
226 | 0.3 |
227 | 2020-10-20 15:29:30+02:00 |
228 | AA098762D |
229 | Hyper Startup Inc. |
230 |
231 |
232 | | BB099931J |
233 | Hyper-Startup Inc. |
234 | Address3 |
235 | Tel3 |
236 | Description3 |
237 | 0.1 |
238 | 2013-07-01 03:34:45-05:00 |
239 | AA098762D |
240 | Hyper Startup Inc. |
241 |
242 |
243 | | HH072982K |
244 | Hyper Hyper Inc. |
245 | Address4 |
246 | |
247 | Description4 |
248 | 0.9 |
249 | 2005-09-11 11:56:00-07:00 |
250 | HH072982K |
251 | Hyper Hyper Inc. |
252 |
253 |
254 | | EE059082Q |
255 | Mega Enterprises Corp. |
256 | Address5 |
257 | Tel5 |
258 | Description5 |
259 | 1.0 |
260 | 1998-04-14 09:21:11+00:00 |
261 | BB016741P |
262 | Mega Enterprises Corporation |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 | Let's try this again, this time with group_rep='first':
271 |
272 |
273 | ```python
274 | customers_df[['group rep ID', 'group rep']] = \
275 | group_similar_strings(
276 | customers_df['Customer Name'],
277 | group_rep='first',
278 | min_similarity=0.6)
279 | ```
280 |
281 | Displaying the new mapping table shows the differences from the result above:
282 |
283 |
284 | ```python
285 | customers_df
286 | ```
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 | |
296 | Customer Name |
297 | Address |
298 | Tel |
299 | Description |
300 | weight |
301 | timestamp |
302 | group rep ID |
303 | group rep |
304 |
305 |
306 | | Customer ID |
307 | |
308 | |
309 | |
310 | |
311 | |
312 | |
313 | |
314 | |
315 |
316 |
317 |
318 |
319 | | BB016741P |
320 | Mega Enterprises Corporation |
321 | Address0 |
322 | Tel0 |
323 | Description0 |
324 | 0.2 |
325 | 2014-12-30 10:55:00-02:00 |
326 | BB016741P |
327 | Mega Enterprises Corporation |
328 |
329 |
330 | | CC082744L |
331 | Hyper Startup Incorporated |
332 | |
333 | Tel1 |
334 | |
335 | 0.5 |
336 | 2017-01-01 20:23:15-05:00 |
337 | CC082744L |
338 | Hyper Startup Incorporated |
339 |
340 |
341 | | AA098762D |
342 | Hyper Startup Inc. |
343 | Address2 |
344 | Tel2 |
345 | Description2 |
346 | 0.3 |
347 | 2020-10-20 15:29:30+02:00 |
348 | CC082744L |
349 | Hyper Startup Incorporated |
350 |
351 |
352 | | BB099931J |
353 | Hyper-Startup Inc. |
354 | Address3 |
355 | Tel3 |
356 | Description3 |
357 | 0.1 |
358 | 2013-07-01 03:34:45-05:00 |
359 | CC082744L |
360 | Hyper Startup Incorporated |
361 |
362 |
363 | | HH072982K |
364 | Hyper Hyper Inc. |
365 | Address4 |
366 | |
367 | Description4 |
368 | 0.9 |
369 | 2005-09-11 11:56:00-07:00 |
370 | HH072982K |
371 | Hyper Hyper Inc. |
372 |
373 |
374 | | EE059082Q |
375 | Mega Enterprises Corp. |
376 | Address5 |
377 | Tel5 |
378 | Description5 |
379 | 1.0 |
380 | 1998-04-14 09:21:11+00:00 |
381 | BB016741P |
382 | Mega Enterprises Corporation |
383 |
384 |
385 |
386 |
387 |
388 |
389 |
390 | Remember it displays the same groups! Only the group names (representatives) have changed.
391 |
392 | ## new_group_rep_by_earliest_timestamp()
393 |
394 | As mentioned above, there are still more choices of group-representatives available. Let's use the `new_group_rep_by_earliest_timestamp()` function:
395 |
396 |
397 | ```python
398 | customers_df.reset_index(inplace=True)
399 | customers_df[['group rep ID', 'group rep']] = \
400 | new_group_rep_by_earliest_timestamp(
401 | grouped_data=customers_df,
402 | group_col='group rep ID',
403 | record_id_col='Customer ID',
404 | record_name_col='Customer Name',
405 | timestamps='timestamp',
406 | dayfirst=False
407 | )
408 | ```
409 |
410 | Notice that this time ***the function operates on already grouped data*** (such as the mapping table that was output by group_similar_strings() above). Thus ***the column of the input grouped data containing the groups*** (here either 'group rep ID' or 'group rep') ***must be specified as argument group_col in addition to the column containing the group members*** (here either 'Customer ID' or 'Customer Name') ***in argument record_id_col***.
411 |
412 | Argument record_name_col is optional and will appear in the output alongside the new group-representatives chosen from record_id_col only if specified.
413 |
414 | The keyword argument `dayfirst` used here is one that is also used in python module dateutil's parser.parse() function. This option specifies whether to interpret the first value in an ambiguous 3-integer date (e.g. 01/05/09) as the day ('True') or month ('False'). If keyword argument `yearfirst` is set to 'True', this distinguishes between YDM and YMD.
415 |
416 | The other possible keyword arguments that can be used are detailed in the docstring (help) of new_group_rep_by_earliest_timestamp():
417 |
418 |
419 | ```python
420 | help(new_group_rep_by_earliest_timestamp)
421 | ```
422 |
423 | Help on function new_group_rep_by_earliest_timestamp in module string_grouper_utils.string_grouper_utils:
424 |
425 | new_group_rep_by_earliest_timestamp(grouped_data: pandas.core.frame.DataFrame, group_col: Union[str, int], record_id_col: Union[str, int], timestamps: Union[pandas.core.series.Series, str, int], record_name_col: Union[str, int, NoneType] = None, parserinfo=None, **kwargs) -> Union[pandas.core.frame.DataFrame, pandas.core.series.Series]
426 | Selects the oldest string in each group as group-representative.
427 | :param grouped_data: The grouped DataFrame
428 | :param group_col: The name or positional index of the column in grouped_data containing the groups
429 | :param record_id_col: The name or positional index of the column in grouped_data with all groups' members' IDs
430 | (This will appear in the output)
431 | :param timestamps: pandas.Series or the column name (str) or column positional index (int) in grouped_data
432 | This contains the timestamps of the strings to be grouped.
433 | :param record_name_col: (Optional) The name or positional index of the column in grouped_data with
434 | all groups' members' names. (This will appear in the output.)
435 | :param parserinfo: (See below.)
436 | :param **kwargs: (See below.)
437 | parserinfo and kwargs are the same arguments as those you would pass to dateutil.parser.parse. They help in
438 | interpreting the string inputs which are to be parsed into datetime datatypes.
439 |
440 | FYI, the dateutil.parser.parse documentation for these arguments follows:
441 | :param parserinfo:
442 | A :class:`parserinfo` object containing parameters for the parser.
443 | If ``None``, the default arguments to the :class:`parserinfo`
444 | constructor are used.
445 |
446 | The ``**kwargs`` parameter takes the following keyword arguments:
447 |
448 | :param default:
449 | The default datetime object, if this is a datetime object and not
450 | ``None``, elements specified in the strings containing the date/time-stamps replace elements in the
451 | default object.
452 |
453 | :param ignoretz:
454 | If set ``True``, time zones in parsed strings are ignored and a naive
455 | :class:`datetime` object is returned.
456 |
457 | :param tzinfos:
458 | Additional time zone names / aliases which may be present in the
459 | string. This argument maps time zone names (and optionally offsets
460 | from those time zones) to time zones. This parameter can be a
461 | dictionary with timezone aliases mapping time zone names to time
462 | zones or a function taking two parameters (``tzname`` and
463 | ``tzoffset``) and returning a time zone.
464 |
465 | The timezones to which the names are mapped can be an integer
466 | offset from UTC in seconds or a :class:`tzinfo` object.
467 |
468 | .. doctest::
469 | :options: +NORMALIZE_WHITESPACE
470 |
471 | >>> from dateutil.parser import parse
472 | >>> from dateutil.tz import gettz
473 | >>> tzinfos = {"BRST": -7200, "CST": gettz("America/Chicago")}
474 | >>> parse("2012-01-19 17:21:00 BRST", tzinfos=tzinfos)
475 | datetime.datetime(2012, 1, 19, 17, 21, tzinfo=tzoffset(u'BRST', -7200))
476 | >>> parse("2012-01-19 17:21:00 CST", tzinfos=tzinfos)
477 | datetime.datetime(2012, 1, 19, 17, 21,
478 | tzinfo=tzfile('/usr/share/zoneinfo/America/Chicago'))
479 |
480 | This parameter is ignored if ``ignoretz`` is set.
481 |
482 | :param dayfirst:
483 | Whether to interpret the first value in an ambiguous 3-integer date
484 | (e.g. 01/05/09) as the day (``True``) or month (``False``). If
485 | ``yearfirst`` is set to ``True``, this distinguishes between YDM and
486 | YMD. If set to ``None``, this value is retrieved from the current
487 | :class:`parserinfo` object (which itself defaults to ``False``).
488 |
489 | :param yearfirst:
490 | Whether to interpret the first value in an ambiguous 3-integer date
491 | (e.g. 01/05/09) as the year. If ``True``, the first number is taken to
492 | be the year, otherwise the last number is taken to be the year. If
493 | this is set to ``None``, the value is retrieved from the current
494 | :class:`parserinfo` object (which itself defaults to ``False``).
495 |
496 | :param fuzzy:
497 | Whether to allow fuzzy parsing, allowing for string like "Today is
498 | January 1, 2047 at 8:21:00AM".
499 |
500 | :param fuzzy_with_tokens:
501 | If ``True``, ``fuzzy`` is automatically set to True, and the parser
502 | will return a tuple where the first element is the parsed
503 | :class:`datetime.datetime` datetimestamp and the second element is
504 | a tuple containing the portions of the string which were ignored:
505 |
506 | .. doctest::
507 |
508 | >>> from dateutil.parser import parse
509 | >>> parse("Today is January 1, 2047 at 8:21:00AM", fuzzy_with_tokens=True)
510 | (datetime.datetime(2047, 1, 1, 8, 21), (u'Today is ', u' ', u'at '))
511 |
512 |
513 |
514 |
515 | ```python
516 | customers_df
517 | ```
518 |
519 |
520 |
521 |
522 |
523 |
524 |
525 |
526 | |
527 | Customer ID |
528 | Customer Name |
529 | Address |
530 | Tel |
531 | Description |
532 | weight |
533 | timestamp |
534 | group rep ID |
535 | group rep |
536 |
537 |
538 |
539 |
540 | | 0 |
541 | BB016741P |
542 | Mega Enterprises Corporation |
543 | Address0 |
544 | Tel0 |
545 | Description0 |
546 | 0.2 |
547 | 2014-12-30 10:55:00-02:00 |
548 | EE059082Q |
549 | Mega Enterprises Corp. |
550 |
551 |
552 | | 1 |
553 | CC082744L |
554 | Hyper Startup Incorporated |
555 | |
556 | Tel1 |
557 | |
558 | 0.5 |
559 | 2017-01-01 20:23:15-05:00 |
560 | BB099931J |
561 | Hyper-Startup Inc. |
562 |
563 |
564 | | 2 |
565 | AA098762D |
566 | Hyper Startup Inc. |
567 | Address2 |
568 | Tel2 |
569 | Description2 |
570 | 0.3 |
571 | 2020-10-20 15:29:30+02:00 |
572 | BB099931J |
573 | Hyper-Startup Inc. |
574 |
575 |
576 | | 3 |
577 | BB099931J |
578 | Hyper-Startup Inc. |
579 | Address3 |
580 | Tel3 |
581 | Description3 |
582 | 0.1 |
583 | 2013-07-01 03:34:45-05:00 |
584 | BB099931J |
585 | Hyper-Startup Inc. |
586 |
587 |
588 | | 4 |
589 | HH072982K |
590 | Hyper Hyper Inc. |
591 | Address4 |
592 | |
593 | Description4 |
594 | 0.9 |
595 | 2005-09-11 11:56:00-07:00 |
596 | HH072982K |
597 | Hyper Hyper Inc. |
598 |
599 |
600 | | 5 |
601 | EE059082Q |
602 | Mega Enterprises Corp. |
603 | Address5 |
604 | Tel5 |
605 | Description5 |
606 | 1.0 |
607 | 1998-04-14 09:21:11+00:00 |
608 | EE059082Q |
609 | Mega Enterprises Corp. |
610 |
611 |
612 |
613 |
614 |
615 |
616 |
617 | Here the group-member with the earliest timestamp has been chosen as group-representative for each group. Notice that even though the timestamp data is input as strings, the function is able to treat them as if they were datetime (or pandas Timestamp) datatypes.
618 |
619 | ## new_group_rep_by_highest_weight() and new_group_rep_by_completeness()
620 |
621 | The other two utility functions `new_group_rep_by_highest_weight()` and `new_group_rep_by_completeness()` operate in a similar way to new_group_rep_by_earliest_timestamp():
622 |
623 | 1. new_group_rep_by_highest_weight() chooses the group-member with the highest weight as group-representative for each group. The weight of each member is assigned as desired by the user, and provided as an argument to the function. The weights could also be a specified column in the input grouped data (mapping table).
624 |
625 | 2. new_group_rep_by_completeness() chooses the group member with the most filled-in fields in its row as group-representative for each group.
626 |
627 |
628 | ```python
629 | customers_df[['group rep ID', 'group rep']] = \
630 | new_group_rep_by_highest_weight(
631 | grouped_data=customers_df,
632 | group_col='group rep ID',
633 | record_id_col='Customer ID',
634 | weights='weight',
635 | record_name_col='Customer Name'
636 | )
637 | ```
638 |
639 |
640 | ```python
641 | customers_df
642 | ```
643 |
644 |
645 |
646 |
647 |
648 |
649 |
650 |
651 | |
652 | Customer ID |
653 | Customer Name |
654 | Address |
655 | Tel |
656 | Description |
657 | weight |
658 | timestamp |
659 | group rep ID |
660 | group rep |
661 |
662 |
663 |
664 |
665 | | 0 |
666 | BB016741P |
667 | Mega Enterprises Corporation |
668 | Address0 |
669 | Tel0 |
670 | Description0 |
671 | 0.2 |
672 | 2014-12-30 10:55:00-02:00 |
673 | EE059082Q |
674 | Mega Enterprises Corp. |
675 |
676 |
677 | | 1 |
678 | CC082744L |
679 | Hyper Startup Incorporated |
680 | |
681 | Tel1 |
682 | |
683 | 0.5 |
684 | 2017-01-01 20:23:15-05:00 |
685 | CC082744L |
686 | Hyper Startup Incorporated |
687 |
688 |
689 | | 2 |
690 | AA098762D |
691 | Hyper Startup Inc. |
692 | Address2 |
693 | Tel2 |
694 | Description2 |
695 | 0.3 |
696 | 2020-10-20 15:29:30+02:00 |
697 | CC082744L |
698 | Hyper Startup Incorporated |
699 |
700 |
701 | | 3 |
702 | BB099931J |
703 | Hyper-Startup Inc. |
704 | Address3 |
705 | Tel3 |
706 | Description3 |
707 | 0.1 |
708 | 2013-07-01 03:34:45-05:00 |
709 | CC082744L |
710 | Hyper Startup Incorporated |
711 |
712 |
713 | | 4 |
714 | HH072982K |
715 | Hyper Hyper Inc. |
716 | Address4 |
717 | |
718 | Description4 |
719 | 0.9 |
720 | 2005-09-11 11:56:00-07:00 |
721 | HH072982K |
722 | Hyper Hyper Inc. |
723 |
724 |
725 | | 5 |
726 | EE059082Q |
727 | Mega Enterprises Corp. |
728 | Address5 |
729 | Tel5 |
730 | Description5 |
731 | 1.0 |
732 | 1998-04-14 09:21:11+00:00 |
733 | EE059082Q |
734 | Mega Enterprises Corp. |
735 |
736 |
737 |
738 |
739 |
740 |
741 |
742 |
743 | ```python
744 | customers_df[['group rep ID', 'group rep']] = \
745 | new_group_rep_by_completeness(
746 | grouped_data=customers_df,
747 | group_col='group rep ID',
748 | record_id_col='Customer ID',
749 | record_name_col='Customer Name',
750 | tested_cols=['Address', 'Tel', 'Description']
751 | )
752 | ```
753 |
754 | **N.B.** If argument tesed_cols is not given, new_group_rep_by_completeness() will test the filled-in status of all the fields of grouped_data for each group member.
755 |
756 |
757 | ```python
758 | customers_df
759 | ```
760 |
761 |
762 |
763 |
764 |
765 |
766 |
767 |
768 | |
769 | Customer ID |
770 | Customer Name |
771 | Address |
772 | Tel |
773 | Description |
774 | weight |
775 | timestamp |
776 | group rep ID |
777 | group rep |
778 |
779 |
780 |
781 |
782 | | 0 |
783 | BB016741P |
784 | Mega Enterprises Corporation |
785 | Address0 |
786 | Tel0 |
787 | Description0 |
788 | 0.2 |
789 | 2014-12-30 10:55:00-02:00 |
790 | BB016741P |
791 | Mega Enterprises Corporation |
792 |
793 |
794 | | 1 |
795 | CC082744L |
796 | Hyper Startup Incorporated |
797 | |
798 | Tel1 |
799 | |
800 | 0.5 |
801 | 2017-01-01 20:23:15-05:00 |
802 | AA098762D |
803 | Hyper Startup Inc. |
804 |
805 |
806 | | 2 |
807 | AA098762D |
808 | Hyper Startup Inc. |
809 | Address2 |
810 | Tel2 |
811 | Description2 |
812 | 0.3 |
813 | 2020-10-20 15:29:30+02:00 |
814 | AA098762D |
815 | Hyper Startup Inc. |
816 |
817 |
818 | | 3 |
819 | BB099931J |
820 | Hyper-Startup Inc. |
821 | Address3 |
822 | Tel3 |
823 | Description3 |
824 | 0.1 |
825 | 2013-07-01 03:34:45-05:00 |
826 | AA098762D |
827 | Hyper Startup Inc. |
828 |
829 |
830 | | 4 |
831 | HH072982K |
832 | Hyper Hyper Inc. |
833 | Address4 |
834 | |
835 | Description4 |
836 | 0.9 |
837 | 2005-09-11 11:56:00-07:00 |
838 | HH072982K |
839 | Hyper Hyper Inc. |
840 |
841 |
842 | | 5 |
843 | EE059082Q |
844 | Mega Enterprises Corp. |
845 | Address5 |
846 | Tel5 |
847 | Description5 |
848 | 1.0 |
849 | 1998-04-14 09:21:11+00:00 |
850 | BB016741P |
851 | Mega Enterprises Corporation |
852 |
853 |
854 |
855 |
856 |
857 |
858 |
859 |
860 | ```python
861 |
862 | ```
863 |
--------------------------------------------------------------------------------
/string_grouper/test/test_string_grouper.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import pandas as pd
3 | import numpy as np
4 | from scipy.sparse import csr_matrix
5 | from string_grouper.string_grouper import DEFAULT_MIN_SIMILARITY, \
6 | DEFAULT_REGEX, DEFAULT_NGRAM_SIZE, DEFAULT_N_PROCESSES, DEFAULT_IGNORE_CASE, \
7 | StringGrouperConfig, StringGrouper, StringGrouperNotFitException, \
8 | match_most_similar, group_similar_strings, match_strings, \
9 | compute_pairwise_similarities
10 | from unittest.mock import patch, Mock
11 |
12 |
13 | def mock_symmetrize_matrix(x: csr_matrix) -> csr_matrix:
14 | return x
15 |
16 |
17 | class SimpleExample(object):
18 | def __init__(self):
19 | self.customers_df = pd.DataFrame(
20 | [
21 | ('BB016741P', 'Mega Enterprises Corporation', 'Address0', 'Tel0', 'Description0', 0.2),
22 | ('CC082744L', 'Hyper Startup Incorporated', '', 'Tel1', '', 0.5),
23 | ('AA098762D', 'Hyper Startup Inc.', 'Address2', 'Tel2', 'Description2', 0.3),
24 | ('BB099931J', 'Hyper-Startup Inc.', 'Address3', 'Tel3', 'Description3', 0.1),
25 | ('HH072982K', 'Hyper Hyper Inc.', 'Address4', '', 'Description4', 0.9),
26 | ('EE059082Q', 'Mega Enterprises Corp.', 'Address5', 'Tel5', 'Description5', 1.0)
27 | ],
28 | columns=('Customer ID', 'Customer Name', 'Address', 'Tel', 'Description', 'weight')
29 | )
30 | self.customers_df2 = pd.DataFrame(
31 | [
32 | ('BB016741P', 'Mega Enterprises Corporation', 'Address0', 'Tel0', 'Description0', 0.2),
33 | ('CC082744L', 'Hyper Startup Incorporated', '', 'Tel1', '', 0.5),
34 | ('AA098762D', 'Hyper Startup Inc.', 'Address2', 'Tel2', 'Description2', 0.3),
35 | ('BB099931J', 'Hyper-Startup Inc.', 'Address3', 'Tel3', 'Description3', 0.1),
36 | ('DD012339M', 'HyperStartup Inc.', 'Address4', 'Tel4', 'Description4', 0.1),
37 | ('HH072982K', 'Hyper Hyper Inc.', 'Address5', '', 'Description5', 0.9),
38 | ('EE059082Q', 'Mega Enterprises Corp.', 'Address6', 'Tel6', 'Description6', 1.0)
39 | ],
40 | columns=('Customer ID', 'Customer Name', 'Address', 'Tel', 'Description', 'weight')
41 | )
42 | self.a_few_strings = pd.Series(['BB016741P', 'BB082744L', 'BB098762D', 'BB099931J', 'BB072982K', 'BB059082Q'])
43 | self.one_string = pd.Series(['BB0'])
44 | self.two_strings = pd.Series(['Hyper', 'Hyp'])
45 | self.whatever_series_1 = pd.Series(['whatever'])
46 | self.expected_result_with_zeroes = pd.DataFrame(
47 | [
48 | (1, 'Hyper Startup Incorporated', 0.08170638, 'whatever', 0),
49 | (0, 'Mega Enterprises Corporation', 0., 'whatever', 0),
50 | (2, 'Hyper Startup Inc.', 0., 'whatever', 0),
51 | (3, 'Hyper-Startup Inc.', 0., 'whatever', 0),
52 | (4, 'Hyper Hyper Inc.', 0., 'whatever', 0),
53 | (5, 'Mega Enterprises Corp.', 0., 'whatever', 0)
54 | ],
55 | columns=['left_index', 'left_Customer Name', 'similarity', 'right_side', 'right_index']
56 | )
57 | self.expected_result_centroid = pd.Series(
58 | [
59 | 'Mega Enterprises Corporation',
60 | 'Hyper Startup Inc.',
61 | 'Hyper Startup Inc.',
62 | 'Hyper Startup Inc.',
63 | 'Hyper Hyper Inc.',
64 | 'Mega Enterprises Corporation'
65 | ],
66 | name='group_rep_Customer Name'
67 | )
68 | self.expected_result_centroid_with_index_col = pd.DataFrame(
69 | [
70 | (0, 'Mega Enterprises Corporation'),
71 | (2, 'Hyper Startup Inc.'),
72 | (2, 'Hyper Startup Inc.'),
73 | (2, 'Hyper Startup Inc.'),
74 | (4, 'Hyper Hyper Inc.'),
75 | (0, 'Mega Enterprises Corporation')
76 | ],
77 | columns=['group_rep_index', 'group_rep_Customer Name']
78 | )
79 | self.expected_result_first = pd.Series(
80 | [
81 | 'Mega Enterprises Corporation',
82 | 'Hyper Startup Incorporated',
83 | 'Hyper Startup Incorporated',
84 | 'Hyper Startup Incorporated',
85 | 'Hyper Hyper Inc.',
86 | 'Mega Enterprises Corporation'
87 | ],
88 | name='group_rep_Customer Name'
89 | )
90 |
91 |
92 | class StringGrouperConfigTest(unittest.TestCase):
93 |
94 | def test_config_defaults(self):
95 | """Empty initialisation should set default values"""
96 | config = StringGrouperConfig()
97 | self.assertEqual(config.min_similarity, DEFAULT_MIN_SIMILARITY)
98 | self.assertEqual(config.max_n_matches, 20)
99 | self.assertEqual(config.regex, DEFAULT_REGEX)
100 | self.assertEqual(config.ngram_size, DEFAULT_NGRAM_SIZE)
101 | self.assertEqual(config.number_of_processes, DEFAULT_N_PROCESSES)
102 | self.assertEqual(config.ignore_case, DEFAULT_IGNORE_CASE)
103 |
104 | def test_config_immutable(self):
105 | """Configurations should be immutable"""
106 | config = StringGrouperConfig()
107 | with self.assertRaises(Exception) as _:
108 | config.min_similarity = 0.1
109 |
110 | def test_config_non_default_values(self):
111 | """Configurations should be immutable"""
112 | config = StringGrouperConfig(min_similarity=0.1, max_n_matches=100, number_of_processes=1)
113 | self.assertEqual(0.1, config.min_similarity)
114 | self.assertEqual(100, config.max_n_matches)
115 | self.assertEqual(1, config.number_of_processes)
116 |
117 |
118 | class StringGrouperTest(unittest.TestCase):
119 |
120 | def test_auto_blocking_single_DataFrame(self):
121 | """tests whether automatic blocking yields consistent results"""
122 | # This function will force an OverflowError to occur when
123 | # the input Series have a combined length above a given number:
124 | # OverflowThreshold. This will in turn trigger automatic splitting
125 | # of the Series/matrices into smaller blocks when n_blocks = None
126 |
127 | sort_cols = ['right_index', 'left_index']
128 |
129 | def fix_row_order(df):
130 | return df.sort_values(sort_cols).reset_index(drop=True)
131 |
132 | simple_example = SimpleExample()
133 | df1 = simple_example.customers_df2['Customer Name']
134 |
135 | # first do manual blocking
136 | sg = StringGrouper(df1, min_similarity=0.1)
137 | pd.testing.assert_series_equal(sg.master, df1)
138 | self.assertEqual(sg.duplicates, None)
139 |
140 | matches = fix_row_order(sg.match_strings(df1, n_blocks=(1, 1)))
141 | self.assertEqual(sg._config.n_blocks, (1, 1))
142 |
143 | # Create a custom wrapper for this StringGrouper instance's
144 | # _build_matches() method which will later be used to
145 | # mock _build_matches().
146 | # Note that we have to define the wrapper here because
147 | # _build_matches() is a non-static function of StringGrouper
148 | # and needs access to the specific StringGrouper instance sg
149 | # created here.
150 | def mock_build_matches(OverflowThreshold,
151 | real_build_matches=sg._build_matches):
152 | def wrapper(left_matrix,
153 | right_matrix,
154 | nnz_rows=None,
155 | sort=True):
156 | if (left_matrix.shape[0] + right_matrix.shape[0]) > \
157 | OverflowThreshold:
158 | raise OverflowError
159 | return real_build_matches(left_matrix, right_matrix, None)
160 | return wrapper
161 |
162 | def do_test_with(OverflowThreshold):
163 | nonlocal sg # allows reference to sg, as sg will be modified below
164 | # Now let us mock sg._build_matches:
165 | sg._build_matches = Mock(side_effect=mock_build_matches(OverflowThreshold))
166 | sg.clear_data()
167 | matches_auto = fix_row_order(sg.match_strings(df1, n_blocks=None))
168 | pd.testing.assert_series_equal(sg.master, df1)
169 | pd.testing.assert_frame_equal(matches, matches_auto)
170 | self.assertEqual(sg._config.n_blocks, None)
171 | # Note that _build_matches is called more than once if and only if
172 | # a split occurred (that is, there was more than one pair of
173 | # matrix-blocks multiplied)
174 | if len(sg._left_Series) + len(sg._right_Series) > \
175 | OverflowThreshold:
176 | # Assert that split occurred:
177 | self.assertGreater(sg._build_matches.call_count, 1)
178 | else:
179 | # Assert that split did not occur:
180 | self.assertEqual(sg._build_matches.call_count, 1)
181 |
182 | # now test auto blocking by forcing an OverflowError when the
183 | # combined Series' lengths is greater than 10, 5, 3, 2
184 |
185 | do_test_with(OverflowThreshold=100) # does not trigger auto blocking
186 | do_test_with(OverflowThreshold=30)
187 | do_test_with(OverflowThreshold=20)
188 | do_test_with(OverflowThreshold=15)
189 | # do_test_with(OverflowThreshold=12)
190 |
191 | def test_n_blocks_single_DataFrame(self):
192 | """tests whether manual blocking yields consistent results"""
193 | sort_cols = ['right_index', 'left_index']
194 |
195 | def fix_row_order(df):
196 | return df.sort_values(sort_cols).reset_index(drop=True)
197 |
198 | simple_example = SimpleExample()
199 | df1 = simple_example.customers_df2['Customer Name']
200 |
201 | matches11 = fix_row_order(match_strings(df1, min_similarity=0.1))
202 |
203 | matches12 = fix_row_order(
204 | match_strings(df1, n_blocks=(1, 2), min_similarity=0.1))
205 | pd.testing.assert_frame_equal(matches11, matches12)
206 |
207 | matches13 = fix_row_order(
208 | match_strings(df1, n_blocks=(1, 3), min_similarity=0.1))
209 | pd.testing.assert_frame_equal(matches11, matches13)
210 |
211 | matches14 = fix_row_order(
212 | match_strings(df1, n_blocks=(1, 4), min_similarity=0.1))
213 | pd.testing.assert_frame_equal(matches11, matches14)
214 |
215 | matches15 = fix_row_order(
216 | match_strings(df1, n_blocks=(1, 5), min_similarity=0.1))
217 | pd.testing.assert_frame_equal(matches11, matches15)
218 |
219 | matches16 = fix_row_order(
220 | match_strings(df1, n_blocks=(1, 6), min_similarity=0.1))
221 | pd.testing.assert_frame_equal(matches11, matches16)
222 |
223 | matches17 = fix_row_order(
224 | match_strings(df1, n_blocks=(1, 7), min_similarity=0.1))
225 | pd.testing.assert_frame_equal(matches11, matches17)
226 |
227 | matches18 = fix_row_order(
228 | match_strings(df1, n_blocks=(1, 8), min_similarity=0.1))
229 | pd.testing.assert_frame_equal(matches11, matches18)
230 |
231 | matches21 = fix_row_order(
232 | match_strings(df1, n_blocks=(2, 1), min_similarity=0.1))
233 | pd.testing.assert_frame_equal(matches11, matches21)
234 |
235 | matches22 = fix_row_order(
236 | match_strings(df1, n_blocks=(2, 2), min_similarity=0.1))
237 | pd.testing.assert_frame_equal(matches11, matches22)
238 |
239 | matches32 = fix_row_order(
240 | match_strings(df1, n_blocks=(3, 2), min_similarity=0.1))
241 | pd.testing.assert_frame_equal(matches11, matches32)
242 |
243 | # Create a custom wrapper for this StringGrouper instance's
244 | # _build_matches() method which will later be used to
245 | # mock _build_matches().
246 | # Note that we have to define the wrapper here because
247 | # _build_matches() is a non-static function of StringGrouper
248 | # and needs access to the specific StringGrouper instance sg
249 | # created here.
250 | sg = StringGrouper(df1, min_similarity=0.1)
251 |
252 | def mock_build_matches(OverflowThreshold,
253 | real_build_matches=sg._build_matches):
254 | def wrapper(left_matrix,
255 | right_matrix,
256 | nnz_rows=None,
257 | sort=True):
258 | if (left_matrix.shape[0] + right_matrix.shape[0]) > \
259 | OverflowThreshold:
260 | raise OverflowError
261 | return real_build_matches(left_matrix, right_matrix, None)
262 | return wrapper
263 |
264 | def test_overflow_error_with(OverflowThreshold, n_blocks):
265 | nonlocal sg
266 | sg._build_matches = Mock(side_effect=mock_build_matches(OverflowThreshold))
267 | sg.clear_data()
268 | max_left_block_size = (len(df1)//n_blocks[0]
269 | + (1 if len(df1) % n_blocks[0] > 0 else 0))
270 | max_right_block_size = (len(df1)//n_blocks[1]
271 | + (1 if len(df1) % n_blocks[1] > 0 else 0))
272 | if (max_left_block_size + max_right_block_size) > OverflowThreshold:
273 | with self.assertRaises(Exception):
274 | _ = sg.match_strings(df1, n_blocks=n_blocks)
275 | else:
276 | matches_manual = fix_row_order(sg.match_strings(df1, n_blocks=n_blocks))
277 | pd.testing.assert_frame_equal(matches11, matches_manual)
278 |
279 | test_overflow_error_with(OverflowThreshold=20, n_blocks=(1, 1))
280 | test_overflow_error_with(OverflowThreshold=20, n_blocks=(1, 1))
281 | test_overflow_error_with(OverflowThreshold=20, n_blocks=(2, 1))
282 | test_overflow_error_with(OverflowThreshold=20, n_blocks=(1, 2))
283 | test_overflow_error_with(OverflowThreshold=20, n_blocks=(4, 4))
284 |
285 | def test_n_blocks_both_DataFrames(self):
286 | """tests whether manual blocking yields consistent results"""
287 | sort_cols = ['right_index', 'left_index']
288 |
289 | def fix_row_order(df):
290 | return df.sort_values(sort_cols).reset_index(drop=True)
291 |
292 | simple_example = SimpleExample()
293 | df1 = simple_example.customers_df['Customer Name']
294 | df2 = simple_example.customers_df2['Customer Name']
295 |
296 | matches11 = fix_row_order(match_strings(df1, df2, min_similarity=0.1))
297 |
298 | matches12 = fix_row_order(
299 | match_strings(df1, df2, n_blocks=(1, 2), min_similarity=0.1))
300 | pd.testing.assert_frame_equal(matches11, matches12)
301 |
302 | matches13 = fix_row_order(
303 | match_strings(df1, df2, n_blocks=(1, 3), min_similarity=0.1))
304 | pd.testing.assert_frame_equal(matches11, matches13)
305 |
306 | matches14 = fix_row_order(
307 | match_strings(df1, df2, n_blocks=(1, 4), min_similarity=0.1))
308 | pd.testing.assert_frame_equal(matches11, matches14)
309 |
310 | matches15 = fix_row_order(
311 | match_strings(df1, df2, n_blocks=(1, 5), min_similarity=0.1))
312 | pd.testing.assert_frame_equal(matches11, matches15)
313 |
314 | matches16 = fix_row_order(
315 | match_strings(df1, df2, n_blocks=(1, 6), min_similarity=0.1))
316 | pd.testing.assert_frame_equal(matches11, matches16)
317 |
318 | matches17 = fix_row_order(
319 | match_strings(df1, df2, n_blocks=(1, 7), min_similarity=0.1))
320 | pd.testing.assert_frame_equal(matches11, matches17)
321 |
322 | matches18 = fix_row_order(
323 | match_strings(df1, df2, n_blocks=(1, 8), min_similarity=0.1))
324 | pd.testing.assert_frame_equal(matches11, matches18)
325 |
326 | matches21 = fix_row_order(
327 | match_strings(df1, df2, n_blocks=(2, 1), min_similarity=0.1))
328 | pd.testing.assert_frame_equal(matches11, matches21)
329 |
330 | matches22 = fix_row_order(
331 | match_strings(df1, df2, n_blocks=(2, 2), min_similarity=0.1))
332 | pd.testing.assert_frame_equal(matches11, matches22)
333 |
334 | matches32 = fix_row_order(
335 | match_strings(df1, df2, n_blocks=(3, 2), min_similarity=0.1))
336 | pd.testing.assert_frame_equal(matches11, matches32)
337 |
338 | def test_n_blocks_bad_option_value(self):
339 | """Tests that bad option values for n_blocks are caught"""
340 | simple_example = SimpleExample()
341 | df1 = simple_example.customers_df2['Customer Name']
342 | with self.assertRaises(Exception):
343 | _ = match_strings(df1, n_blocks=2)
344 | with self.assertRaises(Exception):
345 | _ = match_strings(df1, n_blocks=(0, 2))
346 | with self.assertRaises(Exception):
347 | _ = match_strings(df1, n_blocks=(1, 2.5))
348 | with self.assertRaises(Exception):
349 | _ = match_strings(df1, n_blocks=(1, 2, 3))
350 | with self.assertRaises(Exception):
351 | _ = match_strings(df1, n_blocks=(1, ))
352 |
353 | def test_tfidf_dtype_bad_option_value(self):
354 | """Tests that bad option values for n_blocks are caught"""
355 | simple_example = SimpleExample()
356 | df1 = simple_example.customers_df2['Customer Name']
357 | with self.assertRaises(Exception):
358 | _ = match_strings(df1, tfidf_matrix_dtype=None)
359 | with self.assertRaises(Exception):
360 | _ = match_strings(df1, tfidf_matrix_dtype=0)
361 | with self.assertRaises(Exception):
362 | _ = match_strings(df1, tfidf_matrix_dtype='whatever')
363 |
364 | def test_compute_pairwise_similarities(self):
365 | """tests the high-level function compute_pairwise_similarities"""
366 | simple_example = SimpleExample()
367 | df1 = simple_example.customers_df['Customer Name']
368 | df2 = simple_example.expected_result_centroid
369 | similarities = compute_pairwise_similarities(df1, df2)
370 | expected_result = pd.Series(
371 | [
372 | 1.0,
373 | 0.6336195351561589,
374 | 1.0000000000000004,
375 | 1.0000000000000004,
376 | 1.0,
377 | 0.826462625999832
378 | ],
379 | name='similarity'
380 | )
381 | expected_result = expected_result.astype(np.float64)
382 | pd.testing.assert_series_equal(expected_result, similarities)
383 | sg = StringGrouper(df1, df2)
384 | similarities = sg.compute_pairwise_similarities(df1, df2)
385 | pd.testing.assert_series_equal(expected_result, similarities)
386 |
387 | def test_compute_pairwise_similarities_data_integrity(self):
388 | """tests that an exception is raised whenever the lengths of the two input series of the high-level function
389 | compute_pairwise_similarities are unequal"""
390 | simple_example = SimpleExample()
391 | df1 = simple_example.customers_df['Customer Name']
392 | df2 = simple_example.expected_result_centroid
393 | with self.assertRaises(Exception):
394 | _ = compute_pairwise_similarities(df1, df2[:-2])
395 |
396 | @patch('string_grouper.string_grouper.StringGrouper')
397 | def test_group_similar_strings(self, mock_StringGouper):
398 | """mocks StringGrouper to test if the high-level function group_similar_strings utilizes it as expected"""
399 | mock_StringGrouper_instance = mock_StringGouper.return_value
400 | mock_StringGrouper_instance.fit.return_value = mock_StringGrouper_instance
401 | mock_StringGrouper_instance.get_groups.return_value = 'whatever'
402 |
403 | test_series_1 = None
404 | test_series_id_1 = None
405 | df = group_similar_strings(
406 | test_series_1,
407 | string_ids=test_series_id_1
408 | )
409 |
410 | mock_StringGrouper_instance.fit.assert_called_once()
411 | mock_StringGrouper_instance.get_groups.assert_called_once()
412 | self.assertEqual(df, 'whatever')
413 |
414 | @patch('string_grouper.string_grouper.StringGrouper')
415 | def test_match_most_similar(self, mock_StringGouper):
416 | """mocks StringGrouper to test if the high-level function match_most_similar utilizes it as expected"""
417 | mock_StringGrouper_instance = mock_StringGouper.return_value
418 | mock_StringGrouper_instance.fit.return_value = mock_StringGrouper_instance
419 | mock_StringGrouper_instance.get_groups.return_value = 'whatever'
420 |
421 | test_series_1 = None
422 | test_series_2 = None
423 | test_series_id_1 = None
424 | test_series_id_2 = None
425 | df = match_most_similar(
426 | test_series_1,
427 | test_series_2,
428 | master_id=test_series_id_1,
429 | duplicates_id=test_series_id_2
430 | )
431 |
432 | mock_StringGrouper_instance.fit.assert_called_once()
433 | mock_StringGrouper_instance.get_groups.assert_called_once()
434 | self.assertEqual(df, 'whatever')
435 |
436 | @patch('string_grouper.string_grouper.StringGrouper')
437 | def test_match_strings(self, mock_StringGouper):
438 | """mocks StringGrouper to test if the high-level function match_strings utilizes it as expected"""
439 | mock_StringGrouper_instance = mock_StringGouper.return_value
440 | mock_StringGrouper_instance.fit.return_value = mock_StringGrouper_instance
441 | mock_StringGrouper_instance.get_matches.return_value = 'whatever'
442 |
443 | test_series_1 = None
444 | test_series_id_1 = None
445 | df = match_strings(test_series_1, master_id=test_series_id_1)
446 |
447 | mock_StringGrouper_instance.fit.assert_called_once()
448 | mock_StringGrouper_instance.get_matches.assert_called_once()
449 | self.assertEqual(df, 'whatever')
450 |
451 | @patch(
452 | 'string_grouper.string_grouper.StringGrouper._fix_diagonal',
453 | side_effect=mock_symmetrize_matrix
454 | )
455 | def test_match_list_diagonal_without_the_fix(self, mock_fix_diagonal):
456 | """test fails whenever _matches_list's number of self-joins is not equal to the number of strings"""
457 | # This bug is difficult to reproduce -- I mostly encounter it while working with very large datasets;
458 | # for small datasets setting max_n_matches=1 reproduces the bug
459 | simple_example = SimpleExample()
460 | df = simple_example.customers_df['Customer Name']
461 | matches = match_strings(df, max_n_matches=1)
462 | mock_fix_diagonal.assert_called_once()
463 | num_self_joins = len(matches[matches['left_index'] == matches['right_index']])
464 | num_strings = len(df)
465 | self.assertNotEqual(num_self_joins, num_strings)
466 |
467 | def test_match_list_diagonal(self):
468 | """This test ensures that all self-joins are present"""
469 | # This bug is difficult to reproduce -- I mostly encounter it while working with very large datasets;
470 | # for small datasets setting max_n_matches=1 reproduces the bug
471 | simple_example = SimpleExample()
472 | df = simple_example.customers_df['Customer Name']
473 | matches = match_strings(df, max_n_matches=1)
474 | num_self_joins = len(matches[matches['left_index'] == matches['right_index']])
475 | num_strings = len(df)
476 | self.assertEqual(num_self_joins, num_strings)
477 |
478 | def test_zero_min_similarity(self):
479 | """Since sparse matrices exclude zero elements, this test ensures that zero similarity matches are
480 | returned when min_similarity <= 0. A bug related to this was first pointed out by @nbcvijanovic"""
481 | simple_example = SimpleExample()
482 | s_master = simple_example.customers_df['Customer Name']
483 | s_dup = simple_example.whatever_series_1
484 | matches = match_strings(s_master, s_dup, min_similarity=0)
485 | pd.testing.assert_frame_equal(simple_example.expected_result_with_zeroes, matches)
486 |
487 | def test_get_non_matches_empty_case(self):
488 | """This test ensures that _get_non_matches() returns an empty DataFrame when all pairs of strings match"""
489 | simple_example = SimpleExample()
490 | s_master = simple_example.a_few_strings
491 | s_dup = simple_example.one_string
492 | sg = StringGrouper(s_master, s_dup, max_n_matches=len(s_master), min_similarity=0).fit()
493 | self.assertTrue(sg._get_non_matches_list().empty)
494 |
495 | def test_n_grams_case_unchanged(self):
496 | """Should return all ngrams in a string with case"""
497 | test_series = pd.Series(pd.Series(['aaa']))
498 | # Explicit do not ignore case
499 | sg = StringGrouper(test_series, ignore_case=False)
500 | expected_result = ['McD', 'cDo', 'Don', 'ona', 'nal', 'ald', 'lds']
501 | self.assertListEqual(expected_result, sg.n_grams('McDonalds'))
502 |
503 | def test_n_grams_ignore_case_to_lower(self):
504 | """Should return all case insensitive ngrams in a string"""
505 | test_series = pd.Series(pd.Series(['aaa']))
506 | # Explicit ignore case
507 | sg = StringGrouper(test_series, ignore_case=True)
508 | expected_result = ['mcd', 'cdo', 'don', 'ona', 'nal', 'ald', 'lds']
509 | self.assertListEqual(expected_result, sg.n_grams('McDonalds'))
510 |
511 | def test_n_grams_ignore_case_to_lower_with_defaults(self):
512 | """Should return all case insensitive ngrams in a string"""
513 | test_series = pd.Series(pd.Series(['aaa']))
514 | # Implicit default case (i.e. default behaviour)
515 | sg = StringGrouper(test_series)
516 | expected_result = ['mcd', 'cdo', 'don', 'ona', 'nal', 'ald', 'lds']
517 | self.assertListEqual(expected_result, sg.n_grams('McDonalds'))
518 |
519 | def test_build_matrix(self):
520 | """Should create a csr matrix only master"""
521 | test_series = pd.Series(['foo', 'bar', 'baz'])
522 | sg = StringGrouper(test_series)
523 | master, dupe = sg._get_tf_idf_matrices()
524 | c = csr_matrix([[0., 0., 1.],
525 | [1., 0., 0.],
526 | [0., 1., 0.]])
527 | np.testing.assert_array_equal(c.toarray(), master.toarray())
528 | np.testing.assert_array_equal(c.toarray(), dupe.toarray())
529 |
530 | def test_build_matrix_master_and_duplicates(self):
531 | """Should create a csr matrix for master and duplicates"""
532 | test_series_1 = pd.Series(['foo', 'bar', 'baz'])
533 | test_series_2 = pd.Series(['foo', 'bar', 'bop'])
534 | sg = StringGrouper(test_series_1, test_series_2)
535 | master, dupe = sg._get_tf_idf_matrices()
536 | master_expected = csr_matrix([[0., 0., 0., 1.],
537 | [1., 0., 0., 0.],
538 | [0., 1., 0., 0.]])
539 | dupes_expected = csr_matrix([[0., 0., 0., 1.],
540 | [1., 0., 0., 0.],
541 | [0., 0., 1., 0.]])
542 |
543 | np.testing.assert_array_equal(master_expected.toarray(), master.toarray())
544 | np.testing.assert_array_equal(dupes_expected.toarray(), dupe.toarray())
545 |
546 | def test_build_matches(self):
547 | """Should create the cosine similarity matrix of two series"""
548 | test_series_1 = pd.Series(['foo', 'bar', 'baz'])
549 | test_series_2 = pd.Series(['foo', 'bar', 'bop'])
550 | sg = StringGrouper(test_series_1, test_series_2)
551 | master, dupe = sg._get_tf_idf_matrices()
552 |
553 | expected_matches = np.array([[1., 0., 0.],
554 | [0., 1., 0.],
555 | [0., 0., 0.]])
556 | np.testing.assert_array_equal(expected_matches, sg._build_matches(master, dupe, None).toarray())
557 |
558 | def test_build_matches_list(self):
559 | """Should create the cosine similarity matrix of two series"""
560 | test_series_1 = pd.Series(['foo', 'bar', 'baz'])
561 | test_series_2 = pd.Series(['foo', 'bar', 'bop'])
562 | sg = StringGrouper(test_series_1, test_series_2)
563 | sg = sg.fit()
564 | master = [0, 1]
565 | dupe_side = [0, 1]
566 | similarity = [1.0, 1.0]
567 | expected_df = pd.DataFrame({'master_side': master, 'dupe_side': dupe_side, 'similarity': similarity})
568 | expected_df.loc[:, 'similarity'] = expected_df.loc[:, 'similarity'].astype(sg._config.tfidf_matrix_dtype)
569 | pd.testing.assert_frame_equal(expected_df, sg._matches_list)
570 |
571 | def test_case_insensitive_build_matches_list(self):
572 | """Should create the cosine similarity matrix of two case insensitive series"""
573 | test_series_1 = pd.Series(['foo', 'BAR', 'baz'])
574 | test_series_2 = pd.Series(['FOO', 'bar', 'bop'])
575 | sg = StringGrouper(test_series_1, test_series_2)
576 | sg = sg.fit()
577 | master = [0, 1]
578 | dupe_side = [0, 1]
579 | similarity = [1.0, 1.0]
580 | expected_df = pd.DataFrame({'master_side': master, 'dupe_side': dupe_side, 'similarity': similarity})
581 | expected_df.loc[:, 'similarity'] = expected_df.loc[:, 'similarity'].astype(sg._config.tfidf_matrix_dtype)
582 | pd.testing.assert_frame_equal(expected_df, sg._matches_list)
583 |
584 | def test_get_matches_two_dataframes(self):
585 | test_series_1 = pd.Series(['foo', 'bar', 'baz'])
586 | test_series_2 = pd.Series(['foo', 'bar', 'bop'])
587 | sg = StringGrouper(test_series_1, test_series_2).fit()
588 | left_side = ['foo', 'bar']
589 | left_index = [0, 1]
590 | right_side = ['foo', 'bar']
591 | right_index = [0, 1]
592 | similarity = [1.0, 1.0]
593 | expected_df = pd.DataFrame({'left_index': left_index, 'left_side': left_side,
594 | 'similarity': similarity,
595 | 'right_side': right_side, 'right_index': right_index})
596 | expected_df.loc[:, 'similarity'] = expected_df.loc[:, 'similarity'].astype(sg._config.tfidf_matrix_dtype)
597 | pd.testing.assert_frame_equal(expected_df, sg.get_matches())
598 |
599 | def test_get_matches_single(self):
600 | test_series_1 = pd.Series(['foo', 'bar', 'baz', 'foo'])
601 | sg = StringGrouper(test_series_1)
602 | sg = sg.fit()
603 | left_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo']
604 | right_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo']
605 | right_index = [0, 3, 1, 2, 0, 3]
606 | left_index = [0, 0, 1, 2, 3, 3]
607 | similarity = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
608 | expected_df = pd.DataFrame({'left_index': left_index, 'left_side': left_side,
609 | 'similarity': similarity,
610 | 'right_side': right_side, 'right_index': right_index})
611 | expected_df.loc[:, 'similarity'] = expected_df.loc[:, 'similarity'].astype(sg._config.tfidf_matrix_dtype)
612 | pd.testing.assert_frame_equal(expected_df, sg.get_matches())
613 |
614 | def test_get_matches_1_series_1_id_series(self):
615 | test_series_1 = pd.Series(['foo', 'bar', 'baz', 'foo'])
616 | test_series_id_1 = pd.Series(['A0', 'A1', 'A2', 'A3'])
617 | sg = StringGrouper(test_series_1, master_id=test_series_id_1)
618 | sg = sg.fit()
619 | right_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo']
620 | right_side_id = ['A0', 'A3', 'A1', 'A2', 'A0', 'A3']
621 | right_index = [0, 3, 1, 2, 0, 3]
622 | left_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo']
623 | left_side_id = ['A0', 'A0', 'A1', 'A2', 'A3', 'A3']
624 | left_index = [0, 0, 1, 2, 3, 3]
625 | similarity = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
626 | similarity = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
627 | expected_df = pd.DataFrame({'left_index': left_index, 'left_side': left_side, 'left_id': left_side_id,
628 | 'similarity': similarity,
629 | 'right_id': right_side_id, 'right_side': right_side, 'right_index': right_index})
630 | expected_df.loc[:, 'similarity'] = expected_df.loc[:, 'similarity'].astype(sg._config.tfidf_matrix_dtype)
631 | pd.testing.assert_frame_equal(expected_df, sg.get_matches())
632 |
633 | def test_get_matches_2_series_2_id_series(self):
634 | test_series_1 = pd.Series(['foo', 'bar', 'baz'])
635 | test_series_id_1 = pd.Series(['A0', 'A1', 'A2'])
636 | test_series_2 = pd.Series(['foo', 'bar', 'bop'])
637 | test_series_id_2 = pd.Series(['B0', 'B1', 'B2'])
638 | sg = StringGrouper(test_series_1, test_series_2, duplicates_id=test_series_id_2,
639 | master_id=test_series_id_1).fit()
640 | left_side = ['foo', 'bar']
641 | left_side_id = ['A0', 'A1']
642 | left_index = [0, 1]
643 | right_side = ['foo', 'bar']
644 | right_side_id = ['B0', 'B1']
645 | right_index = [0, 1]
646 | similarity = [1.0, 1.0]
647 | expected_df = pd.DataFrame({'left_index': left_index, 'left_side': left_side, 'left_id': left_side_id,
648 | 'similarity': similarity,
649 | 'right_id': right_side_id, 'right_side': right_side, 'right_index': right_index})
650 | expected_df.loc[:, 'similarity'] = expected_df.loc[:, 'similarity'].astype(sg._config.tfidf_matrix_dtype)
651 | pd.testing.assert_frame_equal(expected_df, sg.get_matches())
652 |
653 | def test_get_matches_raises_exception_if_unexpected_options_given(self):
654 | # When the input id data does not correspond with its string data:
655 | test_series_1 = pd.Series(['foo', 'bar', 'baz'])
656 | bad_test_series_id_1 = pd.Series(['A0', 'A1'])
657 | good_test_series_id_1 = pd.Series(['A0', 'A1', 'A2'])
658 | test_series_2 = pd.Series(['foo', 'bar', 'bop'])
659 | bad_test_series_id_2 = pd.Series(['B0', 'B1'])
660 | good_test_series_id_2 = pd.Series(['B0', 'B1', 'B2'])
661 | with self.assertRaises(Exception):
662 | _ = StringGrouper(test_series_1, master_id=bad_test_series_id_1)
663 | with self.assertRaises(Exception):
664 | _ = StringGrouper(test_series_1, duplicates=test_series_2, duplicates_id=bad_test_series_id_2,
665 | master_id=good_test_series_id_1)
666 |
667 | # When the input data is ok but the option combinations are invalid:
668 | with self.assertRaises(Exception):
669 | _ = StringGrouper(test_series_1, test_series_2, master_id=good_test_series_id_1)
670 | with self.assertRaises(Exception):
671 | _ = StringGrouper(test_series_1, test_series_2, duplicates_id=good_test_series_id_2)
672 | with self.assertRaises(Exception):
673 | _ = StringGrouper(test_series_1, duplicates_id=good_test_series_id_2)
674 | with self.assertRaises(Exception):
675 | _ = StringGrouper(test_series_1, master_id=good_test_series_id_1, duplicates_id=good_test_series_id_2)
676 | with self.assertRaises(Exception):
677 | _ = StringGrouper(test_series_1, master_id=good_test_series_id_1, ignore_index=True, replace_na=True)
678 | # Here we force an exception by making the number of index-levels of duplicates different from master:
679 | # and setting replace_na=True
680 | test_series_2.index = pd.MultiIndex.from_tuples(list(zip(list('ABC'), [0, 1, 2])))
681 | with self.assertRaises(Exception):
682 | _ = StringGrouper(test_series_1, duplicates=test_series_2, replace_na=True)
683 |
684 | def test_get_groups_single_df_group_rep_default(self):
685 | """Should return a pd.Series object with the same length as the original df. The series object will contain
686 | a list of the grouped strings"""
687 | simple_example = SimpleExample()
688 | customers_df = simple_example.customers_df
689 | pd.testing.assert_series_equal(
690 | simple_example.expected_result_centroid,
691 | group_similar_strings(
692 | customers_df['Customer Name'],
693 | min_similarity=0.6,
694 | ignore_index=True
695 | )
696 | )
697 | sg = StringGrouper(customers_df['Customer Name'])
698 | pd.testing.assert_series_equal(
699 | simple_example.expected_result_centroid,
700 | sg.group_similar_strings(
701 | customers_df['Customer Name'],
702 | min_similarity=0.6,
703 | ignore_index=True
704 | )
705 | )
706 |
707 | def test_get_groups_single_valued_series(self):
708 | """This test ensures that get_groups() returns a single-valued DataFrame or Series object
709 | since the input-series is also single-valued. This test was created in response to a bug discovered
710 | by George Walker"""
711 | pd.testing.assert_frame_equal(
712 | pd.DataFrame([(0, "hello")], columns=['group_rep_index', 'group_rep']),
713 | group_similar_strings(
714 | pd.Series(["hello"]),
715 | min_similarity=0.6
716 | )
717 | )
718 | pd.testing.assert_series_equal(
719 | pd.Series(["hello"], name='group_rep'),
720 | group_similar_strings(
721 | pd.Series(["hello"]),
722 | min_similarity=0.6,
723 | ignore_index=True
724 | )
725 | )
726 | pd.testing.assert_frame_equal(
727 | pd.DataFrame([(0, "hello")], columns=['most_similar_index', 'most_similar_master']),
728 | match_most_similar(
729 | pd.Series(["hello"]),
730 | pd.Series(["hello"]),
731 | min_similarity=0.6
732 | )
733 | )
734 | pd.testing.assert_frame_equal(
735 | pd.DataFrame([(0, "hello")], columns=['most_similar_index', 'most_similar_master']),
736 | match_most_similar(
737 | pd.Series(["hello"]),
738 | pd.Series(["hello"]),
739 | min_similarity=0.6,
740 | max_n_matches=20
741 | )
742 | )
743 | pd.testing.assert_series_equal(
744 | pd.Series(["hello"], name='most_similar_master'),
745 | match_most_similar(
746 | pd.Series(["hello"]),
747 | pd.Series(["hello"]),
748 | min_similarity=0.6,
749 | ignore_index=True
750 | )
751 | )
752 |
753 | def test_get_groups_single_df_keep_index(self):
754 | """Should return a pd.Series object with the same length as the original df. The series object will contain
755 | a list of the grouped strings with their indexes displayed in columns"""
756 | simple_example = SimpleExample()
757 | customers_df = simple_example.customers_df
758 | pd.testing.assert_frame_equal(
759 | simple_example.expected_result_centroid_with_index_col,
760 | group_similar_strings(
761 | customers_df['Customer Name'],
762 | min_similarity=0.6,
763 | ignore_index=False
764 | )
765 | )
766 |
767 | def test_get_groups_single_df_group_rep_centroid(self):
768 | """Should return a pd.Series object with the same length as the original df. The series object will contain
769 | a list of the grouped strings"""
770 | simple_example = SimpleExample()
771 | customers_df = simple_example.customers_df
772 | pd.testing.assert_series_equal(
773 | simple_example.expected_result_first,
774 | group_similar_strings(
775 | customers_df['Customer Name'],
776 | group_rep='first',
777 | min_similarity=0.6,
778 | ignore_index=True
779 | )
780 | )
781 |
782 | def test_get_groups_single_df_group_rep_bad_option_value(self):
783 | """Should raise an exception when group_rep value given is neither 'centroid' nor 'first'"""
784 | simple_example = SimpleExample()
785 | customers_df = simple_example.customers_df
786 | with self.assertRaises(Exception):
787 | _ = group_similar_strings(
788 | customers_df['Customer Name'],
789 | group_rep='nonsense',
790 | min_similarity=0.6
791 | )
792 |
793 | def test_get_groups_single_df(self):
794 | """Should return a pd.Series object with the same length as the original df. The series object will contain
795 | a list of the grouped strings"""
796 | test_series_1 = pd.Series(['foooo', 'bar', 'baz', 'foooob'])
797 | sg = StringGrouper(test_series_1, ignore_index=True)
798 | sg = sg.fit()
799 | result = sg.get_groups()
800 | expected_result = pd.Series(['foooo', 'bar', 'baz', 'foooo'], name='group_rep')
801 | pd.testing.assert_series_equal(expected_result, result)
802 |
803 | def test_get_groups_1_string_series_1_id_series(self):
804 | """Should return a pd.DataFrame object with the same length as the original df. The series object will contain
805 | a list of the grouped strings"""
806 | test_series_1 = pd.Series(['foooo', 'bar', 'baz', 'foooob'])
807 | test_series_id_1 = pd.Series(['A0', 'A1', 'A2', 'A3'])
808 | sg = StringGrouper(test_series_1, master_id=test_series_id_1, ignore_index=True)
809 | sg = sg.fit()
810 | result = sg.get_groups()
811 | expected_result = pd.DataFrame(list(zip(['A0', 'A1', 'A2', 'A0'], ['foooo', 'bar', 'baz', 'foooo'])),
812 | columns=['group_rep_id', 'group_rep'])
813 | pd.testing.assert_frame_equal(expected_result, result)
814 |
815 | def test_get_groups_two_df(self):
816 | """Should return a pd.Series object with the length of the dupes. The series will contain the master string
817 | that matches the dupe with the highest similarity"""
818 | test_series_1 = pd.Series(['foooo', 'bar', 'baz'])
819 | test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob'])
820 | sg = StringGrouper(test_series_1, test_series_2, ignore_index=True)
821 | sg = sg.fit()
822 | result = sg.get_groups()
823 | expected_result = pd.Series(['foooo', 'bar', 'baz', 'foooo'], name='most_similar_master')
824 | pd.testing.assert_series_equal(expected_result, result)
825 | result = sg.match_most_similar(test_series_1, test_series_2, max_n_matches=3)
826 | pd.testing.assert_series_equal(expected_result, result)
827 |
828 | def test_get_groups_2_string_series_2_id_series(self):
829 | """Should return a pd.DataFrame object with the length of the dupes. The series will contain the master string
830 | that matches the dupe with the highest similarity"""
831 | test_series_1 = pd.Series(['foooo', 'bar', 'baz'])
832 | test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob'])
833 | test_series_id_1 = pd.Series(['A0', 'A1', 'A2'])
834 | test_series_id_2 = pd.Series(['B0', 'B1', 'B2', 'B3'])
835 | sg = StringGrouper(test_series_1,
836 | test_series_2,
837 | master_id=test_series_id_1,
838 | duplicates_id=test_series_id_2,
839 | ignore_index=True)
840 | sg = sg.fit()
841 | result = sg.get_groups()
842 | expected_result = pd.DataFrame(list(zip(['A0', 'A1', 'A2', 'A0'], ['foooo', 'bar', 'baz', 'foooo'])),
843 | columns=['most_similar_master_id', 'most_similar_master'])
844 | pd.testing.assert_frame_equal(expected_result, result)
845 |
846 | def test_get_groups_2_string_series_2_numeric_id_series_with_missing_master_value(self):
847 | """Should return a pd.DataFrame object with the length of the dupes. The series will contain the master string
848 | that matches the dupe with the highest similarity"""
849 | test_series_1 = pd.Series(['foooo', 'bar', 'foooo'])
850 | test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob'])
851 | test_series_id_1 = pd.Series([0, 1, 2], dtype = "Int64")
852 | test_series_id_2 = pd.Series([100, 101, 102, 103], dtype = "Int64")
853 | sg = StringGrouper(test_series_1,
854 | test_series_2,
855 | master_id=test_series_id_1,
856 | duplicates_id=test_series_id_2,
857 | ignore_index=True)
858 | sg = sg.fit()
859 | result = sg.get_groups()
860 | expected_result = pd.DataFrame(list(zip([0, 1, 102, 0], ['foooo', 'bar', 'baz', 'foooo'])),
861 | columns=['most_similar_master_id', 'most_similar_master']
862 | ).astype(dtype= {"most_similar_master_id":"Int64",
863 | "most_similar_master":"str"})
864 | pd.testing.assert_frame_equal(expected_result, result)
865 |
866 | def test_get_groups_2_string_series_with_numeric_indexes_and_missing_master_value(self):
867 | """Should return a pd.DataFrame object with the length of the dupes. The series will contain the master string
868 | that matches the dupe with the highest similarity"""
869 | test_series_1 = pd.Series(['foooo', 'bar', 'foooo'], index = pd.Index([0, 1, 2], dtype = "Int64"))
870 | test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob'], index = pd.Index([100, 101, 102, 103], dtype = "Int64"))
871 | sg = StringGrouper(test_series_1, test_series_2, replace_na=True)
872 | sg = sg.fit()
873 | result = sg.get_groups()
874 | expected_result = pd.DataFrame(list(zip([0, 1, 102, 0], ['foooo', 'bar', 'baz', 'foooo'])),
875 | columns=['most_similar_index', 'most_similar_master'],
876 | index=test_series_2.index).astype(dtype= {"most_similar_index":"Int64",
877 | "most_similar_master":"str"})
878 | pd.testing.assert_frame_equal(expected_result, result)
879 |
880 | def test_get_groups_two_df_same_similarity(self):
881 | """Should return a pd.Series object with the length of the dupes. If there are two dupes with the same
882 | similarity, the first one is chosen"""
883 | test_series_1 = pd.Series(['foooo', 'bar', 'baz', 'foooo'])
884 | test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob'])
885 | sg = StringGrouper(test_series_1, test_series_2, ignore_index=True)
886 | sg = sg.fit()
887 | result = sg.get_groups()
888 | expected_result = pd.Series(['foooo', 'bar', 'baz', 'foooo'], name='most_similar_master')
889 | pd.testing.assert_series_equal(expected_result, result)
890 |
891 | def test_get_groups_4_df_same_similarity(self):
892 | """Should return a pd.DataFrame object with the length of the dupes. If there are two dupes with the same
893 | similarity, the first one is chosen"""
894 | test_series_1 = pd.Series(['foooo', 'bar', 'baz', 'foooo'])
895 | test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob'])
896 | test_series_id_1 = pd.Series(['A0', 'A1', 'A2', 'A3'])
897 | test_series_id_2 = pd.Series(['B0', 'B1', 'B2', 'B3'])
898 | sg = StringGrouper(test_series_1,
899 | test_series_2,
900 | master_id=test_series_id_1,
901 | duplicates_id=test_series_id_2,
902 | ignore_index=True)
903 | sg = sg.fit()
904 | result = sg.get_groups()
905 | expected_result = pd.DataFrame(list(zip(['A0', 'A1', 'A2', 'A0'], ['foooo', 'bar', 'baz', 'foooo'])),
906 | columns=['most_similar_master_id', 'most_similar_master'])
907 | pd.testing.assert_frame_equal(expected_result, result)
908 |
909 | def test_get_groups_two_df_no_match(self):
910 | """Should return a pd.Series object with the length of the dupes. If no match is found in dupes,
911 | the original will be returned"""
912 | test_series_1 = pd.Series(['foooo', 'bar', 'baz'])
913 | test_series_2 = pd.Series(['foooo', 'dooz', 'bar', 'baz', 'foooob'])
914 | sg = StringGrouper(test_series_1, test_series_2, ignore_index=True)
915 | sg = sg.fit()
916 | result = sg.get_groups()
917 | expected_result = pd.Series(['foooo', 'dooz', 'bar', 'baz', 'foooo'], name='most_similar_master')
918 | pd.testing.assert_series_equal(expected_result, result)
919 |
920 | def test_get_groups_4_df_no_match(self):
921 | """Should return a pd.DataFrame object with the length of the dupes. If no match is found in dupes,
922 | the original will be returned"""
923 | test_series_1 = pd.Series(['foooo', 'bar', 'baz'])
924 | test_series_2 = pd.Series(['foooo', 'dooz', 'bar', 'baz', 'foooob'])
925 | test_series_id_1 = pd.Series(['A0', 'A1', 'A2'])
926 | test_series_id_2 = pd.Series(['B0', 'B1', 'B2', 'B3', 'B4'])
927 | sg = StringGrouper(test_series_1,
928 | test_series_2,
929 | master_id=test_series_id_1,
930 | duplicates_id=test_series_id_2,
931 | ignore_index=True)
932 | sg = sg.fit()
933 | result = sg.get_groups()
934 | expected_result = pd.DataFrame(list(zip(
935 | ['A0', 'B1', 'A1', 'A2', 'A0'], ['foooo', 'dooz', 'bar', 'baz', 'foooo']
936 | )),
937 | columns=['most_similar_master_id', 'most_similar_master']
938 | )
939 | pd.testing.assert_frame_equal(expected_result, result)
940 |
941 | def test_get_groups_raises_exception(self):
942 | """Should raise an exception if called before the StringGrouper is fit"""
943 | test_series_1 = pd.Series(['foooo', 'bar', 'baz', 'foooo'])
944 | test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob'])
945 | sg = StringGrouper(test_series_1, test_series_2)
946 | with self.assertRaises(StringGrouperNotFitException):
947 | _ = sg.get_groups()
948 |
949 | def test_add_match_raises_exception_if_string_not_present(self):
950 | test_series_1 = pd.Series(['foooo', 'no match', 'baz', 'foooo'])
951 | test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob'])
952 | sg = StringGrouper(test_series_1).fit()
953 | sg2 = StringGrouper(test_series_1, test_series_2).fit()
954 | with self.assertRaises(ValueError):
955 | sg.add_match('doesnt exist', 'baz')
956 | with self.assertRaises(ValueError):
957 | sg.add_match('baz', 'doesnt exist')
958 | with self.assertRaises(ValueError):
959 | sg2.add_match('doesnt exist', 'baz')
960 | with self.assertRaises(ValueError):
961 | sg2.add_match('baz', 'doesnt exist')
962 |
963 | def test_add_match_single_occurence(self):
964 | """Should add the match if there are no exact duplicates"""
965 | test_series_1 = pd.Series(['foooo', 'no match', 'baz', 'foooo'])
966 | test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob'])
967 | sg = StringGrouper(test_series_1).fit()
968 | sg.add_match('no match', 'baz')
969 | matches = sg.get_matches()
970 | matches = matches[(matches.left_side == 'no match') & (matches.right_side == 'baz')]
971 | self.assertEqual(1, matches.shape[0])
972 | sg2 = StringGrouper(test_series_1, test_series_2).fit()
973 | sg2.add_match('no match', 'bar')
974 | matches = sg2.get_matches()
975 | matches = matches[(matches.left_side == 'no match') & (matches.right_side == 'bar')]
976 | self.assertEqual(1, matches.shape[0])
977 |
978 | def test_add_match_single_group_matches_symmetric(self):
979 | """New matches that are added to a SG with only a master series should be symmetric"""
980 | test_series_1 = pd.Series(['foooo', 'no match', 'baz', 'foooo'])
981 | sg = StringGrouper(test_series_1).fit()
982 | sg.add_match('no match', 'baz')
983 | matches = sg.get_matches()
984 | matches_1 = matches[(matches.left_side == 'no match') & (matches.right_side == 'baz')]
985 | self.assertEqual(1, matches_1.shape[0])
986 | matches_2 = matches[(matches.left_side == 'baz') & (matches.right_side == 'no match')]
987 | self.assertEqual(1, matches_2.shape[0])
988 |
989 | def test_add_match_multiple_occurences(self):
990 | """Should add multiple matches if there are exact duplicates"""
991 | test_series_1 = pd.Series(['foooo', 'no match', 'baz', 'foooo'])
992 | test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob'])
993 | sg = StringGrouper(test_series_1, test_series_2).fit()
994 | sg.add_match('foooo', 'baz')
995 | matches = sg.get_matches()
996 | matches = matches[(matches.left_side == 'foooo') & (matches.right_side == 'baz')]
997 | self.assertEqual(2, matches.shape[0])
998 |
999 | def test_remove_match(self):
1000 | """Should remove a match"""
1001 | test_series_1 = pd.Series(['foooo', 'no match', 'baz', 'foooob'])
1002 | test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob'])
1003 | sg = StringGrouper(test_series_1).fit()
1004 | sg.remove_match('foooo', 'foooob')
1005 | matches = sg.get_matches()
1006 | matches_1 = matches[(matches.left_side == 'foooo') & (matches.right_side == 'foooob')]
1007 | # In the case of only a master series, the matches are recursive, so both variants are to be removed
1008 | matches_2 = matches[(matches.left_side == 'foooob') & (matches.right_side == 'foooo')]
1009 | self.assertEqual(0, matches_1.shape[0])
1010 | self.assertEqual(0, matches_2.shape[0])
1011 |
1012 | sg2 = StringGrouper(test_series_1, test_series_2).fit()
1013 | sg2.remove_match('foooo', 'foooob')
1014 | matches = sg2.get_matches()
1015 | matches = matches[(matches.left_side == 'foooo') & (matches.right_side == 'foooob')]
1016 | self.assertEqual(0, matches.shape[0])
1017 |
1018 | def test_string_grouper_type_error(self):
1019 | """StringGrouper should raise an typeerror master or duplicates are not a series of strings"""
1020 | with self.assertRaises(TypeError):
1021 | _ = StringGrouper('foo', 'bar')
1022 | with self.assertRaises(TypeError):
1023 | _ = StringGrouper(pd.Series(['foo', 'bar']), pd.Series(['foo', 1]))
1024 | with self.assertRaises(TypeError):
1025 | _ = StringGrouper(pd.Series(['foo', np.nan]), pd.Series(['foo', 'j']))
1026 |
1027 | def test_prior_matches_added(self):
1028 | """When a new match is added, any pre-existing matches should also be updated"""
1029 | sample = [
1030 | 'microsoftoffice 365 home',
1031 | 'microsoftoffice 365 pers',
1032 | 'microsoft office'
1033 | ]
1034 |
1035 | df = pd.DataFrame(sample, columns=['name'])
1036 |
1037 | sg = StringGrouper(df['name'], ignore_index=True)
1038 | sg = sg.fit()
1039 |
1040 | sg = sg.add_match('microsoft office', 'microsoftoffice 365 home')
1041 | sg = sg.add_match('microsoftoffice 365 pers', 'microsoft office')
1042 | df['deduped'] = sg.get_groups()
1043 | # All strings should now match to the same "master" string
1044 | self.assertEqual(1, len(df.deduped.unique()))
1045 |
1046 |
1047 | if __name__ == '__main__':
1048 | unittest.main()
1049 |
--------------------------------------------------------------------------------