├── string_grouper ├── test │ ├── __init__.py │ └── test_string_grouper.py └── __init__.py ├── string_grouper_utils ├── test │ ├── __init__.py │ └── test_string_grouper_utils.py ├── __init__.py └── string_grouper_utils.py ├── images ├── Fuzzy_vs_Exact.png ├── BlockMatrix_1_1.png ├── BlockMatrix_1_2.png ├── BlockMatrix_2_2.png ├── ScaledRuntimeContourPlot.png ├── ScaledTimePerComparison.png └── BlockNumberSpaceExploration1.png ├── .gitignore ├── docs ├── references.md ├── references │ ├── compute_pairwise_similarities.md │ ├── group_similar_strings.md │ ├── match_strings.md │ ├── match_most_similar.md │ ├── options_kwargs.md │ └── sg_class.md ├── performance.md ├── index.md └── examples.md ├── tutorials ├── accounts.csv ├── tutorial_1.md ├── zero_similarity.md └── group_representatives.md ├── .github └── workflows │ └── test.yml ├── pyproject.toml ├── mkdocs.yml ├── LICENSE ├── setup.py ├── CHANGELOG.md └── README.md /string_grouper/test/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /string_grouper_utils/test/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /images/Fuzzy_vs_Exact.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bergvca/string_grouper/HEAD/images/Fuzzy_vs_Exact.png -------------------------------------------------------------------------------- /images/BlockMatrix_1_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bergvca/string_grouper/HEAD/images/BlockMatrix_1_1.png -------------------------------------------------------------------------------- /images/BlockMatrix_1_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bergvca/string_grouper/HEAD/images/BlockMatrix_1_2.png -------------------------------------------------------------------------------- /images/BlockMatrix_2_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bergvca/string_grouper/HEAD/images/BlockMatrix_2_2.png -------------------------------------------------------------------------------- /images/ScaledRuntimeContourPlot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bergvca/string_grouper/HEAD/images/ScaledRuntimeContourPlot.png -------------------------------------------------------------------------------- /images/ScaledTimePerComparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bergvca/string_grouper/HEAD/images/ScaledTimePerComparison.png -------------------------------------------------------------------------------- /images/BlockNumberSpaceExploration1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bergvca/string_grouper/HEAD/images/BlockNumberSpaceExploration1.png -------------------------------------------------------------------------------- /string_grouper_utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .string_grouper_utils import new_group_rep_by_earliest_timestamp, new_group_rep_by_completeness, \ 2 | new_group_rep_by_highest_weight 3 | -------------------------------------------------------------------------------- /string_grouper/__init__.py: -------------------------------------------------------------------------------- 1 | from .string_grouper import compute_pairwise_similarities, group_similar_strings, match_most_similar, match_strings, \ 2 | StringGrouperConfig, StringGrouper 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | 3 | __pycache__ 4 | */__pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | .ipynb_checkpoints 9 | *.ipynb 10 | 11 | dist/ 12 | build/ 13 | *.egg-info/ 14 | 15 | .DS_Store 16 | 17 | site/ 18 | 19 | 20 | tests.txt 21 | -------------------------------------------------------------------------------- /docs/references.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: References 3 | --- 4 | 5 | 6 | All functions are built using a class **`StringGrouper`**. This class can be used through pre-defined functions, for example the four high level functions above, as well as using a more interactive approach where matches can be added or removed if needed by calling the **`StringGrouper`** class directly. 7 | 8 | -------------------------------------------------------------------------------- /tutorials/accounts.csv: -------------------------------------------------------------------------------- 1 | id,name 2 | AA012345X,mega enterprises corp. 3 | BB016741P,mega enterprises corporation 4 | CC052345T,mega corp. 5 | AA098762D,hyper startup inc. 6 | BB099931J,hyper-startup inc. 7 | CC082744L,hyper startup incorporated 8 | HH072982K,hyper hyper inc. 9 | AA903844B,slow and steady inc. 10 | BB904941H,slow and steady incorporated 11 | CC903844B,slow steady inc. 12 | AA777431C,abc enterprises inc. 13 | BB760431Y,a.b.c. enterprises incorporated 14 | BB750431M,a.b.c. enterprises inc. 15 | ZZ123456H,one and only inc. -------------------------------------------------------------------------------- /docs/references/compute_pairwise_similarities.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: compute_pairwise_similarities 3 | --- 4 | 5 | 6 | ## Arguments 7 | 8 | ```python 9 | compute_pairwise_similarities(string_series_1: pd.Series, 10 | string_series_2: pd.Series, 11 | **kwargs) -> pd.Series 12 | ``` 13 | 14 | 15 | ## Result 16 | 17 | Returns a `Series` of cosine similarity scores the same length and index as `string_series_1`. Each score is the cosine similarity between the pair of strings in the same position (row) in the two input `Series`, `string_series_1` and `string_series_2`, as the position of the score in the output `Series`. This can be seen as an element-wise comparison between the two input `Series`. 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Run tests 2 | on: 3 | pull_request: 4 | push: 5 | branches: 6 | - master 7 | 8 | jobs: 9 | test: 10 | runs-on: ${{ matrix.os }} 11 | strategy: 12 | matrix: 13 | python-version: [3.9, 3.11, 3.12.3] 14 | os: [ubuntu-latest, windows-latest] 15 | 16 | steps: 17 | - uses: actions/checkout@v2 18 | 19 | - name: Set up Python ${{ matrix.python-version }} 20 | uses: actions/setup-python@v2 21 | with: 22 | python-version: ${{ matrix.python-version }} 23 | 24 | - name: Install dev-package 25 | run: | 26 | python -m pip install --upgrade pip 27 | pip install poetry 28 | poetry install 29 | python -m pip install -e . 30 | 31 | - name: Run tests 32 | run: python -m unittest 33 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "string_grouper" 3 | version = "0.7.1" 4 | description = "String grouper contains functions to do string matching using TF-IDF and the cossine similarity." 5 | authors = [ 6 | {name = "Chris van den Berg"}, 7 | {name = "ParticularMiner"}, 8 | ] 9 | 10 | maintainers = [ 11 | {name = "Chris van den Berg"}, 12 | {name = "Guillaume Pressiat"}, 13 | ] 14 | 15 | 16 | license = "MIT License" 17 | readme = "README.md" 18 | 19 | packages = [ 20 | { include = "string_grouper" }, 21 | { include = "string_grouper_utils" }, 22 | ] 23 | 24 | [tool.poetry.dependencies] 25 | python = "^3.9" 26 | pandas = "^2.0" 27 | scipy = ">=1.4.1" 28 | scikit-learn = "^1.4.0" 29 | numpy = "^1.26.0" 30 | sparse_dot_topn = ">=1.1.0" 31 | loguru = ">0.7.0" 32 | 33 | [build-system] 34 | requires = ["poetry-core"] 35 | build-backend = "poetry.core.masonry.api" 36 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: String Grouper 2 | 3 | repo_url: https://github.com/bergvca/string_grouper 4 | 5 | 6 | theme: 7 | name: material 8 | features: 9 | - navigation.tabs 10 | # - navigation.tabs.sticky 11 | 12 | nav: 13 | - Home: 14 | - index.md 15 | - References: 16 | - references/match_strings.md 17 | - references/match_most_similar.md 18 | - references/group_similar_strings.md 19 | - references/compute_pairwise_similarities.md 20 | - references/options_kwargs.md 21 | - references/sg_class.md 22 | - Examples: 23 | - examples.md 24 | - Performance: 25 | - performance.md 26 | 27 | 28 | markdown_extensions: 29 | - toc: 30 | toc_depth: 3 31 | - pymdownx.highlight: 32 | anchor_linenums: true 33 | line_spans: __span 34 | pygments_lang_class: true 35 | - pymdownx.inlinehilite 36 | - pymdownx.snippets 37 | - pymdownx.superfences 38 | - admonition 39 | - pymdownx.details 40 | - pymdownx.superfences 41 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Chris van den Berg 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | import pathlib 3 | 4 | # The directory containing this file 5 | HERE = pathlib.Path(__file__).parent 6 | 7 | # The text of the README file 8 | README = (HERE / "README.md").read_text() 9 | 10 | setup( 11 | name='string_grouper', 12 | version='0.7.0', 13 | packages=['string_grouper', 'string_grouper_utils'], 14 | license='MIT License', 15 | description='String grouper contains functions to do string matching using TF-IDF and the cossine similarity. ' 16 | 'Based on https://bergvca.github.io/2017/10/14/super-fast-string-matching.html', 17 | author='Chris van den Berg', 18 | long_description=README, 19 | long_description_content_type="text/markdown", 20 | author_email='fake_email@gmail.com', 21 | url='https://github.com/Bergvca/string_grouper', 22 | zip_safe=False, 23 | python_requires='>3.7', 24 | install_requires=['pandas>=2.0' 25 | , 'scipy>=1.4.1' 26 | , 'scikit-learn>=1.4.0' 27 | , 'numpy>=1.26.0, < 2.0' 28 | , 'sparse_dot_topn>=1.1.0' 29 | , 'loguru>=0.7' 30 | ] 31 | ) 32 | 33 | -------------------------------------------------------------------------------- /docs/references/group_similar_strings.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: group_similar_strings 3 | --- 4 | 5 | 6 | ## Arguments 7 | 8 | ```python 9 | 10 | ## Arguments 11 | 12 | group_similar_strings(strings_to_group: pd.Series, 13 | string_ids: Optional[pd.Series], 14 | **kwargs) -> Union[pd.DataFrame, pd.Series] 15 | ``` 16 | 17 | 18 | ## Result 19 | 20 | Takes a single `Series` of strings (`strings_to_group`) and groups them by assigning to each string one string from `strings_to_group` chosen as the group-representative for each group of similar strings found. (See [tutorials/group_representatives.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/group_representatives.md) for details on how the the group-representatives are chosen.) 21 | 22 | If `ignore_index=True`, the output is a `Series` (with the same name as `strings_to_group` prefixed by the string `'group_rep_'`) of the same length and index as `strings_to_group` containing the group-representative strings. If `strings_to_group` has no name then the name of the returned `Series` is `'group_rep'`. 23 | 24 | For example, an input Series with values: `['foooo', 'foooob', 'bar']` will return `['foooo', 'foooo', 'bar']`. Here `'foooo'` and `'foooob'` are grouped together into group `'foooo'` because they are found to be similar. Another example can be found [below](#dedup). 25 | 26 | If `ignore_index=False`, the output is a `DataFrame` containing the above output `Series` as one of its columns with the same name. The remaining column(s) correspond to the index (or index-levels) of `strings_to_group` and contain the index-labels of the group-representatives as values. These columns have the same names as their counterparts prefixed by the string `'group_rep_'`. 27 | 28 | If `strings_id` is also given, then the IDs from `strings_id` corresponding to the group-representatives are also returned in an additional column (with the same name as `strings_id` prefixed as described above). If `strings_id` has no name, it is assumed to have the name `'id'` before being prefixed. 29 | 30 | 31 | -------------------------------------------------------------------------------- /docs/references/match_strings.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: match_strings 3 | --- 4 | 5 | 6 | ## Arguments 7 | 8 | ```python 9 | match_strings(master: pd.Series, 10 | duplicates: Optional[pd.Series], 11 | master_id: Optional[pd.Series], 12 | duplicates_id: Optional[pd.Series], 13 | **kwargs) -> pd.DataFrame 14 | ``` 15 | 16 | ## Result 17 | 18 | Returns a `DataFrame` containing similarity-scores of all matching pairs of highly similar strings from `master` (and `duplicates` if given). Each matching pair in the output appears in its own row/record consisting of 19 | 20 | 21 | 1. its "left" part: a string (with/without its index-label) from `master`, 22 | 2. its similarity score, and 23 | 3. its "right" part: a string (with/without its index-label) from `duplicates` (or `master` if `duplicates` is not given), 24 | 25 | in that order. Thus the column-names of the output are a collection of three groups: 26 | 27 | 1. The name of `master` and the name(s) of its index (or index-levels) all prefixed by the - string `'left_'`, 28 | 2. `'similarity'` whose column has the similarity-scores as values, and 29 | 3. The name of `duplicates` (or `master` if `duplicates` is not given) and the name(s) of its index (or index-levels) prefixed by the string `'right_'`. 30 | 31 | 32 | Indexes (or their levels) only appear when the keyword argument `ignore_index=False` (the default). (See [tutorials/ignore_index_and_replace_na.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/ignore_index_and_replace_na.md) for a demonstration.) 33 | 34 | If either `master` or `duplicates` has no name, it assumes the name `'side'` which is then prefixed as described above. Similarly, if any of the indexes (or index-levels) has no name it assumes its `pandas` default name (`'index'`, `'level_0'`, and so on) and is then prefixed as described above. 35 | 36 | In other words, if only parameter `master` is given, the function will return pairs of highly similar strings within `master`. This can be seen as a self-join where both `'left_'` and `'right_'` prefixed columns come from `master`. If both parameters `master` and `duplicates` are given, it will return pairs of highly similar strings between `master` and `duplicates`. This can be seen as an inner-join where `'left_'` and `'right_'` prefixed columns come from `master` and `duplicates` respectively. 37 | 38 | The function also supports optionally inputting IDs (`master_id` and `duplicates_id`) corresponding to the strings being matched. In which case, the output includes two additional columns whose names are the names of these optional `Series` prefixed by `'left_'` and `'right_'` accordingly, and containing the IDs corresponding to the strings in the output. If any of these `Series` has no name, then it assumes the name `'id'` and is then prefixed as described above. 39 | 40 | 41 | -------------------------------------------------------------------------------- /docs/references/match_most_similar.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: match_most_similar 3 | --- 4 | 5 | 6 | ## Arguments 7 | 8 | ```python 9 | match_most_similar(master: pd.Series, 10 | duplicates: Optional[pd.Series], 11 | master_id: Optional[pd.Series], 12 | duplicates_id: Optional[pd.Series], 13 | **kwargs) -> Union[pd.DataFrame, pd.Series] 14 | ``` 15 | 16 | ## Result 17 | 18 | If `ignore_index=True`, returns a `Series` of strings, where for each string in `duplicates` the most similar string in `master` is returned. If there are no similar strings in `master` for a given string in `duplicates` (because there is no potential match where the cosine similarity is above the threshold \[default: 0.8\]) then the original string in `duplicates` is returned. The output `Series` thus has the same length and index as `duplicates`. 19 | 20 | For example, if an input `Series` with the values `\['foooo', 'bar', 'baz'\]` is passed as the argument `master`, and `\['foooob', 'bar', 'new'\]` as the values of the argument `duplicates`, the function will return a `Series` with values: `\['foooo', 'bar', 'new'\]`. 21 | 22 | The name of the output `Series` is the same as that of `master` prefixed with the string `'most_similar_'`. If `master` has no name, it is assumed to have the name `'master'` before being prefixed. 23 | 24 | If `ignore_index=False` (the default), `match_most_similar` returns a `DataFrame` containing the same `Series` described above as one of its columns. So it inherits the same index and length as `duplicates`. The rest of its columns correspond to the index (or index-levels) of `master` and thus contain the index-labels of the most similar strings being output as values. If there are no similar strings in `master` for a given string in `duplicates` then the value(s) assigned to this index-column(s) for that string is `NaN` by default. However, if the keyword argument `replace_na=True`, then these `NaN` values are replaced with the index-label(s) of that string in `duplicates`. Note that such replacements can only occur if the indexes of `master` and `duplicates` have the same number of levels. (See [tutorials/ignore_index_and_replace_na.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/ignore_index_and_replace_na.md#MMS) for a demonstration.) 25 | 26 | Each column-name of the output `DataFrame` has the same name as its corresponding column, index, or index-level of `master` prefixed with the string `'most_similar_'`. 27 | 28 | If both parameters `master_id` and `duplicates_id` are also given, then a `DataFrame` is always returned with the same column(s) as described above, but with an additional column containing those IDs from these input `Series` corresponding to the output strings. This column's name is the same as that of `master_id` prefixed in the same way as described above. If `master_id` has no name, it is assumed to have the name `'master_id'` before being prefixed. 29 | 30 | -------------------------------------------------------------------------------- /docs/references/options_kwargs.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Options / **kwargs 3 | --- 4 | 5 | All keyword arguments not mentioned in the function definitions above are used to update the default settings. The following optional arguments can be used: 6 | 7 | ## Tokenization settings 8 | 9 | * **`ngram_size`**: The amount of characters in each n-gram. Default is `3`. 10 | * **`regex`**: The regex string used to clean-up the input string. Default is `r"[,-./]|\s"`. 11 | * **`ignore_case`**: Determines whether or not letter case in strings should be ignored. Defaults to `True`. 12 | * **`normalize_to_ascii`**: Determines whether or not unicode to ascii normarlization is done. Default to `True`. 13 | 14 | ## Match and output settings 15 | 16 | * **`max_n_matches`**: The maximum number of matching strings in `master` allowed per string in `duplicates`. Default is 20. 17 | * **`min_similarity`**: The minimum cosine similarity for two strings to be considered a match. 18 | Defaults to `0.8` 19 | * **`include_zeroes`**: When `min_similarity` ≤ 0, determines whether zero-similarity matches appear in the output. Defaults to `True`. (See [tutorials/zero_similarity.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/zero_similarity.md).) 20 | * **`ignore_index`**: Determines whether indexes are ignored or not. If `False` (the default), index-columns will appear in the output, otherwise not. (See [tutorials/ignore_index_and_replace_na.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/ignore_index_and_replace_na.md) for a demonstration.) 21 | * **`replace_na`**: For function `match_most_similar`, determines whether `NaN` values in index-columns are replaced or not by index-labels from `duplicates`. Defaults to `False`. (See [tutorials/ignore_index_and_replace_na.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/ignore_index_and_replace_na.md) for a demonstration.) 22 | 23 | ## Performance settings 24 | 25 | * **`number_of_processes`**: The number of processes used by the cosine similarity calculation. Defaults to 26 | `number of cores on a machine - 1.` 27 | * **`n_blocks`**: This parameter is a tuple of two `int`s provided to help boost performance, if possible, of processing large DataFrames (see [Subsection Performance](#perf)), by splitting the DataFrames into `n_blocks[0]` blocks for the left operand (of the underlying matrix multiplication) and into `n_blocks[1]` blocks for the right operand before performing the string-comparisons block-wise. Defaults to `None`, in which case automatic splitting occurs if an `OverflowError` would otherwise occur. 28 | 29 | ## Other settings 30 | 31 | * **`tfidf_matrix_dtype`**: The datatype for the tf-idf values of the matrix components. Allowed values are `numpy.float32` and `numpy.float64`. Default is `numpy.float64`. (Note: `numpy.float32` often leads to faster processing and a smaller memory footprint albeit less numerical precision than `numpy.float64`.) 32 | * **`group_rep`**: For function `group_similar_strings`, determines how group-representatives are chosen. Allowed values are `'centroid'` (the default) and `'first'`. See [tutorials/group_representatives.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/group_representatives.md) for an explanation. 33 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to this project will be documented in this file. 4 | 5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 7 | 8 | ## [0.7.1] - 2025-01-23 9 | 10 | ### Changed 11 | * Code wise, nothing changed. However, the version number is cleaned up in pyproject.toml 12 | * Cleaned up documentation and readme.md. Most documentation is moved from the `readme.md` to: 13 | [https://bergvca.github.io/string_grouper](https://bergvca.github.io/string_grouper/). 14 | 15 | 16 | ## [0.7.0] - 2025-01-23 17 | 18 | ### Changed 19 | 20 | * sparse_dot_topn_for_blocks and topn dependencies are removed and replaced by sparse_dot_topn official library from ING Bank, this is a big change: it may have impacts from old code using string_grouper 21 | * `n_blocks` None is now the default value for `n_blocks` and optimal numbers of blocks will be guessed based on empirical observation to split data into smaller chunks (based on input data size) 22 | * sparse_dot_topn now integrates a [block/chunk strategy](https://github.com/ing-bank/sparse_dot_topn?tab=readme-ov-file#distributing-the-top-n-multiplication-of-two-large-o10m-sparse-matrices-over-a-cluster). This strategy is used in string_grouper. 23 | 24 | 25 | ### Added 26 | 27 | * a new parameter normalize_to_ascii to normalize unicode character to ascii ones 28 | * loguru dependency is introduced to print messages to user 29 | 30 | 31 | ## [0.6.1] - 2021-10-19 32 | 33 | * `n_blocks` Added "guesstimate" as default value for `n_blocks`. This will guess an optimal number of blocks 34 | based on empirical observation. 35 | 36 | 37 | ## [0.6.0] - 2021-09-21 38 | 39 | ### Added 40 | 41 | * matrix-blocking/splitting as a performance-enhancer (see [README.md](https://github.com/Bergvca/string_grouper/tree/master/#performance) for details) 42 | * new keyword arguments `force_symmetries` and `n_blocks` (see [README.md](https://github.com/Bergvca/string_grouper/tree/master/#kwargs) for details) 43 | * new dependency on packages `topn` and `sparse_dot_topn_for_blocks` to help with the matrix-blocking 44 | * capability to reuse a previously initialized StringGrouper (that is, the corpus can now persist across high-level function calls like `match_strings()`. See [README.md](https://github.com/Bergvca/string_grouper/tree/master/#corpus) for details.) 45 | 46 | 47 | ## [0.5.0] - 2021-06-11 48 | 49 | ### Added 50 | 51 | * Added new keyword argument **`tfidf_matrix_dtype`** (the datatype for the tf-idf values of the matrix components). Allowed values are `numpy.float32` and `numpy.float64` (used by the required external package `sparse_dot_topn` version 0.3.1). Default is `numpy.float32`. (Note: `numpy.float32` often leads to faster processing and a smaller memory footprint albeit less numerical precision than `numpy.float64`.) 52 | 53 | ### Changed 54 | 55 | * Changed dependency on `sparse_dot_topn` from version 0.2.9 to 0.3.1 56 | * Changed the default datatype for cosine similarities from numpy.float64 to numpy.float32 to boost computational performance at the expense of numerical precision. 57 | * Changed the default value of the keyword argument `max_n_matches` from 20 to the number of strings in `duplicates` (or `master`, if `duplicates` is not given). 58 | * Changed warning issued when the condition \[`include_zeroes=True` and `min_similarity` ≤ 0 and `max_n_matches` is not sufficiently high to capture all nonzero-similarity-matches\] is met to an exception. 59 | 60 | ### Removed 61 | 62 | * Removed the keyword argument `suppress_warning` 63 | 64 | ## [0.4.0] - 2021-04-11 65 | 66 | ### Added 67 | 68 | * Added group representative functionality - by default the centroid is used. From [@ParticularMiner](https://github.com/ParticularMiner) 69 | * Added string_grouper_utils package with additional group-representative functionality: 70 | * new_group_rep_by_earliest_timestamp 71 | * new_group_rep_by_completeness 72 | * new_group_rep_by_highest_weight 73 | 74 | From [@ParticularMiner](https://github.com/ParticularMiner) 75 | * Original indices are now added by default to output of `group_similar_strings`, `match_most_similar` and `match_strings`. 76 | From [@ParticularMiner](https://github.com/ParticularMiner) 77 | * `compute_pairwise_similarities` function From [@ParticularMiner](https://github.com/ParticularMiner) 78 | 79 | ### Changed 80 | 81 | * Default group representative is now the centroid. Used to be the first string in the series belonging to a group. 82 | From [@ParticularMiner](https://github.com/ParticularMiner) 83 | * Output of `match_most_similar` and `match_strings` is now a `pandas.DataFrame` object instead of a `pandas.Series` 84 | by default. From [@ParticularMiner](https://github.com/ParticularMiner) 85 | * Fixed a bug which occurs when min_similarity=0. From [@ParticularMiner](https://github.com/ParticularMiner) -------------------------------------------------------------------------------- /docs/performance.md: -------------------------------------------------------------------------------- 1 | ## Performance 2 | 3 | 4 | Semilogx plots of run-times of `match_strings()` vs the number of blocks (`n_blocks[1]`) into which the right matrix-operand of the dataset (663 000 strings from sec__edgar_company_info.csv) was split before performing the string comparison. As shown in the legend, each plot corresponds to the number `n_blocks[0]` of blocks into which the left matrix-operand was split. 5 | ![Semilogx](https://raw.githubusercontent.com/Bergvca/string_grouper/master/images/BlockNumberSpaceExploration1.png) 6 | 7 | String comparison, as implemented by `string_grouper`, is essentially matrix 8 | multiplication. A pandas Series of strings is converted (tokenized) into a 9 | matrix. Then that matrix is multiplied by itself (or another) transposed. 10 | 11 | Here is an illustration of multiplication of two matrices ***D*** and ***M***T: 12 | ![Block Matrix 1 1](https://raw.githubusercontent.com/Bergvca/string_grouper/master/images/BlockMatrix_1_1.png) 13 | 14 | It turns out that when the matrix (or Series) is very large, the computer 15 | proceeds quite slowly with the multiplication (apparently due to the RAM being 16 | too full). Some computers give up with an `OverflowError`. 17 | 18 | To circumvent this issue, `string_grouper` now allows the division of the Series 19 | into smaller chunks (or blocks) and multiplies the chunks one pair at a time 20 | instead to get the same result: 21 | 22 | ![Block Matrix 2 2](https://raw.githubusercontent.com/Bergvca/string_grouper/master/images/BlockMatrix_2_2.png) 23 | 24 | But surprise ... the run-time of the process is sometimes drastically reduced 25 | as a result. For example, the speed-up of the following call is about 500% 26 | (here, the Series is divided into 200 blocks on the right operand, that is, 27 | 1 block on the left × 200 on the right) compared to the same call with no 28 | splitting \[`n_blocks=(1, 1)`, the default, which is what previous versions 29 | (0.5.0 and earlier) of `string_grouper` did\]: 30 | 31 | ```python 32 | # A DataFrame of 668 000 records: 33 | companies = pd.read_csv('data/sec__edgar_company_info.csv') 34 | 35 | # The following call is more than 6 times faster than earlier versions of 36 | # match_strings() (that is, when n_blocks=(1, 1))! 37 | match_strings(companies['Company Name')], n_blocks=(1, 200)) 38 | ``` 39 | 40 | Further exploration of the block number space ([see plot above](#Semilogx)) has revealed that for any fixed 41 | number of right blocks, the run-time gets longer the larger the number of left 42 | blocks specified. For this reason, it is recommended *not* to split the left matrix. 43 | 44 | ![Block Matrix 1 2](https://raw.githubusercontent.com/Bergvca/string_grouper/master/images/BlockMatrix_1_2.png) 45 | 46 | In general, 47 | 48 |    ***total runtime*** = `n_blocks[0]` × `n_blocks[1]` × ***mean runtime per block-pair*** 49 | 50 |                           = ***Left Operand Size*** × ***Right Operand Size*** × 51 | 52 |                                ***mean runtime per block-pair*** / (***Left Block Size*** × ***Right Block Size***) 53 | 54 | So for given left and right operands, minimizing the ***total runtime*** is the same as minimizing the 55 | 56 |    ***runtime per string-pair comparison*** ≝
                              ***mean runtime per block-pair*** / (***Left Block Size*** × ***Right Block Size***) 57 | 58 | 59 | [Below is a log-log-log contour plot](#ContourPlot) of the ***runtime per string-pair comparison*** scaled by its value 60 | at ***Left Block Size*** = ***Right Block Size*** = 5000. Here, ***Block Size*** 61 | is the number of strings in that block, and ***mean runtime per block-pair*** is the time taken for the following call to run: 62 | ```python 63 | # note the parameter order! 64 | match_strings(right_Series, left_Series, n_blocks=(1, 1)) 65 | ``` 66 | where `left_Series` and `right_Series`, corresponding to ***Left Block*** and ***Right Block*** respectively, are random subsets of the Series `companies['Company Name')]` from the 67 | [sec__edgar_company_info.csv](https://www.kaggle.com/dattapiy/sec-edgar-companies-list/version/1) sample data file. 68 | 69 | ![ContourPlot](https://raw.githubusercontent.com/Bergvca/string_grouper/master/images/ScaledRuntimeContourPlot.png) 70 | 71 | It can be seen that when `right_Series` is roughly the size of 80 000 (denoted by the 72 | white dashed line in the contour plot above), the runtime per string-pair comparison is at 73 | its lowest for any fixed `left_Series` size. Above ***Right Block Size*** = 80 000, the 74 | matrix-multiplication routine begins to feel the limits of the computer's 75 | available memory space and thus its performance deteriorates, as evidenced by the increase 76 | in runtime per string-pair comparison there (above the white dashed line). This knowledge 77 | could serve as a guide for estimating the optimum block numbers — 78 | namely those that divide the Series into blocks of size roughly equal to 79 | 80 000 for the right operand (or `right_Series`). 80 | 81 | So what are the optimum block number values for *any* given Series? That is 82 | anyone's guess, and may likely depend on the data itself. Furthermore, as hinted above, 83 | the answer may vary from computer to computer. 84 | 85 | We however encourage the user to make judicious use of the `n_blocks` 86 | parameter to boost performance of `string_grouper` whenever possible. 87 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: String Grouper 3 | --- 4 | 5 | **`string_grouper`** is a library that makes finding groups of similar strings within a single, or multiple, lists of strings easy — and fast. **`string_grouper`** uses **tf-idf** to calculate [**cosine similarities**](http://blog.christianperone.com/2013/09/machine-learning-cosine-similarity-for-vector-space-models-part-iii/) within a single list or between two lists of strings. The full process is described in the blog [Super Fast String Matching in Python](https://bergvca.github.io/2017/10/14/super-fast-string-matching.html). 6 | 7 | ## Install 8 | 9 | ```bash 10 | pip install string-grouper 11 | ``` 12 | 13 | or see the releases [here](https://github.com/bergvca/string_grouper/releases) 14 | 15 | ## First usage 16 | 17 | ```python 18 | import pandas as pd 19 | from string_grouper import match_strings 20 | 21 | #https://github.com/ngshya/pfsm/blob/master/data/sec_edgar_company_info.csv 22 | company_names = './data/sec_edgar_company_info.csv' 23 | # We only look at the first 50k as an example: 24 | companies = pd.read_csv(company_names)[0:50000] 25 | # Create all matches: 26 | matches = match_strings(companies['Company Name']) 27 | # Look at only the non-exact matches: 28 | matches[matches['left_Company Name'] != matches['right_Company Name']].head() 29 | ``` 30 | 31 | As shown above, the library may be used together with `pandas`, and contains four high level functions (`match_strings`, `match_most_similar`, `group_similar_strings`, and `compute_pairwise_similarities`) that can be used directly, and one class (`StringGrouper`) that allows for a more interactive approach. 32 | 33 | The permitted calling patterns of the four functions, and their return types, are: 34 | 35 | | Function | Parameters | `pandas` Return Type | 36 | | -------------: |:-------------|:-----:| 37 | | `match_strings`| `(master, **kwargs)`| `DataFrame` | 38 | | `match_strings`| `(master, duplicates, **kwargs)`| `DataFrame` | 39 | | `match_strings`| `(master, master_id=id_series, **kwargs)`| `DataFrame` | 40 | | `match_strings`| `(master, duplicates, master_id, duplicates_id, **kwargs)`| `DataFrame` | 41 | 42 | 43 | ## With Polars 44 | 45 | For the moment polars is not yet supported natively. 46 | 47 | But you can juggle easily one with the other: 48 | 49 | ```python 50 | import polars as pl 51 | from string_grouper import match_strings 52 | 53 | company_names = 'https://raw.githubusercontent.com/ngshya/pfsm/refs/heads/master/data/sec_edgar_company_info.csv' 54 | # We only look at the first 50k as an example: 55 | companies = pl.read_csv(company_names).slice(0,50000).to_pandas() 56 | # Create all matches: 57 | matches = pl.from_pandas(match_strings(companies['Company Name'])) 58 | # Look at only the non-exact matches: 59 | matches.filter(pl.col('left_Company Name') != pl.col('right_Company Name')).head() 60 | ``` 61 | 62 | ## High Level Functions 63 | In the rest of this document the names, `Series` and `DataFrame`, refer to the familiar `pandas` object types. 64 | 65 | As shown above, the library may be used together with `pandas`, and contains four high level functions (`match_strings`, `match_most_similar`, `group_similar_strings`, and `compute_pairwise_similarities`) that can be used directly, and one class (`StringGrouper`) that allows for a more interactive approach. 66 | 67 | The permitted calling patterns of the four functions, and their return types, are: 68 | 69 | | Function | Parameters | `pandas` Return Type | 70 | | -------------: |:-------------|:-----:| 71 | | `match_strings`| `(master, **kwargs)`| `DataFrame` | 72 | | `match_strings`| `(master, duplicates, **kwargs)`| `DataFrame` | 73 | | `match_strings`| `(master, master_id=id_series, **kwargs)`| `DataFrame` | 74 | | `match_strings`| `(master, duplicates, master_id, duplicates_id, **kwargs)`| `DataFrame` | 75 | | `match_most_similar`| `(master, duplicates, **kwargs)`| `Series` (if kwarg `ignore_index=True`) otherwise `DataFrame` (default)| 76 | | `match_most_similar`| `(master, duplicates, master_id, duplicates_id, **kwargs)`| `DataFrame` | 77 | | `group_similar_strings`| `(strings_to_group, **kwargs)`| `Series` (if kwarg `ignore_index=True`) otherwise `DataFrame` (default)| 78 | | `group_similar_strings`| `(strings_to_group, strings_id, **kwargs)`| `DataFrame` | 79 | | `compute_pairwise_similarities`| `(string_series_1, string_series_2, **kwargs)`| `Series` | 80 | 81 | 82 | 83 | ## Generic Parameters 84 | 85 | |Name | Description | 86 | |:--- | :--- | 87 | |**`master`** | A `Series` of strings to be matched with themselves (or with those in `duplicates`). | 88 | |**`duplicates`** | A `Series` of strings to be matched with those of `master`. | 89 | |**`master_id`** (or `id_series`) | A `Series` of IDs corresponding to the strings in `master`. | 90 | |**`duplicates_id`** | A `Series` of IDs corresponding to the strings in `duplicates`. | 91 | |**`strings_to_group`** | A `Series` of strings to be grouped. | 92 | |**`strings_id`** | A `Series` of IDs corresponding to the strings in `strings_to_group`. | 93 | |**`string_series_1(_2)`** | A `Series` of strings each of which is to be compared with its corresponding string in `string_series_2(_1)`. | 94 | |**`**kwargs`** | Keyword arguments (see [below](#kwargs)).| 95 | 96 | 97 | ## StringGrouper Class 98 | 99 | The above-mentioned functions are all build using the [StringGrouper](references/sg_class.md) class. This class can be used for more 100 | each of the high-level functions listed above also has a `StringGrouper` 101 | method counterpart of the same name and parameters. Calling such a method of any instance of `StringGrouper` will not 102 | rebuild the instance's underlying corpus to make string-comparisons but rather use it to perform the string-comparisons. 103 | The input Series to the method (`master`, `duplicates`, and so on) will thus be encoded, 104 | or transformed, into tf-idf matrices, using this corpus. See [StringGrouper](references/sg_class.md) for further 105 | details. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # String Grouper 2 | 3 | [![pypi](https://badgen.net/pypi/v/string-grouper)](https://pypi.org/project/string-grouper) 4 | [![license](https://badgen.net/pypi/license/string_grouper)](https://github.com/Bergvca/string_grouper) 5 | [![lastcommit](https://badgen.net/github/last-commit/Bergvca/string_grouper)](https://github.com/Bergvca/string_grouper) 6 | [![codecov](https://codecov.io/gh/Bergvca/string_grouper/branch/master/graph/badge.svg?token=AGK441CQDT)](https://codecov.io/gh/Bergvca/string_grouper) 7 | [![PyPI Downloads](https://static.pepy.tech/badge/string-grouper)](https://pepy.tech/projects/string-grouper) 8 | 9 | 10 |
11 | Click to see image 12 |
13 |
14 | 15 | The image displayed above is a visualization of the graph-structure of one of the groups of strings found by `string_grouper`. Each circle (node) represents a string, and each connecting arc (edge) represents a match between a pair of strings with a similarity score above a given threshold score (here `0.8`). 16 | 17 | The ***centroid*** of the group, as determined by `string_grouper` (see [tutorials/group_representatives.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/group_representatives.md) for an explanation), is the largest node, also with the most edges originating from it. A thick line in the image denotes a strong similarity between the nodes at its ends, while a faint thin line denotes weak similarity. 18 | 19 | The power of `string_grouper` is discernible from this image: in large datasets, `string_grouper` is often able to resolve indirect associations between strings even when, say, due to memory-resource-limitations, direct matches between those strings cannot be computed using conventional methods with a lower threshold similarity score. 20 | 21 |
———
22 | 23 | This image was designed using the graph-visualization software Gephi 0.9.2 with data generated by `string_grouper` operating on the [sec__edgar_company_info.csv](https://www.kaggle.com/dattapiy/sec-edgar-companies-list/version/1) sample data file. 24 | 25 | --- 26 |
27 | 28 | **`string_grouper`** is a library that makes finding groups of similar strings within a single, or multiple, lists of 29 | strings easy — and _fast_. **`string_grouper`** uses **tf-idf** to calculate [**cosine similarities**](https://towardsdatascience.com/understanding-cosine-similarity-and-its-application-fd42f585296a) 30 | within a single list or between two lists of strings. The full process is described in the blog [Super Fast String Matching in Python](https://bergvca.github.io/2017/10/14/super-fast-string-matching.html). 31 | 32 | 33 | ## Installing 34 | 35 | `pip install string-grouper` 36 | 37 | ## Speed 38 | 39 | **`string_grouper`** leverages the blazingly fast [sparse_dot_topn](https://github.com/ing-bank/sparse_dot_topn) libary 40 | to calculate cosine similarities. 41 | 42 | ```python 43 | s = datetime.datetime.now() 44 | matches = match_strings(names['Company Name'], number_of_processes = 4) 45 | 46 | e = datetime.datetime.now() 47 | diff = (e - s) 48 | str(diff) 49 | ``` 50 | Results in: 51 | 52 | `00:05:34.65` On an Intel i7-6500U CPU @ 2.50GHz, where `len(names)` = 663 000 53 | 54 | *in other words*, 55 | the library is able to perform fuzzy matching of 663 000 names in _five and a half minutes_ 56 | on a 2015 consumer CPU using 4 cores. 57 | 58 | ## Simple Match 59 | 60 | ```python 61 | import pandas as pd 62 | from string_grouper import match_strings 63 | 64 | company_names = 'sec__edgar_company_info.csv' 65 | companies = pd.read_csv(company_names) 66 | # Create all matches: 67 | matches = match_strings(companies['Company Name']) 68 | # Look at only the non-exact matches: 69 | matches[matches['left_Company Name'] != matches['right_Company Name']].head() 70 | ``` 71 | 72 | | | left_index | left_Company Name | similarity | right_Company Name | right_index | 73 | |----:|-------------:|:------------------------------------------------------------|-------------:|:----------------------------------------|--------------:| 74 | | 15 | 14 | 0210, LLC | 0.870291 | 90210 LLC | 4211 | 75 | | 167 | 165 | 1 800 MUTUALS ADVISOR SERIES | 0.931615 | 1 800 MUTUALS ADVISORS SERIES | 166 | 76 | | 168 | 166 | 1 800 MUTUALS ADVISORS SERIES | 0.931615 | 1 800 MUTUALS ADVISOR SERIES | 165 | 77 | | 172 | 168 | 1 800 RADIATOR FRANCHISE INC | 1 | 1-800-RADIATOR FRANCHISE INC. | 201 | 78 | | 178 | 173 | 1 FINANCIAL MARKETPLACE SECURITIES LLC /BD | 0.949364 | 1 FINANCIAL MARKETPLACE SECURITIES, LLC | 174 | 79 | 80 | 81 | ## Group Similar Strings and Find most Common 82 | 83 | ```python 84 | companies[["group-id", "name_deduped"]] = group_similar_strings(companies['Company Name']) 85 | companies.groupby('name_deduped')['Line Number'].count().sort_values(ascending=False).head(10) 86 | ``` 87 | | name_deduped | Line Number | 88 | |:---------------------------------------------------|--------------:| 89 | | ADVISORS DISCIPLINED TRUST | 1747 | 90 | | NUVEEN TAX EXEMPT UNIT TRUST SERIES 1 | 916 | 91 | | GUGGENHEIM DEFINED PORTFOLIOS, SERIES 1200 | 652 | 92 | | U S TECHNOLOGIES INC | 632 | 93 | | CAPITAL MANAGEMENT LLC | 628 | 94 | | CLAYMORE SECURITIES DEFINED PORTFOLIOS, SERIES 200 | 611 | 95 | | E ACQUISITION CORP | 561 | 96 | | CAPITAL PARTNERS LP | 561 | 97 | | FIRST TRUST COMBINED SERIES 1 | 560 | 98 | | PRINCIPAL LIFE INCOME FUNDINGS TRUST 20 | 544 | 99 | 100 | ## Documentation 101 | 102 | The documentation can be found [here](https://bergvca.github.io/string_grouper/) 103 | -------------------------------------------------------------------------------- /string_grouper_utils/string_grouper_utils.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from typing import List, Optional, Union 3 | from dateutil.parser import parse 4 | from dateutil.tz import UTC 5 | from numbers import Number 6 | from datetime import datetime 7 | import re 8 | import pydoc 9 | 10 | 11 | def new_group_rep_by_earliest_timestamp(grouped_data: pd.DataFrame, 12 | group_col: Union[str, int], 13 | record_id_col: Union[str, int], 14 | timestamps: Union[pd.Series, str, int], 15 | record_name_col: Optional[Union[str, int]] = None, 16 | parserinfo=None, 17 | **kwargs) -> Union[pd.DataFrame, pd.Series]: 18 | """ 19 | Selects the oldest string in each group as group-representative. 20 | :param grouped_data: The grouped DataFrame 21 | :param group_col: The name or positional index of the column in grouped_data containing the groups 22 | :param record_id_col: The name or positional index of the column in grouped_data with all groups' members' IDs 23 | (This will appear in the output) 24 | :param timestamps: pandas.Series or the column name (str) or column positional index (int) in grouped_data 25 | This contains the timestamps of the strings to be grouped. 26 | :param record_name_col: (Optional) The name or positional index of the column in grouped_data with 27 | all groups' members' names. (This will appear in the output.) 28 | :param parserinfo: (See below.) 29 | :param **kwargs: (See below.) 30 | parserinfo and kwargs are the same arguments as those you would pass to dateutil.parser.parse. They help in 31 | interpreting the string inputs which are to be parsed into datetime datatypes. 32 | 33 | FYI, the dateutil.parser.parse documentation for these arguments follows: 34 | """ 35 | if isinstance(timestamps, pd.Series): 36 | if len(grouped_data) != len(timestamps): 37 | raise Exception('Both grouped_data and timestamps must be pandas.Series of the same length.') 38 | else: 39 | timestamps = get_column(timestamps, grouped_data) 40 | weights = parse_timestamps(timestamps, parserinfo, **kwargs) 41 | return group_rep_transform('idxmin', weights, grouped_data, group_col, record_id_col, record_name_col) 42 | 43 | 44 | def new_group_rep_by_completeness(grouped_data: pd.DataFrame, 45 | group_col: Union[str, int], 46 | record_id_col: Union[str, int], 47 | record_name_col: Optional[Union[str, int]] = None, 48 | tested_cols: Optional[Union[pd.DataFrame, List[Union[str, int]]]] = None 49 | ) -> Union[pd.DataFrame, pd.Series]: 50 | """ 51 | Selects the string in the group with the most filled-in row/record as group-representative. 52 | :param grouped_data: The grouped DataFrame 53 | :param group_col: The name or positional index of the column in grouped_data containing the groups 54 | :param record_id_col: The name or positional index of the column in grouped_data with all groups' members' IDs 55 | (This will appear in the output) 56 | :param record_name_col: (Optional) The name or positional index of the column in grouped_data with 57 | all groups' members' names. (This will appear in the output.) 58 | :param tested_cols: (Optional) pandas.DataFrame or list of column names/indices of grouped_data whose 59 | filled-in statuses are used to determine the new group-representative. 60 | If it is None then the entire group_data itself is used 61 | The input DataFrame of fields of the strings to be grouped. 62 | """ 63 | if isinstance(tested_cols, pd.DataFrame): 64 | if len(grouped_data) != len(tested_cols): 65 | raise Exception('Both grouped_data and tested_cols must be pandas.DataFrame of the same length.') 66 | elif tested_cols is not None: 67 | tested_cols = get_column(tested_cols, grouped_data) 68 | else: 69 | tested_cols = grouped_data 70 | 71 | def is_notnull_and_not_empty(x): 72 | if x == '' or pd.isnull(x): 73 | return 0 74 | else: 75 | return 1 76 | 77 | weights = tested_cols.applymap(is_notnull_and_not_empty).sum(axis=1) 78 | return group_rep_transform('idxmax', weights, grouped_data, group_col, record_id_col, record_name_col) 79 | 80 | 81 | def new_group_rep_by_highest_weight(grouped_data: pd.DataFrame, 82 | group_col: Union[str, int], 83 | record_id_col: Union[str, int], 84 | weights: Union[pd.Series, str, int], 85 | record_name_col: Optional[Union[str, int]] = None, 86 | ) -> Union[pd.DataFrame, pd.Series]: 87 | """ 88 | Selects the string in the group with the largest weight as group-representative. 89 | :param grouped_data: The grouped DataFrame 90 | :param group_col: The name or positional index of the column in grouped_data containing the groups 91 | :param record_id_col: The name or positional index of the column in grouped_data with all groups' members' IDs 92 | (This will appear in the output) 93 | :param weights: pandas.Series or the column name (str) or column positional index (int) in grouped_data 94 | containing the user-defined weights of the strings to be grouped 95 | :param record_name_col: (Optional) The name or positional index of the column in grouped_data with 96 | all groups' members' names. (This will appear in the output.) 97 | """ 98 | if isinstance(weights, pd.Series): 99 | if len(grouped_data) != len(weights): 100 | raise Exception('Both grouped_data and weights must be pandas.Series of the same length.') 101 | else: 102 | weights = get_column(weights, grouped_data) 103 | return group_rep_transform('idxmax', weights, grouped_data, group_col, record_id_col, record_name_col) 104 | 105 | 106 | def group_rep_transform(method: str, 107 | weights: pd.Series, 108 | grouped_data, 109 | group_col, 110 | record_id_col, 111 | record_name_col) -> Union[pd.Series, pd.DataFrame]: 112 | stashed_index = grouped_data.index 113 | group_of_master_id = get_column(group_col, grouped_data).reset_index(drop=True) 114 | group_of_master_id = group_of_master_id.rename('raw_group_id').reset_index().rename(columns={'index': 'weight'}) 115 | group_of_master_id['weight'] = weights.reset_index(drop=True) 116 | group_of_master_id['group_rep'] = \ 117 | group_of_master_id.groupby('raw_group_id', sort=False)['weight'].transform(method) 118 | record_id_col = get_column(record_id_col, grouped_data) 119 | new_rep = record_id_col.iloc[group_of_master_id.group_rep].reset_index(drop=True).rename(None) 120 | if record_name_col is None: 121 | output = new_rep 122 | else: 123 | record_name_col = get_column(record_name_col, grouped_data) 124 | new_rep_name = record_name_col.iloc[group_of_master_id.group_rep].reset_index(drop=True).rename(None) 125 | output = pd.concat([new_rep, new_rep_name], axis=1) 126 | output.index = stashed_index 127 | return output 128 | 129 | 130 | def get_column(col: Union[str, int, List[Union[str, int]]], data: pd.DataFrame): 131 | if isinstance(col, str): 132 | return data.loc[:, col] 133 | elif isinstance(col, int): 134 | return data.iloc[:, col] 135 | elif isinstance(col, List): 136 | return pd.concat([get_column(m, data) for m in col], axis=1) 137 | 138 | 139 | def parse_timestamps(timestamps: pd.Series, parserinfo=None, **kwargs) -> pd.Series: 140 | error_msg = "timestamps must be a Series of date-like or datetime-like strings" 141 | error_msg += " or datetime datatype or pandas Timestamp datatype or numbers" 142 | if is_series_of_type(str, timestamps): 143 | # if any of the strings is not datetime-like raise an exception 144 | if timestamps.to_frame().applymap(is_date).squeeze().all(): 145 | # convert strings to numpy datetime64 146 | return timestamps.transform(lambda x: parse(x, parserinfo, **kwargs).astimezone(UTC)) 147 | elif is_series_of_type(type(pd.Timestamp('15-1-2000')), timestamps): 148 | # convert pandas Timestamps to numpy datetime64 149 | return timestamps.transform(lambda x: x.to_numpy()) 150 | elif is_series_of_type(datetime, timestamps): 151 | # convert python datetimes to numpy datetime64 152 | return timestamps.transform(lambda x: x.astimezone(UTC)) 153 | elif is_series_of_type(Number, timestamps): 154 | return timestamps 155 | raise Exception(error_msg) 156 | 157 | 158 | def is_date(string, parserinfo=None, **kwargs): 159 | """ 160 | Return whether the string can be interpreted as a date. 161 | :param string: str, string to check for date 162 | :param parserinfo: (See below.) 163 | :param **kwargs: (See below.) 164 | parserinfo and kwargs are the same arguments as those you would pass to dateutil.parser.parse. They help in 165 | interpreting the string inputs which are to be parsed into datetime datatypes. 166 | """ 167 | try: 168 | parse(string, parserinfo, **kwargs) 169 | return True 170 | except ValueError: 171 | return False 172 | 173 | 174 | def is_series_of_type(what: type, series_to_test: pd.Series) -> bool: 175 | if series_to_test.to_frame().applymap( 176 | lambda x: not isinstance(x, what) 177 | ).squeeze().any(): 178 | return False 179 | return True 180 | 181 | 182 | # The following lines modify and append the kwargs portion of the docstring of dateutil.parser.parse to 183 | # the docstring of new_group_rep_by_earliest_timestamp: 184 | parse_docstring_kwargs = re.search(':param parserinfo:.*?:return:', pydoc.render_doc(parse), flags=re.DOTALL).group(0) 185 | parse_docstring_kwargs = re.sub( 186 | '``timestr``', 187 | 'the strings containing the date/time-stamps', 188 | parse_docstring_kwargs 189 | ) 190 | new_group_rep_by_earliest_timestamp.__doc__ = new_group_rep_by_earliest_timestamp.__doc__ + \ 191 | parse_docstring_kwargs[:-9] 192 | -------------------------------------------------------------------------------- /docs/references/sg_class.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: String Grouper Class 3 | --- 4 | 5 | 6 | ## Concept 7 | 8 | All functions are built using a class **`StringGrouper`**. This class can be used through pre-defined functions, for example the four high level functions above, as well as using a more interactive approach where matches can be added or removed if needed by calling the **`StringGrouper`** class directly. 9 | 10 | 11 | The four functions mentioned above all create a `StringGrouper` object behind the scenes and call different functions on it. The `StringGrouper` class keeps track of all tuples of similar strings and creates the groups out of these. Since matches are often not perfect, a common workflow is to: 12 | 13 | ## Example 1 - reuse the same tf-idf corpus without rebuilding 14 | 15 | ```python 16 | # Build a corpus using strings in the pandas Series master: 17 | sg = StringGrouper(master) 18 | # The following method-calls will compare strings first in 19 | # pandas Series new_master_1 and next in new_master_2 20 | # using the corpus already built above without rebuilding or 21 | # changing it in any way: 22 | matches1 = sg.match_strings(new_master_1) 23 | matches2 = sg.match_strings(new_master_2) 24 | ``` 25 | 26 | ## Example 2 - add and remove matches 27 | 28 | 1. Create matches 29 | 2. Manually inspect the results 30 | 3. Add and remove matches where necessary 31 | 4. Create groups of similar strings 32 | 33 | The `StringGrouper` class allows for this without having to re-calculate the cosine similarity matrix. See below for an example. 34 | 35 | 36 | ```python 37 | company_names = './data/sec_edgar_company_info.csv' 38 | companies = pd.read_csv(company_names) 39 | ``` 40 | 41 | 1. Create matches 42 | 43 | 44 | ```python 45 | # Create a new StringGrouper 46 | string_grouper = StringGrouper(companies['Company Name'], ignore_index=True) 47 | # Check if the ngram function does what we expect: 48 | string_grouper.n_grams('McDonalds') 49 | ``` 50 | 51 | ['McD', 'cDo', 'Don', 'ona', 'nal', 'ald', 'lds'] 52 | 53 | ```python 54 | string_grouper.n_grams('ÀbracâDABRÀ') 55 | ``` 56 | 57 | ['abr', 'bra', 'rac', 'aca', 'cad', 'ada', 'dab', 'abr', 'bra'] 58 | 59 | ```python 60 | # Now fit the StringGrouper - this will take a while since we are calculating cosine similarities on 600k strings 61 | string_grouper = string_grouper.fit() 62 | ``` 63 | 64 | ```python 65 | # Add the grouped strings 66 | companies['deduplicated_name'] = string_grouper.get_groups() 67 | ``` 68 | 69 | Suppose we know that PWC HOLDING CORP and PRICEWATERHOUSECOOPERS LLP are the same company. StringGrouper will not match these since they are not similar enough. 70 | 71 | 72 | ```python 73 | companies[companies.deduplicated_name.str.contains('PRICEWATERHOUSECOOPERS LLP')] 74 | ``` 75 | 76 | 77 |
78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 |
Line NumberCompany NameCompany CIK Keydeduplicated_name
478441478442PRICEWATERHOUSECOOPERS LLP /TA1064284PRICEWATERHOUSECOOPERS LLP /TA
478442478443PRICEWATERHOUSECOOPERS LLP1186612PRICEWATERHOUSECOOPERS LLP /TA
478443478444PRICEWATERHOUSECOOPERS SECURITIES LLC1018444PRICEWATERHOUSECOOPERS LLP /TA
112 |
113 | 114 | 115 | ```python 116 | companies[companies.deduplicated_name.str.contains('PWC')] 117 | ``` 118 | 119 | 120 |
121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 |
Line NumberCompany NameCompany CIK Keydeduplicated_name
485535485536PWC CAPITAL INC.1690640PWC CAPITAL INC.
485536485537PWC HOLDING CORP1456450PWC HOLDING CORP
485537485538PWC INVESTORS, LLC1480311PWC INVESTORS, LLC
485538485539PWC REAL ESTATE VALUE FUND I LLC1668928PWC REAL ESTATE VALUE FUND I LLC
485539485540PWC SECURITIES CORP /BD1023989PWC SECURITIES CORP /BD
485540485541PWC SECURITIES CORPORATION1023989PWC SECURITIES CORPORATION
485541485542PWCC LTD1172241PWCC LTD
485542485543PWCG BROKERAGE, INC.67301PWCG BROKERAGE, INC.
190 |
191 | 192 | 193 | We can add these with the add function: 194 | 195 | 196 | ```python 197 | string_grouper = string_grouper.add_match('PRICEWATERHOUSECOOPERS LLP', 'PWC HOLDING CORP') 198 | companies['deduplicated_name'] = string_grouper.get_groups() 199 | # Now lets check again: 200 | 201 | companies[companies.deduplicated_name.str.contains('PRICEWATERHOUSECOOPERS LLP')] 202 | ``` 203 | 204 | 205 |
206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 |
Line NumberCompany NameCompany CIK Keydeduplicated_name
478441478442PRICEWATERHOUSECOOPERS LLP /TA1064284PRICEWATERHOUSECOOPERS LLP /TA
478442478443PRICEWATERHOUSECOOPERS LLP1186612PRICEWATERHOUSECOOPERS LLP /TA
478443478444PRICEWATERHOUSECOOPERS SECURITIES LLC1018444PRICEWATERHOUSECOOPERS LLP /TA
485536485537PWC HOLDING CORP1456450PRICEWATERHOUSECOOPERS LLP /TA
247 |
248 | 249 | 250 | This can also be used to merge two groups: 251 | 252 | 253 | ```python 254 | string_grouper = string_grouper.add_match('PRICEWATERHOUSECOOPERS LLP', 'ZUCKER MICHAEL') 255 | companies['deduplicated_name'] = string_grouper.get_groups() 256 | 257 | # Now lets check again: 258 | companies[companies.deduplicated_name.str.contains('PRICEWATERHOUSECOOPERS LLP')] 259 | ``` 260 | 261 | 262 |
263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 |
Line NumberCompany NameCompany CIK Keydeduplicated_name
478441478442PRICEWATERHOUSECOOPERS LLP /TA1064284PRICEWATERHOUSECOOPERS LLP /TA
478442478443PRICEWATERHOUSECOOPERS LLP1186612PRICEWATERHOUSECOOPERS LLP /TA
478443478444PRICEWATERHOUSECOOPERS SECURITIES LLC1018444PRICEWATERHOUSECOOPERS LLP /TA
485536485537PWC HOLDING CORP1456450PRICEWATERHOUSECOOPERS LLP /TA
662585662586ZUCKER MICHAEL1629018PRICEWATERHOUSECOOPERS LLP /TA
662604662605ZUCKERMAN MICHAEL1303321PRICEWATERHOUSECOOPERS LLP /TA
662605662606ZUCKERMAN MICHAEL1496366PRICEWATERHOUSECOOPERS LLP /TA
325 |
326 | 327 | 328 | We can remove strings from groups in the same way: 329 | 330 | 331 | ```python 332 | string_grouper = string_grouper.remove_match('PRICEWATERHOUSECOOPERS LLP', 'ZUCKER MICHAEL') 333 | companies['deduplicated_name'] = string_grouper.get_groups() 334 | 335 | # Now lets check again: 336 | companies[companies.deduplicated_name.str.contains('PRICEWATERHOUSECOOPERS LLP')] 337 | ``` 338 | 339 | 340 |
341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | 355 | 356 | 357 | 358 | 359 | 360 | 361 | 362 | 363 | 364 | 365 | 366 | 367 | 368 | 369 | 370 | 371 | 372 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 381 |
Line NumberCompany NameCompany CIK Keydeduplicated_name
478441478442PRICEWATERHOUSECOOPERS LLP /TA1064284PRICEWATERHOUSECOOPERS LLP /TA
478442478443PRICEWATERHOUSECOOPERS LLP1186612PRICEWATERHOUSECOOPERS LLP /TA
478443478444PRICEWATERHOUSECOOPERS SECURITIES LLC1018444PRICEWATERHOUSECOOPERS LLP /TA
485536485537PWC HOLDING CORP1456450PRICEWATERHOUSECOOPERS LLP /TA
382 |
383 | 384 | -------------------------------------------------------------------------------- /tutorials/tutorial_1.md: -------------------------------------------------------------------------------- 1 | # Finding Duplicates With IDs In String Grouper 2 | 3 | ## Introduction 4 | 5 | A common requirement in data clean-up is the scenario where a data set (database, pandas DataFrame) has multiple database records for the same entity and duplicates need to be found. This example will not cover the task of merging or removing duplicate records — what it will do is use String Grouper to find duplicate records using the match_strings function and the optional IDs functionality. 6 | 7 | For the example we will use [this](accounts.csv) simple data set. The number of rows is not important, the 'name' column has a number of typical cases of types of variations in spelling. 8 | 9 | ``` 10 | id,name 11 | AA012345X,mega enterprises corp. 12 | BB016741P,mega enterprises corporation 13 | CC052345T,mega corp. 14 | AA098762D,hyper startup inc. 15 | BB099931J,hyper-startup inc. 16 | CC082744L,hyper startup incorporated 17 | HH072982K,hyper hyper inc. 18 | AA903844B,slow and steady inc. 19 | BB904941H,slow and steady incorporated 20 | CC903844B,slow steady inc. 21 | AA777431C,abc enterprises inc. 22 | BB760431Y,a.b.c. enterprises incorporated 23 | BB750431M,a.b.c. enterprises inc. 24 | ZZ123456H,one and only inc. 25 | ``` 26 | 27 | ## Example 28 | 29 | The steps below will process the above sample file using String Grouper to search for matches in the values in the 'name' column. The results shown in the tables at each step are based on the sample data above. 30 | 31 | ### Setup 32 | 33 | ```python 34 | import pandas as pd 35 | from string_grouper import match_strings 36 | ``` 37 | 38 | ### Import Data 39 | 40 | ***Tip:*** Assuming the data set will come from an external database, for optimum performance only do an export of the ID column, and the text column that matching will be done on, and convert the text data column (**not the ID column**) to lower case. 41 | 42 | #### Import the sample data. 43 | 44 | ```python 45 | accounts = pd.read_csv('string_grouper/tutorials/accounts.csv') 46 | # Show dataframe 47 | accounts 48 | ``` 49 | 50 | #### Result (first three rows only shown): 51 | 52 |
53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 |
idname
0AA012345Xmega enterprises corp.
1BB016741Pmega enterprises corporation
2CC052345Tmega corp.
.........
84 |
85 | 86 | 87 | ### Find matches, assign to new pandas variable 88 | 89 | Next, use the `match_strings` function and pass the 'name' column as the argument to the `master` parameter, and the 'id' column as the argument to the `master_id` parameter. 90 | 91 | **N.B.** In production with a real data set, depending on its size, the following command can/may take a number of minutes — ***no update/progress indicator is shown***. This obviously also depends on the performance of the computer used. Memory and hard disk performance are a factor, as well as the CPU. String Grouper uses pandas which, in turn, uses NumPy, so matching is not done by computationally intensive looping, but by [array mathematics](https://realpython.com/numpy-array-programming/) — but it still may take some time to process large data sets. 92 | 93 | ```python 94 | matches = match_strings(accounts['name'], master_id = accounts['id'], ignore_index=True) 95 | matches 96 | ``` 97 | This will return a pandas DataFrame as below. The values (company) we will focus on in this example will be those that have variations in the name of the fictitious company, 'Hyper Startup Inc.'. 98 | 99 | 100 |
101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 |
left_idleft_namesimilarityright_nameright_id
..................
3AA098762Dhyper startup inc.1.00hyper-startup inc.BB099931J
4AA098762Dhyper startup inc.1.00hyper startup inc.AA098762D
5BB099931Jhyper-startup inc.1.00hyper-startup inc.BB099931J
6BB099931Jhyper-startup inc.1.00hyper startup inc.AA098762D
7CC082744Lhyper startup incorporated1.00hyper startup incorporatedCC082744L
..................
171 |
172 | 173 | 174 | In a pattern-matching process, each value in a row of the column being matched is checked against *every other value* in the column. 175 | 176 | Processing this using typical Python looping code would mean, in the case of a 100,000 row data set, that the total iterations would be 100,0002 = 10 Billion. Processing that number of iterations might require replacing the CPU of the computer after each investigation! Well maybe not ... but you *would* have time for a few cups of coffee. String Grouper works in a totally different way. 177 | 178 | In the resultant DataFrame above, we see the IDs (AA098762D, BB099931J) having each a group of two values — once where a close match is found, and once where its own record (value) is found. The third ID, CC082744L, is only returned once, even though it is pretty clear that it would be a variation of our fictitious company 'Hyper Startup Inc.' 179 | 180 | 181 | ### Using the 'Minimum Similarity' keyword argument 182 | 183 | String Grouper has a number of configuration options (see the **kwargs** in README.md). The option of interest in the above case is `min_similarity`. 184 | 185 | The default minimum similarity is 0.8. It can be seen that more matches may be found by reducing the minimum similarity from 0.8 to, for example, 0.7. 186 | 187 | ```python 188 | matches = match_strings(accounts['name'], master_id = accounts['id'], ignore_index = True, min_similarity = 0.7) 189 | ``` 190 | 191 | ***Tip:*** If the data set being matched is large, and you wish to experiment with the minimum similarity option, it may be helpful to import only a limited data set during testing, and increase to the full data set when ready. The number of rows imported can be specified in this way: 192 | 193 | ```python 194 | # We only look at the first 50k as an example 195 | accounts = pd.read_csv('/path/to/folder/huge_file.csv')[0:50000] 196 | ``` 197 | 198 | Back to our example ... changing the option to `min_similarity = 0.7` returns this: 199 | 200 |
201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 |
left_idleft_namesimilarityright_nameright_id
..................
5AA098762Dhyper startup inc.1.00hyper-startup inc.BB099931J
6AA098762Dhyper startup inc.1.00hyper startup inc.AA098762D
7AA098762Dhyper startup inc.0.78hyper startup incorporatedCC082744L
8BB099931Jhyper-startup inc.1.00hyper-startup inc.BB099931J
9BB099931Jhyper-startup inc.1.00hyper startup inc.AA098762D
10BB099931Jhyper-startup inc.0.78hyper startup incorporatedCC082744L
11CC082744Lhyper startup incorporated1.00hyper startup incorporatedCC082744L
12CC082744Lhyper startup incorporated0.78hyper-startup inc.BB099931J
13CC082744Lhyper startup incorporated0.78hyper startup inc.AA098762D
14HH072982Khyper hyper inc.1.00hyper hyper inc.HH072982K
..................
311 |
312 | 313 | Now we see the IDs — AA098762D, BB099931J, CC082744L — have further matches. Each 'name' value has two other matching rows (IDs). However, we see that setting minimum similarity to 0.7 has still not matched 'hyper hyper inc.' (ID HH072982K) even though a person would judge that the 'name' is a match. The minimum similarity setting can be adjusted up and down until it is considered that most duplicates are being matched. If so, we can progress. 314 | 315 | ### Removing identical rows 316 | 317 | Once we are happy with the level of matching, we can remove the rows where the IDs are the same. Having the original (database) IDs for the rows means that we can precisely remove identical rows — that is, we are not removing matches based on similar values, but on the exact (database) IDs: 318 | 319 | ```python 320 | dupes = matches[matches.left_id != matches.right_id] 321 | dupes 322 | ``` 323 | And we see the following for the company name we have been following: 324 | 325 | 326 |
327 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | 355 | 356 | 357 | 358 | 359 | 360 | 361 | 362 | 363 | 364 | 365 | 366 | 367 | 368 | 369 | 370 | 371 | 372 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | 392 | 393 | 394 | 395 | 396 | 397 | 398 | 399 | 400 | 401 | 402 | 403 | 404 |
left_idleft_namesimilarityright_nameright_id
..................
5AA098762Dhyper startup inc.1.00hyper-startup inc.BB099931J
7AA098762Dhyper startup inc.0.78hyper startup incorporatedCC082744L
9BB099931Jhyper-startup inc.1.00hyper startup inc.AA098762D
10BB099931Jhyper-startup inc.0.78hyper startup incorporatedCC082744L
12CC082744Lhyper startup incorporated0.78hyper-startup inc.BB099931J
13CC082744Lhyper startup incorporated0.78hyper startup inc.AA098762D
..................
405 |
406 | 407 | ***N.B.** the pandas index number 14 has gone because the left and right side IDs were identical.* 408 | 409 | ### Reduce data to unique rows having duplicate IDs 410 | 411 | Finally we reduce the data to a pandas Series ready for exporting with one row for each record that has any duplicates. 412 | 413 | ```python 414 | company_dupes = pd.DataFrame(dupes.left_id.unique()).squeeze().rename('company_id') 415 | company_dupes 416 | ``` 417 | 418 | This gives the following result: 419 | 420 | ``` 421 | 0 AA012345X 422 | 1 BB016741P 423 | 2 AA098762D 424 | 3 BB099931J 425 | 4 CC082744L 426 | 5 AA903844B 427 | 6 BB904941H 428 | 7 AA777431C 429 | 8 BB760431Y 430 | 9 BB750431M 431 | Name: company_id, dtype: object 432 | ``` 433 | 434 | How this is processed, as with any database clean-up, is out of the scope of this tutorial. A first step however could be: 435 | 436 | 1. Import the list of database IDs into the relevant database as a temporary table 437 | 1. Do an inner-join with the original table the data was exported from and sort ascending by the 'name' column 438 | 439 | This will return filtered rows with the 'name' field in adjacent rows showing similar matched strings. 440 | -------------------------------------------------------------------------------- /docs/examples.md: -------------------------------------------------------------------------------- 1 | 2 | In this section we will cover a few use cases for which string_grouper may be used. We will use the same data set of company names as used in: [Super Fast String Matching in Python](https://bergvca.github.io/2017/10/14/super-fast-string-matching.html). 3 | 4 | ### Find all matches within a single data set 5 | 6 | 7 | ```python 8 | import pandas as pd 9 | import numpy as np 10 | from string_grouper import match_strings, match_most_similar, \ 11 | group_similar_strings, compute_pairwise_similarities, \ 12 | StringGrouper 13 | ``` 14 | 15 | 16 | ```python 17 | company_names = './data/sec_edgar_company_info.csv' 18 | # We only look at the first 50k as an example: 19 | companies = pd.read_csv(company_names)[0:50000] 20 | # Create all matches: 21 | matches = match_strings(companies['Company Name']) 22 | # Look at only the non-exact matches: 23 | matches[matches['left_Company Name'] != matches['right_Company Name']].head() 24 | ``` 25 | 26 | 27 |
28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 |
left_indexleft_Company Namesimilarityright_Company Nameright_index
15140210, LLC0.87029190210 LLC4211
1671651 800 MUTUALS ADVISOR SERIES0.9316151 800 MUTUALS ADVISORS SERIES166
1681661 800 MUTUALS ADVISORS SERIES0.9316151 800 MUTUALS ADVISOR SERIES165
1721681 800 RADIATOR FRANCHISE INC1.0000001-800-RADIATOR FRANCHISE INC.201
1781731 FINANCIAL MARKETPLACE SECURITIES LLC ...0.9493641 FINANCIAL MARKETPLACE SECURITIES, LLC174
82 |
83 | 84 | 85 | ### Find all matches in between two data sets. 86 | The `match_strings` function finds similar items between two data sets as well. This can be seen as an inner join between two data sets: 87 | 88 | 89 | ```python 90 | # Create a small set of artificial company names: 91 | duplicates = pd.Series(['S MEDIA GROUP', '012 SMILE.COMMUNICATIONS', 'foo bar', 'B4UTRADE COM CORP']) 92 | # Create all matches: 93 | matches = match_strings(companies['Company Name'], duplicates) 94 | matches 95 | ``` 96 | 97 | 98 |
99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 |
left_indexleft_Company Namesimilarityright_sideright_index
012012 SMILE.COMMUNICATIONS LTD0.944092012 SMILE.COMMUNICATIONS1
149777B.A.S. MEDIA GROUP0.854383S MEDIA GROUP0
249855B4UTRADE COM CORP1.000000B4UTRADE COM CORP3
349856B4UTRADE COM INC0.810217B4UTRADE COM CORP3
449857B4UTRADE CORP0.878276B4UTRADE COM CORP3
153 |
154 | 155 | 156 | Out of the four company names in `duplicates`, three companies are found in the original company data set. One company is found three times. 157 | 158 | ### Finding duplicates from a (database extract to) DataFrame where IDs for rows are supplied. 159 | 160 | A very common scenario is the case where duplicate records for an entity have been entered into a database. That is, there are two or more records where a name field has slightly different spelling. For example, "A.B. Corporation" and "AB Corporation". Using the optional 'ID' parameter in the `match_strings` function duplicates can be found easily. A [tutorial](https://github.com/Bergvca/string_grouper/blob/master/tutorials/tutorial_1.md) that steps though the process with an example data set is available. 161 | 162 | 163 | ### For a second data set, find only the most similar match 164 | 165 | In the example above, it's possible that multiple matches are found for a single string. Sometimes we just want a string to match with a single most similar string. If there are no similar strings found, the original string should be returned: 166 | 167 | 168 | ```python 169 | # Create a small set of artificial company names: 170 | new_companies = pd.Series(['S MEDIA GROUP', '012 SMILE.COMMUNICATIONS', 'foo bar', 'B4UTRADE COM CORP'],\ 171 | name='New Company') 172 | # Create all matches: 173 | matches = match_most_similar(companies['Company Name'], new_companies, ignore_index=True) 174 | # Display the results: 175 | pd.concat([new_companies, matches], axis=1) 176 | ``` 177 | 178 | 179 |
180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 |
New Companymost_similar_Company Name
0S MEDIA GROUPB.A.S. MEDIA GROUP
1012 SMILE.COMMUNICATIONS012 SMILE.COMMUNICATIONS LTD
2foo barfoo bar
3B4UTRADE COM CORPB4UTRADE COM CORP
211 |
212 | 213 | 214 | 215 | ### Deduplicate a single data set and show items with most duplicates 216 | 217 | The `group_similar_strings` function groups strings that are similar using a single linkage clustering algorithm. That is, if item A and item B are similar; and item B and item C are similar; but the similarity between A and C is below the threshold; then all three items are grouped together. 218 | 219 | ```python 220 | # Add the grouped strings: 221 | companies['deduplicated_name'] = group_similar_strings(companies['Company Name'], 222 | ignore_index=True) 223 | # Show items with most duplicates: 224 | companies.groupby('deduplicated_name')['Line Number'].count().sort_values(ascending=False).head(10) 225 | ``` 226 | 227 | 228 | 229 | 230 | deduplicated_name 231 | ADVISORS DISCIPLINED TRUST 1824 232 | AGL LIFE ASSURANCE CO SEPARATE ACCOUNT 183 233 | ANGELLIST-ART-FUND, A SERIES OF ANGELLIST-FG-FUNDS, LLC 116 234 | AMERICREDIT AUTOMOBILE RECEIVABLES TRUST 2001-1 87 235 | ACE SECURITIES CORP. HOME EQUITY LOAN TRUST, SERIES 2006-HE2 57 236 | ASSET-BACKED PASS-THROUGH CERTIFICATES SERIES 2004-W1 40 237 | ALLSTATE LIFE GLOBAL FUNDING TRUST 2005-3 39 238 | ALLY AUTO RECEIVABLES TRUST 2014-1 33 239 | ANDERSON ROBERT E / 28 240 | ADVENT INTERNATIONAL GPE VIII LIMITED PARTNERSHIP 28 241 | Name: Line Number, dtype: int64 242 | 243 | 244 | The `group_similar_strings` function also works with IDs: imagine a `DataFrame` (`customers_df`) with the following content: 245 | ```python 246 | # Create a small set of artificial customer names: 247 | customers_df = pd.DataFrame( 248 | [ 249 | ('BB016741P', 'Mega Enterprises Corporation'), 250 | ('CC082744L', 'Hyper Startup Incorporated'), 251 | ('AA098762D', 'Hyper Startup Inc.'), 252 | ('BB099931J', 'Hyper-Startup Inc.'), 253 | ('HH072982K', 'Hyper Hyper Inc.') 254 | ], 255 | columns=('Customer ID', 'Customer Name') 256 | ).set_index('Customer ID') 257 | # Display the data: 258 | customers_df 259 | ``` 260 | 261 |
262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 |
Customer Name
Customer ID
BB016741PMega Enterprises Corporation
CC082744LHyper Startup Incorporated
AA098762DHyper Startup Inc.
BB099931JHyper-Startup Inc.
HH072982KHyper Hyper Inc.
296 |
297 | 298 | The output of `group_similar_strings` can be directly used as a mapping table: 299 | ```python 300 | # Group customers with similar names: 301 | customers_df[["group-id", "name_deduped"]] = \ 302 | group_similar_strings(customers_df["Customer Name"]) 303 | # Display the mapping table: 304 | customers_df 305 | ``` 306 | 307 |
308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | 355 |
Customer Namegroup-idname_deduped
Customer ID
BB016741PMega Enterprises CorporationBB016741PMega Enterprises Corporation
CC082744LHyper Startup IncorporatedCC082744LHyper Startup Incorporated
AA098762DHyper Startup Inc.AA098762DHyper Startup Inc.
BB099931JHyper-Startup Inc.AA098762DHyper Startup Inc.
HH072982KHyper Hyper Inc.HH072982KHyper Hyper Inc.
356 |
357 | 358 | Note that here `customers_df` initially had only one column "Customer Name" (before the `group_similar_strings` function call); and it acquired two more columns "group-id" (the index-column) and "name_deduped" after the call through a "[setting with enlargement](https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#setting-with-enlargement)" (a `pandas` feature). 359 | 360 | ### Simply compute the cosine similarities of pairs of strings 361 | 362 | Sometimes we have pairs of strings that have already been matched but whose similarity scores need to be computed. For this purpose we provide the function `compute_pairwise_similarities`: 363 | 364 | ```python 365 | # Create a small DataFrame of pairs of strings: 366 | pair_s = pd.DataFrame( 367 | [ 368 | ('Mega Enterprises Corporation', 'Mega Enterprises Corporation'), 369 | ('Hyper Startup Inc.', 'Hyper Startup Incorporated'), 370 | ('Hyper Startup Inc.', 'Hyper Startup Inc.'), 371 | ('Hyper Startup Inc.', 'Hyper-Startup Inc.'), 372 | ('Hyper Hyper Inc.', 'Hyper Hyper Inc.'), 373 | ('Mega Enterprises Corporation', 'Mega Enterprises Corp.') 374 | ], 375 | columns=('left', 'right') 376 | ) 377 | # Display the data: 378 | pair_s 379 | ``` 380 | 381 | 382 | 383 | 384 |
385 | 386 | 387 | 388 | 389 | 390 | 391 | 392 | 393 | 394 | 395 | 396 | 397 | 398 | 399 | 400 | 401 | 402 | 403 | 404 | 405 | 406 | 407 | 408 | 409 | 410 | 411 | 412 | 413 | 414 | 415 | 416 | 417 | 418 | 419 | 420 | 421 | 422 | 423 | 424 | 425 |
leftright
0Mega Enterprises CorporationMega Enterprises Corporation
1Hyper Startup Inc.Hyper Startup Incorporated
2Hyper Startup Inc.Hyper Startup Inc.
3Hyper Startup Inc.Hyper-Startup Inc.
4Hyper Hyper Inc.Hyper Hyper Inc.
5Mega Enterprises CorporationMega Enterprises Corp.
426 |
427 | 428 | 429 | 430 | 431 | ```python 432 | # Compute their cosine similarities and display them: 433 | pair_s['similarity'] = compute_pairwise_similarities(pair_s['left'], pair_s['right']) 434 | pair_s 435 | ``` 436 | 437 | 438 | 439 | 440 |
441 | 442 | 443 | 444 | 445 | 446 | 447 | 448 | 449 | 450 | 451 | 452 | 453 | 454 | 455 | 456 | 457 | 458 | 459 | 460 | 461 | 462 | 463 | 464 | 465 | 466 | 467 | 468 | 469 | 470 | 471 | 472 | 473 | 474 | 475 | 476 | 477 | 478 | 479 | 480 | 481 | 482 | 483 | 484 | 485 | 486 | 487 | 488 |
leftrightsimilarity
0Mega Enterprises CorporationMega Enterprises Corporation1.000000
1Hyper Startup Inc.Hyper Startup Incorporated0.633620
2Hyper Startup Inc.Hyper Startup Inc.1.000000
3Hyper Startup Inc.Hyper-Startup Inc.1.000000
4Hyper Hyper Inc.Hyper Hyper Inc.1.000000
5Mega Enterprises CorporationMega Enterprises Corp.0.826463
489 |
490 | 491 | -------------------------------------------------------------------------------- /string_grouper_utils/test/test_string_grouper_utils.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import pandas as pd 3 | from dateutil.parser import parse 4 | from string_grouper_utils.string_grouper_utils import new_group_rep_by_earliest_timestamp, \ 5 | new_group_rep_by_completeness, new_group_rep_by_highest_weight 6 | 7 | 8 | class SimpleExample(object): 9 | def __init__(self): 10 | self.customers_df = pd.DataFrame( 11 | [ 12 | ('BB016741P', 'Mega Enterprises Corporation', 'Address0', 'Tel0', 'Description0', 0.2, 13 | '2014-12-30 10:55:00-02:00', 'EE059082Q', 'Mega Enterprises Corp.'), 14 | ('CC082744L', 'Hyper Startup Incorporated', '', 'Tel1', '', 0.5, '2017-01-01 20:23:15-05:00', 15 | 'BB099931J', 'Hyper-Startup Inc.'), 16 | ('AA098762D', 'Hyper Startup Inc.', 'Address2', 'Tel2', 'Description2', 0.3, 17 | '2020-10-20 15:29:30+02:00', 'BB099931J', 'Hyper-Startup Inc.'), 18 | ('BB099931J', 'Hyper-Startup Inc.', 'Address3', 'Tel3', 'Description3', 0.1, 19 | '2013-07-01 03:34:45-05:00', 'BB099931J', 'Hyper-Startup Inc.'), 20 | ('HH072982K', 'Hyper Hyper Inc.', 'Address4', '', 'Description4', 0.9, '2005-09-11 11:56:00-07:00', 21 | 'HH072982K', 'Hyper Hyper Inc.'), 22 | ('EE059082Q', 'Mega Enterprises Corp.', 'Address5', 'Tel5', 'Description5', 1.0, 23 | '1998-04-14 09:21:11+00:00', 'EE059082Q', 'Mega Enterprises Corp.') 24 | ], 25 | columns=('Customer ID', 'Customer Name', 'Address', 'Tel', 'Description', 'weight', 'timestamp', 26 | 'group ID', 'group name') 27 | ) 28 | # new_group_rep_by_earliest_timestamp(customers_df, 'group ID', 'Customer ID', 'timestamp') 29 | self.expected_result_TS = pd.Series( 30 | [ 31 | 'EE059082Q', 32 | 'BB099931J', 33 | 'BB099931J', 34 | 'BB099931J', 35 | 'HH072982K', 36 | 'EE059082Q', 37 | ] 38 | ) 39 | # new_group_rep_by_earliest_timestamp(customers_df, 'group ID', 'Customer ID', 'timestamp', 'Customer Name') 40 | self.expected_result_T = pd.DataFrame( 41 | [ 42 | ('EE059082Q', 'Mega Enterprises Corp.'), 43 | ('BB099931J', 'Hyper-Startup Inc.'), 44 | ('BB099931J', 'Hyper-Startup Inc.'), 45 | ('BB099931J', 'Hyper-Startup Inc.'), 46 | ('HH072982K', 'Hyper Hyper Inc.'), 47 | ('EE059082Q', 'Mega Enterprises Corp.') 48 | ] 49 | ) 50 | # new_group_rep_by_earliest_timestamp(customers_df, 'group ID', 'Customer ID', 'weight', 'Customer Name') 51 | self.expected_result_TW = pd.DataFrame( 52 | [ 53 | ('BB016741P', 'Mega Enterprises Corporation'), 54 | ('BB099931J', 'Hyper-Startup Inc.'), 55 | ('BB099931J', 'Hyper-Startup Inc.'), 56 | ('BB099931J', 'Hyper-Startup Inc.'), 57 | ('HH072982K', 'Hyper Hyper Inc.'), 58 | ('BB016741P', 'Mega Enterprises Corporation') 59 | ] 60 | ) 61 | # new_group_rep_by_highest_weight(customers_df, 'group ID', 'Customer ID', 'weight', 'Customer Name') 62 | self.expected_result_W = pd.DataFrame( 63 | [ 64 | ('EE059082Q', 'Mega Enterprises Corp.'), 65 | ('CC082744L', 'Hyper Startup Incorporated'), 66 | ('CC082744L', 'Hyper Startup Incorporated'), 67 | ('CC082744L', 'Hyper Startup Incorporated'), 68 | ('HH072982K', 'Hyper Hyper Inc.'), 69 | ('EE059082Q', 'Mega Enterprises Corp.') 70 | ] 71 | ) 72 | # new_group_rep_by_highest_weight(customers_df, 'group ID', 'Customer ID', 'weight', 'Customer Name') 73 | self.expected_result_C = pd.DataFrame( 74 | [ 75 | ('BB016741P', 'Mega Enterprises Corporation'), 76 | ('AA098762D', 'Hyper Startup Inc.'), 77 | ('AA098762D', 'Hyper Startup Inc.'), 78 | ('AA098762D', 'Hyper Startup Inc.'), 79 | ('HH072982K', 'Hyper Hyper Inc.'), 80 | ('BB016741P', 'Mega Enterprises Corporation') 81 | ] 82 | ) 83 | 84 | 85 | class StringGrouperUtilTest(unittest.TestCase): 86 | def test_group_rep_by_timestamp_return_series(self): 87 | """Should return a pd.Series object with the same length as the grouped_data. The series object will contain 88 | a list of groups whose group-representatives have the earliest timestamp of the group""" 89 | simple_example = SimpleExample() 90 | customers_df = simple_example.customers_df 91 | pd.testing.assert_series_equal( 92 | simple_example.expected_result_TS, 93 | new_group_rep_by_earliest_timestamp( 94 | customers_df, 95 | 'group ID', 96 | 'Customer ID', 97 | 'timestamp' 98 | ) 99 | ) 100 | 101 | def test_group_rep_by_timestamp_return_dataframe(self): 102 | """Should return a pd.DataFrame object with the same length as the grouped_data. The DataFrame object will contain 103 | a list of groups whose group-representatives have the earliest timestamp of the group""" 104 | simple_example = SimpleExample() 105 | customers_df = simple_example.customers_df 106 | pd.testing.assert_frame_equal( 107 | simple_example.expected_result_T, 108 | new_group_rep_by_earliest_timestamp( 109 | customers_df, 110 | 'group ID', 111 | 'Customer ID', 112 | 'timestamp', 113 | 'Customer Name' 114 | ) 115 | ) 116 | 117 | def test_group_rep_by_timestamp_series_input(self): 118 | """Should return a pd.DataFrame object with the same length as the grouped_data. The DataFrame object will contain 119 | a list of groups whose group-representatives have the earliest timestamp of the group""" 120 | simple_example = SimpleExample() 121 | customers_df = simple_example.customers_df 122 | pd.testing.assert_frame_equal( 123 | simple_example.expected_result_T, 124 | new_group_rep_by_earliest_timestamp( 125 | customers_df, 126 | 'group ID', 127 | 'Customer ID', 128 | customers_df['timestamp'], 129 | 'Customer Name' 130 | ) 131 | ) 132 | 133 | def test_group_rep_by_timestamp_input_series_length(self): 134 | """Should raise an exception when timestamps series length is not the same as the length of grouped_data""" 135 | simple_example = SimpleExample() 136 | customers_df = simple_example.customers_df 137 | with self.assertRaises(Exception): 138 | _ = new_group_rep_by_earliest_timestamp( 139 | customers_df, 140 | 'group ID', 141 | 'Customer ID', 142 | customers_df['timestamp'].iloc[:-2], 143 | 'Customer Name' 144 | ) 145 | 146 | def test_group_rep_by_timestamp_bad_input_timestamp_strings(self): 147 | """Should raise an exception when timestamps series of strings is not datetime-like""" 148 | simple_example = SimpleExample() 149 | customers_df = simple_example.customers_df 150 | with self.assertRaises(Exception): 151 | _ = new_group_rep_by_earliest_timestamp( 152 | customers_df, 153 | 'group ID', 154 | 'Customer ID', 155 | customers_df['Customer ID'], 156 | 'Customer Name' 157 | ) 158 | 159 | def test_group_rep_by_timestamp_pandas_timestamps(self): 160 | """Should return a pd.DataFrame object with the same length as the grouped_data. The DataFrame object will contain 161 | a list of groups whose group-representatives have the earliest timestamp of the group""" 162 | simple_example = SimpleExample() 163 | customers_df = simple_example.customers_df 164 | customers_df2 = customers_df.copy() 165 | customers_df2['timestamp'] = customers_df2['timestamp'].transform(lambda t: pd.Timestamp(t)) 166 | pd.testing.assert_frame_equal( 167 | simple_example.expected_result_T, 168 | new_group_rep_by_earliest_timestamp( 169 | customers_df2, 170 | 'group ID', 171 | 'Customer ID', 172 | customers_df2['timestamp'], 173 | 'Customer Name' 174 | ) 175 | ) 176 | 177 | def test_group_rep_by_timestamp_dateutil_timestamps(self): 178 | """Should return a pd.DataFrame object with the same length as the grouped_data. The DataFrame object will contain 179 | a list of groups whose group-representatives have the earliest timestamp of the group""" 180 | simple_example = SimpleExample() 181 | customers_df = simple_example.customers_df 182 | customers_df2 = customers_df.copy() 183 | customers_df2['timestamp'] = customers_df2['timestamp'].transform(lambda t: parse(t)) 184 | pd.testing.assert_frame_equal( 185 | simple_example.expected_result_T, 186 | new_group_rep_by_earliest_timestamp( 187 | customers_df2, 188 | 'group ID', 189 | 'Customer ID', 190 | customers_df2['timestamp'], 191 | 'Customer Name' 192 | ) 193 | ) 194 | 195 | def test_group_rep_by_timestamp_bad_nonstring_timestamps(self): 196 | """Should raise an exception when not all provided timestamps are datetime-like or number-like""" 197 | simple_example = SimpleExample() 198 | customers_df = simple_example.customers_df 199 | customers_df2 = customers_df.copy() 200 | customers_df2.at[0, 'timestamp'] = 1.0 201 | with self.assertRaises(Exception): 202 | _ = new_group_rep_by_earliest_timestamp( 203 | customers_df2, 204 | 'group ID', 205 | 'Customer ID', 206 | customers_df2['timestamp'], 207 | 'Customer Name' 208 | ) 209 | 210 | def test_group_rep_by_timestamp_input_numbers(self): 211 | """Should return a pd.DataFrame object with the same length as the grouped_data. The DataFrame object will contain 212 | a list of groups whose group-representatives have the earliest timestamp of the group""" 213 | simple_example = SimpleExample() 214 | customers_df = simple_example.customers_df 215 | pd.testing.assert_frame_equal( 216 | simple_example.expected_result_TW, 217 | new_group_rep_by_earliest_timestamp( 218 | customers_df, 219 | 'group ID', 220 | 'Customer ID', 221 | customers_df['weight'], 222 | 'Customer Name' 223 | ) 224 | ) 225 | 226 | def test_group_rep_by_weight(self): 227 | """Should return a pd.DataFrame object with the same length as the grouped_data. The DataFrame object will contain 228 | a list of groups whose group-representatives have the highest weight of the group""" 229 | simple_example = SimpleExample() 230 | customers_df = simple_example.customers_df 231 | pd.testing.assert_frame_equal( 232 | simple_example.expected_result_W, 233 | new_group_rep_by_highest_weight( 234 | customers_df, 235 | 'group ID', 236 | 'Customer ID', 237 | 'weight', 238 | 'Customer Name' 239 | ) 240 | ) 241 | 242 | def test_group_rep_by_weight_input_series(self): 243 | """Should return a pd.DataFrame object with the same length as the grouped_data. The DataFrame object will contain 244 | a list of groups whose group-representatives have the highest weight of the group""" 245 | simple_example = SimpleExample() 246 | customers_df = simple_example.customers_df 247 | pd.testing.assert_frame_equal( 248 | simple_example.expected_result_W, 249 | new_group_rep_by_highest_weight( 250 | customers_df, 251 | 'group ID', 252 | 'Customer ID', 253 | customers_df['weight'], 254 | 'Customer Name' 255 | ) 256 | ) 257 | 258 | def test_group_rep_by_weight_input_series_length(self): 259 | """Should raise an exception when weights series length is not the same as the length of grouped_data""" 260 | simple_example = SimpleExample() 261 | customers_df = simple_example.customers_df 262 | with self.assertRaises(Exception): 263 | _ = new_group_rep_by_highest_weight( 264 | customers_df, 265 | 'group ID', 266 | 'Customer ID', 267 | customers_df['weight'].iloc[:-2], 268 | 'Customer Name' 269 | ) 270 | 271 | def test_group_rep_by_completeness_column_list(self): 272 | """Should return a pd.DataFrame object with the same length as the grouped_data. The DataFrame object will contain 273 | a list of groups whose group-representatives have the most filled-in records of the group""" 274 | simple_example = SimpleExample() 275 | customers_df = simple_example.customers_df 276 | pd.testing.assert_frame_equal( 277 | simple_example.expected_result_C, 278 | new_group_rep_by_completeness( 279 | customers_df, 280 | 'group ID', 281 | 'Customer ID', 282 | 'Customer Name', 283 | [1, 2, 3, 4] 284 | ) 285 | ) 286 | 287 | def test_group_rep_by_completeness_no_columns(self): 288 | """Should return a pd.DataFrame object with the same length as the grouped_data. The DataFrame object will contain 289 | a list of groups whose group-representatives have the most filled-in records of the group""" 290 | simple_example = SimpleExample() 291 | customers_df = simple_example.customers_df 292 | pd.testing.assert_frame_equal( 293 | simple_example.expected_result_C, 294 | new_group_rep_by_completeness( 295 | customers_df, 296 | 'group ID', 297 | 'Customer ID', 298 | 'Customer Name' 299 | ) 300 | ) 301 | 302 | def test_group_rep_by_completeness_input_dataframe(self): 303 | """Should return a pd.DataFrame object with the same length as the grouped_data. The DataFrame object will contain 304 | a list of groups whose group-representatives have the most filled-in records of the group""" 305 | simple_example = SimpleExample() 306 | customers_df = simple_example.customers_df 307 | pd.testing.assert_frame_equal( 308 | simple_example.expected_result_C, 309 | new_group_rep_by_completeness( 310 | customers_df, 311 | 'group ID', 312 | 'Customer ID', 313 | 'Customer Name', 314 | customers_df 315 | ) 316 | ) 317 | 318 | def test_group_rep_by_completeness_input_dataframe_length(self): 319 | """Should raise an exception when tested_cols length is not the same as the length of grouped_data""" 320 | simple_example = SimpleExample() 321 | customers_df = simple_example.customers_df 322 | with self.assertRaises(Exception): 323 | _ = new_group_rep_by_completeness( 324 | customers_df, 325 | 'group ID', 326 | 'Customer ID', 327 | 'Customer Name', 328 | customers_df.iloc[:-2, :] 329 | ) 330 | 331 | 332 | if __name__ == '__main__': 333 | unittest.main() 334 | -------------------------------------------------------------------------------- /tutorials/zero_similarity.md: -------------------------------------------------------------------------------- 1 | # When min_similarity ≤ 0 and include_zeroes = [True | False]) 2 | 3 | 4 | ```python 5 | import pandas as pd 6 | import numpy as np 7 | from string_grouper import StringGrouper 8 | ``` 9 | 10 | 11 | ```python 12 | companies_df = pd.read_csv('data/sec__edgar_company_info.csv')[0:50000] 13 | ``` 14 | 15 | 16 | ```python 17 | master = companies_df['Company Name'] 18 | master_id = companies_df['Line Number'] 19 | duplicates = pd.Series(["ADVISORS DISCIPLINED TRUST", "ADVISORS DISCIPLINED TRUST '18"]) 20 | duplicates_id = pd.Series([3, 5]) 21 | ``` 22 | 23 | #### When ID's are passed as arguments: 24 | By default, zero-similarity matches are found and output when `min_similarity = 0`: 25 | 26 | 27 | ```python 28 | string_grouper = StringGrouper( 29 | master = master, 30 | duplicates=duplicates, 31 | master_id=master_id, 32 | duplicates_id=duplicates_id, 33 | ignore_index=True, 34 | min_similarity = 0, 35 | max_n_matches = 10000, 36 | regex = "[,-./#]" 37 | ).fit() 38 | string_grouper.get_matches() 39 | ``` 40 | 41 | 42 | 43 | 44 |
45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 |
left_Company Nameleft_Line Numbersimilarityright_idright_side
0#1 ARIZONA DISCOUNT PROPERTIES LLC30.0911573ADVISORS DISCIPLINED TRUST
1#1 ARIZONA DISCOUNT PROPERTIES LLC30.0638615ADVISORS DISCIPLINED TRUST '18
205 CAT THIEF/GOLD IN MY STARS LLC210.0153133ADVISORS DISCIPLINED TRUST
305 CAT THIEF/GOLD IN MY STARS LLC210.0107285ADVISORS DISCIPLINED TRUST '18
405 DIXIE UNION/UNDER FIRE LLC220.0253973ADVISORS DISCIPLINED TRUST
..................
99995ALLDREDGE WILLIAM T217460.0000003ADVISORS DISCIPLINED TRUST
99996ALLEN SAMUEL R221830.0000005ADVISORS DISCIPLINED TRUST '18
99997ATSP INNOVATIONS, LLC452730.0000005ADVISORS DISCIPLINED TRUST '18
99998ATLAS IDF, LP448770.0000005ADVISORS DISCIPLINED TRUST '18
99999AU LEO Y455350.0000005ADVISORS DISCIPLINED TRUST '18
147 |

100000 rows × 5 columns

148 |
149 | 150 | 151 | 152 | #### `StringGrouper` also includes option `include_zeroes`: 153 | 154 | 155 | ```python 156 | string_grouper = StringGrouper( 157 | master = master, 158 | duplicates=duplicates, 159 | master_id=master_id, 160 | duplicates_id=duplicates_id, 161 | ignore_index=True, 162 | min_similarity = 0, 163 | max_n_matches = 10000, 164 | regex = "[,-./#]", 165 | include_zeroes = False 166 | ).fit() 167 | string_grouper.get_matches() 168 | ``` 169 | 170 | 171 | 172 | 173 |
174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 |
left_Company Nameleft_Line Numbersimilarityright_idright_side
0#1 ARIZONA DISCOUNT PROPERTIES LLC30.0911573ADVISORS DISCIPLINED TRUST
1#1 ARIZONA DISCOUNT PROPERTIES LLC30.0638615ADVISORS DISCIPLINED TRUST '18
205 CAT THIEF/GOLD IN MY STARS LLC210.0153133ADVISORS DISCIPLINED TRUST
305 CAT THIEF/GOLD IN MY STARS LLC210.0107285ADVISORS DISCIPLINED TRUST '18
405 DIXIE UNION/UNDER FIRE LLC220.0253973ADVISORS DISCIPLINED TRUST
..................
28754BAAPLIFE3-2015, LLC499760.0218305ADVISORS DISCIPLINED TRUST '18
28755BAAPLIFE4-2016, LLC499770.0309833ADVISORS DISCIPLINED TRUST
28756BAAPLIFE4-2016, LLC499770.0217065ADVISORS DISCIPLINED TRUST '18
28757BABA JOE DIAMOND VENTURES US INC.499890.0270643ADVISORS DISCIPLINED TRUST
28758BABA JOE DIAMOND VENTURES US INC.499890.0189605ADVISORS DISCIPLINED TRUST '18
276 |

28759 rows × 5 columns

277 |
278 | 279 | 280 | 281 | #### `get_matches` option `include_zeroes` can override `StringGrouper` default: 282 | 283 | 284 | ```python 285 | string_grouper.get_matches(include_zeroes=True) 286 | ``` 287 | 288 | 289 | 290 | 291 |
292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | 355 | 356 | 357 | 358 | 359 | 360 | 361 | 362 | 363 | 364 | 365 | 366 | 367 | 368 | 369 | 370 | 371 | 372 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | 392 | 393 |
left_Company Nameleft_Line Numbersimilarityright_idright_side
0#1 ARIZONA DISCOUNT PROPERTIES LLC30.0911573ADVISORS DISCIPLINED TRUST
1#1 ARIZONA DISCOUNT PROPERTIES LLC30.0638615ADVISORS DISCIPLINED TRUST '18
205 CAT THIEF/GOLD IN MY STARS LLC210.0153133ADVISORS DISCIPLINED TRUST
305 CAT THIEF/GOLD IN MY STARS LLC210.0107285ADVISORS DISCIPLINED TRUST '18
405 DIXIE UNION/UNDER FIRE LLC220.0253973ADVISORS DISCIPLINED TRUST
..................
99995ALLDREDGE WILLIAM T217460.0000003ADVISORS DISCIPLINED TRUST
99996ALLEN SAMUEL R221830.0000005ADVISORS DISCIPLINED TRUST '18
99997ATSP INNOVATIONS, LLC452730.0000005ADVISORS DISCIPLINED TRUST '18
99998ATLAS IDF, LP448770.0000005ADVISORS DISCIPLINED TRUST '18
99999AU LEO Y455350.0000005ADVISORS DISCIPLINED TRUST '18
394 |

100000 rows × 5 columns

395 |
396 | 397 | 398 | 399 | #### When no ID's are passed as arguments and indexes are not set: 400 | Default indexes are output: 401 | 402 | 403 | ```python 404 | string_grouper = StringGrouper( 405 | master = master, 406 | duplicates=duplicates, 407 | min_similarity = 0, 408 | max_n_matches = 10000, 409 | regex = "[,-./#]" 410 | ).fit() 411 | string_grouper.get_matches() 412 | ``` 413 | 414 | 415 | 416 | 417 |
418 | 419 | 420 | 421 | 422 | 423 | 424 | 425 | 426 | 427 | 428 | 429 | 430 | 431 | 432 | 433 | 434 | 435 | 436 | 437 | 438 | 439 | 440 | 441 | 442 | 443 | 444 | 445 | 446 | 447 | 448 | 449 | 450 | 451 | 452 | 453 | 454 | 455 | 456 | 457 | 458 | 459 | 460 | 461 | 462 | 463 | 464 | 465 | 466 | 467 | 468 | 469 | 470 | 471 | 472 | 473 | 474 | 475 | 476 | 477 | 478 | 479 | 480 | 481 | 482 | 483 | 484 | 485 | 486 | 487 | 488 | 489 | 490 | 491 | 492 | 493 | 494 | 495 | 496 | 497 | 498 | 499 | 500 | 501 | 502 | 503 | 504 | 505 | 506 | 507 | 508 | 509 | 510 | 511 | 512 | 513 | 514 | 515 | 516 | 517 | 518 | 519 |
left_indexleft_Company Namesimilarityright_sideright_index
02#1 ARIZONA DISCOUNT PROPERTIES LLC0.091157ADVISORS DISCIPLINED TRUST0
12#1 ARIZONA DISCOUNT PROPERTIES LLC0.063861ADVISORS DISCIPLINED TRUST '181
22005 CAT THIEF/GOLD IN MY STARS LLC0.015313ADVISORS DISCIPLINED TRUST0
32005 CAT THIEF/GOLD IN MY STARS LLC0.010728ADVISORS DISCIPLINED TRUST '181
42105 DIXIE UNION/UNDER FIRE LLC0.025397ADVISORS DISCIPLINED TRUST0
..................
9999521745ALLDREDGE WILLIAM T0.000000ADVISORS DISCIPLINED TRUST0
9999622182ALLEN SAMUEL R0.000000ADVISORS DISCIPLINED TRUST '181
9999745272ATSP INNOVATIONS, LLC0.000000ADVISORS DISCIPLINED TRUST '181
9999844876ATLAS IDF, LP0.000000ADVISORS DISCIPLINED TRUST '181
9999945534AU LEO Y0.000000ADVISORS DISCIPLINED TRUST '181
520 |

100000 rows × 5 columns

521 |
522 | 523 | 524 | 525 | #### When no ID's are passed as arguments but indexes are set: 526 | Indexes are output: 527 | 528 | 529 | ```python 530 | master.index = pd.Index(master_id) 531 | duplicates.index = pd.Index(duplicates_id) 532 | string_grouper = StringGrouper( 533 | master = master, 534 | duplicates=duplicates, 535 | min_similarity = 0, 536 | max_n_matches = 10000, 537 | regex = "[,-./#]" 538 | ).fit() 539 | string_grouper.get_matches() 540 | ``` 541 | 542 | 543 | 544 | 545 |
546 | 547 | 548 | 549 | 550 | 551 | 552 | 553 | 554 | 555 | 556 | 557 | 558 | 559 | 560 | 561 | 562 | 563 | 564 | 565 | 566 | 567 | 568 | 569 | 570 | 571 | 572 | 573 | 574 | 575 | 576 | 577 | 578 | 579 | 580 | 581 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | 590 | 591 | 592 | 593 | 594 | 595 | 596 | 597 | 598 | 599 | 600 | 601 | 602 | 603 | 604 | 605 | 606 | 607 | 608 | 609 | 610 | 611 | 612 | 613 | 614 | 615 | 616 | 617 | 618 | 619 | 620 | 621 | 622 | 623 | 624 | 625 | 626 | 627 | 628 | 629 | 630 | 631 | 632 | 633 | 634 | 635 | 636 | 637 | 638 | 639 | 640 | 641 | 642 | 643 | 644 | 645 | 646 | 647 |
left_Line Numberleft_Company Namesimilarityright_sideright_index
03#1 ARIZONA DISCOUNT PROPERTIES LLC0.091157ADVISORS DISCIPLINED TRUST3
13#1 ARIZONA DISCOUNT PROPERTIES LLC0.063861ADVISORS DISCIPLINED TRUST '185
22105 CAT THIEF/GOLD IN MY STARS LLC0.015313ADVISORS DISCIPLINED TRUST3
32105 CAT THIEF/GOLD IN MY STARS LLC0.010728ADVISORS DISCIPLINED TRUST '185
42205 DIXIE UNION/UNDER FIRE LLC0.025397ADVISORS DISCIPLINED TRUST3
..................
9999521746ALLDREDGE WILLIAM T0.000000ADVISORS DISCIPLINED TRUST3
9999622183ALLEN SAMUEL R0.000000ADVISORS DISCIPLINED TRUST '185
9999745273ATSP INNOVATIONS, LLC0.000000ADVISORS DISCIPLINED TRUST '185
9999844877ATLAS IDF, LP0.000000ADVISORS DISCIPLINED TRUST '185
9999945535AU LEO Y0.000000ADVISORS DISCIPLINED TRUST '185
648 |

100000 rows × 5 columns

649 |
650 | 651 | 652 | -------------------------------------------------------------------------------- /tutorials/group_representatives.md: -------------------------------------------------------------------------------- 1 | # Group Representatives 2 | ------ 3 | 4 | 5 | ```python 6 | import pandas as pd 7 | from string_grouper import group_similar_strings 8 | ``` 9 | 10 | We have already seen that string_grouper has a function group_similar_strings() that partitions a Series of strings into groups based on their degree of mutual similarity. To represent each group, group_similar_strings() chooses one member of the group. The default choice is the so-called ***centroid*** of the group. 11 | 12 | The **centroid** of a group of similar strings is that string in the group which has the highest ***similarity aggregate***. 13 | 14 | The **similarity aggregate** of a string is the sum of all the cosine similarities between it and the strings that it matches. 15 | 16 | This choice can also be specified by setting the following keyword argument of group_similar_strings: 17 | `group_rep='centroid'`. 18 | 19 | group_similar_strings() has an alternative choice of group representative which is specified by setting `group_rep='first'`. This choice is merely the first member of the group according to its index (that is, its position in the order of appearance of members in the group). Though somewhat arbitrary, this choice is the fastest and can be used for large datasets whenever the choice of group-representative is not important. 20 | 21 | |`group_rep='first'`| 22 | |:---:| 23 | |**`group_rep='centroid'`**| 24 | 25 | But the user may not be satisfied with group_similar_strings()' only two available choices. For example, he/she might prefer the earliest recorded string in the group to represent the group (if timestamp metadata is available). Fortunately, there are three other choices available in an auxiliary module named `string_grouper_utils` included in the package and which can be imported whenever necessary: 26 | 27 | 28 | ```python 29 | from string_grouper_utils import new_group_rep_by_highest_weight, \ 30 | new_group_rep_by_earliest_timestamp, new_group_rep_by_completeness 31 | ``` 32 | 33 | string_grouper_utils provides three high-level functions `new_group_rep_by_highest_weight()`, `new_group_rep_by_earliest_timestamp()`, and `new_group_rep_by_completeness()`. These functions change the group-representatives of data that have already been grouped (by group_similar_strings(), for example). 34 | 35 | Let us create a DataFrame with some artificial timestamped records: 36 | 37 | 38 | ```python 39 | customers_df = pd.DataFrame( 40 | [ 41 | ('BB016741P', 'Mega Enterprises Corporation', 'Address0', 'Tel0', 'Description0', 0.2, '2014-12-30 10:55:00-02:00'), 42 | ('CC082744L', 'Hyper Startup Incorporated', '', 'Tel1', '', 0.5, '2017-01-01 20:23:15-05:00'), 43 | ('AA098762D', 'Hyper Startup Inc.', 'Address2', 'Tel2', 'Description2', 0.3, '2020-10-20 15:29:30+02:00'), 44 | ('BB099931J', 'Hyper-Startup Inc.', 'Address3', 'Tel3', 'Description3', 0.1, '2013-07-01 03:34:45-05:00'), 45 | ('HH072982K', 'Hyper Hyper Inc.', 'Address4', '', 'Description4', 0.9, '2005-09-11 11:56:00-07:00'), 46 | ('EE059082Q', 'Mega Enterprises Corp.', 'Address5', 'Tel5', 'Description5', 1.0, '1998-04-14 09:21:11+00:00') 47 | ], 48 | columns=('Customer ID', 'Customer Name', 'Address', 'Tel', 'Description', 'weight', 'timestamp') 49 | ).set_index('Customer ID') 50 | ``` 51 | 52 | **NB.** These 'timestamps' are not actual `pandas Timestamp` datatypes --- they are strings. If we like, we could convert these strings to `pandas Timestamp` datatypes or datetime datatypes (from python module `datetime`), but this is not necessary because string_grouper_utils can deal with these strings just as they are and can automatically _parse_ them to into (localized) `pandas Timestamp` datatypes internally for comparison as we shall soon see. 53 | 54 | Also, in this example we have used the most general timestamps, that is, each string has a date together with time-of-day and timezone information. This is not always necessary, for example, if desired, only date information can be contained in each string. 55 | 56 | Let us display the DataFrame: 57 | 58 | 59 | ```python 60 | customers_df 61 | ``` 62 | 63 | 64 | 65 | 66 |
67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 |
Customer NameAddressTelDescriptionweighttimestamp
Customer ID
BB016741PMega Enterprises CorporationAddress0Tel0Description00.22014-12-30 10:55:00-02:00
CC082744LHyper Startup IncorporatedTel10.52017-01-01 20:23:15-05:00
AA098762DHyper Startup Inc.Address2Tel2Description20.32020-10-20 15:29:30+02:00
BB099931JHyper-Startup Inc.Address3Tel3Description30.12013-07-01 03:34:45-05:00
HH072982KHyper Hyper Inc.Address4Description40.92005-09-11 11:56:00-07:00
EE059082QMega Enterprises Corp.Address5Tel5Description51.01998-04-14 09:21:11+00:00
145 |
146 | 147 | 148 | 149 | ## group_similar_strings() 150 | 151 | With the following command, we can create a mapping table with the groupings that group_similar_strings() finds. Here the keyword argument `group_rep` is not explicitly set. It therefore takes on the default value `'centroid'`. 152 | 153 | 154 | ```python 155 | customers_df[['group rep ID', 'group rep']] = \ 156 | group_similar_strings( 157 | customers_df['Customer Name'], 158 | min_similarity=0.6) 159 | ``` 160 | 161 | Let's display the mapping table: 162 | 163 | 164 | ```python 165 | customers_df 166 | ``` 167 | 168 | 169 | 170 | 171 |
172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 |
Customer NameAddressTelDescriptionweighttimestampgroup rep IDgroup rep
Customer ID
BB016741PMega Enterprises CorporationAddress0Tel0Description00.22014-12-30 10:55:00-02:00BB016741PMega Enterprises Corporation
CC082744LHyper Startup IncorporatedTel10.52017-01-01 20:23:15-05:00AA098762DHyper Startup Inc.
AA098762DHyper Startup Inc.Address2Tel2Description20.32020-10-20 15:29:30+02:00AA098762DHyper Startup Inc.
BB099931JHyper-Startup Inc.Address3Tel3Description30.12013-07-01 03:34:45-05:00AA098762DHyper Startup Inc.
HH072982KHyper Hyper Inc.Address4Description40.92005-09-11 11:56:00-07:00HH072982KHyper Hyper Inc.
EE059082QMega Enterprises Corp.Address5Tel5Description51.01998-04-14 09:21:11+00:00BB016741PMega Enterprises Corporation
266 |
267 | 268 | 269 | 270 | Let's try this again, this time with group_rep='first': 271 | 272 | 273 | ```python 274 | customers_df[['group rep ID', 'group rep']] = \ 275 | group_similar_strings( 276 | customers_df['Customer Name'], 277 | group_rep='first', 278 | min_similarity=0.6) 279 | ``` 280 | 281 | Displaying the new mapping table shows the differences from the result above: 282 | 283 | 284 | ```python 285 | customers_df 286 | ``` 287 | 288 | 289 | 290 | 291 |
292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | 355 | 356 | 357 | 358 | 359 | 360 | 361 | 362 | 363 | 364 | 365 | 366 | 367 | 368 | 369 | 370 | 371 | 372 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 |
Customer NameAddressTelDescriptionweighttimestampgroup rep IDgroup rep
Customer ID
BB016741PMega Enterprises CorporationAddress0Tel0Description00.22014-12-30 10:55:00-02:00BB016741PMega Enterprises Corporation
CC082744LHyper Startup IncorporatedTel10.52017-01-01 20:23:15-05:00CC082744LHyper Startup Incorporated
AA098762DHyper Startup Inc.Address2Tel2Description20.32020-10-20 15:29:30+02:00CC082744LHyper Startup Incorporated
BB099931JHyper-Startup Inc.Address3Tel3Description30.12013-07-01 03:34:45-05:00CC082744LHyper Startup Incorporated
HH072982KHyper Hyper Inc.Address4Description40.92005-09-11 11:56:00-07:00HH072982KHyper Hyper Inc.
EE059082QMega Enterprises Corp.Address5Tel5Description51.01998-04-14 09:21:11+00:00BB016741PMega Enterprises Corporation
386 |
387 | 388 | 389 | 390 | Remember it displays the same groups! Only the group names (representatives) have changed. 391 | 392 | ## new_group_rep_by_earliest_timestamp() 393 | 394 | As mentioned above, there are still more choices of group-representatives available. Let's use the `new_group_rep_by_earliest_timestamp()` function: 395 | 396 | 397 | ```python 398 | customers_df.reset_index(inplace=True) 399 | customers_df[['group rep ID', 'group rep']] = \ 400 | new_group_rep_by_earliest_timestamp( 401 | grouped_data=customers_df, 402 | group_col='group rep ID', 403 | record_id_col='Customer ID', 404 | record_name_col='Customer Name', 405 | timestamps='timestamp', 406 | dayfirst=False 407 | ) 408 | ``` 409 | 410 | Notice that this time ***the function operates on already grouped data*** (such as the mapping table that was output by group_similar_strings() above). Thus ***the column of the input grouped data containing the groups*** (here either 'group rep ID' or 'group rep') ***must be specified as argument group_col in addition to the column containing the group members*** (here either 'Customer ID' or 'Customer Name') ***in argument record_id_col***. 411 | 412 | Argument record_name_col is optional and will appear in the output alongside the new group-representatives chosen from record_id_col only if specified. 413 | 414 | The keyword argument `dayfirst` used here is one that is also used in python module dateutil's parser.parse() function. This option specifies whether to interpret the first value in an ambiguous 3-integer date (e.g. 01/05/09) as the day ('True') or month ('False'). If keyword argument `yearfirst` is set to 'True', this distinguishes between YDM and YMD. 415 | 416 | The other possible keyword arguments that can be used are detailed in the docstring (help) of new_group_rep_by_earliest_timestamp(): 417 | 418 | 419 | ```python 420 | help(new_group_rep_by_earliest_timestamp) 421 | ``` 422 | 423 | Help on function new_group_rep_by_earliest_timestamp in module string_grouper_utils.string_grouper_utils: 424 | 425 | new_group_rep_by_earliest_timestamp(grouped_data: pandas.core.frame.DataFrame, group_col: Union[str, int], record_id_col: Union[str, int], timestamps: Union[pandas.core.series.Series, str, int], record_name_col: Union[str, int, NoneType] = None, parserinfo=None, **kwargs) -> Union[pandas.core.frame.DataFrame, pandas.core.series.Series] 426 | Selects the oldest string in each group as group-representative. 427 | :param grouped_data: The grouped DataFrame 428 | :param group_col: The name or positional index of the column in grouped_data containing the groups 429 | :param record_id_col: The name or positional index of the column in grouped_data with all groups' members' IDs 430 | (This will appear in the output) 431 | :param timestamps: pandas.Series or the column name (str) or column positional index (int) in grouped_data 432 | This contains the timestamps of the strings to be grouped. 433 | :param record_name_col: (Optional) The name or positional index of the column in grouped_data with 434 | all groups' members' names. (This will appear in the output.) 435 | :param parserinfo: (See below.) 436 | :param **kwargs: (See below.) 437 | parserinfo and kwargs are the same arguments as those you would pass to dateutil.parser.parse. They help in 438 | interpreting the string inputs which are to be parsed into datetime datatypes. 439 | 440 | FYI, the dateutil.parser.parse documentation for these arguments follows: 441 | :param parserinfo: 442 | A :class:`parserinfo` object containing parameters for the parser. 443 | If ``None``, the default arguments to the :class:`parserinfo` 444 | constructor are used. 445 | 446 | The ``**kwargs`` parameter takes the following keyword arguments: 447 | 448 | :param default: 449 | The default datetime object, if this is a datetime object and not 450 | ``None``, elements specified in the strings containing the date/time-stamps replace elements in the 451 | default object. 452 | 453 | :param ignoretz: 454 | If set ``True``, time zones in parsed strings are ignored and a naive 455 | :class:`datetime` object is returned. 456 | 457 | :param tzinfos: 458 | Additional time zone names / aliases which may be present in the 459 | string. This argument maps time zone names (and optionally offsets 460 | from those time zones) to time zones. This parameter can be a 461 | dictionary with timezone aliases mapping time zone names to time 462 | zones or a function taking two parameters (``tzname`` and 463 | ``tzoffset``) and returning a time zone. 464 | 465 | The timezones to which the names are mapped can be an integer 466 | offset from UTC in seconds or a :class:`tzinfo` object. 467 | 468 | .. doctest:: 469 | :options: +NORMALIZE_WHITESPACE 470 | 471 | >>> from dateutil.parser import parse 472 | >>> from dateutil.tz import gettz 473 | >>> tzinfos = {"BRST": -7200, "CST": gettz("America/Chicago")} 474 | >>> parse("2012-01-19 17:21:00 BRST", tzinfos=tzinfos) 475 | datetime.datetime(2012, 1, 19, 17, 21, tzinfo=tzoffset(u'BRST', -7200)) 476 | >>> parse("2012-01-19 17:21:00 CST", tzinfos=tzinfos) 477 | datetime.datetime(2012, 1, 19, 17, 21, 478 | tzinfo=tzfile('/usr/share/zoneinfo/America/Chicago')) 479 | 480 | This parameter is ignored if ``ignoretz`` is set. 481 | 482 | :param dayfirst: 483 | Whether to interpret the first value in an ambiguous 3-integer date 484 | (e.g. 01/05/09) as the day (``True``) or month (``False``). If 485 | ``yearfirst`` is set to ``True``, this distinguishes between YDM and 486 | YMD. If set to ``None``, this value is retrieved from the current 487 | :class:`parserinfo` object (which itself defaults to ``False``). 488 | 489 | :param yearfirst: 490 | Whether to interpret the first value in an ambiguous 3-integer date 491 | (e.g. 01/05/09) as the year. If ``True``, the first number is taken to 492 | be the year, otherwise the last number is taken to be the year. If 493 | this is set to ``None``, the value is retrieved from the current 494 | :class:`parserinfo` object (which itself defaults to ``False``). 495 | 496 | :param fuzzy: 497 | Whether to allow fuzzy parsing, allowing for string like "Today is 498 | January 1, 2047 at 8:21:00AM". 499 | 500 | :param fuzzy_with_tokens: 501 | If ``True``, ``fuzzy`` is automatically set to True, and the parser 502 | will return a tuple where the first element is the parsed 503 | :class:`datetime.datetime` datetimestamp and the second element is 504 | a tuple containing the portions of the string which were ignored: 505 | 506 | .. doctest:: 507 | 508 | >>> from dateutil.parser import parse 509 | >>> parse("Today is January 1, 2047 at 8:21:00AM", fuzzy_with_tokens=True) 510 | (datetime.datetime(2047, 1, 1, 8, 21), (u'Today is ', u' ', u'at ')) 511 | 512 | 513 | 514 | 515 | ```python 516 | customers_df 517 | ``` 518 | 519 | 520 | 521 | 522 |
523 | 524 | 525 | 526 | 527 | 528 | 529 | 530 | 531 | 532 | 533 | 534 | 535 | 536 | 537 | 538 | 539 | 540 | 541 | 542 | 543 | 544 | 545 | 546 | 547 | 548 | 549 | 550 | 551 | 552 | 553 | 554 | 555 | 556 | 557 | 558 | 559 | 560 | 561 | 562 | 563 | 564 | 565 | 566 | 567 | 568 | 569 | 570 | 571 | 572 | 573 | 574 | 575 | 576 | 577 | 578 | 579 | 580 | 581 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | 590 | 591 | 592 | 593 | 594 | 595 | 596 | 597 | 598 | 599 | 600 | 601 | 602 | 603 | 604 | 605 | 606 | 607 | 608 | 609 | 610 | 611 | 612 |
Customer IDCustomer NameAddressTelDescriptionweighttimestampgroup rep IDgroup rep
0BB016741PMega Enterprises CorporationAddress0Tel0Description00.22014-12-30 10:55:00-02:00EE059082QMega Enterprises Corp.
1CC082744LHyper Startup IncorporatedTel10.52017-01-01 20:23:15-05:00BB099931JHyper-Startup Inc.
2AA098762DHyper Startup Inc.Address2Tel2Description20.32020-10-20 15:29:30+02:00BB099931JHyper-Startup Inc.
3BB099931JHyper-Startup Inc.Address3Tel3Description30.12013-07-01 03:34:45-05:00BB099931JHyper-Startup Inc.
4HH072982KHyper Hyper Inc.Address4Description40.92005-09-11 11:56:00-07:00HH072982KHyper Hyper Inc.
5EE059082QMega Enterprises Corp.Address5Tel5Description51.01998-04-14 09:21:11+00:00EE059082QMega Enterprises Corp.
613 |
614 | 615 | 616 | 617 | Here the group-member with the earliest timestamp has been chosen as group-representative for each group. Notice that even though the timestamp data is input as strings, the function is able to treat them as if they were datetime (or pandas Timestamp) datatypes. 618 | 619 | ## new_group_rep_by_highest_weight() and new_group_rep_by_completeness() 620 | 621 | The other two utility functions `new_group_rep_by_highest_weight()` and `new_group_rep_by_completeness()` operate in a similar way to new_group_rep_by_earliest_timestamp(): 622 | 623 | 1. new_group_rep_by_highest_weight() chooses the group-member with the highest weight as group-representative for each group. The weight of each member is assigned as desired by the user, and provided as an argument to the function. The weights could also be a specified column in the input grouped data (mapping table). 624 | 625 | 2. new_group_rep_by_completeness() chooses the group member with the most filled-in fields in its row as group-representative for each group. 626 | 627 | 628 | ```python 629 | customers_df[['group rep ID', 'group rep']] = \ 630 | new_group_rep_by_highest_weight( 631 | grouped_data=customers_df, 632 | group_col='group rep ID', 633 | record_id_col='Customer ID', 634 | weights='weight', 635 | record_name_col='Customer Name' 636 | ) 637 | ``` 638 | 639 | 640 | ```python 641 | customers_df 642 | ``` 643 | 644 | 645 | 646 | 647 |
648 | 649 | 650 | 651 | 652 | 653 | 654 | 655 | 656 | 657 | 658 | 659 | 660 | 661 | 662 | 663 | 664 | 665 | 666 | 667 | 668 | 669 | 670 | 671 | 672 | 673 | 674 | 675 | 676 | 677 | 678 | 679 | 680 | 681 | 682 | 683 | 684 | 685 | 686 | 687 | 688 | 689 | 690 | 691 | 692 | 693 | 694 | 695 | 696 | 697 | 698 | 699 | 700 | 701 | 702 | 703 | 704 | 705 | 706 | 707 | 708 | 709 | 710 | 711 | 712 | 713 | 714 | 715 | 716 | 717 | 718 | 719 | 720 | 721 | 722 | 723 | 724 | 725 | 726 | 727 | 728 | 729 | 730 | 731 | 732 | 733 | 734 | 735 | 736 | 737 |
Customer IDCustomer NameAddressTelDescriptionweighttimestampgroup rep IDgroup rep
0BB016741PMega Enterprises CorporationAddress0Tel0Description00.22014-12-30 10:55:00-02:00EE059082QMega Enterprises Corp.
1CC082744LHyper Startup IncorporatedTel10.52017-01-01 20:23:15-05:00CC082744LHyper Startup Incorporated
2AA098762DHyper Startup Inc.Address2Tel2Description20.32020-10-20 15:29:30+02:00CC082744LHyper Startup Incorporated
3BB099931JHyper-Startup Inc.Address3Tel3Description30.12013-07-01 03:34:45-05:00CC082744LHyper Startup Incorporated
4HH072982KHyper Hyper Inc.Address4Description40.92005-09-11 11:56:00-07:00HH072982KHyper Hyper Inc.
5EE059082QMega Enterprises Corp.Address5Tel5Description51.01998-04-14 09:21:11+00:00EE059082QMega Enterprises Corp.
738 |
739 | 740 | 741 | 742 | 743 | ```python 744 | customers_df[['group rep ID', 'group rep']] = \ 745 | new_group_rep_by_completeness( 746 | grouped_data=customers_df, 747 | group_col='group rep ID', 748 | record_id_col='Customer ID', 749 | record_name_col='Customer Name', 750 | tested_cols=['Address', 'Tel', 'Description'] 751 | ) 752 | ``` 753 | 754 | **N.B.** If argument tesed_cols is not given, new_group_rep_by_completeness() will test the filled-in status of all the fields of grouped_data for each group member. 755 | 756 | 757 | ```python 758 | customers_df 759 | ``` 760 | 761 | 762 | 763 | 764 |
765 | 766 | 767 | 768 | 769 | 770 | 771 | 772 | 773 | 774 | 775 | 776 | 777 | 778 | 779 | 780 | 781 | 782 | 783 | 784 | 785 | 786 | 787 | 788 | 789 | 790 | 791 | 792 | 793 | 794 | 795 | 796 | 797 | 798 | 799 | 800 | 801 | 802 | 803 | 804 | 805 | 806 | 807 | 808 | 809 | 810 | 811 | 812 | 813 | 814 | 815 | 816 | 817 | 818 | 819 | 820 | 821 | 822 | 823 | 824 | 825 | 826 | 827 | 828 | 829 | 830 | 831 | 832 | 833 | 834 | 835 | 836 | 837 | 838 | 839 | 840 | 841 | 842 | 843 | 844 | 845 | 846 | 847 | 848 | 849 | 850 | 851 | 852 | 853 | 854 |
Customer IDCustomer NameAddressTelDescriptionweighttimestampgroup rep IDgroup rep
0BB016741PMega Enterprises CorporationAddress0Tel0Description00.22014-12-30 10:55:00-02:00BB016741PMega Enterprises Corporation
1CC082744LHyper Startup IncorporatedTel10.52017-01-01 20:23:15-05:00AA098762DHyper Startup Inc.
2AA098762DHyper Startup Inc.Address2Tel2Description20.32020-10-20 15:29:30+02:00AA098762DHyper Startup Inc.
3BB099931JHyper-Startup Inc.Address3Tel3Description30.12013-07-01 03:34:45-05:00AA098762DHyper Startup Inc.
4HH072982KHyper Hyper Inc.Address4Description40.92005-09-11 11:56:00-07:00HH072982KHyper Hyper Inc.
5EE059082QMega Enterprises Corp.Address5Tel5Description51.01998-04-14 09:21:11+00:00BB016741PMega Enterprises Corporation
855 |
856 | 857 | 858 | 859 | 860 | ```python 861 | 862 | ``` 863 | -------------------------------------------------------------------------------- /string_grouper/test/test_string_grouper.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import pandas as pd 3 | import numpy as np 4 | from scipy.sparse import csr_matrix 5 | from string_grouper.string_grouper import DEFAULT_MIN_SIMILARITY, \ 6 | DEFAULT_REGEX, DEFAULT_NGRAM_SIZE, DEFAULT_N_PROCESSES, DEFAULT_IGNORE_CASE, \ 7 | StringGrouperConfig, StringGrouper, StringGrouperNotFitException, \ 8 | match_most_similar, group_similar_strings, match_strings, \ 9 | compute_pairwise_similarities 10 | from unittest.mock import patch, Mock 11 | 12 | 13 | def mock_symmetrize_matrix(x: csr_matrix) -> csr_matrix: 14 | return x 15 | 16 | 17 | class SimpleExample(object): 18 | def __init__(self): 19 | self.customers_df = pd.DataFrame( 20 | [ 21 | ('BB016741P', 'Mega Enterprises Corporation', 'Address0', 'Tel0', 'Description0', 0.2), 22 | ('CC082744L', 'Hyper Startup Incorporated', '', 'Tel1', '', 0.5), 23 | ('AA098762D', 'Hyper Startup Inc.', 'Address2', 'Tel2', 'Description2', 0.3), 24 | ('BB099931J', 'Hyper-Startup Inc.', 'Address3', 'Tel3', 'Description3', 0.1), 25 | ('HH072982K', 'Hyper Hyper Inc.', 'Address4', '', 'Description4', 0.9), 26 | ('EE059082Q', 'Mega Enterprises Corp.', 'Address5', 'Tel5', 'Description5', 1.0) 27 | ], 28 | columns=('Customer ID', 'Customer Name', 'Address', 'Tel', 'Description', 'weight') 29 | ) 30 | self.customers_df2 = pd.DataFrame( 31 | [ 32 | ('BB016741P', 'Mega Enterprises Corporation', 'Address0', 'Tel0', 'Description0', 0.2), 33 | ('CC082744L', 'Hyper Startup Incorporated', '', 'Tel1', '', 0.5), 34 | ('AA098762D', 'Hyper Startup Inc.', 'Address2', 'Tel2', 'Description2', 0.3), 35 | ('BB099931J', 'Hyper-Startup Inc.', 'Address3', 'Tel3', 'Description3', 0.1), 36 | ('DD012339M', 'HyperStartup Inc.', 'Address4', 'Tel4', 'Description4', 0.1), 37 | ('HH072982K', 'Hyper Hyper Inc.', 'Address5', '', 'Description5', 0.9), 38 | ('EE059082Q', 'Mega Enterprises Corp.', 'Address6', 'Tel6', 'Description6', 1.0) 39 | ], 40 | columns=('Customer ID', 'Customer Name', 'Address', 'Tel', 'Description', 'weight') 41 | ) 42 | self.a_few_strings = pd.Series(['BB016741P', 'BB082744L', 'BB098762D', 'BB099931J', 'BB072982K', 'BB059082Q']) 43 | self.one_string = pd.Series(['BB0']) 44 | self.two_strings = pd.Series(['Hyper', 'Hyp']) 45 | self.whatever_series_1 = pd.Series(['whatever']) 46 | self.expected_result_with_zeroes = pd.DataFrame( 47 | [ 48 | (1, 'Hyper Startup Incorporated', 0.08170638, 'whatever', 0), 49 | (0, 'Mega Enterprises Corporation', 0., 'whatever', 0), 50 | (2, 'Hyper Startup Inc.', 0., 'whatever', 0), 51 | (3, 'Hyper-Startup Inc.', 0., 'whatever', 0), 52 | (4, 'Hyper Hyper Inc.', 0., 'whatever', 0), 53 | (5, 'Mega Enterprises Corp.', 0., 'whatever', 0) 54 | ], 55 | columns=['left_index', 'left_Customer Name', 'similarity', 'right_side', 'right_index'] 56 | ) 57 | self.expected_result_centroid = pd.Series( 58 | [ 59 | 'Mega Enterprises Corporation', 60 | 'Hyper Startup Inc.', 61 | 'Hyper Startup Inc.', 62 | 'Hyper Startup Inc.', 63 | 'Hyper Hyper Inc.', 64 | 'Mega Enterprises Corporation' 65 | ], 66 | name='group_rep_Customer Name' 67 | ) 68 | self.expected_result_centroid_with_index_col = pd.DataFrame( 69 | [ 70 | (0, 'Mega Enterprises Corporation'), 71 | (2, 'Hyper Startup Inc.'), 72 | (2, 'Hyper Startup Inc.'), 73 | (2, 'Hyper Startup Inc.'), 74 | (4, 'Hyper Hyper Inc.'), 75 | (0, 'Mega Enterprises Corporation') 76 | ], 77 | columns=['group_rep_index', 'group_rep_Customer Name'] 78 | ) 79 | self.expected_result_first = pd.Series( 80 | [ 81 | 'Mega Enterprises Corporation', 82 | 'Hyper Startup Incorporated', 83 | 'Hyper Startup Incorporated', 84 | 'Hyper Startup Incorporated', 85 | 'Hyper Hyper Inc.', 86 | 'Mega Enterprises Corporation' 87 | ], 88 | name='group_rep_Customer Name' 89 | ) 90 | 91 | 92 | class StringGrouperConfigTest(unittest.TestCase): 93 | 94 | def test_config_defaults(self): 95 | """Empty initialisation should set default values""" 96 | config = StringGrouperConfig() 97 | self.assertEqual(config.min_similarity, DEFAULT_MIN_SIMILARITY) 98 | self.assertEqual(config.max_n_matches, 20) 99 | self.assertEqual(config.regex, DEFAULT_REGEX) 100 | self.assertEqual(config.ngram_size, DEFAULT_NGRAM_SIZE) 101 | self.assertEqual(config.number_of_processes, DEFAULT_N_PROCESSES) 102 | self.assertEqual(config.ignore_case, DEFAULT_IGNORE_CASE) 103 | 104 | def test_config_immutable(self): 105 | """Configurations should be immutable""" 106 | config = StringGrouperConfig() 107 | with self.assertRaises(Exception) as _: 108 | config.min_similarity = 0.1 109 | 110 | def test_config_non_default_values(self): 111 | """Configurations should be immutable""" 112 | config = StringGrouperConfig(min_similarity=0.1, max_n_matches=100, number_of_processes=1) 113 | self.assertEqual(0.1, config.min_similarity) 114 | self.assertEqual(100, config.max_n_matches) 115 | self.assertEqual(1, config.number_of_processes) 116 | 117 | 118 | class StringGrouperTest(unittest.TestCase): 119 | 120 | def test_auto_blocking_single_DataFrame(self): 121 | """tests whether automatic blocking yields consistent results""" 122 | # This function will force an OverflowError to occur when 123 | # the input Series have a combined length above a given number: 124 | # OverflowThreshold. This will in turn trigger automatic splitting 125 | # of the Series/matrices into smaller blocks when n_blocks = None 126 | 127 | sort_cols = ['right_index', 'left_index'] 128 | 129 | def fix_row_order(df): 130 | return df.sort_values(sort_cols).reset_index(drop=True) 131 | 132 | simple_example = SimpleExample() 133 | df1 = simple_example.customers_df2['Customer Name'] 134 | 135 | # first do manual blocking 136 | sg = StringGrouper(df1, min_similarity=0.1) 137 | pd.testing.assert_series_equal(sg.master, df1) 138 | self.assertEqual(sg.duplicates, None) 139 | 140 | matches = fix_row_order(sg.match_strings(df1, n_blocks=(1, 1))) 141 | self.assertEqual(sg._config.n_blocks, (1, 1)) 142 | 143 | # Create a custom wrapper for this StringGrouper instance's 144 | # _build_matches() method which will later be used to 145 | # mock _build_matches(). 146 | # Note that we have to define the wrapper here because 147 | # _build_matches() is a non-static function of StringGrouper 148 | # and needs access to the specific StringGrouper instance sg 149 | # created here. 150 | def mock_build_matches(OverflowThreshold, 151 | real_build_matches=sg._build_matches): 152 | def wrapper(left_matrix, 153 | right_matrix, 154 | nnz_rows=None, 155 | sort=True): 156 | if (left_matrix.shape[0] + right_matrix.shape[0]) > \ 157 | OverflowThreshold: 158 | raise OverflowError 159 | return real_build_matches(left_matrix, right_matrix, None) 160 | return wrapper 161 | 162 | def do_test_with(OverflowThreshold): 163 | nonlocal sg # allows reference to sg, as sg will be modified below 164 | # Now let us mock sg._build_matches: 165 | sg._build_matches = Mock(side_effect=mock_build_matches(OverflowThreshold)) 166 | sg.clear_data() 167 | matches_auto = fix_row_order(sg.match_strings(df1, n_blocks=None)) 168 | pd.testing.assert_series_equal(sg.master, df1) 169 | pd.testing.assert_frame_equal(matches, matches_auto) 170 | self.assertEqual(sg._config.n_blocks, None) 171 | # Note that _build_matches is called more than once if and only if 172 | # a split occurred (that is, there was more than one pair of 173 | # matrix-blocks multiplied) 174 | if len(sg._left_Series) + len(sg._right_Series) > \ 175 | OverflowThreshold: 176 | # Assert that split occurred: 177 | self.assertGreater(sg._build_matches.call_count, 1) 178 | else: 179 | # Assert that split did not occur: 180 | self.assertEqual(sg._build_matches.call_count, 1) 181 | 182 | # now test auto blocking by forcing an OverflowError when the 183 | # combined Series' lengths is greater than 10, 5, 3, 2 184 | 185 | do_test_with(OverflowThreshold=100) # does not trigger auto blocking 186 | do_test_with(OverflowThreshold=30) 187 | do_test_with(OverflowThreshold=20) 188 | do_test_with(OverflowThreshold=15) 189 | # do_test_with(OverflowThreshold=12) 190 | 191 | def test_n_blocks_single_DataFrame(self): 192 | """tests whether manual blocking yields consistent results""" 193 | sort_cols = ['right_index', 'left_index'] 194 | 195 | def fix_row_order(df): 196 | return df.sort_values(sort_cols).reset_index(drop=True) 197 | 198 | simple_example = SimpleExample() 199 | df1 = simple_example.customers_df2['Customer Name'] 200 | 201 | matches11 = fix_row_order(match_strings(df1, min_similarity=0.1)) 202 | 203 | matches12 = fix_row_order( 204 | match_strings(df1, n_blocks=(1, 2), min_similarity=0.1)) 205 | pd.testing.assert_frame_equal(matches11, matches12) 206 | 207 | matches13 = fix_row_order( 208 | match_strings(df1, n_blocks=(1, 3), min_similarity=0.1)) 209 | pd.testing.assert_frame_equal(matches11, matches13) 210 | 211 | matches14 = fix_row_order( 212 | match_strings(df1, n_blocks=(1, 4), min_similarity=0.1)) 213 | pd.testing.assert_frame_equal(matches11, matches14) 214 | 215 | matches15 = fix_row_order( 216 | match_strings(df1, n_blocks=(1, 5), min_similarity=0.1)) 217 | pd.testing.assert_frame_equal(matches11, matches15) 218 | 219 | matches16 = fix_row_order( 220 | match_strings(df1, n_blocks=(1, 6), min_similarity=0.1)) 221 | pd.testing.assert_frame_equal(matches11, matches16) 222 | 223 | matches17 = fix_row_order( 224 | match_strings(df1, n_blocks=(1, 7), min_similarity=0.1)) 225 | pd.testing.assert_frame_equal(matches11, matches17) 226 | 227 | matches18 = fix_row_order( 228 | match_strings(df1, n_blocks=(1, 8), min_similarity=0.1)) 229 | pd.testing.assert_frame_equal(matches11, matches18) 230 | 231 | matches21 = fix_row_order( 232 | match_strings(df1, n_blocks=(2, 1), min_similarity=0.1)) 233 | pd.testing.assert_frame_equal(matches11, matches21) 234 | 235 | matches22 = fix_row_order( 236 | match_strings(df1, n_blocks=(2, 2), min_similarity=0.1)) 237 | pd.testing.assert_frame_equal(matches11, matches22) 238 | 239 | matches32 = fix_row_order( 240 | match_strings(df1, n_blocks=(3, 2), min_similarity=0.1)) 241 | pd.testing.assert_frame_equal(matches11, matches32) 242 | 243 | # Create a custom wrapper for this StringGrouper instance's 244 | # _build_matches() method which will later be used to 245 | # mock _build_matches(). 246 | # Note that we have to define the wrapper here because 247 | # _build_matches() is a non-static function of StringGrouper 248 | # and needs access to the specific StringGrouper instance sg 249 | # created here. 250 | sg = StringGrouper(df1, min_similarity=0.1) 251 | 252 | def mock_build_matches(OverflowThreshold, 253 | real_build_matches=sg._build_matches): 254 | def wrapper(left_matrix, 255 | right_matrix, 256 | nnz_rows=None, 257 | sort=True): 258 | if (left_matrix.shape[0] + right_matrix.shape[0]) > \ 259 | OverflowThreshold: 260 | raise OverflowError 261 | return real_build_matches(left_matrix, right_matrix, None) 262 | return wrapper 263 | 264 | def test_overflow_error_with(OverflowThreshold, n_blocks): 265 | nonlocal sg 266 | sg._build_matches = Mock(side_effect=mock_build_matches(OverflowThreshold)) 267 | sg.clear_data() 268 | max_left_block_size = (len(df1)//n_blocks[0] 269 | + (1 if len(df1) % n_blocks[0] > 0 else 0)) 270 | max_right_block_size = (len(df1)//n_blocks[1] 271 | + (1 if len(df1) % n_blocks[1] > 0 else 0)) 272 | if (max_left_block_size + max_right_block_size) > OverflowThreshold: 273 | with self.assertRaises(Exception): 274 | _ = sg.match_strings(df1, n_blocks=n_blocks) 275 | else: 276 | matches_manual = fix_row_order(sg.match_strings(df1, n_blocks=n_blocks)) 277 | pd.testing.assert_frame_equal(matches11, matches_manual) 278 | 279 | test_overflow_error_with(OverflowThreshold=20, n_blocks=(1, 1)) 280 | test_overflow_error_with(OverflowThreshold=20, n_blocks=(1, 1)) 281 | test_overflow_error_with(OverflowThreshold=20, n_blocks=(2, 1)) 282 | test_overflow_error_with(OverflowThreshold=20, n_blocks=(1, 2)) 283 | test_overflow_error_with(OverflowThreshold=20, n_blocks=(4, 4)) 284 | 285 | def test_n_blocks_both_DataFrames(self): 286 | """tests whether manual blocking yields consistent results""" 287 | sort_cols = ['right_index', 'left_index'] 288 | 289 | def fix_row_order(df): 290 | return df.sort_values(sort_cols).reset_index(drop=True) 291 | 292 | simple_example = SimpleExample() 293 | df1 = simple_example.customers_df['Customer Name'] 294 | df2 = simple_example.customers_df2['Customer Name'] 295 | 296 | matches11 = fix_row_order(match_strings(df1, df2, min_similarity=0.1)) 297 | 298 | matches12 = fix_row_order( 299 | match_strings(df1, df2, n_blocks=(1, 2), min_similarity=0.1)) 300 | pd.testing.assert_frame_equal(matches11, matches12) 301 | 302 | matches13 = fix_row_order( 303 | match_strings(df1, df2, n_blocks=(1, 3), min_similarity=0.1)) 304 | pd.testing.assert_frame_equal(matches11, matches13) 305 | 306 | matches14 = fix_row_order( 307 | match_strings(df1, df2, n_blocks=(1, 4), min_similarity=0.1)) 308 | pd.testing.assert_frame_equal(matches11, matches14) 309 | 310 | matches15 = fix_row_order( 311 | match_strings(df1, df2, n_blocks=(1, 5), min_similarity=0.1)) 312 | pd.testing.assert_frame_equal(matches11, matches15) 313 | 314 | matches16 = fix_row_order( 315 | match_strings(df1, df2, n_blocks=(1, 6), min_similarity=0.1)) 316 | pd.testing.assert_frame_equal(matches11, matches16) 317 | 318 | matches17 = fix_row_order( 319 | match_strings(df1, df2, n_blocks=(1, 7), min_similarity=0.1)) 320 | pd.testing.assert_frame_equal(matches11, matches17) 321 | 322 | matches18 = fix_row_order( 323 | match_strings(df1, df2, n_blocks=(1, 8), min_similarity=0.1)) 324 | pd.testing.assert_frame_equal(matches11, matches18) 325 | 326 | matches21 = fix_row_order( 327 | match_strings(df1, df2, n_blocks=(2, 1), min_similarity=0.1)) 328 | pd.testing.assert_frame_equal(matches11, matches21) 329 | 330 | matches22 = fix_row_order( 331 | match_strings(df1, df2, n_blocks=(2, 2), min_similarity=0.1)) 332 | pd.testing.assert_frame_equal(matches11, matches22) 333 | 334 | matches32 = fix_row_order( 335 | match_strings(df1, df2, n_blocks=(3, 2), min_similarity=0.1)) 336 | pd.testing.assert_frame_equal(matches11, matches32) 337 | 338 | def test_n_blocks_bad_option_value(self): 339 | """Tests that bad option values for n_blocks are caught""" 340 | simple_example = SimpleExample() 341 | df1 = simple_example.customers_df2['Customer Name'] 342 | with self.assertRaises(Exception): 343 | _ = match_strings(df1, n_blocks=2) 344 | with self.assertRaises(Exception): 345 | _ = match_strings(df1, n_blocks=(0, 2)) 346 | with self.assertRaises(Exception): 347 | _ = match_strings(df1, n_blocks=(1, 2.5)) 348 | with self.assertRaises(Exception): 349 | _ = match_strings(df1, n_blocks=(1, 2, 3)) 350 | with self.assertRaises(Exception): 351 | _ = match_strings(df1, n_blocks=(1, )) 352 | 353 | def test_tfidf_dtype_bad_option_value(self): 354 | """Tests that bad option values for n_blocks are caught""" 355 | simple_example = SimpleExample() 356 | df1 = simple_example.customers_df2['Customer Name'] 357 | with self.assertRaises(Exception): 358 | _ = match_strings(df1, tfidf_matrix_dtype=None) 359 | with self.assertRaises(Exception): 360 | _ = match_strings(df1, tfidf_matrix_dtype=0) 361 | with self.assertRaises(Exception): 362 | _ = match_strings(df1, tfidf_matrix_dtype='whatever') 363 | 364 | def test_compute_pairwise_similarities(self): 365 | """tests the high-level function compute_pairwise_similarities""" 366 | simple_example = SimpleExample() 367 | df1 = simple_example.customers_df['Customer Name'] 368 | df2 = simple_example.expected_result_centroid 369 | similarities = compute_pairwise_similarities(df1, df2) 370 | expected_result = pd.Series( 371 | [ 372 | 1.0, 373 | 0.6336195351561589, 374 | 1.0000000000000004, 375 | 1.0000000000000004, 376 | 1.0, 377 | 0.826462625999832 378 | ], 379 | name='similarity' 380 | ) 381 | expected_result = expected_result.astype(np.float64) 382 | pd.testing.assert_series_equal(expected_result, similarities) 383 | sg = StringGrouper(df1, df2) 384 | similarities = sg.compute_pairwise_similarities(df1, df2) 385 | pd.testing.assert_series_equal(expected_result, similarities) 386 | 387 | def test_compute_pairwise_similarities_data_integrity(self): 388 | """tests that an exception is raised whenever the lengths of the two input series of the high-level function 389 | compute_pairwise_similarities are unequal""" 390 | simple_example = SimpleExample() 391 | df1 = simple_example.customers_df['Customer Name'] 392 | df2 = simple_example.expected_result_centroid 393 | with self.assertRaises(Exception): 394 | _ = compute_pairwise_similarities(df1, df2[:-2]) 395 | 396 | @patch('string_grouper.string_grouper.StringGrouper') 397 | def test_group_similar_strings(self, mock_StringGouper): 398 | """mocks StringGrouper to test if the high-level function group_similar_strings utilizes it as expected""" 399 | mock_StringGrouper_instance = mock_StringGouper.return_value 400 | mock_StringGrouper_instance.fit.return_value = mock_StringGrouper_instance 401 | mock_StringGrouper_instance.get_groups.return_value = 'whatever' 402 | 403 | test_series_1 = None 404 | test_series_id_1 = None 405 | df = group_similar_strings( 406 | test_series_1, 407 | string_ids=test_series_id_1 408 | ) 409 | 410 | mock_StringGrouper_instance.fit.assert_called_once() 411 | mock_StringGrouper_instance.get_groups.assert_called_once() 412 | self.assertEqual(df, 'whatever') 413 | 414 | @patch('string_grouper.string_grouper.StringGrouper') 415 | def test_match_most_similar(self, mock_StringGouper): 416 | """mocks StringGrouper to test if the high-level function match_most_similar utilizes it as expected""" 417 | mock_StringGrouper_instance = mock_StringGouper.return_value 418 | mock_StringGrouper_instance.fit.return_value = mock_StringGrouper_instance 419 | mock_StringGrouper_instance.get_groups.return_value = 'whatever' 420 | 421 | test_series_1 = None 422 | test_series_2 = None 423 | test_series_id_1 = None 424 | test_series_id_2 = None 425 | df = match_most_similar( 426 | test_series_1, 427 | test_series_2, 428 | master_id=test_series_id_1, 429 | duplicates_id=test_series_id_2 430 | ) 431 | 432 | mock_StringGrouper_instance.fit.assert_called_once() 433 | mock_StringGrouper_instance.get_groups.assert_called_once() 434 | self.assertEqual(df, 'whatever') 435 | 436 | @patch('string_grouper.string_grouper.StringGrouper') 437 | def test_match_strings(self, mock_StringGouper): 438 | """mocks StringGrouper to test if the high-level function match_strings utilizes it as expected""" 439 | mock_StringGrouper_instance = mock_StringGouper.return_value 440 | mock_StringGrouper_instance.fit.return_value = mock_StringGrouper_instance 441 | mock_StringGrouper_instance.get_matches.return_value = 'whatever' 442 | 443 | test_series_1 = None 444 | test_series_id_1 = None 445 | df = match_strings(test_series_1, master_id=test_series_id_1) 446 | 447 | mock_StringGrouper_instance.fit.assert_called_once() 448 | mock_StringGrouper_instance.get_matches.assert_called_once() 449 | self.assertEqual(df, 'whatever') 450 | 451 | @patch( 452 | 'string_grouper.string_grouper.StringGrouper._fix_diagonal', 453 | side_effect=mock_symmetrize_matrix 454 | ) 455 | def test_match_list_diagonal_without_the_fix(self, mock_fix_diagonal): 456 | """test fails whenever _matches_list's number of self-joins is not equal to the number of strings""" 457 | # This bug is difficult to reproduce -- I mostly encounter it while working with very large datasets; 458 | # for small datasets setting max_n_matches=1 reproduces the bug 459 | simple_example = SimpleExample() 460 | df = simple_example.customers_df['Customer Name'] 461 | matches = match_strings(df, max_n_matches=1) 462 | mock_fix_diagonal.assert_called_once() 463 | num_self_joins = len(matches[matches['left_index'] == matches['right_index']]) 464 | num_strings = len(df) 465 | self.assertNotEqual(num_self_joins, num_strings) 466 | 467 | def test_match_list_diagonal(self): 468 | """This test ensures that all self-joins are present""" 469 | # This bug is difficult to reproduce -- I mostly encounter it while working with very large datasets; 470 | # for small datasets setting max_n_matches=1 reproduces the bug 471 | simple_example = SimpleExample() 472 | df = simple_example.customers_df['Customer Name'] 473 | matches = match_strings(df, max_n_matches=1) 474 | num_self_joins = len(matches[matches['left_index'] == matches['right_index']]) 475 | num_strings = len(df) 476 | self.assertEqual(num_self_joins, num_strings) 477 | 478 | def test_zero_min_similarity(self): 479 | """Since sparse matrices exclude zero elements, this test ensures that zero similarity matches are 480 | returned when min_similarity <= 0. A bug related to this was first pointed out by @nbcvijanovic""" 481 | simple_example = SimpleExample() 482 | s_master = simple_example.customers_df['Customer Name'] 483 | s_dup = simple_example.whatever_series_1 484 | matches = match_strings(s_master, s_dup, min_similarity=0) 485 | pd.testing.assert_frame_equal(simple_example.expected_result_with_zeroes, matches) 486 | 487 | def test_get_non_matches_empty_case(self): 488 | """This test ensures that _get_non_matches() returns an empty DataFrame when all pairs of strings match""" 489 | simple_example = SimpleExample() 490 | s_master = simple_example.a_few_strings 491 | s_dup = simple_example.one_string 492 | sg = StringGrouper(s_master, s_dup, max_n_matches=len(s_master), min_similarity=0).fit() 493 | self.assertTrue(sg._get_non_matches_list().empty) 494 | 495 | def test_n_grams_case_unchanged(self): 496 | """Should return all ngrams in a string with case""" 497 | test_series = pd.Series(pd.Series(['aaa'])) 498 | # Explicit do not ignore case 499 | sg = StringGrouper(test_series, ignore_case=False) 500 | expected_result = ['McD', 'cDo', 'Don', 'ona', 'nal', 'ald', 'lds'] 501 | self.assertListEqual(expected_result, sg.n_grams('McDonalds')) 502 | 503 | def test_n_grams_ignore_case_to_lower(self): 504 | """Should return all case insensitive ngrams in a string""" 505 | test_series = pd.Series(pd.Series(['aaa'])) 506 | # Explicit ignore case 507 | sg = StringGrouper(test_series, ignore_case=True) 508 | expected_result = ['mcd', 'cdo', 'don', 'ona', 'nal', 'ald', 'lds'] 509 | self.assertListEqual(expected_result, sg.n_grams('McDonalds')) 510 | 511 | def test_n_grams_ignore_case_to_lower_with_defaults(self): 512 | """Should return all case insensitive ngrams in a string""" 513 | test_series = pd.Series(pd.Series(['aaa'])) 514 | # Implicit default case (i.e. default behaviour) 515 | sg = StringGrouper(test_series) 516 | expected_result = ['mcd', 'cdo', 'don', 'ona', 'nal', 'ald', 'lds'] 517 | self.assertListEqual(expected_result, sg.n_grams('McDonalds')) 518 | 519 | def test_build_matrix(self): 520 | """Should create a csr matrix only master""" 521 | test_series = pd.Series(['foo', 'bar', 'baz']) 522 | sg = StringGrouper(test_series) 523 | master, dupe = sg._get_tf_idf_matrices() 524 | c = csr_matrix([[0., 0., 1.], 525 | [1., 0., 0.], 526 | [0., 1., 0.]]) 527 | np.testing.assert_array_equal(c.toarray(), master.toarray()) 528 | np.testing.assert_array_equal(c.toarray(), dupe.toarray()) 529 | 530 | def test_build_matrix_master_and_duplicates(self): 531 | """Should create a csr matrix for master and duplicates""" 532 | test_series_1 = pd.Series(['foo', 'bar', 'baz']) 533 | test_series_2 = pd.Series(['foo', 'bar', 'bop']) 534 | sg = StringGrouper(test_series_1, test_series_2) 535 | master, dupe = sg._get_tf_idf_matrices() 536 | master_expected = csr_matrix([[0., 0., 0., 1.], 537 | [1., 0., 0., 0.], 538 | [0., 1., 0., 0.]]) 539 | dupes_expected = csr_matrix([[0., 0., 0., 1.], 540 | [1., 0., 0., 0.], 541 | [0., 0., 1., 0.]]) 542 | 543 | np.testing.assert_array_equal(master_expected.toarray(), master.toarray()) 544 | np.testing.assert_array_equal(dupes_expected.toarray(), dupe.toarray()) 545 | 546 | def test_build_matches(self): 547 | """Should create the cosine similarity matrix of two series""" 548 | test_series_1 = pd.Series(['foo', 'bar', 'baz']) 549 | test_series_2 = pd.Series(['foo', 'bar', 'bop']) 550 | sg = StringGrouper(test_series_1, test_series_2) 551 | master, dupe = sg._get_tf_idf_matrices() 552 | 553 | expected_matches = np.array([[1., 0., 0.], 554 | [0., 1., 0.], 555 | [0., 0., 0.]]) 556 | np.testing.assert_array_equal(expected_matches, sg._build_matches(master, dupe, None).toarray()) 557 | 558 | def test_build_matches_list(self): 559 | """Should create the cosine similarity matrix of two series""" 560 | test_series_1 = pd.Series(['foo', 'bar', 'baz']) 561 | test_series_2 = pd.Series(['foo', 'bar', 'bop']) 562 | sg = StringGrouper(test_series_1, test_series_2) 563 | sg = sg.fit() 564 | master = [0, 1] 565 | dupe_side = [0, 1] 566 | similarity = [1.0, 1.0] 567 | expected_df = pd.DataFrame({'master_side': master, 'dupe_side': dupe_side, 'similarity': similarity}) 568 | expected_df.loc[:, 'similarity'] = expected_df.loc[:, 'similarity'].astype(sg._config.tfidf_matrix_dtype) 569 | pd.testing.assert_frame_equal(expected_df, sg._matches_list) 570 | 571 | def test_case_insensitive_build_matches_list(self): 572 | """Should create the cosine similarity matrix of two case insensitive series""" 573 | test_series_1 = pd.Series(['foo', 'BAR', 'baz']) 574 | test_series_2 = pd.Series(['FOO', 'bar', 'bop']) 575 | sg = StringGrouper(test_series_1, test_series_2) 576 | sg = sg.fit() 577 | master = [0, 1] 578 | dupe_side = [0, 1] 579 | similarity = [1.0, 1.0] 580 | expected_df = pd.DataFrame({'master_side': master, 'dupe_side': dupe_side, 'similarity': similarity}) 581 | expected_df.loc[:, 'similarity'] = expected_df.loc[:, 'similarity'].astype(sg._config.tfidf_matrix_dtype) 582 | pd.testing.assert_frame_equal(expected_df, sg._matches_list) 583 | 584 | def test_get_matches_two_dataframes(self): 585 | test_series_1 = pd.Series(['foo', 'bar', 'baz']) 586 | test_series_2 = pd.Series(['foo', 'bar', 'bop']) 587 | sg = StringGrouper(test_series_1, test_series_2).fit() 588 | left_side = ['foo', 'bar'] 589 | left_index = [0, 1] 590 | right_side = ['foo', 'bar'] 591 | right_index = [0, 1] 592 | similarity = [1.0, 1.0] 593 | expected_df = pd.DataFrame({'left_index': left_index, 'left_side': left_side, 594 | 'similarity': similarity, 595 | 'right_side': right_side, 'right_index': right_index}) 596 | expected_df.loc[:, 'similarity'] = expected_df.loc[:, 'similarity'].astype(sg._config.tfidf_matrix_dtype) 597 | pd.testing.assert_frame_equal(expected_df, sg.get_matches()) 598 | 599 | def test_get_matches_single(self): 600 | test_series_1 = pd.Series(['foo', 'bar', 'baz', 'foo']) 601 | sg = StringGrouper(test_series_1) 602 | sg = sg.fit() 603 | left_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo'] 604 | right_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo'] 605 | right_index = [0, 3, 1, 2, 0, 3] 606 | left_index = [0, 0, 1, 2, 3, 3] 607 | similarity = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0] 608 | expected_df = pd.DataFrame({'left_index': left_index, 'left_side': left_side, 609 | 'similarity': similarity, 610 | 'right_side': right_side, 'right_index': right_index}) 611 | expected_df.loc[:, 'similarity'] = expected_df.loc[:, 'similarity'].astype(sg._config.tfidf_matrix_dtype) 612 | pd.testing.assert_frame_equal(expected_df, sg.get_matches()) 613 | 614 | def test_get_matches_1_series_1_id_series(self): 615 | test_series_1 = pd.Series(['foo', 'bar', 'baz', 'foo']) 616 | test_series_id_1 = pd.Series(['A0', 'A1', 'A2', 'A3']) 617 | sg = StringGrouper(test_series_1, master_id=test_series_id_1) 618 | sg = sg.fit() 619 | right_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo'] 620 | right_side_id = ['A0', 'A3', 'A1', 'A2', 'A0', 'A3'] 621 | right_index = [0, 3, 1, 2, 0, 3] 622 | left_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo'] 623 | left_side_id = ['A0', 'A0', 'A1', 'A2', 'A3', 'A3'] 624 | left_index = [0, 0, 1, 2, 3, 3] 625 | similarity = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0] 626 | similarity = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0] 627 | expected_df = pd.DataFrame({'left_index': left_index, 'left_side': left_side, 'left_id': left_side_id, 628 | 'similarity': similarity, 629 | 'right_id': right_side_id, 'right_side': right_side, 'right_index': right_index}) 630 | expected_df.loc[:, 'similarity'] = expected_df.loc[:, 'similarity'].astype(sg._config.tfidf_matrix_dtype) 631 | pd.testing.assert_frame_equal(expected_df, sg.get_matches()) 632 | 633 | def test_get_matches_2_series_2_id_series(self): 634 | test_series_1 = pd.Series(['foo', 'bar', 'baz']) 635 | test_series_id_1 = pd.Series(['A0', 'A1', 'A2']) 636 | test_series_2 = pd.Series(['foo', 'bar', 'bop']) 637 | test_series_id_2 = pd.Series(['B0', 'B1', 'B2']) 638 | sg = StringGrouper(test_series_1, test_series_2, duplicates_id=test_series_id_2, 639 | master_id=test_series_id_1).fit() 640 | left_side = ['foo', 'bar'] 641 | left_side_id = ['A0', 'A1'] 642 | left_index = [0, 1] 643 | right_side = ['foo', 'bar'] 644 | right_side_id = ['B0', 'B1'] 645 | right_index = [0, 1] 646 | similarity = [1.0, 1.0] 647 | expected_df = pd.DataFrame({'left_index': left_index, 'left_side': left_side, 'left_id': left_side_id, 648 | 'similarity': similarity, 649 | 'right_id': right_side_id, 'right_side': right_side, 'right_index': right_index}) 650 | expected_df.loc[:, 'similarity'] = expected_df.loc[:, 'similarity'].astype(sg._config.tfidf_matrix_dtype) 651 | pd.testing.assert_frame_equal(expected_df, sg.get_matches()) 652 | 653 | def test_get_matches_raises_exception_if_unexpected_options_given(self): 654 | # When the input id data does not correspond with its string data: 655 | test_series_1 = pd.Series(['foo', 'bar', 'baz']) 656 | bad_test_series_id_1 = pd.Series(['A0', 'A1']) 657 | good_test_series_id_1 = pd.Series(['A0', 'A1', 'A2']) 658 | test_series_2 = pd.Series(['foo', 'bar', 'bop']) 659 | bad_test_series_id_2 = pd.Series(['B0', 'B1']) 660 | good_test_series_id_2 = pd.Series(['B0', 'B1', 'B2']) 661 | with self.assertRaises(Exception): 662 | _ = StringGrouper(test_series_1, master_id=bad_test_series_id_1) 663 | with self.assertRaises(Exception): 664 | _ = StringGrouper(test_series_1, duplicates=test_series_2, duplicates_id=bad_test_series_id_2, 665 | master_id=good_test_series_id_1) 666 | 667 | # When the input data is ok but the option combinations are invalid: 668 | with self.assertRaises(Exception): 669 | _ = StringGrouper(test_series_1, test_series_2, master_id=good_test_series_id_1) 670 | with self.assertRaises(Exception): 671 | _ = StringGrouper(test_series_1, test_series_2, duplicates_id=good_test_series_id_2) 672 | with self.assertRaises(Exception): 673 | _ = StringGrouper(test_series_1, duplicates_id=good_test_series_id_2) 674 | with self.assertRaises(Exception): 675 | _ = StringGrouper(test_series_1, master_id=good_test_series_id_1, duplicates_id=good_test_series_id_2) 676 | with self.assertRaises(Exception): 677 | _ = StringGrouper(test_series_1, master_id=good_test_series_id_1, ignore_index=True, replace_na=True) 678 | # Here we force an exception by making the number of index-levels of duplicates different from master: 679 | # and setting replace_na=True 680 | test_series_2.index = pd.MultiIndex.from_tuples(list(zip(list('ABC'), [0, 1, 2]))) 681 | with self.assertRaises(Exception): 682 | _ = StringGrouper(test_series_1, duplicates=test_series_2, replace_na=True) 683 | 684 | def test_get_groups_single_df_group_rep_default(self): 685 | """Should return a pd.Series object with the same length as the original df. The series object will contain 686 | a list of the grouped strings""" 687 | simple_example = SimpleExample() 688 | customers_df = simple_example.customers_df 689 | pd.testing.assert_series_equal( 690 | simple_example.expected_result_centroid, 691 | group_similar_strings( 692 | customers_df['Customer Name'], 693 | min_similarity=0.6, 694 | ignore_index=True 695 | ) 696 | ) 697 | sg = StringGrouper(customers_df['Customer Name']) 698 | pd.testing.assert_series_equal( 699 | simple_example.expected_result_centroid, 700 | sg.group_similar_strings( 701 | customers_df['Customer Name'], 702 | min_similarity=0.6, 703 | ignore_index=True 704 | ) 705 | ) 706 | 707 | def test_get_groups_single_valued_series(self): 708 | """This test ensures that get_groups() returns a single-valued DataFrame or Series object 709 | since the input-series is also single-valued. This test was created in response to a bug discovered 710 | by George Walker""" 711 | pd.testing.assert_frame_equal( 712 | pd.DataFrame([(0, "hello")], columns=['group_rep_index', 'group_rep']), 713 | group_similar_strings( 714 | pd.Series(["hello"]), 715 | min_similarity=0.6 716 | ) 717 | ) 718 | pd.testing.assert_series_equal( 719 | pd.Series(["hello"], name='group_rep'), 720 | group_similar_strings( 721 | pd.Series(["hello"]), 722 | min_similarity=0.6, 723 | ignore_index=True 724 | ) 725 | ) 726 | pd.testing.assert_frame_equal( 727 | pd.DataFrame([(0, "hello")], columns=['most_similar_index', 'most_similar_master']), 728 | match_most_similar( 729 | pd.Series(["hello"]), 730 | pd.Series(["hello"]), 731 | min_similarity=0.6 732 | ) 733 | ) 734 | pd.testing.assert_frame_equal( 735 | pd.DataFrame([(0, "hello")], columns=['most_similar_index', 'most_similar_master']), 736 | match_most_similar( 737 | pd.Series(["hello"]), 738 | pd.Series(["hello"]), 739 | min_similarity=0.6, 740 | max_n_matches=20 741 | ) 742 | ) 743 | pd.testing.assert_series_equal( 744 | pd.Series(["hello"], name='most_similar_master'), 745 | match_most_similar( 746 | pd.Series(["hello"]), 747 | pd.Series(["hello"]), 748 | min_similarity=0.6, 749 | ignore_index=True 750 | ) 751 | ) 752 | 753 | def test_get_groups_single_df_keep_index(self): 754 | """Should return a pd.Series object with the same length as the original df. The series object will contain 755 | a list of the grouped strings with their indexes displayed in columns""" 756 | simple_example = SimpleExample() 757 | customers_df = simple_example.customers_df 758 | pd.testing.assert_frame_equal( 759 | simple_example.expected_result_centroid_with_index_col, 760 | group_similar_strings( 761 | customers_df['Customer Name'], 762 | min_similarity=0.6, 763 | ignore_index=False 764 | ) 765 | ) 766 | 767 | def test_get_groups_single_df_group_rep_centroid(self): 768 | """Should return a pd.Series object with the same length as the original df. The series object will contain 769 | a list of the grouped strings""" 770 | simple_example = SimpleExample() 771 | customers_df = simple_example.customers_df 772 | pd.testing.assert_series_equal( 773 | simple_example.expected_result_first, 774 | group_similar_strings( 775 | customers_df['Customer Name'], 776 | group_rep='first', 777 | min_similarity=0.6, 778 | ignore_index=True 779 | ) 780 | ) 781 | 782 | def test_get_groups_single_df_group_rep_bad_option_value(self): 783 | """Should raise an exception when group_rep value given is neither 'centroid' nor 'first'""" 784 | simple_example = SimpleExample() 785 | customers_df = simple_example.customers_df 786 | with self.assertRaises(Exception): 787 | _ = group_similar_strings( 788 | customers_df['Customer Name'], 789 | group_rep='nonsense', 790 | min_similarity=0.6 791 | ) 792 | 793 | def test_get_groups_single_df(self): 794 | """Should return a pd.Series object with the same length as the original df. The series object will contain 795 | a list of the grouped strings""" 796 | test_series_1 = pd.Series(['foooo', 'bar', 'baz', 'foooob']) 797 | sg = StringGrouper(test_series_1, ignore_index=True) 798 | sg = sg.fit() 799 | result = sg.get_groups() 800 | expected_result = pd.Series(['foooo', 'bar', 'baz', 'foooo'], name='group_rep') 801 | pd.testing.assert_series_equal(expected_result, result) 802 | 803 | def test_get_groups_1_string_series_1_id_series(self): 804 | """Should return a pd.DataFrame object with the same length as the original df. The series object will contain 805 | a list of the grouped strings""" 806 | test_series_1 = pd.Series(['foooo', 'bar', 'baz', 'foooob']) 807 | test_series_id_1 = pd.Series(['A0', 'A1', 'A2', 'A3']) 808 | sg = StringGrouper(test_series_1, master_id=test_series_id_1, ignore_index=True) 809 | sg = sg.fit() 810 | result = sg.get_groups() 811 | expected_result = pd.DataFrame(list(zip(['A0', 'A1', 'A2', 'A0'], ['foooo', 'bar', 'baz', 'foooo'])), 812 | columns=['group_rep_id', 'group_rep']) 813 | pd.testing.assert_frame_equal(expected_result, result) 814 | 815 | def test_get_groups_two_df(self): 816 | """Should return a pd.Series object with the length of the dupes. The series will contain the master string 817 | that matches the dupe with the highest similarity""" 818 | test_series_1 = pd.Series(['foooo', 'bar', 'baz']) 819 | test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob']) 820 | sg = StringGrouper(test_series_1, test_series_2, ignore_index=True) 821 | sg = sg.fit() 822 | result = sg.get_groups() 823 | expected_result = pd.Series(['foooo', 'bar', 'baz', 'foooo'], name='most_similar_master') 824 | pd.testing.assert_series_equal(expected_result, result) 825 | result = sg.match_most_similar(test_series_1, test_series_2, max_n_matches=3) 826 | pd.testing.assert_series_equal(expected_result, result) 827 | 828 | def test_get_groups_2_string_series_2_id_series(self): 829 | """Should return a pd.DataFrame object with the length of the dupes. The series will contain the master string 830 | that matches the dupe with the highest similarity""" 831 | test_series_1 = pd.Series(['foooo', 'bar', 'baz']) 832 | test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob']) 833 | test_series_id_1 = pd.Series(['A0', 'A1', 'A2']) 834 | test_series_id_2 = pd.Series(['B0', 'B1', 'B2', 'B3']) 835 | sg = StringGrouper(test_series_1, 836 | test_series_2, 837 | master_id=test_series_id_1, 838 | duplicates_id=test_series_id_2, 839 | ignore_index=True) 840 | sg = sg.fit() 841 | result = sg.get_groups() 842 | expected_result = pd.DataFrame(list(zip(['A0', 'A1', 'A2', 'A0'], ['foooo', 'bar', 'baz', 'foooo'])), 843 | columns=['most_similar_master_id', 'most_similar_master']) 844 | pd.testing.assert_frame_equal(expected_result, result) 845 | 846 | def test_get_groups_2_string_series_2_numeric_id_series_with_missing_master_value(self): 847 | """Should return a pd.DataFrame object with the length of the dupes. The series will contain the master string 848 | that matches the dupe with the highest similarity""" 849 | test_series_1 = pd.Series(['foooo', 'bar', 'foooo']) 850 | test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob']) 851 | test_series_id_1 = pd.Series([0, 1, 2], dtype = "Int64") 852 | test_series_id_2 = pd.Series([100, 101, 102, 103], dtype = "Int64") 853 | sg = StringGrouper(test_series_1, 854 | test_series_2, 855 | master_id=test_series_id_1, 856 | duplicates_id=test_series_id_2, 857 | ignore_index=True) 858 | sg = sg.fit() 859 | result = sg.get_groups() 860 | expected_result = pd.DataFrame(list(zip([0, 1, 102, 0], ['foooo', 'bar', 'baz', 'foooo'])), 861 | columns=['most_similar_master_id', 'most_similar_master'] 862 | ).astype(dtype= {"most_similar_master_id":"Int64", 863 | "most_similar_master":"str"}) 864 | pd.testing.assert_frame_equal(expected_result, result) 865 | 866 | def test_get_groups_2_string_series_with_numeric_indexes_and_missing_master_value(self): 867 | """Should return a pd.DataFrame object with the length of the dupes. The series will contain the master string 868 | that matches the dupe with the highest similarity""" 869 | test_series_1 = pd.Series(['foooo', 'bar', 'foooo'], index = pd.Index([0, 1, 2], dtype = "Int64")) 870 | test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob'], index = pd.Index([100, 101, 102, 103], dtype = "Int64")) 871 | sg = StringGrouper(test_series_1, test_series_2, replace_na=True) 872 | sg = sg.fit() 873 | result = sg.get_groups() 874 | expected_result = pd.DataFrame(list(zip([0, 1, 102, 0], ['foooo', 'bar', 'baz', 'foooo'])), 875 | columns=['most_similar_index', 'most_similar_master'], 876 | index=test_series_2.index).astype(dtype= {"most_similar_index":"Int64", 877 | "most_similar_master":"str"}) 878 | pd.testing.assert_frame_equal(expected_result, result) 879 | 880 | def test_get_groups_two_df_same_similarity(self): 881 | """Should return a pd.Series object with the length of the dupes. If there are two dupes with the same 882 | similarity, the first one is chosen""" 883 | test_series_1 = pd.Series(['foooo', 'bar', 'baz', 'foooo']) 884 | test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob']) 885 | sg = StringGrouper(test_series_1, test_series_2, ignore_index=True) 886 | sg = sg.fit() 887 | result = sg.get_groups() 888 | expected_result = pd.Series(['foooo', 'bar', 'baz', 'foooo'], name='most_similar_master') 889 | pd.testing.assert_series_equal(expected_result, result) 890 | 891 | def test_get_groups_4_df_same_similarity(self): 892 | """Should return a pd.DataFrame object with the length of the dupes. If there are two dupes with the same 893 | similarity, the first one is chosen""" 894 | test_series_1 = pd.Series(['foooo', 'bar', 'baz', 'foooo']) 895 | test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob']) 896 | test_series_id_1 = pd.Series(['A0', 'A1', 'A2', 'A3']) 897 | test_series_id_2 = pd.Series(['B0', 'B1', 'B2', 'B3']) 898 | sg = StringGrouper(test_series_1, 899 | test_series_2, 900 | master_id=test_series_id_1, 901 | duplicates_id=test_series_id_2, 902 | ignore_index=True) 903 | sg = sg.fit() 904 | result = sg.get_groups() 905 | expected_result = pd.DataFrame(list(zip(['A0', 'A1', 'A2', 'A0'], ['foooo', 'bar', 'baz', 'foooo'])), 906 | columns=['most_similar_master_id', 'most_similar_master']) 907 | pd.testing.assert_frame_equal(expected_result, result) 908 | 909 | def test_get_groups_two_df_no_match(self): 910 | """Should return a pd.Series object with the length of the dupes. If no match is found in dupes, 911 | the original will be returned""" 912 | test_series_1 = pd.Series(['foooo', 'bar', 'baz']) 913 | test_series_2 = pd.Series(['foooo', 'dooz', 'bar', 'baz', 'foooob']) 914 | sg = StringGrouper(test_series_1, test_series_2, ignore_index=True) 915 | sg = sg.fit() 916 | result = sg.get_groups() 917 | expected_result = pd.Series(['foooo', 'dooz', 'bar', 'baz', 'foooo'], name='most_similar_master') 918 | pd.testing.assert_series_equal(expected_result, result) 919 | 920 | def test_get_groups_4_df_no_match(self): 921 | """Should return a pd.DataFrame object with the length of the dupes. If no match is found in dupes, 922 | the original will be returned""" 923 | test_series_1 = pd.Series(['foooo', 'bar', 'baz']) 924 | test_series_2 = pd.Series(['foooo', 'dooz', 'bar', 'baz', 'foooob']) 925 | test_series_id_1 = pd.Series(['A0', 'A1', 'A2']) 926 | test_series_id_2 = pd.Series(['B0', 'B1', 'B2', 'B3', 'B4']) 927 | sg = StringGrouper(test_series_1, 928 | test_series_2, 929 | master_id=test_series_id_1, 930 | duplicates_id=test_series_id_2, 931 | ignore_index=True) 932 | sg = sg.fit() 933 | result = sg.get_groups() 934 | expected_result = pd.DataFrame(list(zip( 935 | ['A0', 'B1', 'A1', 'A2', 'A0'], ['foooo', 'dooz', 'bar', 'baz', 'foooo'] 936 | )), 937 | columns=['most_similar_master_id', 'most_similar_master'] 938 | ) 939 | pd.testing.assert_frame_equal(expected_result, result) 940 | 941 | def test_get_groups_raises_exception(self): 942 | """Should raise an exception if called before the StringGrouper is fit""" 943 | test_series_1 = pd.Series(['foooo', 'bar', 'baz', 'foooo']) 944 | test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob']) 945 | sg = StringGrouper(test_series_1, test_series_2) 946 | with self.assertRaises(StringGrouperNotFitException): 947 | _ = sg.get_groups() 948 | 949 | def test_add_match_raises_exception_if_string_not_present(self): 950 | test_series_1 = pd.Series(['foooo', 'no match', 'baz', 'foooo']) 951 | test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob']) 952 | sg = StringGrouper(test_series_1).fit() 953 | sg2 = StringGrouper(test_series_1, test_series_2).fit() 954 | with self.assertRaises(ValueError): 955 | sg.add_match('doesnt exist', 'baz') 956 | with self.assertRaises(ValueError): 957 | sg.add_match('baz', 'doesnt exist') 958 | with self.assertRaises(ValueError): 959 | sg2.add_match('doesnt exist', 'baz') 960 | with self.assertRaises(ValueError): 961 | sg2.add_match('baz', 'doesnt exist') 962 | 963 | def test_add_match_single_occurence(self): 964 | """Should add the match if there are no exact duplicates""" 965 | test_series_1 = pd.Series(['foooo', 'no match', 'baz', 'foooo']) 966 | test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob']) 967 | sg = StringGrouper(test_series_1).fit() 968 | sg.add_match('no match', 'baz') 969 | matches = sg.get_matches() 970 | matches = matches[(matches.left_side == 'no match') & (matches.right_side == 'baz')] 971 | self.assertEqual(1, matches.shape[0]) 972 | sg2 = StringGrouper(test_series_1, test_series_2).fit() 973 | sg2.add_match('no match', 'bar') 974 | matches = sg2.get_matches() 975 | matches = matches[(matches.left_side == 'no match') & (matches.right_side == 'bar')] 976 | self.assertEqual(1, matches.shape[0]) 977 | 978 | def test_add_match_single_group_matches_symmetric(self): 979 | """New matches that are added to a SG with only a master series should be symmetric""" 980 | test_series_1 = pd.Series(['foooo', 'no match', 'baz', 'foooo']) 981 | sg = StringGrouper(test_series_1).fit() 982 | sg.add_match('no match', 'baz') 983 | matches = sg.get_matches() 984 | matches_1 = matches[(matches.left_side == 'no match') & (matches.right_side == 'baz')] 985 | self.assertEqual(1, matches_1.shape[0]) 986 | matches_2 = matches[(matches.left_side == 'baz') & (matches.right_side == 'no match')] 987 | self.assertEqual(1, matches_2.shape[0]) 988 | 989 | def test_add_match_multiple_occurences(self): 990 | """Should add multiple matches if there are exact duplicates""" 991 | test_series_1 = pd.Series(['foooo', 'no match', 'baz', 'foooo']) 992 | test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob']) 993 | sg = StringGrouper(test_series_1, test_series_2).fit() 994 | sg.add_match('foooo', 'baz') 995 | matches = sg.get_matches() 996 | matches = matches[(matches.left_side == 'foooo') & (matches.right_side == 'baz')] 997 | self.assertEqual(2, matches.shape[0]) 998 | 999 | def test_remove_match(self): 1000 | """Should remove a match""" 1001 | test_series_1 = pd.Series(['foooo', 'no match', 'baz', 'foooob']) 1002 | test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob']) 1003 | sg = StringGrouper(test_series_1).fit() 1004 | sg.remove_match('foooo', 'foooob') 1005 | matches = sg.get_matches() 1006 | matches_1 = matches[(matches.left_side == 'foooo') & (matches.right_side == 'foooob')] 1007 | # In the case of only a master series, the matches are recursive, so both variants are to be removed 1008 | matches_2 = matches[(matches.left_side == 'foooob') & (matches.right_side == 'foooo')] 1009 | self.assertEqual(0, matches_1.shape[0]) 1010 | self.assertEqual(0, matches_2.shape[0]) 1011 | 1012 | sg2 = StringGrouper(test_series_1, test_series_2).fit() 1013 | sg2.remove_match('foooo', 'foooob') 1014 | matches = sg2.get_matches() 1015 | matches = matches[(matches.left_side == 'foooo') & (matches.right_side == 'foooob')] 1016 | self.assertEqual(0, matches.shape[0]) 1017 | 1018 | def test_string_grouper_type_error(self): 1019 | """StringGrouper should raise an typeerror master or duplicates are not a series of strings""" 1020 | with self.assertRaises(TypeError): 1021 | _ = StringGrouper('foo', 'bar') 1022 | with self.assertRaises(TypeError): 1023 | _ = StringGrouper(pd.Series(['foo', 'bar']), pd.Series(['foo', 1])) 1024 | with self.assertRaises(TypeError): 1025 | _ = StringGrouper(pd.Series(['foo', np.nan]), pd.Series(['foo', 'j'])) 1026 | 1027 | def test_prior_matches_added(self): 1028 | """When a new match is added, any pre-existing matches should also be updated""" 1029 | sample = [ 1030 | 'microsoftoffice 365 home', 1031 | 'microsoftoffice 365 pers', 1032 | 'microsoft office' 1033 | ] 1034 | 1035 | df = pd.DataFrame(sample, columns=['name']) 1036 | 1037 | sg = StringGrouper(df['name'], ignore_index=True) 1038 | sg = sg.fit() 1039 | 1040 | sg = sg.add_match('microsoft office', 'microsoftoffice 365 home') 1041 | sg = sg.add_match('microsoftoffice 365 pers', 'microsoft office') 1042 | df['deduped'] = sg.get_groups() 1043 | # All strings should now match to the same "master" string 1044 | self.assertEqual(1, len(df.deduped.unique())) 1045 | 1046 | 1047 | if __name__ == '__main__': 1048 | unittest.main() 1049 | --------------------------------------------------------------------------------