├── .flake8
├── .git-blame-ignore-revs
├── .github
    ├── dependabot.yml
    ├── scripts
    │   └── asv_markdown.py
    └── workflows
    │   ├── benchmark-bot.yml
    │   ├── codeql-analysis.yml
    │   ├── lock.yml
    │   └── pythonpackage.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .readthedocs.yml
├── CHANGELOG.md
├── CITATION.cff
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── CONTRIBUTORS.md
├── LICENSE
├── MANIFEST.in
├── README.md
├── THANKS.md
├── benchmarks
    ├── asv.conf.json
    ├── benchmarks
    │   ├── __init__.py
    │   ├── canonical.py
    │   ├── canonical_gazetteer.py
    │   ├── canonical_matching.py
    │   ├── common.py
    │   └── datasets
    │   │   ├── restaurant-1.csv
    │   │   ├── restaurant-2.csv
    │   │   ├── restaurant-nophone-training.csv
    │   │   └── restaurant-nophone.csv
    └── setup.py
├── dedupe
    ├── __init__.py
    ├── _typing.py
    ├── api.py
    ├── backport.py
    ├── blocking.py
    ├── branch_and_bound.py
    ├── canonical.py
    ├── canopy_index.py
    ├── clustering.py
    ├── convenience.py
    ├── core.py
    ├── cpredicates.pyx
    ├── datamodel.py
    ├── index.py
    ├── labeler.py
    ├── levenshtein.py
    ├── predicate_functions.py
    ├── predicates.py
    ├── py.typed
    ├── serializer.py
    ├── tfidf.py
    ├── training.py
    └── variables
    │   ├── __init__.py
    │   ├── base.py
    │   ├── categorical_type.py
    │   ├── exact.py
    │   ├── exists.py
    │   ├── interaction.py
    │   ├── latlong.py
    │   ├── price.py
    │   ├── set.py
    │   └── string.py
├── docs
    ├── API-documentation.rst
    ├── Bibliography.rst
    ├── Examples.rst
    ├── Makefile
    ├── Troubleshooting.rst
    ├── Variable-definition.rst
    ├── _static
    │   ├── css
    │   │   ├── bootstrap.css
    │   │   └── custom.css
    │   ├── images
    │   │   ├── dedupeio-logo-reversed.png
    │   │   └── dedupeio-logo.png
    │   └── js
    │   │   ├── bootstrap.min.js
    │   │   └── jquery.min.js
    ├── _templates
    │   └── layout.html
    ├── conf.py
    ├── how-it-works
    │   ├── Choosing-a-good-threshold.rst
    │   ├── Grouping-duplicates.rst
    │   ├── How-it-works.rst
    │   ├── Making-smart-comparisons.rst
    │   ├── Matching-records.rst
    │   └── Special-Cases.rst
    ├── index.rst
    └── requirements.txt
├── pyproject.toml
├── requirements.txt
├── setup.py
└── tests
    ├── duplicateCluster_memory_case.py
    ├── test_api.py
    ├── test_blocking.py
    ├── test_canonical.py
    ├── test_core.py
    ├── test_cpredicates.py
    ├── test_dedupe.py
    ├── test_exists.py
    ├── test_labeler.py
    ├── test_memory.sh
    ├── test_predicate_functions.py
    ├── test_predicates.py
    ├── test_price.py
    ├── test_serializer.py
    ├── test_tfidf.py
    └── test_training.py


/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length=160
3 | extend-ignore = E203


--------------------------------------------------------------------------------
/.git-blame-ignore-revs:
--------------------------------------------------------------------------------
1 | # .git-blame-ignore-revs
2 | # Blacken
3 | c28cd1363f9fcf3bb9c7769615e02bfc08ba45b1
4 | 9e01ccf2e7eacabe0cd1ee16c5158ba417104897
5 | 442edec76a27f7d76f01c89de7327c35cbb898d7
6 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # To get started with Dependabot version updates, you'll need to specify which
 2 | # package ecosystems to update and where the package manifests are located.
 3 | # Please see the documentation for all configuration options:
 4 | # https://help.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
 5 | 
 6 | version: 2
 7 | updates:
 8 |   - package-ecosystem: "pip" # See documentation for possible values
 9 |     directory: "/" # Location of package manifests
10 |     schedule:
11 |       interval: "daily"
12 |   - package-ecosystem: "github-actions"
13 |     directory: "/"
14 |     schedule:
15 |       interval: "weekly"
16 |     ignore:
17 |       # Optional: Official actions have moving tags like v1;
18 |       # if you use those, you don't need updates.
19 |       - dependency-name: "actions/*"
20 | 


--------------------------------------------------------------------------------
/.github/scripts/asv_markdown.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | 
 4 | def format(element):
 5 |     if is_float(element):
 6 |         f = float(element)
 7 | 
 8 |         return "{0:.3}".format(f)
 9 | 
10 |     else:
11 |         return element
12 | 
13 | 
14 | def is_float(element):
15 |     try:
16 |         float(element)
17 |     except ValueError:
18 |         return False
19 |     else:
20 |         return True
21 | 
22 | 
23 | def to_markdown(data):
24 |     preamble = """# {tests} ([diff](https://github.com/dedupeio/dedupe/compare/{base_commit}...{head_commit})):
25 | |  |       before       |    after  |       ratio | benchmark  |
26 | |- |-: |-: |-: |-|\n""".format(
27 |         **data
28 |     )
29 | 
30 |     full_table = preamble + "\n".join(
31 |         "|" + "|".join(row) + "|" for row in data["comparisons"]
32 |     )
33 | 
34 |     return full_table
35 | 
36 | 
37 | def parse(asv_input):
38 |     result = re.match(
39 |         r"^\n(?P<tests>.*?):\n\n       before           after         ratio\n     \[(?P<base_commit>.+)\]       \[(?P<head_commit>.+)\]\n     <(?P<base_branch>.+)>           <(?P<head_branch>.+)> *\n(?P<raw_comparisons>.*)",
40 |         asv_input,
41 |         re.DOTALL,
42 |     )
43 | 
44 |     test_details = result.groupdict()
45 | 
46 |     raw_comparisons = test_details.pop("raw_comparisons").splitlines()
47 |     comparisons = (
48 |         [row[:2].strip()] + row[2:].split(maxsplit=3) for row in raw_comparisons
49 |     )
50 |     test_details["comparisons"] = [
51 |         [indicator, format(value_a), format(value_b), ratio, test]
52 |         for indicator, value_a, value_b, ratio, test in comparisons
53 |     ]
54 |     return test_details
55 | 
56 | 
57 | if __name__ == "__main__":
58 |     import sys
59 | 
60 |     print("hello", file=sys.stderr)
61 |     asv_input = sys.stdin.read()
62 |     print(asv_input, file=sys.stderr)
63 | 
64 |     print(to_markdown(parse(asv_input)))
65 | 


--------------------------------------------------------------------------------
/.github/workflows/benchmark-bot.yml:
--------------------------------------------------------------------------------
 1 | # from https://github.com/pandas-dev/pandas/blob/d42a148cd83e06b5e5ef1fb6424e337d5b5efaa5/.github/workflows/asv-bot.yml
 2 | name: "Benchmark Bot"
 3 | 
 4 | on:
 5 |   issue_comment: # Pull requests are also issues
 6 |     types:
 7 |       - created
 8 | 
 9 | env:
10 |   COMMENT: ${{github.event.comment.body}}
11 | 
12 | jobs:
13 |   benchmarks:
14 |     name: "Run benchmarks"
15 |     if: startsWith(github.event.comment.body, '@benchmark')
16 |     runs-on: ubuntu-latest
17 |     defaults:
18 |       run:
19 |         shell: bash -el {0}
20 | 
21 |     concurrency:
22 |       # Set concurrency to prevent abuse(full runs are ~5.5 hours !!!)
23 |       # each user can only run one concurrent benchmark bot at a time
24 |       # We don't cancel in progress jobs, but if you want to benchmark multiple PRs,
25 |       # you're gonna have to wait
26 |       group: ${{ github.actor }}-benchmarks
27 |       cancel-in-progress: false
28 | 
29 |     steps:
30 |       - name: Install hub
31 |         run: sudo apt-get install -y hub
32 | 
33 |       - name: Setup git
34 |         uses: actions/checkout@v3
35 | 
36 |       # Since this was triggered by a comment, not a PR,
37 |       # the `actions/checkout` action will pull
38 |       # the default branch (AKA main). We need to checkout the PR branch.
39 |       # From https://github.com/actions/checkout/issues/331#issuecomment-925405415
40 |       - name: Checkout Pull Request
41 |         run: |
42 |           
43 |           hub pr checkout ${{ github.event.issue.number }}
44 |           echo "Checked out SHA:"
45 |           git log -1 --format='%H'
46 |         env:
47 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
48 | 
49 |       - name: Set up Python 3.10
50 |         uses: actions/setup-python@v3
51 |         with:
52 |           python-version: "3.10"
53 | 
54 |       - name: Install dependencies
55 |         run: |
56 |           pip install --upgrade pip
57 |           pip install -r requirements.txt
58 |           pip install .
59 | 
60 |       - name: Run benchmarks
61 |         id: bench
62 |         continue-on-error: true # This is a fake failure, asv will exit code 1 for regressions
63 |         run: |
64 |           # extracting the regex, see https://stackoverflow.com/a/36798723
65 |           REGEX=$(echo "$COMMENT" | sed -n "s/^@benchmark\s\+-b\s*\(\S*\).*$/\1/p")
66 |           if [ -z "$REGEX" ]; then
67 |             BENCHMARKS=""
68 |           else
69 |             BENCHMARKS="-b $REGEX"
70 |           fi
71 |           cd benchmarks
72 |           asv check -E existing
73 |           git remote add upstream https://github.com/dedupeio/dedupe.git
74 |           git fetch upstream
75 |           asv machine --yes
76 |           asv continuous --show-stderr -f 1.1 $BENCHMARKS upstream/main HEAD | cat
77 |           echo 'BENCH_OUTPUT<<EOF' >> $GITHUB_ENV
78 |           asv compare -f 1.1 upstream/main HEAD | python ../.github/scripts/asv_markdown.py >> $GITHUB_ENV
79 |           echo 'EOF' >> $GITHUB_ENV
80 |           echo "REGEX=$REGEX" >> $GITHUB_ENV
81 | 
82 |       - name: Add comment with results
83 |         uses: actions/github-script@v6
84 |         env:
85 |           BENCH_OUTPUT: ${{env.BENCH_OUTPUT}}
86 |           REGEX: ${{env.REGEX}}
87 |         with:
88 |           script: |
89 |             const ENV_VARS = process.env
90 |             const run_url = `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`
91 |             github.rest.issues.createComment({
92 |               issue_number: context.issue.number,
93 |               owner: context.repo.owner,
94 |               repo: context.repo.repo,
95 |               body: ENV_VARS["BENCH_OUTPUT"] + '\n\n[(logs)](' + run_url + ')'
96 |             })
97 | 


--------------------------------------------------------------------------------
/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
 1 | # For most projects, this workflow file will not need changing; you simply need
 2 | # to commit it to your repository.
 3 | #
 4 | # You may wish to alter this file to override the set of languages analyzed,
 5 | # or to provide custom queries or build logic.
 6 | #
 7 | # ******** NOTE ********
 8 | # We have attempted to detect the languages in your repository. Please check
 9 | # the `language` matrix defined below to confirm you have the correct set of
10 | # supported CodeQL languages.
11 | #
12 | name: "CodeQL"
13 | 
14 | on:
15 |   push:
16 |     branches: [ main ]
17 |   pull_request:
18 |     # The branches below must be a subset of the branches above
19 |     branches: [ main ]
20 |   schedule:
21 |     - cron: '17 21 * * 4'
22 | 
23 | jobs:
24 |   analyze:
25 |     name: Analyze
26 |     runs-on: ubuntu-latest
27 |     permissions:
28 |       actions: read
29 |       contents: read
30 |       security-events: write
31 | 
32 |     strategy:
33 |       fail-fast: false
34 |       matrix:
35 |         language: [ 'python' ]
36 |         # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
37 |         # Learn more about CodeQL language support at https://git.io/codeql-language-support
38 | 
39 |     steps:
40 |     - name: Checkout repository
41 |       uses: actions/checkout@v3
42 | 
43 |     # Initializes the CodeQL tools for scanning.
44 |     - name: Initialize CodeQL
45 |       uses: github/codeql-action/init@v3
46 |       with:
47 |         languages: ${{ matrix.language }}
48 |         # If you wish to specify custom queries, you can do so here or in a config file.
49 |         # By default, queries listed here will override any specified in a config file.
50 |         # Prefix the list here with "+" to use these queries and those in the config file.
51 |         # queries: ./path/to/local/query, your-org/your-repo/queries@main
52 | 
53 |     # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
54 |     # If this step fails, then you should remove it and run the build manually (see below)
55 |     - name: Autobuild
56 |       uses: github/codeql-action/autobuild@v3
57 | 
58 |     # ℹ️ Command-line programs to run using the OS shell.
59 |     # 📚 https://git.io/JvXDl
60 | 
61 |     # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
62 |     #    and modify them (or add more) to build your code if your project
63 |     #    uses a compiled language
64 | 
65 |     #- run: |
66 |     #   make bootstrap
67 |     #   make release
68 | 
69 |     - name: Perform CodeQL Analysis
70 |       uses: github/codeql-action/analyze@v3
71 | 


--------------------------------------------------------------------------------
/.github/workflows/lock.yml:
--------------------------------------------------------------------------------
 1 | name: 'Lock Threads'
 2 | 
 3 | on:
 4 |   schedule:
 5 |     - cron: '0 0 1/7 * *'
 6 |   workflow_dispatch:
 7 | 
 8 | permissions:
 9 |   issues: write
10 | 
11 | concurrency:
12 |   group: lock
13 | 
14 | jobs:
15 |   action:
16 |     runs-on: ubuntu-latest
17 |     steps:
18 |       - uses: dessant/lock-threads@v5
19 |         with:
20 |           process-only: 'issues'
21 |           issue-inactive-days: '14'
22 |           log-output: true
23 | 


--------------------------------------------------------------------------------
/.github/workflows/pythonpackage.yml:
--------------------------------------------------------------------------------
  1 | name: tests
  2 | 
  3 | on: [push, pull_request]
  4 | 
  5 | jobs:
  6 |   lint:
  7 |     runs-on: ubuntu-latest
  8 |     steps:
  9 |       - uses: actions/checkout@v2
 10 |       - uses: actions/setup-python@v2
 11 |         with:
 12 |           python-version: "3.12"
 13 |       - name: Install dependencies
 14 |         run: |
 15 |           pip install --upgrade pip
 16 |           pip install .
 17 |           pip install -r requirements.txt
 18 |       - name: flake8
 19 |         run: flake8 dedupe tests benchmarks/benchmarks
 20 |       - name: isort
 21 |         if: always()
 22 |         run: isort --check-only .
 23 |       - name: black
 24 |         if: always()
 25 |         run: black . --check
 26 |       - name: mypy
 27 |         if: always()
 28 |         run: mypy
 29 |   test:
 30 |     timeout-minutes: 40
 31 |     runs-on: ${{ matrix.os }}
 32 |     strategy:
 33 |       fail-fast: false
 34 |       matrix:
 35 |         os: [windows-latest, macos-latest, ubuntu-latest]
 36 |         python-version: [3.8, 3.9, "3.10", "3.11", "3.12"]
 37 | 
 38 |     steps:
 39 |     - uses: actions/checkout@v2
 40 |     - name: Set up Python ${{ matrix.python-version }}
 41 |       uses: actions/setup-python@v2
 42 |       with:
 43 |         python-version: ${{ matrix.python-version }}
 44 |     - name: Install dependencies
 45 |       run: |
 46 |         pip install --upgrade pip
 47 |         pip install -e .
 48 |     - name: Install test dependencies
 49 |       run: pip install -r requirements.txt
 50 |     - name: pytest
 51 |       run: pytest
 52 |     - name: Code Coverage
 53 |       uses: codecov/codecov-action@v4
 54 |       env:
 55 |         OS: ${{ matrix.os }}
 56 |         PYTHON: '3.7'
 57 |       with:
 58 |         env_vars: OS,PYTHON
 59 |     - name: Integration tests
 60 |       # Do everything twice: The first time is training and generates settings,
 61 |       # the second time it tests using a static settings file.
 62 |       run: |
 63 |         python -m pip install ./benchmarks
 64 |         python benchmarks/benchmarks/canonical.py
 65 |         python benchmarks/benchmarks/canonical.py
 66 |         python benchmarks/benchmarks/canonical_matching.py
 67 |         python benchmarks/benchmarks/canonical_matching.py
 68 |         python benchmarks/benchmarks/canonical_gazetteer.py
 69 |         python benchmarks/benchmarks/canonical_gazetteer.py
 70 |   settings_file_persists:
 71 |     runs-on: ubuntu-latest
 72 |     steps:
 73 |       - name: checkout main
 74 |         uses: actions/checkout@v2
 75 |         with:
 76 |           ref: main
 77 |       - uses: actions/setup-python@v2
 78 |       - name: Install dependencies
 79 |         run: |
 80 |           pip install --upgrade pip
 81 |           pip install .
 82 |           python -m pip install ./benchmarks
 83 |       - name: Run on canonical on main
 84 |         run: python benchmarks/benchmarks/canonical.py
 85 |       - name: checkout this PR branch
 86 |         uses: actions/checkout@v2
 87 |         with:
 88 |           clean: false
 89 |       - name: Install any new dependencies
 90 |         run: pip install .
 91 |       - name: Run on canonical with setting file created on main
 92 |         run: python benchmarks/benchmarks/canonical.py      
 93 |   wheels:
 94 |     if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags')
 95 |     needs: [test, lint, settings_file_persists]
 96 |     name: Build wheels on ${{ matrix.os }}
 97 |     runs-on: ${{ matrix.os }}
 98 |     strategy:
 99 |       matrix:
100 |         os: [windows-latest, macos-latest, ubuntu-latest]
101 |     steps:
102 |       - uses: actions/checkout@v2
103 |       - uses: actions/setup-python@v2
104 |       - name: Build wheels
105 |         uses: pypa/cibuildwheel@v2.21.3
106 |         env:
107 |           CIBW_ARCHS_MACOS: x86_64 arm64 universal2
108 |       - name: Build sdist
109 |         run: |
110 |           pip install build
111 |           python -m build --sdist
112 |       - name: Publish wheels to PyPI
113 |         env:
114 |           TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
115 |           TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
116 |         run: |
117 |           pip install twine
118 |           twine upload --skip-existing wheelhouse/*.whl
119 |           twine upload dist/*
120 |         continue-on-error: true
121 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .env
 2 | .coverage*
 3 | htmlcov
 4 | cpredicates.c
 5 | *.code-workspace
 6 | libdistance-0.2.1
 7 | build
 8 | _build
 9 | *.pyc
10 | logfile
11 | *.*~
12 | *.o
13 | *.so
14 | *.py.*
15 | *.*gz
16 | *.html
17 | .#*
18 | *.*#
19 | *.json
20 | examples/output/*.*
21 | examples/csv_example/csv_example_output.csv
22 | *output.csv
23 | examples/mysql_example/*.txt*
24 | *.db
25 | kernprof.py
26 | possible_classifiers
27 | .DS_Store
28 | mysql.cnf
29 | *settings
30 | *.egg-info
31 | ENV
32 | dist
33 | src/*.c
34 | 
35 | .coverage
36 | htmlcov
37 | 
38 | !benchmarks/asv.conf.json
39 | benchmarks/.asv/*
40 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/psf/black
 3 |     rev: 24.4.2 
 4 |     hooks:
 5 |     - id: black
 6 |   - repo: https://github.com/pycqa/isort
 7 |     rev: 5.13.2
 8 |     hooks:
 9 |       - id: isort
10 |         name: isort (python)
11 |   - repo: https://github.com/pycqa/flake8
12 |     rev: "7.1.0"
13 |     hooks:
14 |       - id: flake8
15 |         args: [--config=.flake8]
16 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 2 | 
 3 | # Required
 4 | version: 2
 5 | 
 6 | # Set the OS, Python version and other tools you might need
 7 | build:
 8 |   os: ubuntu-22.04
 9 |   tools:
10 |     python: "3.12"
11 | 
12 | # Build documentation in the docs/ directory with Sphinx
13 | sphinx:
14 |   configuration: docs/conf.py
15 | 
16 | # Build documentation with MkDocs
17 | #mkdocs:
18 | #  configuration: mkdocs.yml
19 | 
20 | # Optionally build your docs in additional formats such as PDF and ePub
21 | formats: all
22 | 
23 | # Optionally set the version of Python and requirements required to build your docs
24 | python:
25 |   install:
26 |     - requirements: docs/requirements.txt
27 |     - method: pip
28 |       path: .
29 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # 3.0.2
  2 | - Fixed regression in Exists predicate
  3 | 
  4 | # 3.0.1
  5 | - Fixed regression in Exists predicate
  6 | 
  7 | 
  8 | # 3.0.0
  9 | - Development in python packaging made supporting the previous namespace approach for
 10 |   variable plugins untenable. Since we had to redo the way we defined the data model, 
 11 |   we took the opportunity to explicity instantiate variable objects. 
 12 | 
 13 | # 2.0.6
 14 | - fixed bug that was preventing learning of index predicates in Dedupe mode
 15 | 
 16 | # 2.0.3
 17 | - Improved memory performance of connected components
 18 | 
 19 | 
 20 | # 2.0
 21 | 
 22 | - Python 3 only
 23 | - Static typing and type Hints
 24 | - Incorporate sqlite to extend normal API to millions of records
 25 | - Multiprocessing enabled for Windows
 26 | - Multiprocessing mode changed to spawn for Mac OS X
 27 | - Moved from CamelCase to lowercase_with_underscore for method names.
 28 | - Dropped ability to save indices in save settings.
 29 | - Moved from Deduper.match -> Dedupe.partition, RecordLink.match -> RecordLink.join, Gazetteer.match -> Gazetteer.search
 30 | - Renamed Matching.blocker -> Matching.fingerprinter
 31 | - Moved to autodoc for documentation
 32 | - Dropped threshold methods
 33 | - matchBlocks has been replaced by score, which takes pairs of records not blocks
 34 | 
 35 | # 1.10.0
 36 | - Dropped python 2.7 support
 37 | 
 38 | # 1.9.4
 39 | - Cleaned up block learning
 40 | 
 41 | # 1.9.3
 42 | - Improved performance of connected components algorithm with very large components
 43 | - Fixed pickling unpickling bug of Index predicate classes
 44 | 
 45 | # 1.9.0
 46 | - Implemented a disagreement based active labeler to improve blocking recall
 47 | 
 48 | # 1.8.2
 49 | - removed shelve-backed persistence in blocking data in favor of an improved in-memory implementation
 50 | 
 51 | # 1.8.0
 52 | - matchBlocks is not a generator; match is now optionally a generator. If the
 53 |   generator option is turned of for the Gazette match is lazy
 54 | 
 55 | # 1.7.8
 56 | - Speed up blocking, on our way to 3-predicates
 57 | 
 58 | # 1.7.5
 59 | - Significantly reduced memory footprint during connected_components
 60 | 
 61 | # 1.7.3
 62 | - Significantly reduced memory footprint during scoreDuplicates
 63 | 
 64 | # 1.7.2
 65 | - Improper release
 66 | 
 67 | # 1.7.1
 68 | - TempShelve class that addresses various bugs related to cleaning up tempoary shelves
 69 | 
 70 | # 1.7.0
 71 | - Added `target` argument to blocker and predicates for changing the behavior
 72 |   of the predicates for the target and source dataset if we are linking.
 73 | 
 74 | # 1.6.8
 75 | - Use file-backed blocking with dbm, dramatically increases size of data that can be handled without special programming
 76 | 
 77 | # 1.6.7
 78 | - Reduce memory footprint of matching
 79 | 
 80 | # 1.6.0
 81 | - Simplify .train method
 82 | 
 83 | # 1.5.5
 84 | - Levenshtein search based index predicates thanks to @mattandahalfew
 85 | 
 86 | # 1.5.0
 87 | - simplified the sample API, this might be a breaking change for some
 88 | - the active learner interface is now more modular to allow for a different learner
 89 | - random sampling of pairs has been improved for linking case and
 90 |   dedupe case, h/t to @MarkusShepherd
 91 | 
 92 | ## 1.4.15
 93 | - frozendicts have finally been removed
 94 | - first N char predicates return their entire length if length is less
 95 |   than N, instead of nothing
 96 | - crossvalidation is skipped in active learning if using default rlr learner
 97 | 
 98 | ## 1.4.5
 99 | - Block indexes can now be persisted by using the index=True argument
100 |   in the writeSettings method
101 | 
102 | ## 1.4.1
103 | - Now uses C version of double metaphone for speed
104 | - Much faster compounding of blocks in block learning
105 | 
106 | ## 1.4.0
107 | - Block learning now tries to minimize the total number of comparisons
108 |   not just the comparisons of distinct records. This decouples makes
109 |   block learning from learning classifier learning. This change has
110 |   requires new, different arguments to the train method.
111 | 
112 | ## 1.3.8
113 | - Console labeler now shows fields in the order they are defined in
114 |   the data model. The labeler also reports number of labeled examples
115 | - `pud` argument added to the `train` method. Proportion of uncovered
116 |   dupes. This deprecates `uncovered_dupes` argument
117 | 
118 | ## 1.3.0
119 | - If we have enough training data, consider Compound predicates of length 3 in addition to predicates of length 2
120 | 
121 | ## 1.1.1
122 | - None now treated as missing data indicator. Warnings for deprecations of older types of missing data indicators
123 | 
124 | ## 1.1.0
125 | Features
126 | - Handle FuzzyCategoricalType in datamodel
127 | 
128 | ## 1.0.0
129 | Features
130 | - Speed up learning
131 | - Parallelize sampling
132 | - Optional [CRF Edit Distance](https://dedupe.readthedocs.io/en/latest/Variable-definition.html#optional-edit-distance)
133 | 
134 | ## 0.8.0
135 | Support for Python 3.4 added. Support for Python 2.6 dropped.
136 | 
137 | Features
138 | - Windows OS supported
139 | - train method has argument for not considering index predicates
140 | - TfIDFNGram Index Predicate added (for shorter string)
141 | - SuffixArray Predicate
142 | - Double Metaphone Predicates
143 | - Predicates for numbers, OrderOfMagnitude, Round
144 | - Set Predicate OrderOfCardinality
145 | - Final, learned predicates list will now often be smaller without
146 |   loss of coverage
147 | - Variables refactored to support external extensions like
148 |   https://github.com/datamade/dedupe-variable-address
149 | - Categorical distance, regularized logistic regression, affine gap
150 |   distance, canonicalization have been turned into separate libraries.
151 | - Simplejson is now dependency
152 | 
153 | ## 0.7.5
154 | Features
155 | - Individual record cluster membership scores
156 | - New predicates
157 | - New Exists Variable Type
158 | 
159 | Bug Fixes
160 | - Latlong predicate fixed
161 | - Set TFIDF canopy working properly
162 | 
163 | ## 0.7.4
164 | Features
165 | - Sampling methods now use blocked sampling
166 | 
167 | ## 0.7.0
168 | Version 0.7.0 is backwards compatible, except for the match method of Gazetteer class
169 | 
170 | Features
171 | - new index, unindex, and match methods in Gazetter Matching. Useful for
172 |   streaming matching
173 | 
174 | ## 0.6.0
175 | Version 0.6.0 is *not* backwards compatible.
176 | 
177 | Features :
178 | - new Text, ShortString, and exact string types
179 | - multiple variables can be defined on same field
180 | - new Gazette linker for matching dirty records against a master list
181 | - performance improvements, particularly in memory usage
182 | - canonicalize function in dedupe.convenience for creating a canonical representation of a cluster of records
183 | - tons of bugfixes
184 | 
185 | API breaks
186 | - when initializing an ActiveMatching object, `variable_definition` replaces `field_definition` and is a list of    dictionaries instead of a dictionary. See the documentation for details
187 | - also when initializing a Matching object, `num_processes` has been replaced by `num_cores`, which now defaults to the
188 | number of cpus on the machine
189 | - when initializing a StaticMatching object, `settings_file` is now expected to be a file object not a string. The `readTraining`, `writeTraining`, `writeSettings` methods also all now expect file objects
190 | 
191 | 
192 | ## 0.5
193 | Version 0.5 is *not* backwards compatible.
194 | 
195 | Features :
196 | 
197 | - Special case code for linking two datasets that, individually are unique
198 | - Parallel processing using python standard library multiprocessing
199 | - Much faster canopy creation using zope.index
200 | - Asynchronous active learning methods
201 | 
202 | API breaks :
203 | - `duplicateClusters` has been removed, it has been replaced by
204 |   `match` and `matchBlocks`
205 | - `goodThreshold` has been removed, it has been replaced by
206 |   `threshold` and `thresholdBlocks`
207 | - the meaning of `train` has changed. To train from training file use `readTraining`. To use console labeling, pass a dedupe instance to the `consoleLabel` function
208 | - The convenience function dataSample has been removed. It has been replaced by
209 | the `sample` methods
210 | - It is no longer necessary to pass `frozendicts` to `Matching` classes
211 | - `blockingFunction` has been removed and been replaced by the `blocker` method
212 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | cff-version: 1.2.0
 2 | message: "If you use this software, please cite it as below."
 3 | authors:
 4 | - family-names: "Gregg"
 5 |   given-names: "Forest"
 6 | - family-names: "Eder"
 7 |   given-names: "Derek"
 8 | title: "dedupe"
 9 | version: 2.0.11
10 | date-released: 2022-01-27
11 | url: "https://github.com/dedupeio/dedupe"
12 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation.
 6 | 
 7 | ## Our Standards
 8 | 
 9 | Examples of behavior that contributes to creating a positive environment include:
10 | 
11 | * Using welcoming and inclusive language
12 | * Being respectful of differing viewpoints and experiences
13 | * Gracefully accepting constructive criticism
14 | * Focusing on what is best for the community
15 | * Showing empathy towards other community members
16 | 
17 | Examples of unacceptable behavior by participants include:
18 | 
19 | * The use of sexualized language or imagery and unwelcome sexual attention or advances
20 | * Trolling, insulting/derogatory comments, and personal or political attacks
21 | * Public or private harassment
22 | * Publishing others' private information, such as a physical or electronic address, without explicit permission
23 | * Other conduct which could reasonably be considered inappropriate in a professional setting
24 | 
25 | ## Our Responsibilities
26 | 
27 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior.
28 | 
29 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful.
30 | 
31 | ## Scope
32 | 
33 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers.
34 | 
35 | ## Enforcement
36 | 
37 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at info@datamade.us. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately.
38 | 
39 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership.
40 | 
41 | ## Attribution
42 | 
43 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version]
44 | 
45 | [homepage]: http://contributor-covenant.org
46 | [version]: http://contributor-covenant.org/version/1/4/
47 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | ## Reporting issues
2 | 
3 | When reporting issues please include as much detail as possible about your
4 | operating system, dedupe version and python version. Whenever possible, please
5 | also include a brief, self-contained code example that demonstrates the problem.
6 | 
7 | If dedupe is raising an exception, please paste a [full traceback](https://en.wikipedia.org/wiki/Stack_trace).
8 | 


--------------------------------------------------------------------------------
/CONTRIBUTORS.md:
--------------------------------------------------------------------------------
1 | * Forest Gregg
2 | * Derek Eder
3 | * Nikit Saraf
4 | * Mark Huberty
5 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2014 Forest Gregg, Derek Eder, DataMade and Contributors
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include dedupe/cpredicates.pyx


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Dedupe Python Library
  2 | 
  3 | [![Tests Passing](https://github.com/dedupeio/dedupe/workflows/tests/badge.svg)](https://github.com/dedupeio/dedupe/actions?query=workflow%3Atests)[![codecov](https://codecov.io/gh/dedupeio/dedupe/branch/main/graph/badge.svg?token=aauKUrTEgh)](https://codecov.io/gh/dedupeio/dedupe)
  4 | 
  5 | _dedupe is a python library that uses machine learning to perform fuzzy matching, deduplication and entity resolution quickly on structured data._
  6 | 
  7 | __dedupe__ will help you: 
  8 | 
  9 | * __remove duplicate entries__ from a spreadsheet of names and addresses
 10 | * __link a list__ with customer information to another with order history, even without unique customer IDs
 11 | * take a database of campaign contributions and __figure out which ones were made by the same person__, even if the names were entered slightly differently for each record
 12 | 
 13 | dedupe takes in human training data and comes up with the best rules for your dataset to quickly and automatically find similar records, even with very large databases.
 14 | 
 15 | ## Important links
 16 | * Documentation: https://docs.dedupe.io/
 17 | * Repository: https://github.com/dedupeio/dedupe
 18 | * Issues: https://github.com/dedupeio/dedupe/issues
 19 | * Mailing list: https://groups.google.com/forum/#!forum/open-source-deduplication
 20 | * Examples: https://github.com/dedupeio/dedupe-examples
 21 | 
 22 | ## dedupe library consulting
 23 | 
 24 | If you or your organization would like professional assistance in working with the dedupe library, Dedupe.io LLC offers consulting services. [Read more about pricing and available services here](https://dedupe.io/pricing/#consulting).
 25 | 
 26 | ## Tools built with dedupe
 27 | 
 28 | ### [Dedupe.io](https://dedupe.io/)
 29 | A cloud service powered by the dedupe library for de-duplicating and finding matches in your data. It provides a step-by-step wizard for uploading your data, setting up a model, training, clustering and reviewing the results.
 30 | 
 31 | [Dedupe.io](https://dedupe.io/) also supports record linkage across data sources and continuous matching and training through an [API](https://apidocs.dedupe.io/en/latest/).
 32 | 
 33 | For more, see the [Dedupe.io product site](https://dedupe.io/), [tutorials on how to use it](https://dedupe.io/tutorial/intro-to-dedupe-io.html), and [differences between it and the dedupe library](https://dedupe.io/documentation/should-i-use-dedupeio-or-the-dedupe-python-library.html).
 34 | 
 35 | Dedupe is well adopted by the Python community. Check out this [blogpost](https://medium.com/district-data-labs/basics-of-entity-resolution-with-python-and-dedupe-bc87440b64d4),
 36 | a YouTube video on how to use [Dedupe with Python](https://youtu.be/McsTWXeURhA) and a Youtube video on how to apply [Dedupe at scale using Spark](https://youtu.be/q9HPUYmiwjE?t=2704).
 37 | 
 38 | 
 39 | ### [csvdedupe](https://github.com/dedupeio/csvdedupe)
 40 | Command line tool for de-duplicating and [linking](https://github.com/dedupeio/csvdedupe#csvlink-usage) CSV files. Read about it on [Source Knight-Mozilla OpenNews](https://source.opennews.org/en-US/articles/introducing-cvsdedupe/).
 41 | 
 42 | ## Installation
 43 | 
 44 | ### Using dedupe
 45 | 
 46 | If you only want to use dedupe, install it this way:
 47 | 
 48 | ```bash
 49 | pip install dedupe
 50 | ```
 51 | 
 52 | Familiarize yourself with [dedupe's API](https://docs.dedupe.io/en/latest/API-documentation.html), and get started on your project. Need inspiration? Have a look at [some examples](https://github.com/dedupeio/dedupe-examples).
 53 | 
 54 | ### Developing dedupe
 55 | 
 56 | We recommend using [virtualenv](http://virtualenv.readthedocs.org/en/latest/virtualenv.html) and [virtualenvwrapper](http://virtualenvwrapper.readthedocs.org/en/latest/install.html) for working in a virtualized development environment. [Read how to set up virtualenv](http://docs.python-guide.org/en/latest/dev/virtualenvs/).
 57 | 
 58 | Once you have virtualenvwrapper set up,
 59 | 
 60 | ```bash
 61 | mkvirtualenv dedupe
 62 | git clone https://github.com/dedupeio/dedupe.git
 63 | cd dedupe
 64 | pip install -e . --config-settings editable_mode=compat
 65 | pip install -r requirements.txt
 66 | ```
 67 | 
 68 | If these tests pass, then everything should have been installed correctly!
 69 | 
 70 | ```bash
 71 | pytest
 72 | ```
 73 | 
 74 | Afterwards, whenever you want to work on dedupe,
 75 | 
 76 | ```bash
 77 | workon dedupe
 78 | ```
 79 | 
 80 | ## Testing
 81 | Unit tests of core dedupe functions
 82 | ```bash
 83 | pytest
 84 | ```
 85 | 
 86 | #### Test using canonical dataset from Bilenko's research
 87 |   
 88 | Using Deduplication
 89 | ```bash
 90 | python -m pip install -e ./benchmarks
 91 | python benchmarks/benchmarks/canonical.py
 92 | ```
 93 | 
 94 | Using Record Linkage
 95 | ```bash
 96 | python -m pip install -e ./benchmarks
 97 | python benchmarks/benchmarks/canonical_matching.py
 98 | ```
 99 | 
100 | 
101 | ## Team
102 | 
103 | * Forest Gregg, DataMade
104 | * Derek Eder, DataMade
105 | 
106 | ## Credits
107 | 
108 | Dedupe is based on Mikhail Yuryevich Bilenko's Ph.D. dissertation: [*Learnable Similarity Functions and their Application to Record Linkage and Clustering*](http://www.cs.utexas.edu/~ml/papers/marlin-dissertation-06.pdf).
109 | 
110 | ## Errors / Bugs
111 | 
112 | If something is not behaving intuitively, it is a bug, and should be reported.
113 | [Report it here](https://github.com/dedupeio/dedupe/issues)
114 | 
115 | 
116 | ## Note on Patches/Pull Requests
117 |  
118 | * Fork the project.
119 | * Make your feature addition or bug fix.
120 | * Send us a pull request. Bonus points for topic branches.
121 | 
122 | ## Copyright
123 | 
124 | Copyright (c) 2022 Forest Gregg and Derek Eder. Released under the [MIT License](https://github.com/dedupeio/dedupe/blob/main/LICENSE).
125 | 
126 | Third-party copyright in this distribution is noted where applicable.
127 | 
128 | ## Citing Dedupe
129 | If you use Dedupe in an academic work, please give this citation:
130 | 
131 | Forest Gregg and Derek Eder. 2022. Dedupe. https://github.com/dedupeio/dedupe.
132 | 


--------------------------------------------------------------------------------
/THANKS.md:
--------------------------------------------------------------------------------
1 | # Thanks To
2 | 
3 | * Jon Markel for the Illinois campaign contributions data used in the mysql_example, which he got from 
4 |   the [Illinois State Board of Elections](http://www.elections.il.gov/)
5 | 
6 | * [Daniel Müllner](http://math.stanford.edu/~muellner/) for his wonderful [fastcluster](http://math.stanford.edu/~muellner/fastcluster.html) library and the many changes he made at our request
7 | 


--------------------------------------------------------------------------------
/benchmarks/benchmarks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dedupeio/dedupe/54ecfe77d41390da66899596834a2bde3712c966/benchmarks/benchmarks/__init__.py


--------------------------------------------------------------------------------
/benchmarks/benchmarks/canonical.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | import os
 3 | import time
 4 | from itertools import combinations
 5 | 
 6 | import dedupe
 7 | from benchmarks import common
 8 | 
 9 | 
10 | def make_report(data, clustering):
11 |     true_dupes = common.get_true_dupes(data)
12 |     predicted_dupes = set()
13 |     for cluser_id, _ in clustering:
14 |         for pair in combinations(cluser_id, 2):
15 |             predicted_dupes.add(frozenset(pair))
16 | 
17 |     return common.Report.from_scores(true_dupes, predicted_dupes)
18 | 
19 | 
20 | class Canonical:
21 |     settings_file = common.DATASETS_DIR / "canonical_learned_settings"
22 |     data_file = common.DATASETS_DIR / "restaurant-nophone-training.csv"
23 | 
24 |     def setup(self):
25 |         self.data = common.load_data(self.data_file)
26 |         training_pairs = dedupe.training_data_dedupe(self.data, "unique_id", 5000)
27 |         self.training_pairs_filelike = io.StringIO()
28 |         dedupe.serializer.write_training(training_pairs, self.training_pairs_filelike)
29 |         self.training_pairs_filelike.seek(0)
30 | 
31 |     def make_report(self, clustering):
32 |         return make_report(self.data, clustering)
33 | 
34 |     def run(self, use_settings=False):
35 |         deduper: dedupe.StaticDedupe | dedupe.Dedupe
36 | 
37 |         if use_settings and os.path.exists(self.settings_file):
38 |             with open(self.settings_file, "rb") as f:
39 |                 deduper = dedupe.StaticDedupe(f)
40 | 
41 |         else:
42 |             variables = [
43 |                 dedupe.variables.String("name"),
44 |                 dedupe.variables.Exact("name"),
45 |                 dedupe.variables.String("address"),
46 |                 dedupe.variables.ShortString("cuisine", has_missing=True),
47 |                 dedupe.variables.ShortString("city"),
48 |             ]
49 | 
50 |             deduper = dedupe.Dedupe(variables, num_cores=5)
51 |             deduper.prepare_training(
52 |                 self.data, training_file=self.training_pairs_filelike, sample_size=10000
53 |             )
54 |             deduper.train(index_predicates=True)
55 |             with open(self.settings_file, "wb") as f:
56 |                 deduper.write_settings(f)
57 | 
58 |         return deduper.partition(self.data, threshold=0.5)
59 | 
60 |     def time_run(self):
61 |         return self.run()
62 | 
63 |     def peakmem_run(self):
64 |         return self.run()
65 | 
66 |     def track_precision(self):
67 |         return self.make_report(self.run()).precision
68 | 
69 |     def track_recall(self):
70 |         return self.make_report(self.run()).recall
71 | 
72 | 
73 | def cli():
74 |     common.configure_logging()
75 | 
76 |     can = Canonical()
77 |     can.setup()
78 | 
79 |     t0 = time.time()
80 |     clustering = can.run(use_settings=True)
81 |     elapsed = time.time() - t0
82 | 
83 |     print(can.make_report(clustering))
84 |     print(f"ran in {elapsed} seconds")
85 | 
86 | 
87 | if __name__ == "__main__":
88 |     cli()
89 | 


--------------------------------------------------------------------------------
/benchmarks/benchmarks/canonical_gazetteer.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | 
 4 | import dedupe
 5 | from benchmarks import canonical_matching, common
 6 | 
 7 | 
 8 | def make_report(data, clustering):
 9 |     true_dupes = canonical_matching.get_true_dupes(data)
10 |     predicted_dupes = {
11 |         frozenset([a, b]) for a, result in clustering for b, score in result
12 |     }
13 |     return common.Report.from_scores(true_dupes, predicted_dupes)
14 | 
15 | 
16 | class Gazetteer(canonical_matching.Matching):
17 |     settings_file = common.DATASETS_DIR / "canonical_gazetteer_learned_settings"
18 |     data_1_file = common.DATASETS_DIR / "restaurant-1.csv"
19 |     data_2_file = common.DATASETS_DIR / "restaurant-2.csv"
20 | 
21 |     params = [None]  # placholder
22 | 
23 |     def make_report(self, clustering):
24 |         return make_report(self.data, clustering)
25 | 
26 |     def run(self, kwargs, use_settings=False):
27 |         data_1, data_2 = self.data
28 |         gazetteer: dedupe.StaticGazetteer | dedupe.Gazetteer
29 | 
30 |         if use_settings and os.path.exists(self.settings_file):
31 |             with open(self.settings_file, "rb") as f:
32 |                 gazetteer = dedupe.StaticGazetteer(f)
33 |         else:
34 |             variables = [
35 |                 dedupe.variables.String("name"),
36 |                 dedupe.variables.String("address"),
37 |                 dedupe.variables.String("cuisine"),
38 |                 dedupe.variables.String("city"),
39 |             ]
40 | 
41 |             gazetteer = dedupe.Gazetteer(variables)
42 |             gazetteer.prepare_training(
43 |                 data_1,
44 |                 data_2,
45 |                 training_file=self.training_pairs_filelike,
46 |                 sample_size=10000,
47 |             )
48 |             gazetteer.train()
49 | 
50 |             with open(self.settings_file, "wb") as f:
51 |                 gazetteer.write_settings(f)
52 | 
53 |         gazetteer.index(data_2)
54 |         gazetteer.unindex(data_2)
55 |         gazetteer.index(data_2)
56 | 
57 |         return gazetteer.search(data_1, n_matches=1, generator=True)
58 | 
59 | 
60 | def cli():
61 |     common.configure_logging()
62 | 
63 |     gaz = Gazetteer()
64 |     gaz.setup(None)
65 | 
66 |     t0 = time.time()
67 |     clustering = gaz.run(None, use_settings=True)
68 |     elapsed = time.time() - t0
69 | 
70 |     print(gaz.make_report(clustering))
71 |     print(f"ran in {elapsed} seconds")
72 | 
73 | 
74 | if __name__ == "__main__":
75 |     cli()
76 | 


--------------------------------------------------------------------------------
/benchmarks/benchmarks/canonical_matching.py:
--------------------------------------------------------------------------------
  1 | import io
  2 | import os
  3 | import time
  4 | 
  5 | import dedupe
  6 | from benchmarks import common
  7 | 
  8 | 
  9 | def get_true_dupes(data):
 10 |     data_1, data_2 = data
 11 |     all_data = data_1.copy()
 12 |     all_data.update(data_2)
 13 |     return common.get_true_dupes(all_data)
 14 | 
 15 | 
 16 | def make_report(data, clustering):
 17 |     true_dupes = get_true_dupes(data)
 18 |     predicted_dupes = {frozenset(pair) for pair, _ in clustering}
 19 |     return common.Report.from_scores(true_dupes, predicted_dupes)
 20 | 
 21 | 
 22 | class Matching:
 23 |     settings_file = common.DATASETS_DIR / "canonical_data_matching_learned_settings"
 24 |     data_1_file = common.DATASETS_DIR / "restaurant-1.csv"
 25 |     data_2_file = common.DATASETS_DIR / "restaurant-2.csv"
 26 | 
 27 |     params = [
 28 |         {"threshold": 0.5},
 29 |         {"threshold": 0.5, "constraint": "many-to-one"},
 30 |     ]
 31 |     param_names = ["kwargs"]
 32 | 
 33 |     def setup(self, kwargs):
 34 |         data_1 = common.load_data(self.data_1_file)
 35 |         data_2 = common.load_data(self.data_2_file)
 36 | 
 37 |         self.data = (data_1, data_2)
 38 |         training_pairs = dedupe.training_data_link(data_1, data_2, "unique_id", 5000)
 39 |         self.training_pairs_filelike = io.StringIO()
 40 |         dedupe.serializer.write_training(training_pairs, self.training_pairs_filelike)
 41 |         self.training_pairs_filelike.seek(0)
 42 | 
 43 |     def run(self, kwargs, use_settings=False):
 44 |         data_1, data_2 = self.data
 45 |         deduper: dedupe.StaticRecordLink | dedupe.RecordLink
 46 | 
 47 |         if use_settings and os.path.exists(self.settings_file):
 48 |             with open(self.settings_file, "rb") as f:
 49 |                 deduper = dedupe.StaticRecordLink(f)
 50 |         else:
 51 |             variables = [
 52 |                 dedupe.variables.String("name"),
 53 |                 dedupe.variables.String("address"),
 54 |                 dedupe.variables.String("cuisine"),
 55 |                 dedupe.variables.String("city"),
 56 |             ]
 57 |             deduper = dedupe.RecordLink(variables)
 58 |             deduper.prepare_training(
 59 |                 data_1,
 60 |                 data_2,
 61 |                 training_file=self.training_pairs_filelike,
 62 |                 sample_size=10000,
 63 |             )
 64 |             deduper.train()
 65 |             with open(self.settings_file, "wb") as f:
 66 |                 deduper.write_settings(f)
 67 | 
 68 |         return deduper.join(data_1, data_2, **kwargs)
 69 | 
 70 |     def make_report(self, clustering):
 71 |         return make_report(self.data, clustering)
 72 | 
 73 |     def time_run(self, kwargs):
 74 |         return self.run(kwargs)
 75 | 
 76 |     def peakmem_run(self, kwargs):
 77 |         return self.run(kwargs)
 78 | 
 79 |     def track_precision(self, kwargs):
 80 |         return self.make_report(self.run(kwargs)).precision
 81 | 
 82 |     def track_recall(self, kwargs):
 83 |         return self.make_report(self.run(kwargs)).recall
 84 | 
 85 | 
 86 | def cli():
 87 |     common.configure_logging()
 88 | 
 89 |     m = Matching()
 90 |     for kwargs in m.params:
 91 |         m.setup(kwargs)
 92 |         print()
 93 |         print(f"running with kwargs: {kwargs}")
 94 |         t0 = time.time()
 95 |         clustering = m.run(kwargs=kwargs, use_settings=True)
 96 |         elapsed = time.time() - t0
 97 | 
 98 |         print(m.make_report(clustering))
 99 |         print(f"ran in {elapsed} seconds")
100 | 
101 | 
102 | if __name__ == "__main__":
103 |     cli()
104 | 


--------------------------------------------------------------------------------
/benchmarks/benchmarks/common.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import logging
 3 | import optparse
 4 | import re
 5 | from dataclasses import dataclass
 6 | from itertools import groupby
 7 | from pathlib import Path
 8 | 
 9 | DATASETS_DIR = Path(__file__).parent / "datasets"
10 | 
11 | 
12 | def pre_process(column):
13 |     column = re.sub("  +", " ", column)
14 |     column = re.sub("\n", " ", column)
15 |     column = column.strip().strip('"').strip("'").lower()
16 |     if not column:
17 |         column = None
18 |     return column
19 | 
20 | 
21 | def load_data(pathlike):
22 |     data_d = {}
23 |     with open(pathlike) as f:
24 |         reader = csv.DictReader(f)
25 |         for i, row in enumerate(reader):
26 |             clean_row = {k: pre_process(v) for (k, v) in row.items()}
27 |             data_d[str(pathlike) + str(i)] = clean_row
28 | 
29 |     return data_d
30 | 
31 | 
32 | def configure_logging() -> None:
33 |     optp = optparse.OptionParser()
34 |     optp.add_option(
35 |         "-v",
36 |         "--verbose",
37 |         dest="verbose",
38 |         action="count",
39 |         help="Increase verbosity (specify multiple times for more)",
40 |     )
41 |     opts, _ = optp.parse_args()
42 |     log_level = logging.WARNING
43 |     if opts.verbose:
44 |         if opts.verbose == 1:
45 |             log_level = logging.INFO
46 |         elif opts.verbose >= 2:
47 |             log_level = logging.DEBUG
48 |     logging.basicConfig(level=log_level)
49 | 
50 | 
51 | def get_true_dupes(data: dict) -> set:
52 |     duplicates = set()
53 |     for _, pair in groupby(
54 |         sorted(data.items(), key=lambda x: x[1]["unique_id"]),
55 |         key=lambda x: x[1]["unique_id"],
56 |     ):
57 |         pair_l = list(pair)
58 |         if len(pair_l) == 2:
59 |             a, b = pair_l
60 |             duplicates.add(frozenset((a[0], b[0])))
61 |     return duplicates
62 | 
63 | 
64 | @dataclass
65 | class Report:
66 |     # TODO add more and replace calculations with sklearn
67 |     n_true: int
68 |     n_found: int
69 |     precision: float
70 |     recall: float
71 | 
72 |     @classmethod
73 |     def from_scores(cls, true_dupes: set, found_dupes: set):
74 |         true_positives = found_dupes.intersection(true_dupes)
75 | 
76 |         n_true = len(true_dupes)
77 |         n_found = len(found_dupes)
78 |         precision = len(true_positives) / n_found
79 |         recall = len(true_positives) / n_true
80 | 
81 |         return cls(n_true, n_found, precision, recall)
82 | 


--------------------------------------------------------------------------------
/benchmarks/setup.py:
--------------------------------------------------------------------------------
 1 | # Dummy file to allow editable installs
 2 | from setuptools import find_packages, setup
 3 | 
 4 | if __name__ == "__main__":
 5 |     setup(
 6 |         name="benchmarks",
 7 |         packages=find_packages(),
 8 |         package_data={
 9 |             # If any package contains *.txt or *.json files, include them:
10 |             "": ["*.csv"],
11 |             # And include any files found in the 'mypackage/data' directory:
12 |             "benchmarks": ["datasets/*"],
13 |         },
14 |     )
15 | 


--------------------------------------------------------------------------------
/dedupe/__init__.py:
--------------------------------------------------------------------------------
 1 | from dedupe.api import (  # noqa: F401
 2 |     Dedupe,
 3 |     Gazetteer,
 4 |     RecordLink,
 5 |     StaticDedupe,
 6 |     StaticGazetteer,
 7 |     StaticRecordLink,
 8 | )
 9 | from dedupe.convenience import (  # noqa: F401
10 |     canonicalize,
11 |     console_label,
12 |     training_data_dedupe,
13 |     training_data_link,
14 | )
15 | from dedupe.serializer import read_training, write_training  # noqa: F401
16 | 
17 | __all__ = [
18 |     "Dedupe",
19 |     "Gazetteer",
20 |     "RecordLink",
21 |     "StaticDedupe",
22 |     "StaticGazetteer",
23 |     "StaticRecordLink",
24 |     "canonicalize",
25 |     "console_label",
26 |     "training_data_dedupe",
27 |     "training_data_link",
28 |     "read_training",
29 |     "write_training",
30 | ]
31 | 


--------------------------------------------------------------------------------
/dedupe/_typing.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from typing import (
  3 |     TYPE_CHECKING,
  4 |     Any,
  5 |     Callable,
  6 |     Dict,
  7 |     FrozenSet,
  8 |     Iterable,
  9 |     Iterator,
 10 |     List,
 11 |     Literal,
 12 |     Mapping,
 13 |     MutableSequence,
 14 |     Protocol,
 15 |     Sequence,
 16 |     Tuple,
 17 |     Type,
 18 |     TypedDict,
 19 |     Union,
 20 |     runtime_checkable,
 21 | )
 22 | 
 23 | import numpy
 24 | import numpy.typing
 25 | 
 26 | if TYPE_CHECKING:
 27 |     from dedupe.predicates import Predicate
 28 | 
 29 | 
 30 | RecordDict = Mapping[str, Any]
 31 | RecordID = Union[int, str]
 32 | RecordIDDType = Union[Type[int], Tuple[Type[str], Literal[256]]]
 33 | RecordIDPair = Union[Tuple[int, int], Tuple[str, str]]
 34 | RecordInt = Tuple[int, RecordDict]
 35 | RecordStr = Tuple[str, RecordDict]
 36 | Record = Union[RecordInt, RecordStr]
 37 | RecordPairInt = Tuple[RecordInt, RecordInt]
 38 | RecordPairStr = Tuple[RecordStr, RecordStr]
 39 | RecordPairs = Union[Iterator[RecordPairInt], Iterator[RecordPairStr]]
 40 | BlockInt = List[RecordPairInt]
 41 | BlockStr = List[RecordPairStr]
 42 | Block = Union[RecordPairInt, RecordPairStr]
 43 | BlocksInt = Iterator[BlockInt]
 44 | BlocksStr = Iterator[BlockStr]
 45 | Blocks = Union[BlocksInt, BlocksStr]
 46 | ClusterInt = Tuple[
 47 |     Tuple[int, ...], Union[numpy.typing.NDArray[numpy.float64], Tuple[float, ...]]
 48 | ]
 49 | ClusterStr = Tuple[
 50 |     Tuple[str, ...], Union[numpy.typing.NDArray[numpy.float64], Tuple[float, ...]]
 51 | ]
 52 | ClustersInt = Iterable[ClusterInt]
 53 | ClustersStr = Iterable[ClusterStr]
 54 | Clusters = Union[ClustersInt, ClustersStr]
 55 | 
 56 | DataInt = Mapping[int, RecordDict]
 57 | DataStr = Mapping[str, RecordDict]
 58 | Data = Union[DataInt, DataStr]
 59 | 
 60 | RecordDictPair = Tuple[RecordDict, RecordDict]
 61 | RecordDictPairs = List[RecordDictPair]
 62 | ArrayLinks = Iterable[numpy.ndarray]
 63 | TupleLinksInt = Iterable[Tuple[Tuple[int, int], float]]
 64 | TupleLinksStr = Iterable[Tuple[Tuple[str, str], float]]
 65 | TupleLinks = Union[TupleLinksInt, TupleLinksStr]
 66 | Links = Union[ArrayLinks, TupleLinks]
 67 | LookupResultsInt = Iterable[Tuple[int, Tuple[Tuple[int, float], ...]]]
 68 | LookupResultsStr = Iterable[Tuple[str, Tuple[Tuple[str, float], ...]]]
 69 | LookupResults = Union[LookupResultsInt, LookupResultsStr]
 70 | JoinConstraint = Literal["one-to-one", "many-to-one", "many-to-many"]
 71 | Comparator = Callable[[Any, Any], Union[Union[int, float], Sequence[Union[int, float]]]]
 72 | CustomComparator = Callable[[Any, Any], Union[int, float]]
 73 | Scores = Union[numpy.memmap, numpy.ndarray]
 74 | Labels = List[Literal[0, 1]]
 75 | LabelsLike = Iterable[Literal[0, 1]]
 76 | Cover = Dict["Predicate", FrozenSet[int]]
 77 | ComparisonCoverInt = Dict["Predicate", FrozenSet[Tuple[int, int]]]
 78 | ComparisonCoverStr = Dict["Predicate", FrozenSet[Tuple[str, str]]]
 79 | ComparisonCover = Union[ComparisonCoverInt, ComparisonCoverStr]
 80 | PredicateFunction = Callable[[Any], FrozenSet[str]]
 81 | 
 82 | 
 83 | class TrainingData(TypedDict):
 84 |     match: MutableSequence[RecordDictPair]
 85 |     distinct: MutableSequence[RecordDictPair]
 86 | 
 87 | 
 88 | # Takes pairs of records and generates a (n_samples X n_features) array
 89 | FeaturizerFunction = Callable[
 90 |     [Sequence[RecordDictPair]], numpy.typing.NDArray[numpy.float64]
 91 | ]
 92 | 
 93 | 
 94 | class Classifier(Protocol):
 95 |     """Takes an array of pairwise distances and computes the likelihood they are a pair."""
 96 | 
 97 |     def fit(self, X: numpy.typing.NDArray[numpy.float64], y: LabelsLike) -> None: ...
 98 | 
 99 |     def predict_proba(
100 |         self, X: numpy.typing.NDArray[numpy.float64]
101 |     ) -> numpy.typing.NDArray[numpy.float64]: ...
102 | 
103 | 
104 | class ClosableJoinable(Protocol):
105 |     def close(self) -> None: ...
106 | 
107 |     def join(self) -> None: ...
108 | 
109 | 
110 | class Variable(Protocol):
111 |     name: str
112 |     predicates: List["Predicate"]
113 |     has_missing: bool
114 | 
115 |     def __len__(self) -> int: ...
116 | 
117 | 
118 | @runtime_checkable
119 | class FieldVariable(Variable, Protocol):
120 |     field: str
121 |     comparator: Comparator
122 | 
123 | 
124 | class InteractionVariable(Variable, Protocol):
125 |     interaction_fields: List[str]
126 | 
127 | 
128 | MapLike = Callable[[Callable[[Any], Any], Iterable], Iterable]
129 | 
130 | PathLike = Union[str, os.PathLike]
131 | 


--------------------------------------------------------------------------------
/dedupe/backport.py:
--------------------------------------------------------------------------------
 1 | import platform
 2 | 
 3 | if platform.system() == "Darwin":
 4 |     import multiprocessing
 5 | 
 6 |     ctx = multiprocessing.get_context("spawn")
 7 |     Queue = ctx.Queue
 8 |     Process = ctx.Process
 9 |     Pool = ctx.Pool
10 |     SimpleQueue = ctx.SimpleQueue
11 |     Lock = ctx.Lock
12 |     RLock = ctx.RLock
13 | else:
14 |     from multiprocessing import (  # type: ignore # noqa
15 |         Lock,
16 |         Pool,
17 |         Process,
18 |         Queue,
19 |         RLock,
20 |         SimpleQueue,
21 |     )
22 | 


--------------------------------------------------------------------------------
/dedupe/blocking.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | from __future__ import annotations
  3 | 
  4 | import logging
  5 | import time
  6 | from collections import defaultdict
  7 | from typing import TYPE_CHECKING
  8 | 
  9 | if TYPE_CHECKING:
 10 |     from typing import (
 11 |         Any,
 12 |         Callable,
 13 |         DefaultDict,
 14 |         Generator,
 15 |         Iterable,
 16 |         List,
 17 |         Sequence,
 18 |         Union,
 19 |     )
 20 | 
 21 |     import dedupe.predicates
 22 |     from dedupe._typing import Data, Record, RecordID
 23 |     from dedupe.index import Index
 24 | 
 25 |     Docs = Union[Iterable[str], Iterable[Iterable[str]]]
 26 |     IndexList = DefaultDict[str, List[dedupe.predicates.IndexPredicate]]
 27 | 
 28 | 
 29 | logger = logging.getLogger(__name__)
 30 | 
 31 | 
 32 | def index_list() -> IndexList:
 33 |     return defaultdict(list)
 34 | 
 35 | 
 36 | class Fingerprinter:
 37 |     """Takes in a record and returns all blocks that record belongs to"""
 38 | 
 39 |     def __init__(self, predicates: Iterable[dedupe.predicates.Predicate]) -> None:
 40 |         self.predicates = predicates
 41 | 
 42 |         self.index_fields: dict[str, IndexList]
 43 |         self.index_fields = defaultdict(index_list)
 44 |         """
 45 |         A dictionary of all the fingerprinter methods that use an
 46 |         index of data field values. The keys are the field names,
 47 |         which can be useful to know for indexing the data.
 48 |         """
 49 | 
 50 |         self.index_predicates = []
 51 | 
 52 |         for full_predicate in predicates:
 53 |             for predicate in full_predicate:
 54 |                 if hasattr(predicate, "index"):
 55 |                     self.index_fields[predicate.field][predicate.type].append(predicate)
 56 |                     self.index_predicates.append(predicate)
 57 | 
 58 |     def __call__(
 59 |         self, records: Iterable[Record], target: bool = False
 60 |     ) -> Generator[tuple[str, RecordID]]:
 61 |         """
 62 |         Generate the predicates for records. Yields tuples of (predicate,
 63 |         record_id).
 64 | 
 65 |         Args:
 66 |             records: A sequence of tuples of (record_id,
 67 |                   record_dict). Can often be created by
 68 |                   `data_dict.items()`.
 69 |             target: Indicates whether the data should be treated as
 70 |                     the target data. This effects the behavior of
 71 |                     search predicates. If `target` is set to
 72 |                     `True`, an search predicate will return the
 73 |                     value itself. If `target` is set to `False` the
 74 |                     search predicate will return all possible
 75 |                     values within the specified search distance.
 76 | 
 77 |                     Let's say we have a
 78 |                     `LevenshteinSearchPredicate` with an associated
 79 |                     distance of `1` on a `"name"` field; and we
 80 |                     have a record like `{"name": "thomas"}`. If the
 81 |                     `target` is set to `True` then the predicate
 82 |                     will return `"thomas"`.  If `target` is set to
 83 |                     `False`, then the blocker could return
 84 |                     `"thomas"`, `"tomas"`, and `"thoms"`. By using
 85 |                     the `target` argument on one of your datasets,
 86 |                     you will dramatically reduce the total number
 87 |                     of comparisons without a loss of accuracy.
 88 | 
 89 |         .. code:: python
 90 | 
 91 |            > data = [(1, {'name' : 'bob'}), (2, {'name' : 'suzanne'})]
 92 |            > blocked_ids = deduper.fingerprinter(data)
 93 |            > print list(blocked_ids)
 94 |            [('foo:1', 1), ..., ('bar:1', 100)]
 95 | 
 96 |         """
 97 | 
 98 |         start_time = time.perf_counter()
 99 |         predicates = [
100 |             (":" + str(i), predicate) for i, predicate in enumerate(self.predicates)
101 |         ]
102 | 
103 |         for i, record in enumerate(records):
104 |             record_id, instance = record
105 | 
106 |             for pred_id, predicate in predicates:
107 |                 block_keys = predicate(instance, target=target)
108 |                 for block_key in block_keys:
109 |                     yield block_key + pred_id, record_id
110 | 
111 |             if i and i % 10000 == 0:
112 |                 logger.info(
113 |                     "%(iteration)d, %(elapsed)f2 seconds",
114 |                     {"iteration": i, "elapsed": time.perf_counter() - start_time},
115 |                 )
116 | 
117 |     def reset_indices(self) -> None:
118 |         """
119 |         Fingerprinter indices can take up a lot of memory. If you are
120 |         done with blocking, the method will reset the indices to free up.
121 |         If you need to block again, the data will need to be re-indexed.
122 |         """
123 |         for predicate in self.index_predicates:
124 |             predicate.reset()
125 | 
126 |     def index(self, docs: Docs, field: str) -> None:
127 |         """
128 |         Add docs to the indices used by fingerprinters.
129 | 
130 |         Some fingerprinter methods depend upon having an index of
131 |         values that a field may have in the data. This method adds
132 |         those values to the index. If you don't have any fingerprinter
133 |         methods that use an index, this method will do nothing.
134 | 
135 |         Args:
136 |             docs: an iterator of values from your data to index. While
137 |                   not required, it is recommended that docs be a unique
138 |                   set of of those values. Indexing can be an expensive
139 |                   operation.
140 |             field: fieldname or key associated with the values you are
141 |                    indexing
142 | 
143 |         """
144 |         indices = extractIndices(self.index_fields[field])
145 | 
146 |         for doc in docs:
147 |             if doc:
148 |                 for _, index, preprocess in indices:
149 |                     index.index(preprocess(doc))
150 | 
151 |         for index_type, index, _ in indices:
152 |             index.initSearch()
153 | 
154 |             for predicate in self.index_fields[field][index_type]:
155 |                 logger.debug("Canopy: %s", str(predicate))
156 |                 predicate.index = index
157 |                 predicate.bust_cache()
158 | 
159 |     def unindex(self, docs: Docs, field: str) -> None:
160 |         """Remove docs from indices used by fingerprinters
161 | 
162 |         Args:
163 |             docs: an iterator of values from your data to remove. While
164 |                   not required, it is recommended that docs be a unique
165 |                   set of of those values. Indexing can be an expensive
166 |                   operation.
167 |             field: fieldname or key associated with the values you are
168 |                    unindexing
169 |         """
170 | 
171 |         indices = extractIndices(self.index_fields[field])
172 | 
173 |         for doc in docs:
174 |             if doc:
175 |                 for _, index, preprocess in indices:
176 |                     try:
177 |                         index.unindex(preprocess(doc))
178 |                     except KeyError:
179 |                         pass
180 | 
181 |         for index_type, index, _ in indices:
182 |             index.initSearch()
183 | 
184 |             for predicate in self.index_fields[field][index_type]:
185 |                 logger.debug("Canopy: %s", str(predicate))
186 |                 predicate.index = index
187 |                 predicate.bust_cache()
188 | 
189 |     def index_all(self, data: Data) -> None:
190 |         for field in self.index_fields:
191 |             unique_fields = {record[field] for record in data.values() if record[field]}
192 |             self.index(unique_fields, field)
193 | 
194 | 
195 | def extractIndices(
196 |     index_fields: IndexList,
197 | ) -> Sequence[tuple[str, Index, Callable[[Any], Any]]]:
198 |     indices = []
199 |     for index_type, predicates in index_fields.items():
200 |         predicate = predicates[0]
201 |         index = predicate.index
202 |         preprocess = predicate.preprocess
203 |         if predicate.index is None:
204 |             index = predicate.initIndex()
205 |         assert index is not None
206 |         indices.append((index_type, index, preprocess))
207 | 
208 |     return indices
209 | 


--------------------------------------------------------------------------------
/dedupe/branch_and_bound.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import functools
 4 | from typing import Any, Collection, Iterable, Mapping, Tuple
 5 | 
 6 | from ._typing import Cover
 7 | from .predicates import Predicate
 8 | 
 9 | Partial = Tuple[Predicate, ...]
10 | 
11 | 
12 | def _reachable(dupe_cover: Mapping[Any, frozenset[int]]) -> int:
13 |     return len(frozenset.union(*dupe_cover.values())) if dupe_cover else 0
14 | 
15 | 
16 | def _remove_dominated(coverage: Cover, dominator: Predicate) -> Cover:
17 |     dominant_cover = coverage[dominator]
18 | 
19 |     return {
20 |         pred: cover
21 |         for pred, cover in coverage.items()
22 |         if not (dominator.cover_count <= pred.cover_count and dominant_cover >= cover)
23 |     }
24 | 
25 | 
26 | def _uncovered_by(
27 |     coverage: Mapping[Any, frozenset[int]], covered: frozenset[int]
28 | ) -> dict[Any, frozenset[int]]:
29 |     return {
30 |         pred: still_uncovered
31 |         for pred, uncovered in coverage.items()
32 |         if (still_uncovered := uncovered - covered)
33 |     }
34 | 
35 | 
36 | def _order_by(
37 |     candidates: Mapping[Predicate, Collection[Any]], p: Predicate
38 | ) -> tuple[int, float]:
39 |     return (len(candidates[p]), -p.cover_count)
40 | 
41 | 
42 | def _score(partial: Iterable[Predicate]) -> float:
43 |     return sum(p.cover_count for p in partial)
44 | 
45 | 
46 | def search(original_cover: Cover, target: int, calls: int) -> Partial:
47 |     def _covered(partial: Partial) -> int:
48 |         return (
49 |             len(frozenset.union(*(original_cover[p] for p in partial)))
50 |             if partial
51 |             else 0
52 |         )
53 | 
54 |     cheapest_score = float("inf")
55 |     cheapest: Partial = ()
56 | 
57 |     start: tuple[Cover, Partial] = (original_cover, ())
58 |     to_explore = [start]
59 | 
60 |     while to_explore and calls:
61 |         candidates, partial = to_explore.pop()
62 | 
63 |         covered = _covered(partial)
64 |         score = _score(partial)
65 | 
66 |         if covered < target:
67 |             window = cheapest_score - score
68 |             candidates = {
69 |                 p: cover for p, cover in candidates.items() if p.cover_count < window
70 |             }
71 | 
72 |             reachable = _reachable(candidates) + covered
73 | 
74 |             if candidates and reachable >= target:
75 |                 order_by = functools.partial(_order_by, candidates)
76 |                 best = max(candidates, key=order_by)
77 | 
78 |                 reduced = _remove_dominated(candidates, best)
79 |                 to_explore.append((reduced, partial))
80 | 
81 |                 remaining = _uncovered_by(candidates, candidates[best])
82 |                 to_explore.append((remaining, partial + (best,)))
83 | 
84 |         elif score < cheapest_score:
85 |             cheapest = partial
86 |             cheapest_score = score
87 | 
88 |         calls -= 1
89 | 
90 |     return cheapest
91 | 


--------------------------------------------------------------------------------
/dedupe/canonical.py:
--------------------------------------------------------------------------------
 1 | from typing import Mapping, Sequence
 2 | 
 3 | import numpy
 4 | import numpy.typing
 5 | from affinegap import normalizedAffineGapDistance as affine
 6 | 
 7 | from dedupe._typing import Comparator, RecordDict
 8 | 
 9 | 
10 | def getCentroid(attribute_variants: Sequence[str], comparator: Comparator) -> str:
11 |     """
12 |     Takes in a list of attribute values for a field,
13 |     evaluates the centroid using the comparator,
14 |     & returns the centroid (i.e. the 'best' value for the field)
15 |     """
16 | 
17 |     n = len(attribute_variants)
18 | 
19 |     distance_matrix = numpy.zeros([n, n])
20 | 
21 |     # populate distance matrix by looping through elements of matrix triangle
22 |     for i in range(0, n):
23 |         for j in range(0, i):
24 |             distance = comparator(attribute_variants[i], attribute_variants[j])
25 |             distance_matrix[i, j] = distance_matrix[j, i] = distance
26 | 
27 |     average_distance = distance_matrix.mean(0)
28 | 
29 |     # there can be ties for minimum, average distance string
30 |     min_dist_indices: numpy.typing.NDArray[numpy.int_]
31 |     min_dist_indices = numpy.where(average_distance == average_distance.min())[0]
32 | 
33 |     if len(min_dist_indices) > 1:
34 |         centroid = breakCentroidTie(attribute_variants, min_dist_indices)
35 |     else:
36 |         centroid_index = min_dist_indices[0]
37 |         centroid = attribute_variants[centroid_index]
38 | 
39 |     return centroid
40 | 
41 | 
42 | def breakCentroidTie(
43 |     attribute_variants: Sequence[str],
44 |     min_dist_indices: numpy.typing.NDArray[numpy.int_],
45 | ) -> str:
46 |     """
47 |     Finds centroid when there are multiple values w/ min avg distance
48 |     (e.g. any dupe cluster of 2) right now this selects the first
49 |     among a set of ties, but can be modified to break ties in strings
50 |     by selecting the longest string
51 | 
52 |     """
53 |     return attribute_variants[min_dist_indices[0]]
54 | 
55 | 
56 | def getCanonicalRep(record_cluster: Sequence[RecordDict]) -> Mapping[str, str]:
57 |     """
58 |     Given a list of records within a duplicate cluster, constructs a
59 |     canonical representation of the cluster by finding canonical
60 |     values for each field
61 | 
62 |     """
63 |     canonical_rep = {}
64 | 
65 |     keys = record_cluster[0].keys()
66 | 
67 |     for key in keys:
68 |         key_values = []
69 |         for record in record_cluster:
70 |             # assume non-empty values always better than empty value
71 |             # for canonical record
72 |             if record.get(key):
73 |                 key_values.append(record[key])
74 |         if key_values:
75 |             canonical_rep[key] = getCentroid(key_values, affine)
76 |         else:
77 |             canonical_rep[key] = ""
78 | 
79 |     return canonical_rep
80 | 


--------------------------------------------------------------------------------
/dedupe/canopy_index.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import logging
 4 | import math
 5 | from typing import Iterable
 6 | 
 7 | import numpy
 8 | from BTrees.Length import Length
 9 | from zope.index.text.cosineindex import CosineIndex
10 | from zope.index.text.lexicon import Lexicon
11 | from zope.index.text.setops import mass_weightedUnion
12 | from zope.index.text.textindex import TextIndex
13 | 
14 | logger = logging.getLogger(__name__)
15 | 
16 | 
17 | class CanopyIndex(TextIndex):  # pragma: no cover
18 |     def __init__(self) -> None:
19 |         lexicon = CanopyLexicon()
20 |         self.index = CosineIndex(lexicon)
21 |         self.lexicon = lexicon
22 | 
23 |     def initSearch(self) -> None:
24 |         N = len(self.index._docweight)
25 |         threshold = int(max(1000, N * 0.05))
26 | 
27 |         stop_words = []
28 |         self._wids_dict = {}
29 | 
30 |         bucket = self.index.family.IF.Bucket
31 |         for wid, docs in self.index._wordinfo.items():
32 |             if len(docs) > threshold:
33 |                 stop_words.append(wid)
34 |                 continue
35 | 
36 |             if isinstance(docs, dict):
37 |                 docs = bucket(docs)
38 |                 self.index._wordinfo[wid] = docs
39 | 
40 |             idf = numpy.log1p(N / len(docs))
41 |             term = self.lexicon._words[wid]
42 | 
43 |             self._wids_dict[term] = (wid, idf)
44 | 
45 |         for wid in stop_words:
46 |             word = self.lexicon._words.pop(wid)
47 |             del self.lexicon._wids[word]
48 |             logger.info(f"Removing stop word {word}")
49 |             del self.index._wordinfo[wid]
50 | 
51 |     def apply(
52 |         self,
53 |         query_list: Iterable[str],
54 |         threshold: float,
55 |         start: int = 0,
56 |         count: int | None = None,
57 |     ) -> list[tuple[float, int]]:
58 |         _wids_dict = self._wids_dict
59 |         _wordinfo = self.index._wordinfo
60 |         l_pow = float.__pow__
61 | 
62 |         L = []
63 |         qw = 0.0
64 | 
65 |         for term in query_list:
66 |             wid, weight = _wids_dict.get(term, (None, None))
67 |             if wid is None:
68 |                 continue
69 |             docs = _wordinfo[wid]
70 |             L.append((docs, weight))
71 |             qw += l_pow(weight, 2)
72 | 
73 |         results = mass_weightedUnion(L)
74 | 
75 |         qw = math.sqrt(qw)
76 |         filtered_results: list[tuple[float, int]] = results.byValue(qw * threshold)
77 | 
78 |         return filtered_results
79 | 
80 | 
81 | class CanopyLexicon(Lexicon):  # pragma: no cover
82 |     def sourceToWordIds(self, last: list | None = None) -> list[int]:
83 |         if last is None:
84 |             last = []
85 |         if not isinstance(self.wordCount, Length):  # type: ignore[has-type]
86 |             self.wordCount = Length(self.wordCount())  # type: ignore[has-type]
87 |         self.wordCount._p_deactivate()
88 |         return list(map(self._getWordIdCreate, last))
89 | 


--------------------------------------------------------------------------------
/dedupe/cpredicates.pyx:
--------------------------------------------------------------------------------
 1 | # cython: c_string_type=unicode, c_string_encoding=utf8, infertypes=True, language_level=3
 2 | 
 3 | cpdef list ngrams(basestring field, int n):
 4 |     """ngrams returns all contiguous sequences of n characters
 5 |     of a given field.
 6 | 
 7 |     :param field: the string to be sequenced
 8 |     :param n: the number of characters to be included in each gram
 9 | 
10 |     usage:
11 |     >>> from dedupe.dedupe.predicated import ngrams
12 |     >>> ngrams("deduplicate", 3)
13 |     ['ded', 'edu', 'dup', 'upl', 'pli', 'lic', 'ica', 'cat', 'ate']
14 |     """
15 |     cdef unicode ufield = _ustring(field)
16 | 
17 |     cdef int i
18 |     cdef int n_char = len(ufield)
19 |     cdef int n_grams = n_char - n + 1
20 |     cdef list grams = [ufield[i:i+n] for i in range(n_grams)]
21 |     return grams
22 | 
23 | 
24 | cpdef frozenset unique_ngrams(basestring field, int n):
25 |     """unique_ngrams returns all contiguous unique sequences of n characters
26 |     of a given field.
27 | 
28 |     :param field: the string to be sequenced
29 |     :param n: the number of characters to be included in each gram
30 | 
31 |     usage:
32 |     >>> from dedupe.dedupe.predicated import unique_ngrams
33 |     >>> unique_ngrams("mississippi", 2)
34 |     {"mi", "is", "ss", "si", "ip", "pp", "pi"}
35 |     """
36 |     cdef unicode ufield = _ustring(field)
37 | 
38 |     cdef int i
39 |     cdef int n_char = len(ufield)
40 |     cdef int n_grams = n_char - n + 1
41 |     cdef set grams = {ufield[i:i+n] for i in range(n_grams)}
42 |     return frozenset(grams)
43 | 
44 | 
45 | cpdef frozenset initials(basestring field, int n):
46 |     """returns a tuple containing the first n chars of a field.
47 |     The whole field is returned if n is greater than the field length.
48 | 
49 |     :param field: the string
50 |     :type n: int
51 | 
52 |     usage:
53 |     >>> initials("dedupe", 7)
54 |     ('dedupe', )
55 |     >>> initials("deduplication", 7)
56 |     ('dedupli', )
57 |     """
58 |     cdef unicode ufield = _ustring(field)
59 | 
60 |     return frozenset((ufield[:n],))
61 | 
62 | 
63 | cdef unicode _ustring(basestring s):
64 |     if type(s) is unicode:
65 |         # fast path for most common case(s)
66 |         return <unicode>s
67 |     else : # safe because of basestring
68 |         return <char *>s
69 | 


--------------------------------------------------------------------------------
/dedupe/datamodel.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import copyreg
  4 | import types
  5 | from collections.abc import Mapping
  6 | from typing import TYPE_CHECKING, cast
  7 | 
  8 | import numpy
  9 | 
 10 | from dedupe._typing import FieldVariable
 11 | from dedupe.variables.interaction import InteractionType
 12 | 
 13 | if TYPE_CHECKING:
 14 |     from typing import Collection, Generator, Iterable, Sequence
 15 | 
 16 |     from dedupe._typing import (
 17 |         Comparator,
 18 |         InteractionVariable,
 19 |         RecordDict,
 20 |         RecordDictPair,
 21 |         Variable,
 22 |     )
 23 |     from dedupe.predicates import Predicate
 24 | 
 25 | 
 26 | class DataModel:
 27 |     version = 2
 28 | 
 29 |     def __init__(self, variable_definitions: Collection[Variable]):
 30 |         for item in variable_definitions:
 31 |             if isinstance(item, Mapping):
 32 |                 raise ValueError(
 33 |                     "It looks like you are trying to use a variable definition "
 34 |                     "composed of dictionaries. dedupe 3.0 uses variable objects "
 35 |                     'directly. So instead of [{"field": "name", "type": "String"}] '
 36 |                     'we now do [dedupe.variables.String("name")].'
 37 |                 )
 38 | 
 39 |         variable_definitions = list(variable_definitions)
 40 |         if not variable_definitions:
 41 |             raise ValueError("The variable definitions cannot be empty")
 42 |         if not any(variable.predicates for variable in variable_definitions):
 43 |             raise ValueError(
 44 |                 "At least one of the variable types needs to be a type"
 45 |                 "other than 'Custom'. 'Custom' types have no associated"
 46 |                 "blocking rules"
 47 |             )
 48 | 
 49 |         # This is a protocol check, not a class inheritance check
 50 |         self.field_variables: list[FieldVariable] = [
 51 |             variable
 52 |             for variable in variable_definitions
 53 |             if isinstance(variable, FieldVariable)
 54 |         ]
 55 | 
 56 |         # we need to keep track of ordering of variables because in
 57 |         # order to calculate derived fields like interaction and missing
 58 |         # data fields.
 59 |         columns: list[Variable] = []
 60 |         for variable in self.field_variables:
 61 |             if len(variable) == 1:
 62 |                 columns.append(variable)
 63 |             elif len(variable) > 1:
 64 |                 assert hasattr(variable, "higher_vars")
 65 |                 columns.extend(variable.higher_vars)
 66 | 
 67 |         self._derived_start = len(columns)
 68 | 
 69 |         # i'm not really satisfied with how we are dealing with interactions
 70 |         # here. seems like there should be a cleaner path, but i don't see it
 71 |         # today
 72 |         columns += interactions(variable_definitions, self.field_variables)
 73 | 
 74 |         self._missing_field_indices = missing_field_indices(columns)
 75 |         self._interaction_indices = interaction_indices(columns)
 76 | 
 77 |         self._len = len(columns) + len(self._missing_field_indices)
 78 | 
 79 |     def __len__(self) -> int:
 80 |         return self._len
 81 | 
 82 |     # Changing this from a property to just a normal attribute causes
 83 |     # pickling problems, because we are removing static methods from
 84 |     # their class context. This could be fixed by defining comparators
 85 |     # outside of classes in fieldclasses
 86 |     @property
 87 |     def _field_comparators(
 88 |         self,
 89 |     ) -> Generator[tuple[str, Comparator, int, int]]:
 90 |         start = 0
 91 |         stop = 0
 92 |         for var in self.field_variables:
 93 |             stop = start + len(var)
 94 |             comparator = cast("Comparator", var.comparator)
 95 |             yield (var.field, comparator, start, stop)
 96 |             start = stop
 97 | 
 98 |     @property
 99 |     def predicates(self) -> set[Predicate]:
100 |         predicates = set()
101 |         for var in self.field_variables:
102 |             for predicate in var.predicates:
103 |                 predicates.add(predicate)
104 |         return predicates
105 | 
106 |     def distances(
107 |         self, record_pairs: Sequence[RecordDictPair]
108 |     ) -> numpy.typing.NDArray[numpy.float64]:
109 |         num_records = len(record_pairs)
110 | 
111 |         distances = numpy.empty((num_records, len(self)), "f4")
112 | 
113 |         for i, (record_1, record_2) in enumerate(record_pairs):
114 |             for field, compare, start, stop in self._field_comparators:
115 |                 if record_1[field] is not None and record_2[field] is not None:
116 |                     distances[i, start:stop] = compare(record_1[field], record_2[field])
117 |                 elif hasattr(compare, "missing"):
118 |                     distances[i, start:stop] = compare(record_1[field], record_2[field])
119 |                 else:
120 |                     distances[i, start:stop] = numpy.nan
121 | 
122 |         distances = self._add_derived_distances(distances)
123 | 
124 |         return distances
125 | 
126 |     def _add_derived_distances(
127 |         self, distances: numpy.typing.NDArray[numpy.float64]
128 |     ) -> numpy.typing.NDArray[numpy.float64]:
129 |         current_column = self._derived_start
130 | 
131 |         for indices in self._interaction_indices:
132 |             distances[:, current_column] = numpy.prod(distances[:, indices], axis=1)
133 |             current_column += 1
134 | 
135 |         is_missing = numpy.isnan(distances[:, :current_column])
136 | 
137 |         distances[:, :current_column][is_missing] = 0
138 | 
139 |         if self._missing_field_indices:
140 |             distances[:, current_column:] = (
141 |                 1 - is_missing[:, self._missing_field_indices]
142 |             )
143 | 
144 |         return distances
145 | 
146 |     def check(self, record: RecordDict) -> None:
147 |         for field, _, _, _ in self._field_comparators:
148 |             if field not in record:
149 |                 raise ValueError(
150 |                     "Records do not line up with data model. "
151 |                     "The field '%s' is in data_model but not "
152 |                     "in a record" % field
153 |                 )
154 | 
155 |     def __getstate__(self):
156 |         d = self.__dict__
157 |         d["object_version"] = self.version
158 |         return d
159 | 
160 |     def __setstate__(self, d):
161 |         version = d.pop("object_version", None)
162 |         if version is None and "_variables" in d:
163 |             d["_len"] = len(d.pop("_variables"))
164 |             d["primary_variables"] = d.pop("primary_fields")
165 |         elif version == 1:
166 |             d["field_variables"] = d.pop("primary_variables")
167 | 
168 |         self.__dict__ = d
169 | 
170 | 
171 | def interactions(
172 |     variables: Iterable[Variable], primary_variables: Iterable[FieldVariable]
173 | ) -> list[InteractionVariable]:
174 |     field_d = {field.name: field for field in primary_variables}
175 | 
176 |     interactions: list[InteractionVariable] = []
177 |     for variable in variables:
178 |         if isinstance(variable, InteractionType):
179 |             variable.expandInteractions(field_d)
180 |             interactions.extend(variable.higher_vars)
181 |     return interactions
182 | 
183 | 
184 | def missing_field_indices(variables: list[Variable]) -> list[int]:
185 |     return [i for i, var in enumerate(variables) if var.has_missing]
186 | 
187 | 
188 | def interaction_indices(variables: list[Variable]) -> list[list[int]]:
189 |     var_names = [var.name for var in variables]
190 |     indices = []
191 |     for var in variables:
192 |         if hasattr(var, "interaction_fields"):
193 |             interaction_indices = [var_names.index(f) for f in var.interaction_fields]
194 |             indices.append(interaction_indices)
195 |     return indices
196 | 
197 | 
198 | def reduce_method(m):
199 |     return (getattr, (m.__self__, m.__func__.__name__))
200 | 
201 | 
202 | copyreg.pickle(types.MethodType, reduce_method)
203 | 


--------------------------------------------------------------------------------
/dedupe/index.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from abc import ABC, abstractmethod
 4 | from typing import TYPE_CHECKING
 5 | 
 6 | if TYPE_CHECKING:
 7 |     from typing import MutableMapping, Tuple
 8 | 
 9 |     Doc = Tuple[str, ...]
10 | 
11 | 
12 | class Index(ABC):
13 |     _doc_to_id: MutableMapping[Doc, int]
14 | 
15 |     @abstractmethod
16 |     def __init__(self) -> None:  # pragma: no cover
17 |         pass
18 | 
19 |     @abstractmethod
20 |     def index(self, doc: Doc) -> None:  # pragma: no cover
21 |         pass
22 | 
23 |     @abstractmethod
24 |     def unindex(self, doc: Doc) -> None:  # pragma: no cover
25 |         pass
26 | 
27 |     @abstractmethod  # pragma: no cover
28 |     def search(self, doc: Doc, threshold: int | float = 0) -> list[int]:
29 |         pass
30 | 
31 |     @abstractmethod
32 |     def initSearch(self) -> None:  # pragma: no cover
33 |         pass
34 | 


--------------------------------------------------------------------------------
/dedupe/levenshtein.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, List
 2 | 
 3 | import Levenshtein_search
 4 | 
 5 | from .core import Enumerator
 6 | from .index import Index
 7 | 
 8 | 
 9 | class LevenshteinIndex(Index):
10 |     _doc_to_id: Dict[str, int]  # type: ignore[assignment]
11 | 
12 |     def __init__(self) -> None:
13 |         self.index_key = Levenshtein_search.populate_wordset(-1, [])
14 |         self._doc_to_id = Enumerator(start=1)
15 | 
16 |     def index(self, doc: str) -> None:  # type: ignore[override]
17 |         if doc not in self._doc_to_id:
18 |             self._doc_to_id[doc]
19 |             Levenshtein_search.add_string(self.index_key, doc)
20 | 
21 |     def unindex(self, doc: str) -> None:  # type: ignore[override]
22 |         del self._doc_to_id[doc]
23 |         Levenshtein_search.clear_wordset(self.index_key)
24 |         self.index_key = Levenshtein_search.populate_wordset(-1, list(self._doc_to_id))
25 | 
26 |     def initSearch(self) -> None:
27 |         pass
28 | 
29 |     def search(self, doc: str, threshold: int = 0) -> List[int]:  # type: ignore[override]
30 |         matching_docs = Levenshtein_search.lookup(self.index_key, doc, threshold)
31 |         if matching_docs:
32 |             return [self._doc_to_id[match] for match, _, _ in matching_docs]
33 |         else:
34 |             return []
35 | 
36 |     def __del__(self) -> None:
37 |         Levenshtein_search.clear_wordset(self.index_key)
38 | 


--------------------------------------------------------------------------------
/dedupe/predicate_functions.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from itertools import chain
  3 | from math import copysign, floor, log10
  4 | from typing import Any, FrozenSet, Sequence, Tuple, Union
  5 | 
  6 | from doublemetaphone import doublemetaphone
  7 | 
  8 | # This allows to import all cpredicate functions from this module.
  9 | from dedupe.cpredicates import initials, ngrams, unique_ngrams  # noqa: F401
 10 | 
 11 | words = re.compile(r"[\w']+").findall
 12 | integers = re.compile(r"\d+").findall
 13 | start_word = re.compile(r"^([\w']+)").match
 14 | two_start_words = re.compile(r"^([\w']+\W+[\w']+)").match
 15 | start_integer = re.compile(r"^(\d+)").match
 16 | alpha_numeric = re.compile(r"(?=[a-zA-Z]*\d)[a-zA-Z\d]+").findall
 17 | 
 18 | 
 19 | def wholeFieldPredicate(field: str) -> FrozenSet[str]:
 20 |     """return the whole field as a string"""
 21 |     return frozenset((str(field),))
 22 | 
 23 | 
 24 | def tokenFieldPredicate(field: str) -> FrozenSet[str]:
 25 |     """returns the tokens"""
 26 |     return frozenset(words(field))
 27 | 
 28 | 
 29 | def firstTokenPredicate(field: str) -> FrozenSet[str]:
 30 |     first_token = start_word(field)
 31 |     if first_token:
 32 |         return frozenset(first_token.groups())
 33 |     else:
 34 |         return frozenset()
 35 | 
 36 | 
 37 | def firstTwoTokensPredicate(field: str) -> FrozenSet[str]:
 38 |     first_two_tokens = two_start_words(field)
 39 |     if first_two_tokens:
 40 |         return frozenset(first_two_tokens.groups())
 41 |     else:
 42 |         return frozenset()
 43 | 
 44 | 
 45 | def commonIntegerPredicate(field: str) -> FrozenSet[str]:
 46 |     """return any integers"""
 47 | 
 48 |     # `str(int(i))` removes leading zeros, e.g. `str(int("0001")) = "1"`
 49 |     return frozenset(str(int(i)) for i in integers(field))
 50 | 
 51 | 
 52 | def alphaNumericPredicate(field: str) -> FrozenSet[str]:
 53 |     return frozenset(alpha_numeric(field))
 54 | 
 55 | 
 56 | def nearIntegersPredicate(field: str) -> FrozenSet[str]:
 57 |     """for any integer N in field return the integers N-1, N and N+1"""
 58 |     string_ints = integers(field)
 59 |     near_ints = set()
 60 |     for s in string_ints:
 61 |         num = int(s)
 62 |         near_ints.add(str(num - 1))
 63 |         near_ints.add(str(num))
 64 |         near_ints.add(str(num + 1))
 65 | 
 66 |     return frozenset(near_ints)
 67 | 
 68 | 
 69 | def hundredIntegerPredicate(field: str) -> FrozenSet[str]:
 70 |     return frozenset(str(int(i))[:-2] + "00" for i in integers(field))
 71 | 
 72 | 
 73 | def hundredIntegersOddPredicate(field: str) -> FrozenSet[str]:
 74 |     return frozenset(str(int(i))[:-2] + "0" + str(int(i) % 2) for i in integers(field))
 75 | 
 76 | 
 77 | def firstIntegerPredicate(field: str) -> FrozenSet[str]:
 78 |     first_token = start_integer(field)
 79 |     if first_token:
 80 |         return frozenset(first_token.groups())
 81 |     else:
 82 |         return frozenset()
 83 | 
 84 | 
 85 | def ngramsTokens(field: Sequence[Any], n: int) -> FrozenSet[str]:
 86 |     grams = set()
 87 |     n_tokens = len(field)
 88 |     for i in range(n_tokens):
 89 |         for j in range(i + n, min(n_tokens, i + n) + 1):
 90 |             grams.add(" ".join(str(tok) for tok in field[i:j]))
 91 |     return frozenset(grams)
 92 | 
 93 | 
 94 | def commonTwoTokens(field: str) -> FrozenSet[str]:
 95 |     return ngramsTokens(field.split(), 2)
 96 | 
 97 | 
 98 | def commonThreeTokens(field: str) -> FrozenSet[str]:
 99 |     return ngramsTokens(field.split(), 3)
100 | 
101 | 
102 | def fingerprint(field: str) -> FrozenSet[str]:
103 |     return frozenset(("".join(sorted(field.split())),))
104 | 
105 | 
106 | def oneGramFingerprint(field: str) -> FrozenSet[str]:
107 |     return frozenset(("".join(sorted({*field.replace(" ", "")})),))
108 | 
109 | 
110 | def twoGramFingerprint(field: str) -> FrozenSet[str]:
111 |     if len(field) > 1:
112 |         return frozenset(("".join(sorted(unique_ngrams(field.replace(" ", ""), 2))),))
113 |     else:
114 |         return frozenset()
115 | 
116 | 
117 | def commonFourGram(field: str) -> FrozenSet[str]:
118 |     """return 4-grams"""
119 |     return frozenset(unique_ngrams(field.replace(" ", ""), 4))
120 | 
121 | 
122 | def commonSixGram(field: str) -> FrozenSet[str]:
123 |     """return 6-grams"""
124 |     return frozenset(unique_ngrams(field.replace(" ", ""), 6))
125 | 
126 | 
127 | def sameThreeCharStartPredicate(field: str) -> FrozenSet[str]:
128 |     """return first three characters"""
129 |     return frozenset(initials(field.replace(" ", ""), 3))
130 | 
131 | 
132 | def sameFiveCharStartPredicate(field: str) -> FrozenSet[str]:
133 |     """return first five characters"""
134 |     return frozenset(initials(field.replace(" ", ""), 5))
135 | 
136 | 
137 | def sameSevenCharStartPredicate(field: str) -> FrozenSet[str]:
138 |     """return first seven characters"""
139 |     return frozenset(initials(field.replace(" ", ""), 7))
140 | 
141 | 
142 | def suffixArray(field: str) -> FrozenSet[str]:
143 |     n = len(field) - 4
144 |     if n > 0:
145 |         return frozenset(field[i:] for i in range(0, n))
146 |     else:
147 |         return frozenset()
148 | 
149 | 
150 | def sortedAcronym(field: str) -> FrozenSet[str]:
151 |     return frozenset(("".join(sorted(each[0] for each in field.split())),))
152 | 
153 | 
154 | def doubleMetaphone(field: str) -> FrozenSet[str]:
155 |     return frozenset(metaphone for metaphone in doublemetaphone(field) if metaphone)
156 | 
157 | 
158 | def metaphoneToken(field: str) -> FrozenSet[str]:
159 |     return frozenset(
160 |         metaphone_token
161 |         for metaphone_token in chain(
162 |             *(doublemetaphone(token) for token in field.split())
163 |         )
164 |         if metaphone_token
165 |     )
166 | 
167 | 
168 | def wholeSetPredicate(field_set: Sequence[Any]) -> FrozenSet[str]:
169 |     return frozenset((str(field_set),))
170 | 
171 | 
172 | def commonSetElementPredicate(field_set: Sequence[Any]) -> FrozenSet[str]:
173 |     """return set as individual elements"""
174 | 
175 |     return frozenset(str(item) for item in field_set)
176 | 
177 | 
178 | def commonTwoElementsPredicate(field: Sequence[Any]) -> FrozenSet[str]:
179 |     return ngramsTokens(sorted(field), 2)
180 | 
181 | 
182 | def commonThreeElementsPredicate(field: Sequence[Any]) -> FrozenSet[str]:
183 |     return ngramsTokens(sorted(field), 3)
184 | 
185 | 
186 | def lastSetElementPredicate(field_set: Sequence[Any]) -> FrozenSet[str]:
187 |     return frozenset((str(max(field_set)),))
188 | 
189 | 
190 | def firstSetElementPredicate(field_set: Sequence[Any]) -> FrozenSet[str]:
191 |     return frozenset((str(min(field_set)),))
192 | 
193 | 
194 | def magnitudeOfCardinality(field_set: Sequence[Any]) -> FrozenSet[str]:
195 |     return orderOfMagnitude(len(field_set))
196 | 
197 | 
198 | def latLongGridPredicate(field: Tuple[float], digits: int = 1) -> FrozenSet[str]:
199 |     """
200 |     Given a lat / long pair, return the grid coordinates at the
201 |     nearest base value.  e.g., (42.3, -5.4) returns a grid at 0.1
202 |     degree resolution of 0.1 degrees of latitude ~ 7km, so this is
203 |     effectively a 14km lat grid.  This is imprecise for longitude,
204 |     since 1 degree of longitude is 0km at the poles, and up to 111km
205 |     at the equator. But it should be reasonably precise given some
206 |     prior logical block (e.g., country).
207 |     """
208 |     if any(field):
209 |         return frozenset((str(tuple(round(dim, digits) for dim in field)),))
210 |     else:
211 |         return frozenset()
212 | 
213 | 
214 | def orderOfMagnitude(field: Union[int, float]) -> FrozenSet[str]:
215 |     if field > 0:
216 |         return frozenset((str(int(round(log10(field)))),))
217 |     else:
218 |         return frozenset()
219 | 
220 | 
221 | # Thanks to http://stackoverflow.com/questions/3410976/how-to-round-a-number-to-significant-figures-in-python
222 | def roundTo1(field: float) -> FrozenSet[str]:
223 |     abs_num = abs(field)
224 |     order = int(floor(log10(abs_num)))
225 |     rounded = round(abs_num, -order)
226 |     return frozenset((str(int(copysign(rounded, field))),))
227 | 


--------------------------------------------------------------------------------
/dedupe/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dedupeio/dedupe/54ecfe77d41390da66899596834a2bde3712c966/dedupe/py.typed


--------------------------------------------------------------------------------
/dedupe/serializer.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from typing import Any, Iterator, TextIO
 3 | 
 4 | from dedupe._typing import TrainingData
 5 | 
 6 | 
 7 | def _from_json(json_object: Any) -> Any:
 8 |     if "__class__" in json_object:
 9 |         if json_object["__class__"] == "frozenset":
10 |             return frozenset(json_object["__value__"])
11 |         if json_object["__class__"] == "tuple":
12 |             return tuple(json_object["__value__"])
13 |     return json_object
14 | 
15 | 
16 | def hint_tuples(item: Any) -> Any:
17 |     if isinstance(item, tuple):
18 |         return {"__class__": "tuple", "__value__": [hint_tuples(e) for e in item]}
19 |     if isinstance(item, list):
20 |         return [hint_tuples(e) for e in item]
21 |     if isinstance(item, dict):
22 |         return {key: hint_tuples(value) for key, value in item.items()}
23 |     else:
24 |         return item
25 | 
26 | 
27 | class TupleEncoder(json.JSONEncoder):
28 |     def encode(self, obj: Any) -> Any:
29 |         return super().encode(hint_tuples(obj))
30 | 
31 |     def iterencode(self, obj: Any, _one_shot: bool = False) -> Iterator[str]:
32 |         return super().iterencode(hint_tuples(obj))
33 | 
34 |     def default(self, python_object: Any) -> Any:
35 |         if isinstance(python_object, frozenset):
36 |             return {"__class__": "frozenset", "__value__": list(python_object)}
37 |         return super().default(python_object)
38 | 
39 | 
40 | def read_training(training_file: TextIO) -> Any:
41 |     """
42 |     Read training from previously built training data file object
43 | 
44 |     Args:
45 |         training_file: file object containing the training data
46 | 
47 |     Returns:
48 |         A dictionary with two keys, `match` and `distinct`. See the inverse,
49 |         :func:`write_training`.
50 |     """
51 |     return json.load(training_file, object_hook=_from_json)
52 | 
53 | 
54 | def write_training(labeled_pairs: TrainingData, file_obj: TextIO) -> None:
55 |     """
56 |     Write a JSON file that contains labeled examples
57 | 
58 |     Args:
59 |         labeled_pairs: A dictionary with two keys, `match` and `distinct`.
60 |                        The values are lists that can contain pairs of records
61 |         file_obj: file object to write training data to
62 | 
63 |     .. code:: python
64 | 
65 |         examples = {
66 |             "match": [
67 |                  ({'name' : 'Georgie Porgie'}, {'name' : 'George Porgie'}),
68 |             ],
69 |             "distinct": [
70 |                 ({'name' : 'Georgie Porgie'}, {'name' : 'Georgette Porgette'}),
71 |             ],
72 |         }
73 |         with open('training.json', 'w') as f:
74 |             dedupe.write_training(examples, f)
75 | 
76 |     """
77 |     json.dump(labeled_pairs, file_obj, cls=TupleEncoder, ensure_ascii=True)
78 | 


--------------------------------------------------------------------------------
/dedupe/tfidf.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import logging
 3 | from typing import List, Tuple
 4 | 
 5 | from dedupe.canopy_index import CanopyIndex
 6 | from dedupe.core import Enumerator
 7 | from dedupe.index import Index
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | 
11 | Doc = Tuple[str, ...]
12 | 
13 | 
14 | class TfIdfIndex(Index):
15 |     def __init__(self) -> None:
16 |         self._index = CanopyIndex()
17 |         self._doc_to_id = Enumerator(start=1)
18 |         self._parseTerms = self._index.lexicon.parseTerms
19 | 
20 |     def index(self, doc: Doc) -> None:
21 |         if doc not in self._doc_to_id:
22 |             i = self._doc_to_id[doc]
23 |             self._index.index_doc(i, doc)
24 | 
25 |     def unindex(self, doc) -> None:
26 |         i = self._doc_to_id.pop(doc)
27 |         self._index.unindex_doc(i)
28 |         self.initSearch()
29 | 
30 |     def initSearch(self) -> None:
31 |         self._index.initSearch()
32 | 
33 |     def search(self, doc: Doc, threshold: float = 0) -> List[int]:
34 |         query_list = self._parseTerms(doc)
35 | 
36 |         if query_list:
37 |             results = [
38 |                 center for score, center in self._index.apply(query_list, threshold)
39 |             ]
40 |         else:
41 |             results = []
42 | 
43 |         return results
44 | 


--------------------------------------------------------------------------------
/dedupe/variables/__init__.py:
--------------------------------------------------------------------------------
 1 | from .base import CustomType as Custom
 2 | from .categorical_type import CategoricalType as Categorical
 3 | from .exact import ExactType as Exact
 4 | from .exists import ExistsType as Exists
 5 | from .interaction import InteractionType as Interaction
 6 | from .latlong import LatLongType as LatLong
 7 | from .price import PriceType as Price
 8 | from .set import SetType as Set
 9 | from .string import ShortStringType as ShortString
10 | from .string import StringType as String
11 | from .string import TextType as Text
12 | 
13 | __all__ = [
14 |     "Custom",
15 |     "Categorical",
16 |     "Exact",
17 |     "Exists",
18 |     "Interaction",
19 |     "LatLong",
20 |     "Price",
21 |     "Set",
22 |     "ShortString",
23 |     "String",
24 |     "Text",
25 | ]
26 | 


--------------------------------------------------------------------------------
/dedupe/variables/base.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from typing import TYPE_CHECKING
  4 | 
  5 | from dedupe import predicates
  6 | 
  7 | if TYPE_CHECKING:
  8 |     from typing import Any, ClassVar, Iterable, Sequence
  9 | 
 10 |     from dedupe._typing import Comparator, CustomComparator, PredicateFunction
 11 |     from dedupe._typing import Variable as VariableProtocol
 12 | 
 13 | 
 14 | class Variable:
 15 |     name: str
 16 |     type: ClassVar[str]
 17 |     predicates: list[predicates.Predicate]
 18 |     higher_vars: Sequence[VariableProtocol]
 19 | 
 20 |     def __len__(self) -> int:
 21 |         return 1
 22 | 
 23 |     def __repr__(self) -> str:
 24 |         return self.name
 25 | 
 26 |     def __hash__(self) -> int:
 27 |         return hash(self.name)
 28 | 
 29 |     def __eq__(self, other: Any) -> bool:
 30 |         other_name: str = other.name
 31 |         return self.name == other_name
 32 | 
 33 |     def __init__(self, has_missing: bool = False):
 34 |         self.has_missing = has_missing
 35 | 
 36 |     def __getstate__(self) -> dict[str, Any]:
 37 |         odict = self.__dict__.copy()
 38 |         odict["predicates"] = None
 39 | 
 40 |         return odict
 41 | 
 42 | 
 43 | class DerivedType(Variable):
 44 |     type = "Derived"
 45 | 
 46 |     def __init__(self, name: str, var_type: str, **kwargs):
 47 |         self.name = f"({str(name)}: {str(var_type)})"
 48 |         super().__init__(**kwargs)
 49 | 
 50 | 
 51 | class FieldType(Variable):
 52 |     _index_thresholds: Sequence[float] = []
 53 |     _index_predicates: Sequence[type[predicates.IndexPredicate]] = []
 54 |     _predicate_functions: Sequence[PredicateFunction] = ()
 55 |     _Predicate: type[predicates.SimplePredicate] = predicates.SimplePredicate
 56 |     comparator: Comparator
 57 | 
 58 |     def __init__(self, field: str, name: str | None = None, has_missing: bool = False):
 59 |         self.field = field
 60 | 
 61 |         if name is None:
 62 |             self.name = f"({self.field}: {self.type})"
 63 |         else:
 64 |             self.name = name
 65 | 
 66 |         self.predicates = [
 67 |             self._Predicate(pred, self.field) for pred in self._predicate_functions
 68 |         ]
 69 | 
 70 |         self.predicates += indexPredicates(
 71 |             self._index_predicates, self._index_thresholds, self.field
 72 |         )
 73 | 
 74 |         self.has_missing = has_missing
 75 |         if self.has_missing:
 76 |             exists_pred = predicates.ExistsPredicate(self.field)
 77 |             self.predicates.append(exists_pred)
 78 | 
 79 | 
 80 | class CustomType(FieldType):
 81 |     type = "Custom"
 82 | 
 83 |     def __init__(
 84 |         self,
 85 |         field: str,
 86 |         comparator: CustomComparator,
 87 |         name: str | None = None,
 88 |         **kwargs,
 89 |     ):
 90 |         super().__init__(field, **kwargs)
 91 | 
 92 |         if comparator is None:
 93 |             raise ValueError(
 94 |                 "You must define a comparator function for the Custom class"
 95 |             )
 96 |         else:
 97 |             self.comparator = comparator
 98 | 
 99 |         if name is None:
100 |             self.name = f"({self.field}: {self.type}, {self.comparator.__name__})"
101 |         else:
102 |             self.name = name
103 | 
104 | 
105 | def indexPredicates(
106 |     predicates: Iterable[type[predicates.IndexPredicate]],
107 |     thresholds: Sequence[float],
108 |     field: str,
109 | ) -> list[predicates.IndexPredicate]:
110 |     index_predicates = []
111 |     for predicate in predicates:
112 |         for threshold in thresholds:
113 |             index_predicates.append(predicate(threshold, field))
114 | 
115 |     return index_predicates
116 | 


--------------------------------------------------------------------------------
/dedupe/variables/categorical_type.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import Sequence
 4 | 
 5 | from categorical import CategoricalComparator
 6 | 
 7 | from dedupe import predicates
 8 | from dedupe._typing import PredicateFunction
 9 | from dedupe.variables.base import DerivedType, FieldType
10 | 
11 | 
12 | class CategoricalType(FieldType):
13 |     type = "Categorical"
14 |     _predicate_functions: list[PredicateFunction] = [predicates.wholeFieldPredicate]
15 | 
16 |     def __init__(self, field: str, categories: Sequence[str], **kwargs):
17 |         super().__init__(field, **kwargs)
18 | 
19 |         self.comparator = CategoricalComparator(categories)  # type: ignore[assignment]
20 | 
21 |         self.higher_vars = []
22 |         for higher_var in self.comparator.dummy_names:  # type: ignore[attr-defined]
23 |             dummy_var = DerivedType(higher_var, "Dummy", has_missing=False)
24 |             self.higher_vars.append(dummy_var)
25 | 
26 |     def __len__(self) -> int:
27 |         return len(self.higher_vars)
28 | 


--------------------------------------------------------------------------------
/dedupe/variables/exact.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | from dedupe import predicates
 4 | from dedupe.variables.base import FieldType
 5 | 
 6 | 
 7 | class ExactType(FieldType):
 8 |     _predicate_functions = [predicates.wholeFieldPredicate]
 9 |     type = "Exact"
10 | 
11 |     @staticmethod
12 |     def comparator(field_1: Any, field_2: Any) -> int:
13 |         if field_1 == field_2:
14 |             return 1
15 |         else:
16 |             return 0
17 | 


--------------------------------------------------------------------------------
/dedupe/variables/exists.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import Any
 4 | 
 5 | from categorical import CategoricalComparator
 6 | 
 7 | from dedupe._typing import PredicateFunction
 8 | from dedupe.variables.base import DerivedType, FieldType
 9 | 
10 | 
11 | class ExistsType(FieldType):
12 |     type = "Exists"
13 |     _predicate_functions: list[PredicateFunction] = []
14 | 
15 |     def __init__(self, field: str, **kwargs):
16 |         super().__init__(field, **kwargs)
17 | 
18 |         self.cat_comparator = CategoricalComparator([0, 1])
19 | 
20 |         self.higher_vars = []
21 |         for higher_var in self.cat_comparator.dummy_names:
22 |             dummy_var = DerivedType(higher_var, "Dummy", has_missing=self.has_missing)
23 |             self.higher_vars.append(dummy_var)
24 | 
25 |     def comparator(self, field_1: Any, field_2: Any) -> list[int]:
26 |         if field_1 and field_2:
27 |             return self.cat_comparator(1, 1)
28 |         elif field_1 or field_2:
29 |             return self.cat_comparator(0, 1)
30 |         else:
31 |             return self.cat_comparator(0, 0)
32 | 
33 |     def __len__(self) -> int:
34 |         return len(self.higher_vars)
35 | 
36 |     # This flag tells fieldDistances in dedupe.core to pass
37 |     # missing values (None) into the comparator
38 |     comparator.missing = True  # type: ignore
39 | 


--------------------------------------------------------------------------------
/dedupe/variables/interaction.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import itertools
 4 | from typing import Mapping
 5 | 
 6 | from dedupe._typing import FieldVariable, InteractionVariable
 7 | from dedupe.variables.base import Variable
 8 | 
 9 | 
10 | class InteractionType(Variable):
11 |     type = "Interaction"
12 |     higher_vars: list[InteractionVariable]
13 | 
14 |     def __init__(self, *args: str, **kwargs):
15 |         self.interactions = list(args)
16 | 
17 |         self.name = "(Interaction: %s)" % str(self.interactions)
18 |         self.interaction_fields = self.interactions
19 | 
20 |         super().__init__(**kwargs)
21 | 
22 |     def expandInteractions(self, field_model: Mapping[str, FieldVariable]) -> None:
23 |         self.interaction_fields = self.atomicInteractions(
24 |             self.interactions, field_model
25 |         )
26 |         for field in self.interaction_fields:
27 |             if field_model[field].has_missing:
28 |                 self.has_missing = True
29 | 
30 |         self.categorical(field_model)
31 | 
32 |     def categorical(self, field_model: Mapping[str, FieldVariable]) -> None:
33 |         categoricals = [
34 |             field
35 |             for field in self.interaction_fields
36 |             if hasattr(field_model[field], "higher_vars")
37 |         ]
38 |         noncategoricals = [
39 |             field
40 |             for field in self.interaction_fields
41 |             if not hasattr(field_model[field], "higher_vars")
42 |         ]
43 | 
44 |         dummies = [field_model[field].higher_vars for field in categoricals]  # type: ignore[attr-defined]
45 | 
46 |         self.higher_vars = []
47 |         for combo in itertools.product(*dummies):
48 |             var_names = [field.name for field in combo] + noncategoricals
49 |             higher_var = InteractionType(*var_names, has_missing=self.has_missing)
50 |             self.higher_vars.append(higher_var)
51 | 
52 |     def atomicInteractions(
53 |         self, interactions: list[str], field_model: Mapping[str, FieldVariable]
54 |     ) -> list[str]:
55 |         atomic_interactions = []
56 | 
57 |         for field in interactions:
58 |             try:
59 |                 field_model[field]
60 |             except KeyError:
61 |                 raise KeyError(
62 |                     "The interaction variable %s is "
63 |                     "not a named variable in the variable "
64 |                     "definition" % field
65 |                 )
66 | 
67 |             if hasattr(field_model[field], "interaction_fields"):
68 |                 sub_interactions = field_model[field].interaction_fields  # type: ignore[attr-defined]
69 |                 atoms = self.atomicInteractions(sub_interactions, field_model)
70 |                 atomic_interactions.extend(atoms)
71 |             else:
72 |                 atomic_interactions.append(field)
73 | 
74 |         return atomic_interactions
75 | 


--------------------------------------------------------------------------------
/dedupe/variables/latlong.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from math import sqrt
 4 | 
 5 | from haversine import haversine
 6 | 
 7 | from dedupe import predicates
 8 | from dedupe.variables.base import FieldType
 9 | 
10 | 
11 | class LatLongType(FieldType):
12 |     type = "LatLong"
13 | 
14 |     _predicate_functions = [predicates.latLongGridPredicate]
15 | 
16 |     @staticmethod
17 |     def comparator(x: tuple[float, float], y: tuple[float, float]) -> float:
18 |         return sqrt(haversine(x, y))
19 | 


--------------------------------------------------------------------------------
/dedupe/variables/price.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import numpy
 4 | 
 5 | from dedupe import predicates
 6 | from dedupe.variables.base import FieldType
 7 | 
 8 | 
 9 | class PriceType(FieldType):
10 |     _predicate_functions = [
11 |         predicates.orderOfMagnitude,
12 |         predicates.wholeFieldPredicate,
13 |         predicates.roundTo1,
14 |     ]
15 |     type = "Price"
16 | 
17 |     @staticmethod
18 |     def comparator(price_1: int | float, price_2: int | float) -> float:
19 |         if price_1 <= 0:
20 |             return numpy.nan
21 |         elif price_2 <= 0:
22 |             return numpy.nan
23 |         else:
24 |             return abs(numpy.log10(price_1) - numpy.log10(price_2))
25 | 


--------------------------------------------------------------------------------
/dedupe/variables/set.py:
--------------------------------------------------------------------------------
 1 | from typing import Collection, Iterable, Optional
 2 | 
 3 | from simplecosine.cosine import CosineSetSimilarity
 4 | 
 5 | from dedupe import predicates
 6 | from dedupe.variables.base import FieldType
 7 | 
 8 | 
 9 | class SetType(FieldType):
10 |     type = "Set"
11 | 
12 |     _predicate_functions = (
13 |         predicates.wholeSetPredicate,
14 |         predicates.commonSetElementPredicate,
15 |         predicates.lastSetElementPredicate,
16 |         predicates.commonTwoElementsPredicate,
17 |         predicates.commonThreeElementsPredicate,
18 |         predicates.magnitudeOfCardinality,
19 |         predicates.firstSetElementPredicate,
20 |     )
21 | 
22 |     _index_predicates = (
23 |         predicates.TfidfSetSearchPredicate,
24 |         predicates.TfidfSetCanopyPredicate,
25 |     )
26 |     _index_thresholds = (0.2, 0.4, 0.6, 0.8)
27 | 
28 |     def __init__(
29 |         self, field: str, corpus: Optional[Iterable[Collection[str]]] = None, **kwargs
30 |     ):
31 |         super().__init__(field, **kwargs)
32 | 
33 |         if corpus is None:
34 |             corpus = []
35 | 
36 |         self.comparator = CosineSetSimilarity(corpus)  # type: ignore[assignment]
37 | 


--------------------------------------------------------------------------------
/dedupe/variables/string.py:
--------------------------------------------------------------------------------
  1 | from typing import Iterable, Optional, Sequence, Type
  2 | 
  3 | from affinegap import normalizedAffineGapDistance as affineGap
  4 | from highered import CRFEditDistance
  5 | from simplecosine.cosine import CosineTextSimilarity
  6 | 
  7 | from dedupe import predicates
  8 | from dedupe._typing import PredicateFunction
  9 | from dedupe.variables.base import FieldType, indexPredicates
 10 | 
 11 | crfEd = CRFEditDistance()
 12 | 
 13 | base_predicates = (
 14 |     predicates.wholeFieldPredicate,
 15 |     predicates.firstTokenPredicate,
 16 |     predicates.firstTwoTokensPredicate,
 17 |     predicates.commonIntegerPredicate,
 18 |     predicates.nearIntegersPredicate,
 19 |     predicates.firstIntegerPredicate,
 20 |     predicates.hundredIntegerPredicate,
 21 |     predicates.hundredIntegersOddPredicate,
 22 |     predicates.alphaNumericPredicate,
 23 |     predicates.sameThreeCharStartPredicate,
 24 |     predicates.sameFiveCharStartPredicate,
 25 |     predicates.sameSevenCharStartPredicate,
 26 |     predicates.commonTwoTokens,
 27 |     predicates.commonThreeTokens,
 28 |     predicates.fingerprint,
 29 |     predicates.oneGramFingerprint,
 30 |     predicates.twoGramFingerprint,
 31 |     predicates.sortedAcronym,
 32 | )
 33 | 
 34 | 
 35 | class BaseStringType(FieldType):
 36 |     _Predicate = predicates.StringPredicate
 37 |     _predicate_functions: Sequence[PredicateFunction] = ()
 38 | 
 39 |     def __init__(self, *args, **kwargs):
 40 |         super().__init__(*args, **kwargs)
 41 | 
 42 |         self.predicates += indexPredicates(
 43 |             (
 44 |                 predicates.LevenshteinCanopyPredicate,
 45 |                 predicates.LevenshteinSearchPredicate,
 46 |             ),
 47 |             (1, 2, 3, 4),
 48 |             self.field,
 49 |         )
 50 | 
 51 | 
 52 | class ShortStringType(BaseStringType):
 53 |     type = "ShortString"
 54 | 
 55 |     _predicate_functions = base_predicates + (
 56 |         predicates.commonFourGram,
 57 |         predicates.commonSixGram,
 58 |         predicates.tokenFieldPredicate,
 59 |         predicates.suffixArray,
 60 |         predicates.doubleMetaphone,
 61 |         predicates.metaphoneToken,
 62 |     )
 63 | 
 64 |     _index_predicates: Sequence[Type[predicates.IndexPredicate]] = [
 65 |         predicates.TfidfNGramCanopyPredicate,
 66 |         predicates.TfidfNGramSearchPredicate,
 67 |     ]
 68 |     _index_thresholds = (0.2, 0.4, 0.6, 0.8)
 69 | 
 70 |     def __init__(
 71 |         self, field: str, name: Optional[str] = None, crf: bool = False, **kwargs
 72 |     ):
 73 |         super().__init__(field, name=name, **kwargs)
 74 | 
 75 |         if crf:
 76 |             self.comparator = crfEd  # type: ignore[assignment]
 77 |         else:
 78 |             self.comparator = affineGap  # type: ignore[assignment]
 79 | 
 80 | 
 81 | class StringType(ShortStringType):
 82 |     type = "String"
 83 | 
 84 |     _index_predicates = [
 85 |         predicates.TfidfNGramCanopyPredicate,
 86 |         predicates.TfidfNGramSearchPredicate,
 87 |         predicates.TfidfTextCanopyPredicate,
 88 |         predicates.TfidfTextSearchPredicate,
 89 |     ]
 90 | 
 91 | 
 92 | class TextType(BaseStringType):
 93 |     type = "Text"
 94 | 
 95 |     _predicate_functions = base_predicates
 96 | 
 97 |     _index_predicates = [
 98 |         predicates.TfidfTextCanopyPredicate,
 99 |         predicates.TfidfTextSearchPredicate,
100 |     ]
101 |     _index_thresholds = (0.2, 0.4, 0.6, 0.8)
102 | 
103 |     def __init__(self, field: str, corpus: Optional[Iterable[str]] = None, **kwargs):
104 |         super().__init__(field, **kwargs)
105 | 
106 |         if corpus is None:
107 |             corpus = []
108 | 
109 |         self.comparator = CosineTextSimilarity(corpus)  # type: ignore[assignment]
110 | 


--------------------------------------------------------------------------------
/docs/API-documentation.rst:
--------------------------------------------------------------------------------
  1 | =====================
  2 | Library Documentation
  3 | =====================
  4 | 
  5 | :class:`Dedupe` Objects
  6 | -----------------------
  7 | .. autoclass:: dedupe.Dedupe
  8 | 
  9 |     .. code:: python
 10 | 
 11 |         # initialize from a defined set of fields
 12 |         variables = [
 13 | 	    dedupe.variables.String("Site name"),
 14 | 	    dedupe.variables.String("Address"),
 15 | 	    dedupe.variables.String("Zip", has_missing=True),
 16 | 	    dedupe.variables.String("Phone", has_missing=True),
 17 |         ]
 18 |         deduper = dedupe.Dedupe(variables)
 19 | 
 20 |     .. automethod:: prepare_training
 21 |     .. automethod:: uncertain_pairs
 22 |     .. automethod:: mark_pairs
 23 |     .. automethod:: train
 24 |     .. automethod:: write_training
 25 |     .. automethod:: write_settings
 26 |     .. automethod:: cleanup_training
 27 |     .. automethod:: partition
 28 | 
 29 | 
 30 | 
 31 | :class:`StaticDedupe` Objects
 32 | -----------------------------
 33 | .. autoclass:: dedupe.StaticDedupe
 34 | 
 35 |     .. code:: python
 36 | 
 37 |         with open('learned_settings', 'rb') as f:
 38 |             matcher = StaticDedupe(f)
 39 | 
 40 |     .. automethod:: partition
 41 | 
 42 | 
 43 | :class:`RecordLink` Objects
 44 | ---------------------------
 45 | .. autoclass:: dedupe.RecordLink
 46 | 
 47 |     .. code:: python
 48 | 
 49 |         # initialize from a defined set of fields
 50 |         variables = [
 51 | 	    dedupe.variables.String("Site name"),
 52 | 	    dedupe.variables.String("Address"),
 53 | 	    dedupe.variables.String("Zip", has_missing=True),
 54 | 	    dedupe.variables.String("Phone", has_missing=True),
 55 |         ]
 56 |         deduper = dedupe.RecordLink(variables)
 57 | 
 58 |     .. automethod:: prepare_training
 59 |     .. automethod:: uncertain_pairs
 60 |     .. automethod:: mark_pairs
 61 |     .. automethod:: train
 62 |     .. automethod:: write_training
 63 |     .. automethod:: write_settings
 64 |     .. automethod:: cleanup_training
 65 |     .. automethod:: join
 66 | 
 67 | 
 68 | :class:`StaticRecordLink` Objects
 69 | ---------------------------------
 70 | .. autoclass:: dedupe.StaticRecordLink
 71 | 
 72 |     .. code:: python
 73 | 
 74 |         with open('learned_settings', 'rb') as f:
 75 |             matcher = StaticRecordLink(f)
 76 | 
 77 |     .. automethod:: join
 78 | 
 79 | 
 80 | :class:`Gazetteer` Objects
 81 | --------------------------
 82 | .. autoclass:: dedupe.Gazetteer
 83 | 
 84 |     .. code:: python
 85 | 
 86 |         # initialize from a defined set of fields
 87 |         variables = [
 88 | 	    dedupe.variables.String("Site name"),
 89 | 	    dedupe.variables.String("Address"),
 90 | 	    dedupe.variables.String("Zip", has_missing=True),
 91 | 	    dedupe.variables.String("Phone", has_missing=True),
 92 |         ]
 93 |         matcher = dedupe.Gazetteer(variables)
 94 | 
 95 |     .. automethod:: prepare_training
 96 |     .. automethod:: uncertain_pairs
 97 |     .. automethod:: mark_pairs
 98 |     .. automethod:: train
 99 |     .. automethod:: write_training
100 |     .. automethod:: write_settings
101 |     .. automethod:: cleanup_training
102 |     .. automethod:: index
103 |     .. automethod:: unindex
104 |     .. automethod:: search
105 | 
106 | 
107 | :class:`StaticGazetteer` Objects
108 | --------------------------------
109 | .. autoclass:: dedupe.StaticGazetteer
110 | 
111 |     .. code:: python
112 | 
113 |         with open('learned_settings', 'rb') as f:
114 |             matcher = StaticGazetteer(f)
115 | 
116 |     .. automethod:: index
117 |     .. automethod:: unindex
118 |     .. automethod:: search
119 |     .. automethod:: blocks
120 |     .. automethod:: score
121 |     .. automethod:: many_to_n
122 | 
123 | Lower Level Classes and Methods
124 | -------------------------------
125 | 
126 | With the methods documented above, you can work with data into the
127 | millions of records. However, if are working with larger data you
128 | may not be able to load all your data into memory. You'll need
129 | to interact with some of the lower level classes and methods.
130 | 
131 | .. seealso:: The `PostgreSQL <https://dedupeio.github.io/dedupe-examples/docs/pgsql_big_dedupe_example.html>`_ and `MySQL <https://dedupeio.github.io/dedupe-examples/docs/mysql_example.html>`_ examples use these lower level classes and methods.
132 | 
133 | Dedupe and StaticDedupe
134 | ***********************
135 | 
136 | .. currentmodule:: dedupe
137 | 
138 | .. class:: Dedupe
139 |    :noindex:
140 | 
141 |     .. attribute:: fingerprinter
142 | 
143 |        Instance of :class:`dedupe.blocking.Fingerprinter` class if
144 |        the :func:`train` has been run, else `None`.
145 | 
146 |     .. automethod:: pairs
147 |     .. automethod:: score
148 |     .. automethod:: cluster
149 | 
150 | .. class:: StaticDedupe
151 |    :noindex:
152 | 
153 |     .. attribute:: fingerprinter
154 | 
155 |        Instance of :class:`dedupe.blocking.Fingerprinter` class
156 | 
157 |     .. method:: pairs(data)
158 | 
159 |        Same as :func:`dedupe.Dedupe.pairs`
160 | 
161 |     .. method:: score(pairs)
162 | 
163 |        Same as :func:`dedupe.Dedupe.score`
164 | 
165 |     .. method:: cluster(scores, threshold=0.5)
166 | 
167 |        Same as :func:`dedupe.Dedupe.cluster`
168 | 
169 | 
170 | RecordLink and StaticRecordLink
171 | *******************************
172 | 
173 | .. class:: RecordLink
174 |    :noindex:
175 | 
176 |     .. attribute:: fingerprinter
177 | 
178 |        Instance of :class:`dedupe.blocking.Fingerprinter` class if
179 |        the :func:`train` has been run, else `None`.
180 | 
181 |     .. automethod:: pairs
182 |     .. automethod:: score
183 |     .. automethod:: one_to_one
184 |     .. automethod:: many_to_one
185 | 
186 | .. class:: StaticRecordLink
187 |    :noindex:
188 | 
189 |    .. attribute:: fingerprinter
190 | 
191 |        Instance of :class:`dedupe.blocking.Fingerprinter` class
192 | 
193 |    .. method:: pairs(data_1, data_2)
194 | 
195 | 	Same as :func:`dedupe.RecordLink.pairs`
196 | 
197 |    .. method:: score(pairs)
198 | 
199 | 	Same as :func:`dedupe.RecordLink.score`
200 | 
201 |    .. method:: one_to_one(scores, threshold=0.0)
202 | 
203 |         Same as :func:`dedupe.RecordLink.one_to_one`
204 | 
205 |    .. method:: many_to_one(scores, threshold=0.0)
206 | 
207 | 	Same as :func:`dedupe.RecordLink.many_to_one`
208 | 
209 | 
210 | Gazetteer and StaticGazetteer
211 | *****************************
212 | 
213 | .. class:: Gazetteer
214 |    :noindex:
215 | 
216 |     .. attribute:: fingerprinter
217 | 
218 |        Instance of :class:`dedupe.blocking.Fingerprinter` class if
219 |        the :func:`train` has been run, else `None`.
220 | 
221 |     .. automethod:: blocks
222 |     .. automethod:: score
223 |     .. automethod:: many_to_n
224 | 
225 | .. class:: StaticGazeteer
226 |    :noindex:
227 | 
228 |     .. attribute:: fingerprinter
229 | 
230 |        Instance of :class:`dedupe.blocking.Fingerprinter` class
231 | 
232 |     .. method:: blocks(data)
233 | 
234 | 	Same as :func:`dedupe.Gazetteer.blocks`
235 | 
236 |     .. method:: score(blocks)
237 | 
238 | 	Same as :func:`dedupe.Gazetteer.score`
239 | 
240 |     .. method:: many_to_n(score_blocks, threshold=0.0, n_matches=1)
241 | 
242 | 	Same as :func:`dedupe.Gazetteer.many_to_n`
243 | 
244 | :class:`Fingerprinter` Objects
245 | ******************************
246 | .. autoclass:: dedupe.blocking.Fingerprinter
247 | 
248 |    .. automethod:: __call__
249 |    .. autoattribute:: index_fields
250 |    .. automethod:: index
251 |    .. automethod:: unindex
252 |    .. automethod:: reset_indices
253 | 
254 | 
255 | Convenience Functions
256 | ---------------------
257 | 
258 | .. autofunction:: dedupe.console_label
259 | .. autofunction:: dedupe.training_data_dedupe
260 | .. autofunction:: dedupe.training_data_link
261 | .. autofunction:: dedupe.canonicalize
262 | .. autofunction:: dedupe.read_training
263 | .. autofunction:: dedupe.write_training
264 | 


--------------------------------------------------------------------------------
/docs/Bibliography.rst:
--------------------------------------------------------------------------------
 1 | ============
 2 | Bibliography
 3 | ============
 4 | 
 5 | -  http://research.microsoft.com/apps/pubs/default.aspx?id=153478
 6 | -  http://cs.anu.edu.au/~Peter.Christen/data-matching-book-2012.html
 7 | -  http://www.umiacs.umd.edu/~getoor/Tutorials/ER\_VLDB2012.pdf
 8 | 
 9 | New School
10 | ----------
11 | - Steorts, Rebecca C., Rob Hall and Stephen Fienberg. "A Bayesian Approach to Record Linkage and De-duplication" December 2013. http://arxiv.org/abs/1312.4645
12 | 
13 | Very beautiful work. Records are matched to latent individuals. O(N)
14 | running time. Unsupervised, but everything hinges on tuning
15 | hyperparameters. This work only contemplates categorical variables.
16 | 
17 | 
18 | To Read
19 | -------
20 | - Domingos and Domingos Multi-relational record linkage. http://homes.cs.washington.edu/~pedrod/papers/mrdm04.pdf
21 | - An Entity Based Model for Coreference Resolution http://cs.tulane.edu/~aculotta/pubs/wick09entity.pdf
22 | 
23 | 


--------------------------------------------------------------------------------
/docs/Examples.rst:
--------------------------------------------------------------------------------
 1 | ========
 2 | Examples
 3 | ========
 4 | 
 5 | Dedupe is a library and not a stand-alone command line tool. To
 6 | demonstrate its usage, we have come up with a few example recipes for
 7 | different sized datasets for you to try out.
 8 | 
 9 | You can view and download the source code for these examples in the
10 | `examples repo <https://github.com/dedupeio/dedupe-examples>`__.
11 | 
12 | Or, you can view annotated, "walkthrough" versions online:
13 | 
14 | * `Small data deduplication <http://dedupeio.github.io/dedupe-examples/docs/csv_example.html>`__
15 | * `Record Linkage <https://dedupeio.github.io/dedupe-examples/docs/record_linkage_example.html>`__
16 | * `Gazetter example <https://dedupeio.github.io/dedupe-examples/docs/gazetteer_example.html>`__
17 | * `MySQL example <https://dedupeio.github.io/dedupe-examples/docs/mysql_example.html>`__
18 | * `Postgres big dedupe example <https://dedupeio.github.io/dedupe-examples/docs/pgsql_big_dedupe_example.html>`__
19 | * `Patent Author Disambiguation <https://dedupeio.github.io/dedupe-examples/docs/patent_example.html>`__


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    = -W
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # User-friendly check for sphinx-build
 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
 13 | endif
 14 | 
 15 | # Internal variables.
 16 | PAPEROPT_a4     = -D latex_paper_size=a4
 17 | PAPEROPT_letter = -D latex_paper_size=letter
 18 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 19 | # the i18n builder cannot share the environment and doctrees with the others
 20 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 21 | 
 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
 23 | 
 24 | help:
 25 | 	@echo "Please use \`make <target>' where <target> is one of"
 26 | 	@echo "  html       to make standalone HTML files"
 27 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 28 | 	@echo "  singlehtml to make a single large HTML file"
 29 | 	@echo "  pickle     to make pickle files"
 30 | 	@echo "  json       to make JSON files"
 31 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 32 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 33 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 34 | 	@echo "  epub       to make an epub"
 35 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 36 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 37 | 	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
 38 | 	@echo "  text       to make text files"
 39 | 	@echo "  man        to make manual pages"
 40 | 	@echo "  texinfo    to make Texinfo files"
 41 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 42 | 	@echo "  gettext    to make PO message catalogs"
 43 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 44 | 	@echo "  xml        to make Docutils-native XML files"
 45 | 	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
 46 | 	@echo "  linkcheck  to check all external links for integrity"
 47 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 48 | 
 49 | clean:
 50 | 	rm -rf $(BUILDDIR)/*
 51 | 
 52 | html:
 53 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 54 | 	@echo
 55 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 56 | 
 57 | dirhtml:
 58 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 59 | 	@echo
 60 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 61 | 
 62 | singlehtml:
 63 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 64 | 	@echo
 65 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 66 | 
 67 | pickle:
 68 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 69 | 	@echo
 70 | 	@echo "Build finished; now you can process the pickle files."
 71 | 
 72 | json:
 73 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 74 | 	@echo
 75 | 	@echo "Build finished; now you can process the JSON files."
 76 | 
 77 | htmlhelp:
 78 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 79 | 	@echo
 80 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 81 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 82 | 
 83 | qthelp:
 84 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 85 | 	@echo
 86 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 87 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 88 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/dedupe.qhcp"
 89 | 	@echo "To view the help file:"
 90 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/dedupe.qhc"
 91 | 
 92 | devhelp:
 93 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
 94 | 	@echo
 95 | 	@echo "Build finished."
 96 | 	@echo "To view the help file:"
 97 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/dedupe"
 98 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/dedupe"
 99 | 	@echo "# devhelp"
100 | 
101 | epub:
102 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
103 | 	@echo
104 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
105 | 
106 | latex:
107 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
108 | 	@echo
109 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
110 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
111 | 	      "(use \`make latexpdf' here to do that automatically)."
112 | 
113 | latexpdf:
114 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
115 | 	@echo "Running LaTeX files through pdflatex..."
116 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
117 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
118 | 
119 | latexpdfja:
120 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
121 | 	@echo "Running LaTeX files through platex and dvipdfmx..."
122 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
123 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
124 | 
125 | text:
126 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
127 | 	@echo
128 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
129 | 
130 | man:
131 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
132 | 	@echo
133 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
134 | 
135 | texinfo:
136 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
137 | 	@echo
138 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
139 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
140 | 	      "(use \`make info' here to do that automatically)."
141 | 
142 | info:
143 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
144 | 	@echo "Running Texinfo files through makeinfo..."
145 | 	make -C $(BUILDDIR)/texinfo info
146 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
147 | 
148 | gettext:
149 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
150 | 	@echo
151 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
152 | 
153 | changes:
154 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
155 | 	@echo
156 | 	@echo "The overview file is in $(BUILDDIR)/changes."
157 | 
158 | linkcheck:
159 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
160 | 	@echo
161 | 	@echo "Link check complete; look for any errors in the above output " \
162 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
163 | 
164 | doctest:
165 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
166 | 	@echo "Testing of doctests in the sources finished, look at the " \
167 | 	      "results in $(BUILDDIR)/doctest/output.txt."
168 | 
169 | xml:
170 | 	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
171 | 	@echo
172 | 	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
173 | 
174 | pseudoxml:
175 | 	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
176 | 	@echo
177 | 	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
178 | 


--------------------------------------------------------------------------------
/docs/Troubleshooting.rst:
--------------------------------------------------------------------------------
 1 | ***************
 2 | Troubleshooting
 3 | ***************
 4 | 
 5 | So you've tried to apply dedupe to your dataset, but you're having some problems.
 6 | Once you understand :ref:`how dedupe works <how-it-works-label>`, and you've taken
 7 | a look at some of the :doc:`examples<Examples>`, then this troubleshoooting
 8 | guide is your next step.
 9 | 
10 | Memory Considerations
11 | =====================
12 | 
13 | The top two likely memory bottlenecks, in order of likelihood, are:
14 | 
15 | 1. Building the index predicates for blocking. If this is a problem,
16 |    you can try turning off index blocking rules (and just use predicate
17 |    blocking rules) by setting ``index_predicates=False`` in
18 |    :meth:`dedupe.Dedupe.train`.
19 | 
20 | 2. During `cluster()`. After scoring, we have to compare all the pairwise scores
21 |    and build the clusters. dedupe runs a connected-components algorithm to
22 |    determine where to begin the clustering, and this is currently done in
23 |    memory using python dicts, so it can take substantial memory.
24 |    There isn't currently a way to avoid this except to just use less records.
25 | 
26 | Time Considerations
27 | ===================
28 | 
29 | The slowest part of dedupe is probably during blocking. A big part of this is building
30 | the index predicates, so the easiest fix for this is to set `index_predicates=False`
31 | in :meth:`dedupe.Dedupe.train`.
32 | 
33 | Blocking could also be slow if dedupe has to do too many or too complex of
34 | blocking rules. You can fix this by reducing the number of blocking rules dedupe has
35 | to learn to cover all the true positives. Either you reduce the `recall` parameter
36 | in :meth:`dedupe.Dedupe.train`, or, similarly, just use less positive examples
37 | during training.
38 | 
39 | Note that you are making a choice here between speed and recall. The less blocking
40 | you do, the faster you go, but the more likely you are to not block true positives
41 | together.
42 | 
43 | This part of dedupe is still single-threaded, and could probably benefit
44 | from parallelization or other code strategies,
45 | although current attempts haven't really proved promising yet.
46 | 
47 | 
48 | Improving Accuracy
49 | ==================
50 | 
51 | - Inspect your results and see if you can find any patterns: Does dedupe
52 |   not seem to be paying enough attention to some detail?
53 | 
54 | - Inspect the pairs given to you during :func:`dedupe.console_label`. These
55 |   are pairs that dedupe is most confused about. Are these actually confusing
56 |   pairs? If so, then great, dedupe is doing about as well as you could expect.
57 |   If the pair is obviously a duplicate or obviously not a duplicate, then this
58 |   means there is some signal that you should help dedupe to find.
59 | 
60 | - Read up on the theory behind each of the variable types. Some of them
61 |   are going to work better depending on the situation, so try to understand
62 |   them as well as you can.
63 | 
64 | - Add other variables. For instance try treating a field as both a `String`
65 |   and as a `Text` variable. If this doesn't cut it, add your own custom
66 |   variable that emphasizes the feature that you're really looking for.
67 |   For instance, if you have a list of last names, you might want "Smith"
68 |   to score well with "Smith-Johnson" (someone got married?). None of the
69 |   builtin variables will handle this well, so write your own comparator.
70 | 
71 | - Add `Interaction` variables. For instance, if both the "last name" and 
72 |   "street address" fields score very well, then this is almost a guarantee
73 |   that these two records refer to the same person. An `Interaction` variable
74 |   can emphasize this to the learner.
75 | 
76 | Extending Dedupe
77 | ================
78 | 
79 | If the built in variables don't cut it, you can write your own variables.
80 | 
81 | Take a look at the separately maintained `optional variables
82 | <https://github.com/search?q=org%3Adedupeio+dedupe-variable>`__
83 | for examples of how to write your own custom variable types with
84 | your custom comparators and predicates.


--------------------------------------------------------------------------------
/docs/_static/css/custom.css:
--------------------------------------------------------------------------------
  1 | 
  2 | /* ========== */
  3 | /* Typography */
  4 | /* ========== */
  5 | 
  6 | body {
  7 |     font-size: 16px;
  8 |     font-family: 'Montserrat', 'Arial', sans-serif;
  9 |     color: #555;
 10 | }
 11 | 
 12 | h1, h2, h3, h4, h5, h6 {
 13 |   font-family: 'Roboto', 'Arial', sans-serif;
 14 | }
 15 | 
 16 | h1, h1 a, h2, h2 a, h3, h3 a {
 17 |     color: #666;
 18 |     font-weight: bold;
 19 | }
 20 | 
 21 | /* Override some of the built-in RTD fonts */
 22 | h1, h2, h3, h4, h5, h6, legend,
 23 | .rst-content .toctree-wrapper p.caption {
 24 |     font-family: 'Montserrat', 'Arial', sans-serif;
 25 | }
 26 | 
 27 | h1 a:hover, h2 a:hover, h3 a:hover {
 28 |     color: #333;
 29 | }
 30 | 
 31 | a, a:visited, .wy-menu-vertical a { color: #F26F80;}
 32 | a:hover { color: #F26F80;}
 33 | a:active, a:focus { outline: 0;}
 34 | 
 35 | .navbar-nav > li > a,
 36 | .navbar-brand {
 37 |     height: 40px;
 38 | }
 39 | 
 40 | .navbar-nav > li > a {
 41 |     padding-top: 10.5px;
 42 |     line-height: 20.57px;
 43 | }
 44 | 
 45 | .navbar {
 46 |     min-height: 40px;
 47 | }
 48 | 
 49 | .navbar-fixed-top .navbar-collapse {
 50 |     padding-right: 15px;
 51 | }
 52 | 
 53 | .navbar-default .navbar-nav > li > a {
 54 |     color: #555;
 55 | }
 56 | 
 57 | .navbar-logo {
 58 |   height: 15px;
 59 |   margin: 13px;
 60 | }
 61 | 
 62 | .nav > li {
 63 |     font-size: 0.9em;
 64 |     margin-right: 1.1px;
 65 | }
 66 | 
 67 | .navbar-default .navbar-nav > li > a:hover {
 68 |     color: #F26F80;
 69 |     background-color: transparent;
 70 | }
 71 | 
 72 | .navbar-brand {
 73 |     font-size: 17px;
 74 |     padding: 11px 15px;
 75 | }
 76 | 
 77 | /* ================ */
 78 | /* Navs and footers */
 79 | /* ================ */
 80 | 
 81 | .footer {
 82 |     background-color: #CAD8E7;
 83 |     padding: 20px 0;
 84 |     margin-top: 20px;
 85 |     font-size: 0.9em;
 86 | }
 87 | 
 88 | .footer a {
 89 |   color: #29446F;
 90 | }
 91 | 
 92 | /* Remove Bootstrap navbar stuff */
 93 | .navbar > .container .navbar-brand,
 94 | .navbar > .container-fluid .navbar-brand {
 95 |     margin-left: 0;
 96 | }
 97 | 
 98 | /* Pad headers so that the navbar doesn't headbutt them */
 99 | /* (Thx to Chris Coyer on CSS-Tricks for the elegant sol'n) */
100 | /* https://css-tricks.com/hash-tag-links-padding/ */
101 | h1:before, h2:before, h3:before,
102 | h4:before, h4:before, h6:before { 
103 |     display: block; 
104 |     content: " "; 
105 |     margin-top: -55px; 
106 |     height: 55px; 
107 |     opacity: 0;
108 |     pointer-events: none;
109 | }
110 | 
111 | /* =================== */
112 | /* RTD theme overrides */
113 | /* =================== */
114 | 
115 | /* Remove sidebar background colors */
116 | .wy-nav-side, .wy-side-nav-search {
117 |     background: #fcfcfc;
118 | }
119 | 
120 | /* Remove dark colors from the background */
121 | .wy-body-for-nav {
122 |     background: #fcfcfc;
123 |     background-image: none;
124 | }
125 | 
126 | /* Give the nav a little bit of padding */
127 | .wy-nav-side {
128 |     padding-left: 10px;
129 |     top: 52px;
130 | }
131 | 
132 | /* Restyle mobile nav */
133 | .wy-nav-top {
134 |     background-color: #f8f8f8;
135 |     border-bottom: 1px solid #e7e7e7;
136 | }
137 | 
138 | /* Give the hamburger menu styles like Bootstrap */
139 | .wy-nav-top i {
140 |     padding: 5px 10px;
141 |     margin-top: 8px;
142 |     margin-bottom: 8px;
143 |     background-color: transparent;
144 |     background-image: none;
145 |     border: 1px solid #7777;
146 |     border-radius: 4px;
147 |     border-color: #ddd;
148 |     color: #777;
149 | }
150 | 
151 | .wy-nav-top i:hover {
152 |     background-color: #ddd;
153 | }
154 | 
155 | /* Pad the content just a lil */
156 | .wy-nav-content {
157 |     margin-top: 52px;
158 |     margin-left: auto;
159 |     margin-right: auto;
160 | }
161 | 
162 | /* Sidebar text should always be dark */
163 | .wy-side-nav-search,
164 | .wy-side-nav-search > a,
165 | .wy-side-nav-search .wy-dropdown > a,
166 | .wy-side-nav-search > div.version {
167 |     color: #777777
168 | }
169 | 
170 | /* Hover styles for the menu links */
171 | .wy-menu > ul > li > a:hover,
172 | .wy-menu > ul > li > a:focus {
173 |     background-color: #EFEFEF;
174 |     color: #F26F80;
175 |     font-weight: bold;
176 | }
177 | 
178 | /*Make the search bar less garish */
179 | .wy-side-nav-search input[type="text"] {
180 |     border-color: #fcfcfc;  /* hacky override for the gross border */
181 |     border-radius: 5px;
182 | }
183 | 
184 | /* Right-align the sidebar text */
185 | .wy-menu-vertical {
186 |     text-align: right;
187 | }
188 | 
189 | /* Version box in the lower left-hand corner */
190 | .rst-versions {
191 |     background: #ababab;
192 |     border-top: none;
193 | }
194 | 
195 | .rst-versions .rst-current-version {
196 |     background-color: #7d7d7d;
197 |     color: #ffffff;
198 | }
199 | 
200 | .rst-versions .rst-other-versions {
201 |     color: #4e4e4e;
202 | }
203 | 
204 | /* ============= */
205 | /* Media queries */
206 | /* ============= */
207 | 
208 | /* Remove Dedupe.io navbar at small breakpoints */
209 | @media (max-width: 768px) {
210 | 
211 |     #dedupe-nav {
212 |       display: none;
213 |     }
214 | 
215 |     .wy-nav-side {
216 |       top: 0;
217 |     }
218 | 
219 |     .wy-nav-content {
220 |       margin-top: 0;
221 |     }
222 | 
223 |     .wy-nav-top a {
224 |         text-decoration: none;
225 |     }
226 | }
227 | 
228 | /* Get rid of mysterious dark void that appears on big screens */
229 | @media screen and (min-width: 1400px) {
230 |     .wy-nav-content-wrap {
231 |         background: #fcfcfc;
232 |     }
233 | }
234 | 


--------------------------------------------------------------------------------
/docs/_static/images/dedupeio-logo-reversed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dedupeio/dedupe/54ecfe77d41390da66899596834a2bde3712c966/docs/_static/images/dedupeio-logo-reversed.png


--------------------------------------------------------------------------------
/docs/_static/images/dedupeio-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dedupeio/dedupe/54ecfe77d41390da66899596834a2bde3712c966/docs/_static/images/dedupeio-logo.png


--------------------------------------------------------------------------------
/docs/_templates/layout.html:
--------------------------------------------------------------------------------
 1 | {% extends "!layout.html" %}
 2 | 
 3 | {% block extrahead %}
 4 |   <!-- Grab JQuery, Bootstrap, CSS, and fonts -->
 5 |   <link href="{{ pathto('_static/css/bootstrap.css', True) }}" rel="stylesheet" type="text/css">
 6 |   <link href="{{ pathto('_static/css/custom.css', True) }}" rel="stylesheet" type="text/css">
 7 |   <script src="{{ pathto('_static/js/jquery.min.js', True) }}"></script>
 8 |   <link href='https://fonts.googleapis.com/css?family=Roboto:400,400italic,700' rel='stylesheet' type='text/css'>
 9 |   <link href='https://fonts.googleapis.com/css?family=Montserrat:400,400italic,700' rel='stylesheet' type='text/css'>
10 |   <script src="{{ pathto('_static/js/bootstrap.min.js', True) }}"></script>
11 |   <script>
12 |     $(function() {
13 |       /* Tweak some of the hardcoded markup in the RTD theme */
14 |       $('.wy-nav-top > a, .icon.icon-home').html('dedupe library docs');
15 |       $('.icon.icon-home').removeClass('icon, icon-home');
16 |     });
17 |   </script>
18 | {% endblock %}
19 | 
20 | {% block extrabody %}
21 | <!-- Navbar from the dedupe.io marketing site, lightly edited -->
22 | <div class="navbar navbar-default navbar-fixed-top" id="dedupe-nav" role="navigation"> 
23 |   <div class="container-fluid">
24 |     <div class="navbar-header">
25 |       <button type="button" class="navbar-toggle" data-toggle="collapse" data-target=".navbar-collapse">
26 |         <span class="sr-only">Toggle navigation</span>
27 |         <span class="icon-bar"></span>
28 |         <span class="icon-bar"></span>
29 |         <span class="icon-bar"></span>
30 |       </button>
31 |       <a class="navbar-left" href="https://dedupe.io"><img src="{{ pathto('_static/images/dedupeio-logo.png', True) }}" class='navbar-logo' alt='Dedupe.io' title='Dedupe.io' /></a>
32 |     </div>
33 |     <div class="navbar-collapse collapse pull-right">
34 |       <ul class="nav navbar-nav">
35 |         <li><a href="https://dedupe.io/#demo">Demo</a></li>
36 |         <li><a href="https://dedupe.io/#sign-up">Sign up</a></li>
37 |         <li><a href="https://dedupe.io/pricing/">Pricing</a></li>
38 |         <li><a href="https://dedupe.io/tutorials/">Tutorials</a></li>
39 |         <li><a href="https://dedupe.io/developers/">Developers</a></li>
40 |         <li><a href="https://app.dedupe.io/login?next=%2F"><i class='fa fa-fw fa-sign-in'></i> Login</a></li>
41 |       </ul>
42 |     </div><!--/.nav-collapse -->
43 |   </div>
44 | </div>
45 | {% endblock %}
46 | 


--------------------------------------------------------------------------------
/docs/how-it-works/Choosing-a-good-threshold.rst:
--------------------------------------------------------------------------------
 1 | =========================
 2 | Choosing a Good Threshold
 3 | =========================
 4 | 
 5 | Dedupe can predict the *probability* that a pair of records are
 6 | duplicates. So, how should we decide that a pair of records really are
 7 | duplicates?
 8 | 
 9 | To answer this question we need to know something about Precision and
10 | Recall. Why don't you check out the `Wikipedia
11 | page <http://en.wikipedia.org/wiki/Precision_and_recall>`__ and come
12 | back here.
13 | 
14 | There's always a trade-off between precision and recall. That's okay. As
15 | long as we know how much we care about precision vs. recall, `we can
16 | define an F-score <http://en.wikipedia.org/wiki/F1_score>`__ that will
17 | let us find a threshold for deciding when records are duplicates *that
18 | is optimal for our priorities*.
19 | 
20 | Typically, the way that we find that threshold is by looking at the true
21 | precision and recall of some data where we know their true labels -
22 | where we know the real duplicates. However, we will only get a good
23 | threshold if the labeled examples are representative of the data we are
24 | trying to classify.
25 | 
26 | So here's the problem - the labeled examples that we make with Dedupe
27 | are not at all representative, and that's by design. In the active
28 | learning step, we are not trying to find the most representative data
29 | examples. We're trying to find the ones that will teach us the most.
30 | 
31 | The approach we take here is to take a random sample of blocked data,
32 | and then calculate the pairwise probability that records will be
33 | duplicates within each block. From these probabilities we can calculate
34 | the expected number of duplicates and distinct pairs, so we can
35 | calculate the expected precision and recall.
36 | 
37 | 


--------------------------------------------------------------------------------
/docs/how-it-works/Grouping-duplicates.rst:
--------------------------------------------------------------------------------
 1 | ===================
 2 | Grouping Duplicates
 3 | ===================
 4 | 
 5 | Once we have calculated the probability that pairs of record are
 6 | duplicates or not, we still have a kind of thorny problem because it's
 7 | not just pairs of records that can be duplicates. Three, four, thousands
 8 | of records could all refer to the same entity (person, organization, ice
 9 | cream flavor, etc.,) but we only have pairwise measures.
10 | 
11 | Let's say we have measured the following pairwise probabilities between
12 | records A, B, and C.
13 | 
14 | ::
15 | 
16 |     A -- 0.6 -- B -- 0.6 -- C 
17 | 
18 | The probability that A and B are duplicates is 60%, the probability that
19 | B and C are duplicates is 60%, but what is the probability that A and C
20 | are duplicates?
21 | 
22 | Let's say that everything is going perfectly and we can say there's a
23 | 36% probability that A and C are duplicates. We'd probably want to say
24 | that A and C should not be considered duplicates.
25 | 
26 | Okay, then should we say that A and B are a duplicate pair and C is a
27 | distinct record or that A is the distinct record and that B and C are
28 | duplicates?
29 | 
30 | Well... this is a thorny problem, and we tried solving it a few
31 | different ways. In the end, we found that **hierarchical clustering with
32 | centroid linkage** gave us the best results. What this algorithm does is
33 | say that all points within some distance of centroid are part of the
34 | same group. In this example, B would be the centroid - and A, B, C and
35 | would all be put in the same group.
36 | 
37 | Unfortunately, a more principled answer does not exist because the
38 | estimated pairwise probabilities are not transitive.
39 | 
40 | Clustering the groups depends on us setting a threshold for group
41 | membership -- the distance of the points to the centroid. Depending on how
42 | we choose that threshold, we'll get very different groups, and we will
43 | want to choose this threshold wisely.
44 | 
45 | In recent years, there has been some very exciting research that
46 | solves the problem of turning pairwise distances into clusters, by
47 | avoiding making pairwise comparisons altogether. Unfortunately, these
48 | developments are not compatible with Dedupe's pairwise approach. See,
49 | `Michael Wick, et.al, 2012. "A Discriminative Hierarchical Model for Fast Coreference at Large Scale" <http://people.cs.umass.edu/~sameer/files/hierar-coref-acl12.pdf>`__
50 | and `Rebecca C. Steorts, et. al., 2013. "A Bayesian Approach to Graphical Record Linkage and De-duplication" <http://arxiv.org/abs/1312.4645>`__.
51 | 
52 | 


--------------------------------------------------------------------------------
/docs/how-it-works/How-it-works.rst:
--------------------------------------------------------------------------------
 1 | .. _how-it-works-label:
 2 | 
 3 | ############
 4 | How it works
 5 | ############
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 1
 9 | 
10 |    Matching-records
11 |    Making-smart-comparisons
12 |    Grouping-duplicates
13 |    Choosing-a-good-threshold
14 |    Special-Cases
15 | 
16 | 
17 | **Problems with real-world data**
18 | 
19 | Journalists, academics, and businesses work hard to get big masses of
20 | data to learn about what people or organizations are doing.
21 | Unfortunately, once we get the data, we often can't answer our questions
22 | because we can't tell who is who.
23 | 
24 | In much real-world data, we do not have a way of absolutely deciding
25 | whether two records, say ``John Smith`` and ``J. Smith`` are referring
26 | to the same person. If these were records of campaign contribution data,
27 | did a ``John Smith`` give two donations or did ``John Smith`` and maybe
28 | ``Jane Smith`` give one contribution apiece?
29 | 
30 | People are pretty good at making these calls, if they have enough
31 | information. For example, I would be pretty confident that the following
32 | two records are the about the same person.
33 | 
34 | ::
35 | 
36 |     first name | last name | address                   | phone   |
37 |     --------------------------------------------------------------
38 |     bob        | roberts   | 1600 pennsylvania ave.   | 555-0123 |
39 |     Robert     | Roberts   | 1600 Pensylvannia Avenue |          |
40 | 
41 | If we have to decide which records in our data are about the same person
42 | or organization, then we could just go through by hand, compare every
43 | record, and decide which records are about the same entity.
44 | 
45 | This is very, very boring and can takes a **long** time. Dedupe is a
46 | software library that can make these decisions about whether records are
47 | about the same thing about as good as a person can, but quickly.
48 | 


--------------------------------------------------------------------------------
/docs/how-it-works/Making-smart-comparisons.rst:
--------------------------------------------------------------------------------
  1 | ========================
  2 | Making Smart Comparisons
  3 | ========================
  4 | 
  5 | Say we have magic function that takes in a pair of records and always
  6 | returns a ``False`` if a pair of records are distinct and ``True`` if a
  7 | pair of records refer to the same person or organization.
  8 | 
  9 | Let's say that this function was pretty slow. It always took one second
 10 | to return.
 11 | 
 12 | How long would it take to duplicate a thousand records?
 13 | 
 14 | Within a dataset of thousand records, there are :math:`\frac{1{,}000
 15 | \times 999}{2} = 499{,}500` unique pairs of records. If we
 16 | compared all of them using our magic function it would take six days.
 17 | 
 18 | But, one second is a **long** time, let's say we sped it up so that we
 19 | can make 10,000 comparisons per second. Now we can get through our
 20 | thousand-record-long dataset in less than a minute.
 21 | 
 22 | Feeling good about our super-fast comparison function, let's take on a
 23 | dataset of 100,000 records. Now there are
 24 | :math:`\frac{100{,}000 \times 99{,}999}{2} = 4{,}999{,}950{,}000` unique possible
 25 | pairs. If we compare all of them with our super-fast comparison function,
 26 | it will take six days again.
 27 | 
 28 | If we want to work with moderately sized data, we have to find a way of
 29 | making fewer comparisons.
 30 | 
 31 | Duplicates are rare
 32 | -------------------
 33 | 
 34 | In real world data, nearly all possible pairs of records are not
 35 | duplicates.
 36 | 
 37 | In this four-record example below, only two pairs of records are
 38 | duplicates--(1, 2) and (3, 4), while there are four unique
 39 | pairs of records that are not duplicates--(1,3), (1,4), (2,3), and (2,4). 
 40 | Typically, as the size of the dataset grows, the fraction of pairs of records
 41 | that are duplicates gets very small very quickly.
 42 | 
 43 | +-------------+-----------+--------------------------+--------------+----------+
 44 | |  first name | last name | address                  | phone        | record_id|
 45 | +=============+===========+==========================+==============+==========+
 46 | |  bob        | roberts   | 1600 pennsylvania ave.   | 555-0123     | 1        |
 47 | +-------------+-----------+--------------------------+--------------+----------+
 48 | |  Robert     | Roberts   | 1600 Pensylvannia Avenue |              | 2        |
 49 | +-------------+-----------+--------------------------+--------------+----------+
 50 | |  steve      | Jones     | 123 Cowabunga Lane       | 555-0000     | 3        |
 51 | +-------------+-----------+--------------------------+--------------+----------+
 52 | |  Stephen    | Janes     | 123 Cawabunga Ln         | 444-555-0000 | 4        |
 53 | +-------------+-----------+--------------------------+--------------+----------+
 54 | 
 55 | 
 56 | If we could only compare records that were true duplicates, we wouldn't
 57 | run into the explosion of comparisons. Of course, if we already knew where
 58 | the true duplicates were, we wouldn't need to compare any individual
 59 | records. Unfortunately we don't, but we do quite well if just compare
 60 | records that are somewhat similar.
 61 | 
 62 | Blocking
 63 | --------
 64 | 
 65 | Duplicate records almost always share *something* in common. If we
 66 | define groups of data that share something and only compare the records
 67 | in that group, or *block*, then we can dramatically reduce the number of
 68 | comparisons we will make. If we define these blocks well, then we will make
 69 | very few comparisons and still have confidence that will compare records
 70 | that truly are duplicates.
 71 | 
 72 | This task is called blocking, and we approach it in two ways: predicate
 73 | blocks and index blocks.
 74 | 
 75 | Predicate blocks
 76 | ~~~~~~~~~~~~~~~~
 77 | 
 78 | A predicate block is a bundle of records that all share a feature -- a
 79 | feature produced by a simple function called a predicate.
 80 | 
 81 | Predicate functions take in a record field, and output a set of features
 82 | for that field. These features could be "the first 3 characters of the
 83 | field," "every word in the field," and so on. Records that share the
 84 | same feature become part of a block.
 85 | 
 86 | Let's take an example. Let's use a "first 3 character" predicate on
 87 | the **address field** below..
 88 | 
 89 | +-------------+-----------+--------------------------+--------------+----------+
 90 | |  first name | last name | address                  | phone        | record_id|
 91 | +=============+===========+==========================+==============+==========+
 92 | |  bob        | roberts   | 1600 pennsylvania ave.   | 555-0123     | 1        |
 93 | +-------------+-----------+--------------------------+--------------+----------+
 94 | |  Robert     | Roberts   | 1600 Pensylvannia Avenue |              | 2        |
 95 | +-------------+-----------+--------------------------+--------------+----------+
 96 | |  steve      | Jones     | 123 Cowabunga Lane       | 555-0000     | 3        |
 97 | +-------------+-----------+--------------------------+--------------+----------+
 98 | |  Stephen    | Janes     | 123 Cawabunga Ln         | 444-555-0000 | 4        |
 99 | +-------------+-----------+--------------------------+--------------+----------+
100 | 
101 | That leaves us with two blocks - The '160' block, which contains records
102 | 1 and 2, and the '123' block, which contains records 3 and 4.
103 | 
104 | ::
105 | 
106 |     {'160' : (1,2) # tuple of record_ids
107 |      '123' : (3,4)
108 |      } 
109 | 
110 | Again, we're applying the "first three characters" predicate function to the
111 | address field in our data, the function outputs the following features --
112 | 160, 160, 123, 123 -- and then we group together the records that have
113 | identical features into "blocks". 
114 | 
115 | Others simple predicates Dedupe uses include: 
116 | 
117 | * whole field 
118 | * token field 
119 | * common integer 
120 | * same three char start 
121 | * same five char start
122 | * same seven char start 
123 | * near integers 
124 | * common four gram 
125 | * common six gram
126 | 
127 | .. _index-blocks-label:
128 | 
129 | Index Blocks
130 | ~~~~~~~~~~~~
131 | 
132 | Dedupe also uses another way of producing blocks from searching and
133 | index. First, we create a special data structure, like an `inverted
134 | index <http://en.wikipedia.org/wiki/Inverted_index>`__, that lets us
135 | quickly find records similar to target records. We populate the index
136 | with all the unique values that appear in field. 
137 | 
138 | When blocking, for each record we search the index for values similar to
139 | the record's field. We block together records that share at least one
140 | common search result.
141 | 
142 | Index predicates require building an index from all the unique values
143 | in a field. This can take substantial time and memory. Index
144 | predicates are also usually slower than predicate blocking.
145 | 
146 | Combining blocking rules
147 | ------------------------
148 | 
149 | If it's good to put define blocks of records that share the same 'city'
150 | field, it might be even better to block records that share *both* the
151 | 'city' field *and* the 'zip code' field. Dedupe tries these cross-field
152 | blocks. These combinations blocks are called disjunctive blocks.
153 | 
154 | Learning good blocking rules for given data
155 | -------------------------------------------
156 | 
157 | Dedupe comes with a long set of predicates, and when these are
158 | combined Dedupe can have hundreds of possible blocking rules to choose
159 | from. We will want to find a small set of these rules that covers
160 | every labeled duplicated pair but minimizes the total number pairs
161 | dedupe will have to compare.
162 | 
163 | While we approach this problem by using greedy algorithms, particularly
164 | `Chvatal's Greedy Set-Cover
165 | algorithm <http://www.cs.ucr.edu/~neal/Papers/Young08SetCover.pdf>`__.
166 | 
167 | 


--------------------------------------------------------------------------------
/docs/how-it-works/Matching-records.rst:
--------------------------------------------------------------------------------
  1 | ****************
  2 | Matching Records
  3 | ****************
  4 | 
  5 | If you look at the following two records, you might think it's pretty
  6 | clear that they are about the same person.
  7 | 
  8 | ::
  9 | 
 10 |     first name | last name | address                   | phone   |
 11 |     --------------------------------------------------------------
 12 |     bob        | roberts   | 1600 pennsylvania ave.   | 555-0123 |
 13 |     Robert     | Roberts   | 1600 Pensylvannia Avenue |          |
 14 | 
 15 | However, I bet it would be pretty hard for you to explicitly write down
 16 | all the reasons why you think these records are about the same Mr.
 17 | Roberts.
 18 | 
 19 | Record similarity
 20 | -----------------
 21 | 
 22 | One way that people have approached this problem is by saying that
 23 | records that are more similar are more likely to be duplicates. That's a
 24 | good first step, but then we have to precisely define what we mean for
 25 | two records to be similar.
 26 | 
 27 | The default way that we do this in Dedupe is to use what's called a
 28 | string metric. A string metric is an way of taking two strings and
 29 | returning a number that is low if the strings are similar and high if
 30 | they are dissimilar. One famous string metric is called the Hamming
 31 | distance. It counts the number of substitutions that must be made to
 32 | turn one string into another. For example, ``roberts`` and ``Roberts``
 33 | would have Hamming distance of 1 because we have to substitute ``r`` for
 34 | ``R`` in order to turn ``roberts`` into ``Roberts``.
 35 | 
 36 | There are lots of different string metrics, and we actually use a metric
 37 | called the `Affine Gap Distance <https://en.wikipedia.org/wiki/Gap_penalty#Affine>`__, which is a
 38 | variation on the Hamming distance.
 39 | 
 40 | Record by record or field by field
 41 | ----------------------------------
 42 | 
 43 | When we are calculating whether two records are similar we could treat
 44 | each record as if it was a long string.
 45 | 
 46 | ::
 47 | 
 48 |     record_distance = string_distance('bob roberts 1600 pennsylvania ave. 555-0123',
 49 |                                       'Robert Roberts 1600 Pensylvannia Avenue')
 50 | 
 51 | Alternately, we could compare field by field
 52 | 
 53 | ::
 54 | 
 55 |     record_distance = (string_distance('bob', 'Robert') 
 56 |                        + string_distance('roberts', 'Roberts')
 57 |                        + string_distance('1600 pennsylvania ave.', '1600 Pensylvannia Avenue')
 58 |                        + string_distance('555-0123', ''))
 59 | 
 60 | The major advantage of comparing field by field is that we don't have to
 61 | treat each field string distance equally. Maybe we think that its really
 62 | important that the last names and addresses are similar but it's not as
 63 | important that first name and phone numbers are close. We can express
 64 | that importance with numeric weights, i.e.
 65 | 
 66 | ::
 67 | 
 68 |     record_distance = (0.5 * string_distance('bob', 'Robert') 
 69 |                        + 2.0 * string_distance('roberts', 'Roberts')
 70 |                        + 2.0 * string_distance('1600 pennsylvania ave.', '1600 Pensylvannia Avenue')
 71 |                        + 0.5 * string_distance('555-0123', ''))
 72 | 
 73 | Setting weights and making decisions
 74 | ------------------------------------
 75 | 
 76 | Say we set our record\_distance to be this weighted sum of field
 77 | distances, just as we had above. Let's say we calculated the
 78 | record\_distance and we found that it was the beautiful number **8**.
 79 | 
 80 | That number, by itself, is not that helpful. Ultimately, we are trying
 81 | to decide whether a pair of records are duplicates, and I'm not sure
 82 | what decision I should make if I see an 8. Does an 8 mean that the pair
 83 | of records are really similar or really far apart, likely or unlikely to
 84 | be duplicates. We'd like to define the record distances so that we can
 85 | look at the number and know whether to decide whether it's a duplicate.
 86 | 
 87 | Also, I really would rather not have to set the weights by hand every
 88 | time. It can be very tricky to know which fields are going to matter and
 89 | even if I know that some fields are more important I'm not sure how to
 90 | quantify it (is it 2 times more important or 1.3 times)?
 91 | 
 92 | Fortunately, we can solve both problems with a technique called
 93 | regularized logistic regression. If we supply pairs of records that we
 94 | label as either being duplicates or distinct, then Dedupe will learn a
 95 | set of weights such that the record distance can easily be transformed
 96 | into our best estimate of the probability that a pair of records are
 97 | duplicates.
 98 | 
 99 | Once we have learned these good weights, we want to use them to find
100 | which records are duplicates. But turns out that doing this the naive
101 | way will usually not work, and :doc:`we'll have to do something
102 | smarter <Making-smart-comparisons>`.
103 | 
104 | Active learning
105 | ~~~~~~~~~~~~~~~
106 | 
107 | In order to learn those weights, Dedupe needs example pairs with labels.
108 | Most of the time, we will need people to supply those labels.
109 | 
110 | But the whole point of Dedupe is to save people's time, and that
111 | includes making good use of your labeling time so we use an approach
112 | called Active Learning.
113 | 
114 | Basically, Dedupe keeps track of bunch unlabeled pairs and whether
115 | 
116 | 1. the current learning blocking rules would cover the pairs
117 | 2. the current learned classifier would predict that the pairs are
118 |    duplicates or are distinct
119 | 
120 | We maintain a set of the pairs where there is disagreement: that is
121 | pairs which classifier believes are duplicates but which are not
122 | covered by the current blocking rules, and the pairs which the
123 | classifier believes are distinct but which are blocked together.
124 | 
125 | Dedupe picks, at random from this disagreement set, a pair of records
126 | and asks the user to decide. Once it gets this label, it relearns the
127 | weights and blocking rules. We then recalculate the disagreement set.
128 | 
129 | Other field distances
130 | ~~~~~~~~~~~~~~~~~~~~~
131 | 
132 | We have implemented a number of field distance measures. See :doc:`the
133 | details about variables </Variable-definition>`.
134 | 
135 | 
136 | 


--------------------------------------------------------------------------------
/docs/how-it-works/Special-Cases.rst:
--------------------------------------------------------------------------------
 1 | =============
 2 | Special Cases
 3 | =============
 4 | 
 5 | The process we have been describing is for the most general case--when
 6 | you have a dataset where an arbitrary number of records can all refer to
 7 | the same entity.
 8 | 
 9 | There are certain special cases where we can make more assumptions about
10 | how records can be linked, which if true, make the problem much simpler.
11 | 
12 | One important case we call Record Linkage. Say you have two datasets and
13 | you want to find the records in each dataset that refer to the same
14 | thing. If you can assume that each dataset, individually, is unique,
15 | then this puts a big constraint on how records can match. If this
16 | uniqueness assumption holds, then (A) two records can only refer to the
17 | same entity if they are from different datasets and (B) no other record
18 | can match either of those two records.
19 | 
20 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
  1 | .. dedupe documentation master file, created by
  2 |    sphinx-quickstart on Thu Apr 10 11:27:59 2014.
  3 |    You can adapt this file completely to your liking, but it should at least
  4 |    contain the root `toctree` directive.
  5 | 
  6 | ================
  7 | Dedupe |release|
  8 | ================
  9 | 
 10 | *dedupe is a library that uses machine learning to perform de-duplication and entity resolution quickly on structured data.*
 11 | 
 12 | If you're looking for the documentation for the Dedupe.io Web API, you can find that here: https://apidocs.dedupe.io/
 13 | 
 14 | **dedupe** will help you: 
 15 | 
 16 | * **remove duplicate entries** from a spreadsheet of names and addresses
 17 | * **link a list** with customer information to another with order history, even without unique customer id's
 18 | * take a database of campaign contributions and **figure out which ones were made by the same person**, even if the names were entered slightly differently for each record
 19 | 
 20 | dedupe takes in human training data and comes up with the best rules for your dataset to quickly and automatically find similar records, even with very large databases.
 21 | 
 22 | Important links
 23 | ===============
 24 | 
 25 | * Documentation: https://docs.dedupe.io/
 26 | * Repository: https://github.com/dedupeio/dedupe
 27 | * Issues: https://github.com/dedupeio/dedupe/issues
 28 | * Mailing list: https://groups.google.com/forum/#!forum/open-source-deduplication
 29 | * Examples: https://github.com/dedupeio/dedupe-examples
 30 | * IRC channel, `#dedupe on irc.freenode.net <http://webchat.freenode.net/?channels=dedupe>`__
 31 | 
 32 | Tools built with dedupe
 33 | =======================
 34 | 
 35 | `Dedupe.io <https://dedupe.io/>`__
 36 | A full service web service powered by dedupe for de-duplicating and find matches in your messy data. It provides an easy-to-use interface and provides cluster review and automation, as well as advanced record linkage, continuous matching and API integrations. `See the product page <https://dedupe.io/>`__ and the `launch blog post <https://datamade.us/blog/introducing-dedupeio>`__.
 37 | 
 38 | `csvdedupe <https://github.com/dedupeio/csvdedupe>`__
 39 | Command line tool for de-duplicating and `linking <https://github.com/dedupeio/csvdedupe#csvlink-usage>`__ CSV files. Read about it on `Source Knight-Mozilla OpenNews <https://source.opennews.org/en-US/articles/introducing-cvsdedupe/>`__.
 40 | 
 41 | Contents
 42 | ========
 43 | 
 44 | .. toctree::
 45 |    :maxdepth: 1
 46 | 
 47 |    API-documentation
 48 |    Variable-definition
 49 |    Examples
 50 |    how-it-works/How-it-works
 51 |    Troubleshooting
 52 |    Bibliography
 53 | 
 54 | 
 55 | Features
 56 | ========
 57 | 
 58 | * **machine learning** - reads in human labeled data to automatically create optimum weights and blocking rules
 59 | * **runs on a laptop** - makes intelligent comparisons so you don't need a powerful server to run it
 60 | * **built as a library** - so it can be integrated in to your applications or import scripts
 61 | * **extensible** - supports adding custom data types, string comparators and blocking rules
 62 | * **open source** - anyone can use, modify or add to it
 63 | 
 64 | Installation
 65 | ============
 66 | 
 67 | .. code-block:: bash
 68 | 
 69 |    pip install dedupe
 70 | 
 71 | Errors / Bugs
 72 | =============
 73 | 
 74 | If something is not behaving intuitively, it is a bug, and should be
 75 | reported. `Report it here <https://github.com/dedupeio/dedupe/issues>`__
 76 | 
 77 | Contributing to dedupe
 78 | ======================
 79 | 
 80 | Check out `dedupe <https://github.com/dedupeio/dedupe>`__
 81 | repo for how to contribute to the library.
 82 | 
 83 | Check out `dedupe-examples
 84 | <https://github.com/dedupeio/dedupe-examples>`__ for how to contribute
 85 | a useful example of using dedupe.
 86 | 
 87 | Citing dedupe
 88 | =============
 89 | 
 90 | If you use Dedupe in an academic work, please give this citation:
 91 | 
 92 | Gregg, Forest and Derek Eder. 2015. Dedupe. https://github.com/dedupeio/dedupe.
 93 | 
 94 | 
 95 | Indices and tables
 96 | ==================
 97 | 
 98 | * :ref:`genindex`
 99 | 
100 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx>=4.3.0
2 | sphinx-autodoc-typehints
3 | sphinx-rtd-theme>=0.5.1
4 | sphinxcontrib-htmlhelp
5 | sphinxcontrib-jsmath
6 | sphinxcontrib-serializinghtml
7 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "dedupe"
 3 | description = "A python library for accurate and scaleable data deduplication and entity-resolution"
 4 | version = "3.0.3"
 5 | readme = "README.md"
 6 | requires-python = ">=3.8"
 7 | license = {file = "LICENSE"}
 8 | keywords = []
 9 | authors = [
10 |   { name = "Forest Gregg", email = "fgregg@datamade.us" },
11 | ]
12 | classifiers = [
13 |     "Development Status :: 4 - Beta",
14 |     "Intended Audience :: Developers",
15 |     "Intended Audience :: Science/Research",
16 |     "License :: OSI Approved :: MIT License",
17 |     "Natural Language :: English",
18 |     "Operating System :: MacOS :: MacOS X",
19 |     "Operating System :: Microsoft :: Windows",
20 |     "Operating System :: POSIX",
21 |     "Programming Language :: Cython",
22 |     "Programming Language :: Python :: 3",
23 |     "Topic :: Software Development :: Libraries :: Python Modules",
24 |     "Topic :: Scientific/Engineering",
25 |     "Topic :: Scientific/Engineering :: Information Analysis",
26 | ]
27 | dependencies = [
28 |     "scikit-learn",
29 |     "affinegap>=1.3",
30 |     "categorical-distance>=1.9",
31 |     "numpy>=1.20",
32 |     "doublemetaphone",
33 |     "highered>=0.2.0",
34 |     "simplecosine>=1.2",
35 |     "haversine>=0.4.1",
36 |     "BTrees>=4.1.4",
37 |     "zope.index",
38 |     "dedupe_Levenshtein_search",
39 | ]
40 | 
41 | [project.urls]
42 | Homepage = "https://github.com/dedupeio/dedupe"
43 | Issues = "https://github.com/dedupeio/dedupe/issues"
44 | Documentation = "https://docs.dedupe.io/en/latest/"
45 | Examples = "https://github.com/dedupeio/dedupe-examples"
46 | Twitter = "https://twitter.com/DedupeIo"
47 | Changelog = "https://github.com/dedupeio/dedupe/blob/main/CHANGELOG.md"
48 | MailingList = "https://groups.google.com/forum/#!forum/open-source-deduplication"
49 | 
50 | 
51 | [build-system]
52 | requires = ["setuptools",
53 |             "wheel",
54 |             "cython"]
55 | build-backend = "setuptools.build_meta"
56 | 
57 | [tool.setuptools]
58 | packages = ["dedupe", "dedupe.variables"]
59 | 
60 | [tool.setuptools.package-data]
61 | dedupe = ["py.typed"]
62 | 
63 | [tool.mypy]
64 | plugins = "numpy.typing.mypy_plugin"
65 | files = ["dedupe"]
66 | show_error_codes = true
67 | ignore_missing_imports = true
68 | check_untyped_defs = true
69 | implicit_reexport = false
70 | 
71 | [tool.pytest.ini_options]
72 | minversion = "7.1"
73 | addopts = "--cov dedupe --cov-report xml"
74 | testpaths = ["tests", "dedupe"]
75 | 
76 | [tool.isort]
77 | profile = "black"
78 | src_paths = ["dedupe", "tests", "benchmarks"]
79 | 
80 | [tool.coverage.run]
81 | omit = ["dedupe/backport.py"]
82 | source = ["dedupe"]
83 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | asv
 2 | black
 3 | coverage[toml]
 4 | coveralls
 5 | flake8
 6 | mock
 7 | mypy
 8 | pytest
 9 | pytest-cov
10 | virtualenv
11 | isort
12 | pre-commit
13 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     from setuptools import Extension, setup
 3 | except ImportError:
 4 |     raise ImportError(
 5 |         "setuptools module required, please go to https://pypi.python.org/pypi/setuptools and follow the instructions for installing setuptools"
 6 |     )
 7 | 
 8 | from Cython.Build import cythonize
 9 | 
10 | setup(
11 |     ext_modules=cythonize([Extension("dedupe.cpredicates", ["dedupe/cpredicates.pyx"])])
12 | )
13 | 


--------------------------------------------------------------------------------
/tests/duplicateCluster_memory_case.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | import dedupe.core
 4 | import dedupe.dedupe  # noqa: F401
 5 | 
 6 | # simulated_candidates = (((1, {'name': 'asdffdsa'}), (2, {'name': 'fdsaasdf'}))
 7 | # for _ in xrange(10**6))
 8 | 
 9 | # data_model =  {"fields": {"name": {"type": "String", "weight": -1.0}},
10 | # "bias": 1.0}
11 | # threshold = 0
12 | 
13 | # dupes = dedupe.core.scoreDuplicates(simulated_candidates,
14 | # data_model,
15 | # 0)
16 | 
17 | # simulated_candidates = (((1, {'name': 'asdffdsa'}), (2, {'name': 'fdsaasdf'}))
18 | # for _ in xrange(10**7))
19 | 
20 | 
21 | # deduper = dedupe.dedupe.Dedupe({"name": {"type": "String", "weight": -1.0}})
22 | # clusters = deduper.duplicateClusters(simulated_candidates, 0, 0)
23 | 
24 | 
25 | def candidates_gen():
26 |     candidate_set = set()
27 |     for _ in range(10**5):
28 |         block = [((random.randint(0, 1000), "a"), (random.randint(0, 1000), "b"))]
29 |         for candidate in block:
30 |             pair_ids = (candidate[0][0], candidate[1][0])
31 |             if pair_ids not in candidate_set:
32 |                 yield candidate
33 |                 candidate_set.add(pair_ids)
34 |     del candidate_set
35 | 
36 | 
37 | @profile  # noqa: F821
38 | def generator_test():
39 |     a = sum(candidate[0][0] for candidate in candidates_gen())
40 |     print(a)
41 | 
42 | 
43 | generator_test()
44 | 


--------------------------------------------------------------------------------
/tests/test_api.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | import unittest
  3 | import warnings
  4 | from collections import OrderedDict
  5 | 
  6 | import dedupe.api
  7 | 
  8 | 
  9 | def icfi(x):
 10 |     return list(itertools.chain.from_iterable(x))
 11 | 
 12 | 
 13 | DATA_SAMPLE = [
 14 |     ({"age": "27", "name": "Kyle"}, {"age": "50", "name": "Bob"}),
 15 |     ({"age": "27", "name": "Kyle"}, {"age": "35", "name": "William"}),
 16 |     ({"age": "10", "name": "Sue"}, {"age": "35", "name": "William"}),
 17 |     ({"age": "27", "name": "Kyle"}, {"age": "20", "name": "Jimmy"}),
 18 |     ({"age": "75", "name": "Charlie"}, {"age": "21", "name": "Jimbo"}),
 19 | ]
 20 | 
 21 | data_dict = OrderedDict(
 22 |     (
 23 |         (0, {"name": "Bob", "age": "51"}),
 24 |         (1, {"name": "Linda", "age": "50"}),
 25 |         (2, {"name": "Gene", "age": "12"}),
 26 |         (3, {"name": "Tina", "age": "15"}),
 27 |         (4, {"name": "Bob B.", "age": "51"}),
 28 |         (5, {"name": "bob belcher", "age": "51"}),
 29 |         (6, {"name": "linda ", "age": "50"}),
 30 |     )
 31 | )
 32 | 
 33 | data_dict_2 = OrderedDict(
 34 |     (
 35 |         (7, {"name": "BOB", "age": "51"}),
 36 |         (8, {"name": "LINDA", "age": "50"}),
 37 |         (9, {"name": "GENE", "age": "12"}),
 38 |         (10, {"name": "TINA", "age": "15"}),
 39 |         (11, {"name": "BOB B.", "age": "51"}),
 40 |         (12, {"name": "BOB BELCHER", "age": "51"}),
 41 |         (13, {"name": "LINDA ", "age": "50"}),
 42 |     )
 43 | )
 44 | 
 45 | 
 46 | class ActiveMatch(unittest.TestCase):
 47 |     def setUp(self):
 48 |         self.field_definition = [
 49 |             dedupe.variables.String("name"),
 50 |             dedupe.variables.String("age"),
 51 |         ]
 52 | 
 53 |     def test_initialize_fields(self):
 54 |         self.assertRaises(TypeError, dedupe.api.ActiveMatching)
 55 | 
 56 |         with self.assertRaises(ValueError):
 57 |             dedupe.api.ActiveMatching(
 58 |                 [],
 59 |             )
 60 | 
 61 |         with self.assertRaises(ValueError):
 62 |             dedupe.api.ActiveMatching([{"field": "name", "type": "String"}])
 63 | 
 64 |         with self.assertRaises(ValueError):
 65 |             dedupe.api.ActiveMatching(
 66 |                 [dedupe.variables.Custom("name", comparator=lambda x, y: 1)],
 67 |             )
 68 | 
 69 |         with self.assertRaises(ValueError):
 70 |             dedupe.api.ActiveMatching(
 71 |                 [
 72 |                     dedupe.variables.Custom("name", comparator=lambda x, y: 1),
 73 |                     dedupe.variables.Custom("age", comparator=lambda x, y: 1),
 74 |                 ],
 75 |             )
 76 | 
 77 |         dedupe.api.ActiveMatching(
 78 |             [
 79 |                 dedupe.variables.Custom("name", comparator=lambda x, y: 1),
 80 |                 dedupe.variables.String("age"),
 81 |             ],
 82 |         )
 83 | 
 84 |     def test_check_record(self):
 85 |         matcher = dedupe.api.ActiveMatching(self.field_definition)
 86 | 
 87 |         self.assertRaises(ValueError, matcher._checkRecordPair, ())
 88 |         self.assertRaises(ValueError, matcher._checkRecordPair, (1, 2))
 89 |         self.assertRaises(ValueError, matcher._checkRecordPair, (1, 2, 3))
 90 |         self.assertRaises(ValueError, matcher._checkRecordPair, ({}, {}))
 91 | 
 92 |         matcher._checkRecordPair(
 93 |             ({"name": "Frank", "age": "72"}, {"name": "Bob", "age": "27"})
 94 |         )
 95 | 
 96 |     def test_markPair(self):
 97 |         from collections import OrderedDict
 98 | 
 99 |         good_training_pairs = OrderedDict(
100 |             (("match", DATA_SAMPLE[3:5]), ("distinct", DATA_SAMPLE[0:3]))
101 |         )
102 |         bad_training_pairs = {"non_dupes": DATA_SAMPLE[0:3], "match": DATA_SAMPLE[3:5]}
103 | 
104 |         matcher = dedupe.api.ActiveMatching(self.field_definition)
105 | 
106 |         self.assertRaises(ValueError, matcher.mark_pairs, bad_training_pairs)
107 | 
108 |         matcher.mark_pairs(good_training_pairs)
109 | 
110 |         with warnings.catch_warnings(record=True) as w:
111 |             warnings.simplefilter("always")
112 |             matcher.mark_pairs({"match": [], "distinct": []})
113 |             assert len(w) == 1
114 |             assert str(w[-1].message) == "Didn't return any labeled record pairs"
115 | 
116 | 
117 | if __name__ == "__main__":
118 |     unittest.main()
119 | 


--------------------------------------------------------------------------------
/tests/test_blocking.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | from collections import defaultdict
  3 | 
  4 | import dedupe
  5 | 
  6 | 
  7 | class BlockingTest(unittest.TestCase):
  8 |     def setUp(self):
  9 |         field_definition = [{"field": "name", "type": "String"}]
 10 |         self.data_model = dedupe.Dedupe(field_definition).data_model
 11 |         self.training_pairs = {
 12 |             "match": [
 13 |                 ({"name": "Bob", "age": "50"}, {"name": "Bob", "age": "75"}),
 14 |                 ({"name": "Meredith", "age": "40"}, {"name": "Sue", "age": "10"}),
 15 |             ],
 16 |             "distinct": [
 17 |                 ({"name": "Jimmy", "age": "20"}, {"name": "Jimbo", "age": "21"}),
 18 |                 ({"name": "Willy", "age": "35"}, {"name": "William", "age": "35"}),
 19 |                 ({"name": "William", "age": "36"}, {"name": "William", "age": "35"}),
 20 |             ],
 21 |         }
 22 | 
 23 |         self.training = self.training_pairs["match"] + self.training_pairs["distinct"]
 24 |         self.training_records = []
 25 |         for pair in self.training:
 26 |             for record in pair:
 27 |                 if record not in self.training_records:
 28 |                     self.training_records.append(record)
 29 | 
 30 |         self.simple = lambda x: {str(k) for k in x if "CompoundPredicate" not in str(k)}
 31 | 
 32 | 
 33 | class TfidfTest(unittest.TestCase):
 34 |     def setUp(self):
 35 |         self.data_d = {
 36 |             100: {"name": "Bob", "age": "50", "dataset": 0},
 37 |             105: {"name": "Charlie", "age": "75", "dataset": 1},
 38 |             110: {"name": "Meredith", "age": "40", "dataset": 1},
 39 |             115: {"name": "Sue", "age": "10", "dataset": 0},
 40 |             120: {"name": "Jimbo", "age": "21", "dataset": 0},
 41 |             125: {"name": "Jimbo", "age": "21", "dataset": 0},
 42 |             130: {"name": "Willy", "age": "35", "dataset": 0},
 43 |             135: {"name": "Willy", "age": "35", "dataset": 1},
 44 |             140: {"name": "Martha", "age": "19", "dataset": 1},
 45 |             145: {"name": "Kyle", "age": "27", "dataset": 0},
 46 |         }
 47 | 
 48 |     def test_unconstrained_inverted_index(self):
 49 |         blocker = dedupe.blocking.Fingerprinter(
 50 |             [dedupe.predicates.TfidfTextSearchPredicate(0.0, "name")]
 51 |         )
 52 | 
 53 |         blocker.index({record["name"] for record in self.data_d.values()}, "name")
 54 | 
 55 |         blocks = defaultdict(set)
 56 | 
 57 |         for block_key, record_id in blocker(self.data_d.items()):
 58 |             blocks[block_key].add(record_id)
 59 | 
 60 |         blocks = {frozenset(block) for block in blocks.values() if len(block) > 1}
 61 | 
 62 |         assert blocks == {frozenset([120, 125]), frozenset([130, 135])}
 63 | 
 64 | 
 65 | class TfIndexUnindex(unittest.TestCase):
 66 |     def setUp(self):
 67 |         data_d = {
 68 |             100: {"name": "Bob", "age": "50", "dataset": 0},
 69 |             105: {"name": "Charlie", "age": "75", "dataset": 1},
 70 |             110: {"name": "Meredith", "age": "40", "dataset": 1},
 71 |             115: {"name": "Sue", "age": "10", "dataset": 0},
 72 |             120: {"name": "Jimbo", "age": "21", "dataset": 0},
 73 |             125: {"name": "Jimbo", "age": "21", "dataset": 0},
 74 |             130: {"name": "Willy", "age": "35", "dataset": 0},
 75 |             135: {"name": "Willy", "age": "35", "dataset": 1},
 76 |             140: {"name": "Martha", "age": "19", "dataset": 1},
 77 |             145: {"name": "Kyle", "age": "27", "dataset": 0},
 78 |         }
 79 | 
 80 |         self.blocker = dedupe.blocking.Fingerprinter(
 81 |             [dedupe.predicates.TfidfTextSearchPredicate(0.0, "name")]
 82 |         )
 83 | 
 84 |         self.records_1 = {
 85 |             record_id: record
 86 |             for record_id, record in data_d.items()
 87 |             if record["dataset"] == 0
 88 |         }
 89 | 
 90 |         self.fields_2 = {
 91 |             record_id: record["name"]
 92 |             for record_id, record in data_d.items()
 93 |             if record["dataset"] == 1
 94 |         }
 95 | 
 96 |     def test_index(self):
 97 |         self.blocker.index(set(self.fields_2.values()), "name")
 98 | 
 99 |         blocks = defaultdict(set)
100 | 
101 |         for block_key, record_id in self.blocker(self.records_1.items()):
102 |             blocks[block_key].add(record_id)
103 | 
104 |         assert list(blocks.items())[0][1] == {130}
105 | 
106 |     def test_doubled_index(self):
107 |         self.blocker.index(self.fields_2.values(), "name")
108 |         self.blocker.index(self.fields_2.values(), "name")
109 | 
110 |         blocks = defaultdict(set)
111 | 
112 |         for block_key, record_id in self.blocker(self.records_1.items()):
113 |             blocks[block_key].add(record_id)
114 | 
115 |         result = list(blocks.items())
116 | 
117 |         assert len(result) == 1
118 | 
119 |         assert result[0][1] == {130}
120 | 
121 |     def test_unindex(self):
122 |         self.blocker.index(self.fields_2.values(), "name")
123 |         self.blocker.unindex(self.fields_2.values(), "name")
124 | 
125 |         blocks = defaultdict(set)
126 | 
127 |         for block_key, record_id in self.blocker(self.records_1.items()):
128 |             blocks[block_key].add(record_id)
129 | 
130 |         assert len(blocks.values()) == 0
131 | 
132 | 
133 | if __name__ == "__main__":
134 |     unittest.main()
135 | 


--------------------------------------------------------------------------------
/tests/test_canonical.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import dedupe.canonical
 4 | 
 5 | 
 6 | class CanonicalizationTest(unittest.TestCase):
 7 |     def test_get_centroid(self):
 8 |         from affinegap import normalizedAffineGapDistance as comparator
 9 | 
10 |         attributeList = [
11 |             "mary crane center",
12 |             "mary crane center north",
13 |             "mary crane league - mary crane - west",
14 |             "mary crane league mary crane center (east)",
15 |             "mary crane league mary crane center (north)",
16 |             "mary crane league mary crane center (west)",
17 |             "mary crane league - mary crane - east",
18 |             "mary crane family and day care center",
19 |             "mary crane west",
20 |             "mary crane center east",
21 |             "mary crane league mary crane center (east)",
22 |             "mary crane league mary crane center (north)",
23 |             "mary crane league mary crane center (west)",
24 |             "mary crane league",
25 |             "mary crane",
26 |             "mary crane east 0-3",
27 |             "mary crane north",
28 |             "mary crane north 0-3",
29 |             "mary crane league - mary crane - west",
30 |             "mary crane league - mary crane - north",
31 |             "mary crane league - mary crane - east",
32 |             "mary crane league - mary crane - west",
33 |             "mary crane league - mary crane - north",
34 |             "mary crane league - mary crane - east",
35 |         ]
36 | 
37 |         centroid = dedupe.canonical.getCentroid(attributeList, comparator)
38 |         assert centroid == "mary crane"
39 | 
40 |     def test_get_canonical_rep(self):
41 |         record_list = [
42 |             {"name": "mary crane", "address": "123 main st", "zip": "12345"},
43 |             {"name": "mary crane east", "address": "123 main street", "zip": ""},
44 |             {"name": "mary crane west", "address": "123 man st", "zip": ""},
45 |         ]
46 | 
47 |         rep = dedupe.canonical.getCanonicalRep(record_list)
48 |         assert rep == {
49 |             "name": "mary crane",
50 |             "address": "123 main street",
51 |             "zip": "12345",
52 |         }
53 | 
54 |         rep = dedupe.canonical.getCanonicalRep(record_list[0:2])
55 |         assert rep == {"name": "mary crane", "address": "123 main st", "zip": "12345"}
56 | 
57 |         rep = dedupe.canonical.getCanonicalRep(record_list[0:1])
58 |         assert rep == {"name": "mary crane", "address": "123 main st", "zip": "12345"}
59 | 


--------------------------------------------------------------------------------
/tests/test_core.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import unittest
  3 | 
  4 | import numpy
  5 | import scipy.special
  6 | 
  7 | import dedupe
  8 | 
  9 | 
 10 | class MockClassifier:
 11 |     def __init__(self):
 12 |         self.weight = 0
 13 |         self.bias = 0
 14 | 
 15 |     def predict_proba(self, examples):
 16 |         return scipy.special.expit(examples * self.weight + self.bias)
 17 | 
 18 | 
 19 | class ScoreDuplicates(unittest.TestCase):
 20 |     def setUp(self):
 21 |         random.seed(123)
 22 | 
 23 |         long_string = "asa;sasdfjasdio;fio;asdnfasdvnvao;asduifvnavjasdfasdfasfasasdfasdfasdfasdfasdfsdfasgnuavpidcvaspdivnaspdivninasduinguipghauipsdfnvaspfighapsdifnasdifnasdpighuignpaguinpgiasidfjasdfjsdofgiongag"  # noqa: E501
 24 | 
 25 |         self.records = iter(
 26 |             [
 27 |                 (
 28 |                     (long_string, {"name": "Margret", "age": "32"}),
 29 |                     ("2", {"name": "Marga", "age": "33"}),
 30 |                 ),
 31 |                 (
 32 |                     ("2", {"name": "Marga", "age": "33"}),
 33 |                     ("3", {"name": "Maria", "age": "19"}),
 34 |                 ),
 35 |                 (
 36 |                     ("4", {"name": "Maria", "age": "19"}),
 37 |                     ("5", {"name": "Monica", "age": "39"}),
 38 |                 ),
 39 |                 (
 40 |                     ("6", {"name": "Monica", "age": "39"}),
 41 |                     ("7", {"name": "Mira", "age": "47"}),
 42 |                 ),
 43 |                 (
 44 |                     ("8", {"name": "Mira", "age": "47"}),
 45 |                     ("9", {"name": "Mona", "age": "9"}),
 46 |                 ),
 47 |             ]
 48 |         )
 49 | 
 50 |         deduper = dedupe.Dedupe([dedupe.variables.String("name")])
 51 |         self.data_model = deduper.data_model
 52 |         self.classifier = MockClassifier()
 53 | 
 54 |         self.classifier.weight = -1.0302742719650269
 55 |         self.classifier.bias = 4.76
 56 | 
 57 |         score_dtype = [("pairs", "<U192", 2), ("score", "f4")]
 58 | 
 59 |         self.desired_scored_pairs = numpy.array(
 60 |             [
 61 |                 ((long_string, "2"), 0.96),
 62 |                 (["2", "3"], 0.96),
 63 |                 (["4", "5"], 0.78),
 64 |                 (["6", "7"], 0.72),
 65 |                 (["8", "9"], 0.84),
 66 |             ],
 67 |             dtype=score_dtype,
 68 |         )
 69 | 
 70 |     def test_score_duplicates(self):
 71 |         scores = dedupe.core.scoreDuplicates(
 72 |             self.records, self.data_model.distances, self.classifier, 2
 73 |         )
 74 | 
 75 |         numpy.testing.assert_equal(scores["pairs"], self.desired_scored_pairs["pairs"])
 76 | 
 77 |         numpy.testing.assert_allclose(
 78 |             scores["score"], self.desired_scored_pairs["score"], 2
 79 |         )
 80 | 
 81 |     def test_score_duplicates_with_zeros(self):
 82 |         # Pairs with scores of 0s shouldn't be included
 83 |         # https://github.com/dedupeio/dedupe/issues/1072
 84 |         self.classifier.weight = -1000
 85 |         self.classifier.bias = 1000
 86 |         records = iter(
 87 |             [
 88 |                 (("1", {"name": "ABCD"}), ("2", {"name": "EFGH"})),
 89 |                 (("3", {"name": "IJKL"}), ("4", {"name": "IJKL"})),
 90 |             ]
 91 |         )
 92 |         dtype = [("pairs", "<U256", 2), ("score", "f4")]
 93 |         expected = numpy.array([(["3", "4"], 1)], dtype=dtype)
 94 | 
 95 |         scores = dedupe.core.scoreDuplicates(
 96 |             records, self.data_model.distances, self.classifier, 2
 97 |         )
 98 | 
 99 |         assert isinstance(scores, numpy.memmap)
100 |         assert scores.dtype == expected.dtype
101 |         numpy.testing.assert_equal(scores["pairs"], expected["pairs"])
102 |         numpy.testing.assert_allclose(scores["score"], expected["score"], 2)
103 | 
104 | 
105 | class FieldDistances(unittest.TestCase):
106 |     def test_exact_comparator(self):
107 |         deduper = dedupe.Dedupe([dedupe.variables.Exact("name")])
108 | 
109 |         record_pairs = (
110 |             ({"name": "Shmoo"}, {"name": "Shmee"}),
111 |             ({"name": "Shmoo"}, {"name": "Shmoo"}),
112 |         )
113 | 
114 |         numpy.testing.assert_array_almost_equal(
115 |             deduper.data_model.distances(record_pairs), numpy.array([[0.0], [1.0]]), 3
116 |         )
117 | 
118 |     def test_comparator(self):
119 |         deduper = dedupe.Dedupe(
120 |             [dedupe.variables.Categorical("type", categories=["a", "b", "c"])]
121 |         )
122 | 
123 |         record_pairs = (({"type": "a"}, {"type": "b"}), ({"type": "a"}, {"type": "c"}))
124 | 
125 |         numpy.testing.assert_array_almost_equal(
126 |             deduper.data_model.distances(record_pairs),
127 |             numpy.array([[0, 0, 1, 0, 0], [0, 0, 0, 1, 0]]),
128 |             3,
129 |         )
130 | 
131 |     def test_comparator_interaction(self):
132 |         deduper = dedupe.Dedupe(
133 |             [
134 |                 dedupe.variables.Categorical(
135 |                     "type", categories=["a", "b"], name="type"
136 |                 ),
137 |                 dedupe.variables.Interaction("type", "name"),
138 |                 dedupe.variables.Exact("name", name="name"),
139 |             ]
140 |         )
141 | 
142 |         record_pairs = (
143 |             ({"name": "steven", "type": "a"}, {"name": "steven", "type": "b"}),
144 |             ({"name": "steven", "type": "b"}, {"name": "steven", "type": "b"}),
145 |         )
146 | 
147 |         numpy.testing.assert_array_almost_equal(
148 |             deduper.data_model.distances(record_pairs),
149 |             numpy.array([[0, 1, 1, 0, 1], [1, 0, 1, 1, 0]]),
150 |             3,
151 |         )
152 | 
153 | 
154 | class Unique(unittest.TestCase):
155 |     def test_unique(self):
156 |         target = ([{1: 1, 2: 2}, {3: 3, 4: 4}], [{3: 3, 4: 4}, {1: 1, 2: 2}])
157 | 
158 |         assert dedupe.core.unique([{1: 1, 2: 2}, {3: 3, 4: 4}, {1: 1, 2: 2}]) in target
159 | 
160 | 
161 | if __name__ == "__main__":
162 |     unittest.main()
163 | 


--------------------------------------------------------------------------------
/tests/test_cpredicates.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | 
  3 | from dedupe.cpredicates import initials, ngrams, unique_ngrams
  4 | 
  5 | 
  6 | class TestCPredicates(unittest.TestCase):
  7 |     def test_ngrams(self):
  8 |         assert ngrams("deduplicate", 1) == [
  9 |             "d",
 10 |             "e",
 11 |             "d",
 12 |             "u",
 13 |             "p",
 14 |             "l",
 15 |             "i",
 16 |             "c",
 17 |             "a",
 18 |             "t",
 19 |             "e",
 20 |         ]
 21 |         assert ngrams("deduplicate", 2) == [
 22 |             "de",
 23 |             "ed",
 24 |             "du",
 25 |             "up",
 26 |             "pl",
 27 |             "li",
 28 |             "ic",
 29 |             "ca",
 30 |             "at",
 31 |             "te",
 32 |         ]
 33 |         assert ngrams("deduplicate", 3) == [
 34 |             "ded",
 35 |             "edu",
 36 |             "dup",
 37 |             "upl",
 38 |             "pli",
 39 |             "lic",
 40 |             "ica",
 41 |             "cat",
 42 |             "ate",
 43 |         ]
 44 |         assert ngrams("deduplicate", 4) == [
 45 |             "dedu",
 46 |             "edup",
 47 |             "dupl",
 48 |             "upli",
 49 |             "plic",
 50 |             "lica",
 51 |             "icat",
 52 |             "cate",
 53 |         ]
 54 |         assert ngrams("deduplicate", 5) == [
 55 |             "dedup",
 56 |             "edupl",
 57 |             "dupli",
 58 |             "uplic",
 59 |             "plica",
 60 |             "licat",
 61 |             "icate",
 62 |         ]
 63 |         assert ngrams("deduplicate", 6) == [
 64 |             "dedupl",
 65 |             "edupli",
 66 |             "duplic",
 67 |             "uplica",
 68 |             "plicat",
 69 |             "licate",
 70 |         ]
 71 |         assert ngrams("deduplicate", 7) == [
 72 |             "dedupli",
 73 |             "eduplic",
 74 |             "duplica",
 75 |             "uplicat",
 76 |             "plicate",
 77 |         ]
 78 |         assert ngrams("deduplicate", 8) == [
 79 |             "deduplic",
 80 |             "eduplica",
 81 |             "duplicat",
 82 |             "uplicate",
 83 |         ]
 84 |         assert ngrams("deduplicate", 9) == ["deduplica", "eduplicat", "duplicate"]
 85 |         assert ngrams("deduplicate", 10) == ["deduplicat", "eduplicate"]
 86 |         assert ngrams("deduplicate", 11) == ["deduplicate"]
 87 |         assert ngrams("deduplicate", 12) == []
 88 |         assert ngrams("deduplicate", 100) == []
 89 | 
 90 |     def test_unique_ngrams(self):
 91 |         assert unique_ngrams("mississippi", 1) == {"m", "i", "s", "p"}
 92 |         assert unique_ngrams("mississippi", 2) == {
 93 |             "mi",
 94 |             "is",
 95 |             "ss",
 96 |             "si",
 97 |             "ip",
 98 |             "pp",
 99 |             "pi",
100 |         }
101 |         assert unique_ngrams("mississippi", 3) == {
102 |             "mis",
103 |             "iss",
104 |             "ssi",
105 |             "sis",
106 |             "sip",
107 |             "ipp",
108 |             "ppi",
109 |         }
110 |         assert unique_ngrams("mississippi", 4) == {
111 |             "miss",
112 |             "issi",
113 |             "ssis",
114 |             "siss",
115 |             "ssip",
116 |             "sipp",
117 |             "ippi",
118 |         }
119 |         assert unique_ngrams("mississippi", 5) == {
120 |             "missi",
121 |             "issis",
122 |             "ssiss",
123 |             "sissi",
124 |             "issip",
125 |             "ssipp",
126 |             "sippi",
127 |         }
128 |         assert unique_ngrams("mississippi", 6) == {
129 |             "missis",
130 |             "ississ",
131 |             "ssissi",
132 |             "sissip",
133 |             "issipp",
134 |             "ssippi",
135 |         }
136 |         assert unique_ngrams("mississippi", 7) == {
137 |             "mississ",
138 |             "ississi",
139 |             "ssissip",
140 |             "sissipp",
141 |             "issippi",
142 |         }
143 |         assert unique_ngrams("mississippi", 8) == {
144 |             "mississi",
145 |             "ississip",
146 |             "ssissipp",
147 |             "sissippi",
148 |         }
149 |         assert unique_ngrams("mississippi", 9) == {
150 |             "mississip",
151 |             "ississipp",
152 |             "ssissippi",
153 |         }
154 |         assert unique_ngrams("mississippi", 10) == {"mississipp", "ississippi"}
155 |         assert unique_ngrams("mississippi", 11) == {"mississippi"}
156 |         assert unique_ngrams("mississippi", 12) == set()
157 |         assert unique_ngrams("mississippi", 100) == set()
158 | 
159 |     def test_initials(self):
160 |         assert initials("deduplicate", 1) == {"d"}
161 |         assert initials("deduplicate", 2) == {"de"}
162 |         assert initials("deduplicate", 3) == {"ded"}
163 |         assert initials("deduplicate", 4) == {"dedu"}
164 |         assert initials("deduplicate", 5) == {"dedup"}
165 |         assert initials("deduplicate", 6) == {"dedupl"}
166 |         assert initials("deduplicate", 7) == {"dedupli"}
167 |         assert initials("deduplicate", 8) == {"deduplic"}
168 |         assert initials("deduplicate", 9) == {"deduplica"}
169 |         assert initials("deduplicate", 10) == {"deduplicat"}
170 |         assert initials("deduplicate", 11) == {"deduplicate"}
171 |         assert initials("deduplicate", 12) == {"deduplicate"}
172 |         assert initials("deduplicate", 100) == {"deduplicate"}
173 | 
174 | 
175 | if __name__ == "__main__":
176 |     unittest.main()
177 | 


--------------------------------------------------------------------------------
/tests/test_dedupe.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | import unittest
  3 | 
  4 | import numpy
  5 | 
  6 | import dedupe
  7 | import dedupe.variables
  8 | 
  9 | DATA = {
 10 |     100: {"name": "Bob", "age": "50"},
 11 |     105: {"name": "Charlie", "age": "75"},
 12 |     110: {"name": "Meredith", "age": "40"},
 13 |     115: {"name": "Sue", "age": "10"},
 14 |     120: {"name": "Jimmy", "age": "20"},
 15 |     125: {"name": "Jimbo", "age": "21"},
 16 |     130: {"name": "Willy", "age": "35"},
 17 |     135: {"name": "William", "age": "35"},
 18 |     140: {"name": "Martha", "age": "19"},
 19 |     145: {"name": "Kyle", "age": "27"},
 20 | }
 21 | 
 22 | DATA_SAMPLE = (
 23 |     ({"age": "27", "name": "Kyle"}, {"age": "50", "name": "Bob"}),
 24 |     ({"age": "27", "name": "Kyle"}, {"age": "35", "name": "William"}),
 25 |     ({"age": "10", "name": "Sue"}, {"age": "35", "name": "William"}),
 26 |     ({"age": "27", "name": "Kyle"}, {"age": "20", "name": "Jimmy"}),
 27 |     ({"age": "75", "name": "Charlie"}, {"age": "21", "name": "Jimbo"}),
 28 | )
 29 | 
 30 | 
 31 | class DataModelTest(unittest.TestCase):
 32 |     def test_data_model(self):
 33 |         DataModel = dedupe.datamodel.DataModel
 34 | 
 35 |         self.assertRaises(TypeError, DataModel)
 36 | 
 37 |         data_model = DataModel(
 38 |             [
 39 |                 dedupe.variables.String(field="a", name="a"),
 40 |                 dedupe.variables.String(field="b", name="b"),
 41 |                 dedupe.variables.Interaction("a", "b"),
 42 |             ]
 43 |         )
 44 | 
 45 |         assert data_model._interaction_indices == [[0, 1]]
 46 | 
 47 |         data_model = DataModel(
 48 |             [
 49 |                 dedupe.variables.String(field="a", name="a", has_missing=True),
 50 |                 dedupe.variables.String(field="b", name="b"),
 51 |                 dedupe.variables.Interaction("a", "b"),
 52 |             ]
 53 |         )
 54 | 
 55 |         assert data_model._missing_field_indices == [0, 2]
 56 | 
 57 |         data_model = DataModel(
 58 |             [
 59 |                 dedupe.variables.String(field="a", name="a", has_missing=False),
 60 |                 dedupe.variables.String(field="b", name="b"),
 61 |                 dedupe.variables.Interaction("a", "b"),
 62 |             ]
 63 |         )
 64 | 
 65 |         assert data_model._missing_field_indices == []
 66 | 
 67 | 
 68 | class ConnectedComponentsTest(unittest.TestCase):
 69 |     def test_components(self):
 70 |         G = numpy.array(
 71 |             [
 72 |                 ((1, 2), 0.1),
 73 |                 ((2, 3), 0.2),
 74 |                 ((4, 5), 0.2),
 75 |                 ((4, 6), 0.2),
 76 |                 ((7, 9), 0.2),
 77 |                 ((8, 9), 0.2),
 78 |                 ((10, 11), 0.2),
 79 |                 ((12, 13), 0.2),
 80 |                 ((12, 14), 0.5),
 81 |                 ((11, 12), 0.2),
 82 |             ],
 83 |             dtype=[("pairs", "i4", 2), ("score", "f4")],
 84 |         )
 85 |         components = dedupe.clustering.connected_components
 86 |         G_components = {
 87 |             frozenset(tuple(edge) for edge, _ in component)
 88 |             for component in components(G, 30000)
 89 |         }
 90 |         assert G_components == {
 91 |             frozenset(((1, 2), (2, 3))),
 92 |             frozenset(((4, 5), (4, 6))),
 93 |             frozenset(((12, 13), (12, 14), (10, 11), (11, 12))),
 94 |             frozenset(((7, 9), (8, 9))),
 95 |         }
 96 | 
 97 | 
 98 | class ClusteringTest(unittest.TestCase):
 99 |     def setUp(self):
100 |         # Fully connected star network
101 |         self.dupes = numpy.array(
102 |             [
103 |                 ((1, 2), 0.86),
104 |                 ((1, 3), 0.72),
105 |                 ((1, 4), 0.2),
106 |                 ((1, 5), 0.6),
107 |                 ((2, 3), 0.86),
108 |                 ((2, 4), 0.2),
109 |                 ((2, 5), 0.72),
110 |                 ((3, 4), 0.3),
111 |                 ((3, 5), 0.5),
112 |                 ((4, 5), 0.72),
113 |                 ((10, 11), 0.9),
114 |             ],
115 |             dtype=[("pairs", "i4", 2), ("score", "f4")],
116 |         )
117 | 
118 |         # Dupes with Ids as String
119 |         self.str_dupes = numpy.array(
120 |             [
121 |                 (("1", "2"), 0.86),
122 |                 (("1", "3"), 0.72),
123 |                 (("1", "4"), 0.2),
124 |                 (("1", "5"), 0.6),
125 |                 (("2", "3"), 0.86),
126 |                 (("2", "4"), 0.2),
127 |                 (("2", "5"), 0.72),
128 |                 (("3", "4"), 0.3),
129 |                 (("3", "5"), 0.5),
130 |                 (("4", "5"), 0.72),
131 |             ],
132 |             dtype=[("pairs", "S4", 2), ("score", "f4")],
133 |         )
134 | 
135 |         self.bipartite_dupes = (
136 |             ((1, 5), 0.1),
137 |             ((1, 6), 0.72),
138 |             ((1, 7), 0.2),
139 |             ((1, 8), 0.6),
140 |             ((2, 5), 0.2),
141 |             ((2, 6), 0.2),
142 |             ((2, 7), 0.72),
143 |             ((2, 8), 0.3),
144 |             ((3, 5), 0.24),
145 |             ((3, 6), 0.72),
146 |             ((3, 7), 0.24),
147 |             ((3, 8), 0.65),
148 |             ((4, 5), 0.63),
149 |             ((4, 6), 0.96),
150 |             ((4, 7), 0.23),
151 |             ((5, 8), 0.24),
152 |         )
153 | 
154 |     def clusterEquals(self, x, y):
155 |         if [] == x == y:
156 |             return True
157 |         if len(x) != len(y):
158 |             return False
159 | 
160 |         for cluster_a, cluster_b in zip(x, y):
161 |             if cluster_a[0] != cluster_b[0]:
162 |                 return False
163 |             for score_a, score_b in zip(cluster_a[1], cluster_b[1]):
164 |                 if abs(score_a - score_b) > 0.001:
165 |                     return False
166 |             else:
167 |                 return True
168 | 
169 |     def test_hierarchical(self):
170 |         hierarchical = dedupe.clustering.cluster
171 |         assert self.clusterEquals(list(hierarchical(self.dupes, 1)), [])
172 | 
173 |         assert self.clusterEquals(
174 |             list(hierarchical(self.dupes, 0.5)),
175 |             [
176 |                 ((1, 2, 3), (0.778, 0.860, 0.778)),
177 |                 ((4, 5), (0.720, 0.720)),
178 |                 ((10, 11), (0.899, 0.899)),
179 |             ],
180 |         )
181 | 
182 |         print(hierarchical(self.dupes, 0.0))
183 |         assert self.clusterEquals(
184 |             list(hierarchical(self.dupes, 0)),
185 |             [
186 |                 ((1, 2, 3, 4, 5), (0.526, 0.564, 0.542, 0.320, 0.623)),
187 |                 ((10, 11), (0.899, 0.899)),
188 |             ],
189 |         )
190 | 
191 |         assert list(hierarchical(self.str_dupes, 1)) == []
192 |         assert list(zip(*hierarchical(self.str_dupes, 0.5)))[0] == (
193 |             (b"1", b"2", b"3"),
194 |             (b"4", b"5"),
195 |         )
196 |         assert list(zip(*hierarchical(self.str_dupes, 0)))[0] == (
197 |             (b"1", b"2", b"3", b"4", b"5"),
198 |         )
199 | 
200 |     def test_greedy_matching(self):
201 |         greedyMatch = dedupe.clustering.greedyMatching
202 | 
203 |         bipartite_dupes = numpy.array(
204 |             list(self.bipartite_dupes), dtype=[("ids", int, 2), ("score", float)]
205 |         )
206 | 
207 |         assert list(greedyMatch(bipartite_dupes)) == [
208 |             ((4, 6), 0.96),
209 |             ((2, 7), 0.72),
210 |             ((3, 8), 0.65),
211 |             ((1, 5), 0.1),
212 |         ]
213 | 
214 |     def test_gazette_matching(self):
215 |         gazetteMatch = dedupe.clustering.gazetteMatching
216 |         blocked_dupes = itertools.groupby(self.bipartite_dupes, key=lambda x: x[0][0])
217 | 
218 |         def to_numpy(x):
219 |             return numpy.array(x, dtype=[("ids", int, 2), ("score", float)])
220 | 
221 |         blocked_dupes = [to_numpy(list(block)) for _, block in blocked_dupes]
222 | 
223 |         target = [
224 |             (((1, 6), 0.72), ((1, 8), 0.6)),
225 |             (((2, 7), 0.72), ((2, 8), 0.3)),
226 |             (((3, 6), 0.72), ((3, 8), 0.65)),
227 |             (((4, 6), 0.96), ((4, 5), 0.63)),
228 |             (((5, 8), 0.24),),
229 |         ]
230 | 
231 |         assert [
232 |             tuple((tuple(pair), score) for pair, score in each.tolist())
233 |             for each in gazetteMatch(blocked_dupes, n_matches=2)
234 |         ] == target
235 | 
236 | 
237 | if __name__ == "__main__":
238 |     unittest.main()
239 | 


--------------------------------------------------------------------------------
/tests/test_exists.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import numpy
 4 | 
 5 | from dedupe.variables.exists import ExistsType
 6 | 
 7 | 
 8 | class TestExists(unittest.TestCase):
 9 |     def test_comparator(self):
10 |         var = ExistsType("foo")
11 |         assert numpy.array_equal(var.comparator(None, None), [0, 0])
12 |         assert numpy.array_equal(var.comparator(1, 1), [1, 0])
13 |         assert numpy.array_equal(var.comparator(1, 0), [0, 1])
14 | 
15 |     def test_len_higher_vars(self):
16 |         # The len > 1 is neccessary for the correct processing in datamodel.py
17 |         var = ExistsType("foo")
18 |         assert len(var) > 1
19 |         assert len(var.higher_vars) > 1
20 |         assert len(var) == len(var.higher_vars)
21 | 


--------------------------------------------------------------------------------
/tests/test_labeler.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import unittest
 3 | 
 4 | import pytest
 5 | 
 6 | import dedupe
 7 | from dedupe import datamodel, labeler
 8 | from dedupe._typing import RecordDictPair
 9 | 
10 | SAMPLE = {
11 |     1: {"name": "Meredith", "age": "40"},
12 |     2: {"name": "Sue", "age": "10"},
13 |     3: {"name": "Willy", "age": "35"},
14 |     4: {"name": "William", "age": "35"},
15 |     5: {"name": "Jimmy", "age": "20"},
16 |     6: {"name": "Jimbo", "age": "21"},
17 | }
18 | 
19 | 
20 | def freeze_record_pair(record_pair: RecordDictPair):
21 |     rec1, rec2 = record_pair
22 |     return (frozenset(rec1.items()), frozenset(rec2.items()))
23 | 
24 | 
25 | class ActiveLearningTest(unittest.TestCase):
26 |     def setUp(self):
27 |         self.data_model = datamodel.DataModel(
28 |             [dedupe.variables.String("name"), dedupe.variables.String("age")]
29 |         )
30 | 
31 |     def test_AL(self):
32 |         random.seed(1111111111110)
33 |         # Even with random seed, the order of the following seem to be random,
34 |         # so we shouldn't test for exact order.
35 |         EXPECTED_CANDIDATES = [
36 |             ({"name": "Willy", "age": "35"}, {"name": "William", "age": "35"}),
37 |             ({"name": "Jimmy", "age": "20"}, {"name": "Jimbo", "age": "21"}),
38 |             ({"name": "Willy", "age": "35"}, {"name": "Jimmy", "age": "20"}),
39 |             ({"name": "William", "age": "35"}, {"name": "Jimmy", "age": "20"}),
40 |             ({"name": "Meredith", "age": "40"}, {"name": "Sue", "age": "10"}),
41 |             ({"name": "Meredith", "age": "40"}, {"name": "Jimmy", "age": "20"}),
42 |             ({"name": "Sue", "age": "10"}, {"name": "Jimmy", "age": "20"}),
43 |             ({"name": "Willy", "age": "35"}, {"name": "Jimbo", "age": "21"}),
44 |             ({"name": "William", "age": "35"}, {"name": "Jimbo", "age": "21"}),
45 |         ]
46 |         EXPECTED_CANDIDATES = {freeze_record_pair(pair) for pair in EXPECTED_CANDIDATES}
47 |         active_learner = labeler.DedupeDisagreementLearner(
48 |             self.data_model.predicates, self.data_model.distances, SAMPLE, []
49 |         )
50 |         actual_candidates = set()
51 |         for i in range(len(EXPECTED_CANDIDATES), 0, -1):
52 |             assert len(active_learner) == i
53 |             record_pair = freeze_record_pair(active_learner.pop())
54 |             actual_candidates.add(record_pair)
55 |         assert actual_candidates == EXPECTED_CANDIDATES
56 |         with pytest.raises(IndexError):
57 |             active_learner.pop()
58 | 
59 | 
60 | if __name__ == "__main__":
61 |     unittest.main()
62 | 


--------------------------------------------------------------------------------
/tests/test_memory.sh:
--------------------------------------------------------------------------------
1 | valgrind --tool=massif --suppressions=/usr/share/doc/python26-devel-2.6.8/valgrind-python.supp --massif-out-file=out.txt --depth=1 python2.6 tests/test_affine_memory.py
2 | ms_print out.txt  | less
3 | 


--------------------------------------------------------------------------------
/tests/test_predicates.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | 
  3 | from dedupe import predicates
  4 | 
  5 | 
  6 | class TestPuncStrip(unittest.TestCase):
  7 |     def test_sevenchar(self):
  8 |         s1 = predicates.StringPredicate(predicates.sameSevenCharStartPredicate, "foo")
  9 |         assert s1({"foo": "fo,18v*1vaad80"}) == s1({"foo": "fo18v1vaad80"})
 10 | 
 11 |     def test_set(self):
 12 |         s1 = predicates.SimplePredicate(predicates.wholeSetPredicate, "foo")
 13 |         colors = {"red", "blue", "green"}
 14 |         assert s1({"foo": colors}) == {str(colors)}
 15 | 
 16 | 
 17 | class TestMetaphone(unittest.TestCase):
 18 |     def test_metaphone_token(self):
 19 |         block_val = predicates.metaphoneToken("9301 S. State St. ")
 20 |         assert block_val == {"STT", "S", "ST"}
 21 | 
 22 | 
 23 | class TestWholeSet(unittest.TestCase):
 24 |     def setUp(self):
 25 |         self.s1 = {"red", "blue", "green"}
 26 | 
 27 |     def test_full_set(self):
 28 |         block_val = predicates.wholeSetPredicate(self.s1)
 29 |         self.assertEqual(block_val, {str(self.s1)})
 30 | 
 31 | 
 32 | class TestSetElement(unittest.TestCase):
 33 |     def setUp(self):
 34 |         self.s1 = {"red", "blue", "green"}
 35 | 
 36 |     def test_long_set(self):
 37 |         block_val = predicates.commonSetElementPredicate(self.s1)
 38 |         self.assertEqual(set(block_val), {"blue", "green", "red"})
 39 | 
 40 |     def test_empty_set(self):
 41 |         block_val = predicates.commonSetElementPredicate(set())
 42 |         self.assertEqual(block_val, set())
 43 | 
 44 |     def test_first_last(self):
 45 |         block_val = predicates.lastSetElementPredicate(self.s1)
 46 |         assert block_val == {"red"}
 47 |         block_val = predicates.firstSetElementPredicate(self.s1)
 48 |         assert block_val == {"blue"}
 49 | 
 50 |     def test_magnitude(self):
 51 |         block_val = predicates.magnitudeOfCardinality(self.s1)
 52 |         assert block_val == {"0"}
 53 | 
 54 |         block_val = predicates.magnitudeOfCardinality(())
 55 |         assert block_val == set()
 56 | 
 57 | 
 58 | class TestLatLongGrid(unittest.TestCase):
 59 |     def setUp(self):
 60 |         self.latlong1 = (42.535, -5.012)
 61 | 
 62 |     def test_precise_latlong(self):
 63 |         block_val = predicates.latLongGridPredicate(self.latlong1)
 64 |         assert block_val == {"(42.5, -5.0)"}
 65 |         block_val = predicates.latLongGridPredicate((0, 0))
 66 |         assert block_val == set()
 67 | 
 68 | 
 69 | class TestAlpaNumeric(unittest.TestCase):
 70 |     def test_alphanumeric(self):
 71 |         assert predicates.alphaNumericPredicate("a1") == {"a1"}
 72 |         assert predicates.alphaNumericPredicate("1a") == {"1a"}
 73 |         assert predicates.alphaNumericPredicate("a1b") == {"a1b"}
 74 |         assert predicates.alphaNumericPredicate("1 a") == {"1"}
 75 |         assert predicates.alphaNumericPredicate("a1 b1") == {"a1", "b1"}
 76 |         assert predicates.alphaNumericPredicate("asdf") == set()
 77 |         assert predicates.alphaNumericPredicate("1") == {"1"}
 78 |         assert predicates.alphaNumericPredicate("a_1") == {"1"}
 79 |         assert predicates.alphaNumericPredicate("a$1") == {"1"}
 80 |         assert predicates.alphaNumericPredicate("a 1") == {"1"}
 81 |         assert predicates.alphaNumericPredicate("773-555-1676") == {
 82 |             "773",
 83 |             "555",
 84 |             "1676",
 85 |         }
 86 | 
 87 | 
 88 | class TestNumericPredicates(unittest.TestCase):
 89 |     def test_order_of_magnitude(self):
 90 |         assert predicates.orderOfMagnitude(10) == {"1"}
 91 |         assert predicates.orderOfMagnitude(9) == {"1"}
 92 |         assert predicates.orderOfMagnitude(2) == {"0"}
 93 |         assert predicates.orderOfMagnitude(-2) == set()
 94 | 
 95 |     def test_round_to_1(self):
 96 |         assert predicates.roundTo1(22315) == {"20000"}
 97 |         assert predicates.roundTo1(-22315) == {"-20000"}
 98 | 
 99 | 
100 | class TestCompoundPredicate(unittest.TestCase):
101 |     def test_escapes_colon(self):
102 |         """
103 |         Regression test for issue #836
104 |         """
105 |         predicate_1 = predicates.SimplePredicate(
106 |             predicates.commonSetElementPredicate, "col_1"
107 |         )
108 |         predicate_2 = predicates.SimplePredicate(
109 |             predicates.commonSetElementPredicate, "col_2"
110 |         )
111 |         record = {"col_1": ["foo:", "foo"], "col_2": [":bar", "bar"]}
112 | 
113 |         block_val = predicates.CompoundPredicate([predicate_1, predicate_2])(record)
114 |         assert len(set(block_val)) == 4
115 |         assert block_val == {"foo\\::\\:bar", "foo\\::bar", "foo:\\:bar", "foo:bar"}
116 | 
117 |     def test_escapes_escaped_colon(self):
118 |         """
119 |         Regression test for issue #836
120 |         """
121 |         predicate_1 = predicates.SimplePredicate(
122 |             predicates.commonSetElementPredicate, "col_1"
123 |         )
124 |         predicate_2 = predicates.SimplePredicate(
125 |             predicates.commonSetElementPredicate, "col_2"
126 |         )
127 |         record = {"col_1": ["foo\\:", "foo"], "col_2": ["\\:bar", "bar"]}
128 | 
129 |         block_val = predicates.CompoundPredicate([predicate_1, predicate_2])(record)
130 |         assert len(set(block_val)) == 4
131 |         assert block_val == {
132 |             "foo\\\\::\\\\:bar",
133 |             "foo\\\\::bar",
134 |             "foo:\\\\:bar",
135 |             "foo:bar",
136 |         }
137 | 
138 | 
139 | if __name__ == "__main__":
140 |     unittest.main()
141 | 


--------------------------------------------------------------------------------
/tests/test_price.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from dedupe.variables.price import PriceType
 4 | 
 5 | 
 6 | class TestPrice(unittest.TestCase):
 7 |     def test_comparator(self):
 8 |         assert PriceType.comparator(1, 10) == 1
 9 |         assert PriceType.comparator(10, 1) == 1
10 | 


--------------------------------------------------------------------------------
/tests/test_serializer.py:
--------------------------------------------------------------------------------
 1 | import codecs
 2 | import json
 3 | import sys
 4 | import unittest
 5 | 
 6 | import dedupe
 7 | 
 8 | 
 9 | class SerializerTest(unittest.TestCase):
10 |     def test_writeTraining(self):
11 |         if sys.version < "3":
12 |             from StringIO import StringIO
13 | 
14 |             output = StringIO()
15 |             encoded_file = codecs.EncodedFile(
16 |                 output, data_encoding="utf8", file_encoding="ascii"
17 |             )
18 |         else:
19 |             from io import StringIO
20 | 
21 |             encoded_file = StringIO()
22 | 
23 |         training_pairs = {
24 |             "distinct": [
25 |                 [
26 |                     {
27 |                         "bar": frozenset(["barë"]),
28 |                         "baz": (1, 2),
29 |                         "bang": (1, 2),
30 |                         "foo": "baz",
31 |                     },
32 |                     {"foo": "baz"},
33 |                 ]
34 |             ],
35 |             "match": [],
36 |         }
37 | 
38 |         json.dump(training_pairs, encoded_file, cls=dedupe.serializer.TupleEncoder)
39 | 
40 |         encoded_file.seek(0)
41 | 
42 |         loaded_training_pairs = json.load(
43 |             encoded_file, object_hook=dedupe.serializer._from_json
44 |         )
45 | 
46 |         assert loaded_training_pairs["distinct"][0][0] == dict(
47 |             training_pairs["distinct"][0][0]
48 |         )
49 | 
50 |         assert isinstance(loaded_training_pairs["distinct"][0][0]["bar"], frozenset)
51 |         assert isinstance(loaded_training_pairs["distinct"][0][0]["baz"], tuple)
52 | 
53 |         deduper = dedupe.Dedupe([dedupe.variables.String("foo")])
54 |         deduper.classifier.cv = False
55 | 
56 |         encoded_file.seek(0)
57 | 
58 |         deduper._read_training(encoded_file)
59 |         print(deduper.training_pairs)
60 |         print(training_pairs)
61 |         assert deduper.training_pairs == training_pairs
62 | 
63 |         encoded_file.close()
64 | 
65 | 
66 | if __name__ == "__main__":
67 |     unittest.main()
68 | 


--------------------------------------------------------------------------------
/tests/test_tfidf.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import dedupe
 4 | 
 5 | 
 6 | class ParsingTest(unittest.TestCase):
 7 |     def setUp(self):
 8 |         self.index = dedupe.tfidf.TfIdfIndex()
 9 | 
10 |     def test_keywords(self):
11 |         self.index.index(("AND", "OR", "EOF", "NOT"))
12 |         self.index._index.initSearch()
13 |         assert self.index.search(("AND", "OR", "EOF", "NOT"))[0] == 1
14 | 
15 |     def test_keywords_title(self):
16 |         self.index.index(("And", "Or", "Eof", "Not"))
17 |         self.index._index.initSearch()
18 |         assert self.index.search(("And", "Or", "Eof", "Not"))[0] == 1
19 | 
20 |     def test_empty_search(self):
21 |         self.index._index.initSearch()
22 |         assert self.index.search(()) == []
23 | 
24 |     def test_wildcards(self):
25 |         self.index.index((r"f\o",))
26 |         self.index.index(("f*",))
27 |         self.index._index.initSearch()
28 |         assert len(self.index.search(("f*",))) == 1
29 | 
30 | 
31 | if __name__ == "__main__":
32 |     unittest.main()
33 | 


--------------------------------------------------------------------------------
/tests/test_training.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import dedupe
 4 | import dedupe.branch_and_bound as branch_and_bound
 5 | import dedupe.training as training
 6 | 
 7 | 
 8 | class TrainingTest(unittest.TestCase):
 9 |     def setUp(self):
10 |         field_definition = [dedupe.variables.String("name")]
11 |         self.data_model = dedupe.Dedupe(field_definition).data_model
12 |         self.training_pairs = {
13 |             "match": [
14 |                 ({"name": "Bob", "age": "50"}, {"name": "Bob", "age": "75"}),
15 |                 ({"name": "Meredith", "age": "40"}, {"name": "Sue", "age": "10"}),
16 |             ],
17 |             "distinct": [
18 |                 ({"name": "Jimmy", "age": "20"}, {"name": "Jimbo", "age": "21"}),
19 |                 ({"name": "Willy", "age": "35"}, {"name": "William", "age": "35"}),
20 |                 ({"name": "William", "age": "36"}, {"name": "William", "age": "35"}),
21 |             ],
22 |         }
23 | 
24 |         self.training = self.training_pairs["match"] + self.training_pairs["distinct"]
25 |         self.training_records = []
26 |         for pair in self.training:
27 |             for record in pair:
28 |                 if record not in self.training_records:
29 |                     self.training_records.append(record)
30 | 
31 |         self.simple = lambda x: {str(k) for k in x if "CompoundPredicate" not in str(k)}
32 | 
33 |         self.block_learner = training.BlockLearner
34 |         self.block_learner.blocker = dedupe.blocking.Fingerprinter(
35 |             self.data_model.predicates
36 |         )
37 |         self.block_learner.blocker.index_all(
38 |             {i: x for i, x in enumerate(self.training_records)}
39 |         )
40 | 
41 |     def test_dedupe_coverage(self):
42 |         coverage = self.block_learner.cover(self.block_learner, self.training)
43 |         assert self.simple(coverage.keys()).issuperset(
44 |             {
45 |                 "SimplePredicate: (tokenFieldPredicate, name)",
46 |                 "SimplePredicate: (commonSixGram, name)",
47 |                 "TfidfTextCanopyPredicate: (0.4, name)",
48 |                 "SimplePredicate: (sortedAcronym, name)",
49 |                 "SimplePredicate: (sameThreeCharStartPredicate, name)",
50 |                 "TfidfTextCanopyPredicate: (0.2, name)",
51 |                 "SimplePredicate: (sameFiveCharStartPredicate, name)",
52 |                 "TfidfTextCanopyPredicate: (0.6, name)",
53 |                 "SimplePredicate: (wholeFieldPredicate, name)",
54 |                 "TfidfTextCanopyPredicate: (0.8, name)",
55 |                 "SimplePredicate: (commonFourGram, name)",
56 |                 "SimplePredicate: (firstTokenPredicate, name)",
57 |                 "SimplePredicate: (sameSevenCharStartPredicate, name)",
58 |             }
59 |         )
60 | 
61 |     def test_uncovered_by(self):
62 |         before = {1: frozenset({1, 2, 3}), 2: frozenset({1, 2}), 3: frozenset({3})}
63 |         after = {1: frozenset({1, 2}), 2: frozenset({1, 2})}
64 | 
65 |         before_copy = before.copy()
66 | 
67 |         assert branch_and_bound._uncovered_by(before, frozenset()) == before
68 |         assert branch_and_bound._uncovered_by(before, frozenset({3})) == after
69 |         assert before == before_copy
70 | 
71 |     def test_covered_pairs(self):
72 |         p1 = lambda x, target=None: frozenset((1,))  # noqa: E 731
73 | 
74 |         self.block_learner.blocker.predicates = (p1,)
75 |         cover = self.block_learner.cover(self.block_learner, [("a", "b")] * 2)
76 | 
77 |         assert cover[p1] == {0, 1}
78 | 
79 | 
80 | if __name__ == "__main__":
81 |     unittest.main()
82 | 


--------------------------------------------------------------------------------