├── .flake8 ├── .git-blame-ignore-revs ├── .github ├── dependabot.yml ├── scripts │ └── asv_markdown.py └── workflows │ ├── benchmark-bot.yml │ ├── codeql-analysis.yml │ ├── lock.yml │ └── pythonpackage.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .readthedocs.yml ├── CHANGELOG.md ├── CITATION.cff ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── CONTRIBUTORS.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── THANKS.md ├── benchmarks ├── asv.conf.json ├── benchmarks │ ├── __init__.py │ ├── canonical.py │ ├── canonical_gazetteer.py │ ├── canonical_matching.py │ ├── common.py │ └── datasets │ │ ├── restaurant-1.csv │ │ ├── restaurant-2.csv │ │ ├── restaurant-nophone-training.csv │ │ └── restaurant-nophone.csv └── setup.py ├── dedupe ├── __init__.py ├── _typing.py ├── api.py ├── backport.py ├── blocking.py ├── branch_and_bound.py ├── canonical.py ├── canopy_index.py ├── clustering.py ├── convenience.py ├── core.py ├── cpredicates.pyx ├── datamodel.py ├── index.py ├── labeler.py ├── levenshtein.py ├── predicate_functions.py ├── predicates.py ├── py.typed ├── serializer.py ├── tfidf.py ├── training.py └── variables │ ├── __init__.py │ ├── base.py │ ├── categorical_type.py │ ├── exact.py │ ├── exists.py │ ├── interaction.py │ ├── latlong.py │ ├── price.py │ ├── set.py │ └── string.py ├── docs ├── API-documentation.rst ├── Bibliography.rst ├── Examples.rst ├── Makefile ├── Troubleshooting.rst ├── Variable-definition.rst ├── _static │ ├── css │ │ ├── bootstrap.css │ │ └── custom.css │ ├── images │ │ ├── dedupeio-logo-reversed.png │ │ └── dedupeio-logo.png │ └── js │ │ ├── bootstrap.min.js │ │ └── jquery.min.js ├── _templates │ └── layout.html ├── conf.py ├── how-it-works │ ├── Choosing-a-good-threshold.rst │ ├── Grouping-duplicates.rst │ ├── How-it-works.rst │ ├── Making-smart-comparisons.rst │ ├── Matching-records.rst │ └── Special-Cases.rst ├── index.rst └── requirements.txt ├── pyproject.toml ├── requirements.txt ├── setup.py └── tests ├── duplicateCluster_memory_case.py ├── test_api.py ├── test_blocking.py ├── test_canonical.py ├── test_core.py ├── test_cpredicates.py ├── test_dedupe.py ├── test_exists.py ├── test_labeler.py ├── test_memory.sh ├── test_predicate_functions.py ├── test_predicates.py ├── test_price.py ├── test_serializer.py ├── test_tfidf.py └── test_training.py /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length=160 3 | extend-ignore = E203 -------------------------------------------------------------------------------- /.git-blame-ignore-revs: -------------------------------------------------------------------------------- 1 | # .git-blame-ignore-revs 2 | # Blacken 3 | c28cd1363f9fcf3bb9c7769615e02bfc08ba45b1 4 | 9e01ccf2e7eacabe0cd1ee16c5158ba417104897 5 | 442edec76a27f7d76f01c89de7327c35cbb898d7 6 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://help.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: "pip" # See documentation for possible values 9 | directory: "/" # Location of package manifests 10 | schedule: 11 | interval: "daily" 12 | - package-ecosystem: "github-actions" 13 | directory: "/" 14 | schedule: 15 | interval: "weekly" 16 | ignore: 17 | # Optional: Official actions have moving tags like v1; 18 | # if you use those, you don't need updates. 19 | - dependency-name: "actions/*" 20 | -------------------------------------------------------------------------------- /.github/scripts/asv_markdown.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | def format(element): 5 | if is_float(element): 6 | f = float(element) 7 | 8 | return "{0:.3}".format(f) 9 | 10 | else: 11 | return element 12 | 13 | 14 | def is_float(element): 15 | try: 16 | float(element) 17 | except ValueError: 18 | return False 19 | else: 20 | return True 21 | 22 | 23 | def to_markdown(data): 24 | preamble = """# {tests} ([diff](https://github.com/dedupeio/dedupe/compare/{base_commit}...{head_commit})): 25 | | | before | after | ratio | benchmark | 26 | |- |-: |-: |-: |-|\n""".format( 27 | **data 28 | ) 29 | 30 | full_table = preamble + "\n".join( 31 | "|" + "|".join(row) + "|" for row in data["comparisons"] 32 | ) 33 | 34 | return full_table 35 | 36 | 37 | def parse(asv_input): 38 | result = re.match( 39 | r"^\n(?P.*?):\n\n before after ratio\n \[(?P.+)\] \[(?P.+)\]\n <(?P.+)> <(?P.+)> *\n(?P.*)", 40 | asv_input, 41 | re.DOTALL, 42 | ) 43 | 44 | test_details = result.groupdict() 45 | 46 | raw_comparisons = test_details.pop("raw_comparisons").splitlines() 47 | comparisons = ( 48 | [row[:2].strip()] + row[2:].split(maxsplit=3) for row in raw_comparisons 49 | ) 50 | test_details["comparisons"] = [ 51 | [indicator, format(value_a), format(value_b), ratio, test] 52 | for indicator, value_a, value_b, ratio, test in comparisons 53 | ] 54 | return test_details 55 | 56 | 57 | if __name__ == "__main__": 58 | import sys 59 | 60 | print("hello", file=sys.stderr) 61 | asv_input = sys.stdin.read() 62 | print(asv_input, file=sys.stderr) 63 | 64 | print(to_markdown(parse(asv_input))) 65 | -------------------------------------------------------------------------------- /.github/workflows/benchmark-bot.yml: -------------------------------------------------------------------------------- 1 | # from https://github.com/pandas-dev/pandas/blob/d42a148cd83e06b5e5ef1fb6424e337d5b5efaa5/.github/workflows/asv-bot.yml 2 | name: "Benchmark Bot" 3 | 4 | on: 5 | issue_comment: # Pull requests are also issues 6 | types: 7 | - created 8 | 9 | env: 10 | COMMENT: ${{github.event.comment.body}} 11 | 12 | jobs: 13 | benchmarks: 14 | name: "Run benchmarks" 15 | if: startsWith(github.event.comment.body, '@benchmark') 16 | runs-on: ubuntu-latest 17 | defaults: 18 | run: 19 | shell: bash -el {0} 20 | 21 | concurrency: 22 | # Set concurrency to prevent abuse(full runs are ~5.5 hours !!!) 23 | # each user can only run one concurrent benchmark bot at a time 24 | # We don't cancel in progress jobs, but if you want to benchmark multiple PRs, 25 | # you're gonna have to wait 26 | group: ${{ github.actor }}-benchmarks 27 | cancel-in-progress: false 28 | 29 | steps: 30 | - name: Install hub 31 | run: sudo apt-get install -y hub 32 | 33 | - name: Setup git 34 | uses: actions/checkout@v3 35 | 36 | # Since this was triggered by a comment, not a PR, 37 | # the `actions/checkout` action will pull 38 | # the default branch (AKA main). We need to checkout the PR branch. 39 | # From https://github.com/actions/checkout/issues/331#issuecomment-925405415 40 | - name: Checkout Pull Request 41 | run: | 42 | 43 | hub pr checkout ${{ github.event.issue.number }} 44 | echo "Checked out SHA:" 45 | git log -1 --format='%H' 46 | env: 47 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 48 | 49 | - name: Set up Python 3.10 50 | uses: actions/setup-python@v3 51 | with: 52 | python-version: "3.10" 53 | 54 | - name: Install dependencies 55 | run: | 56 | pip install --upgrade pip 57 | pip install -r requirements.txt 58 | pip install . 59 | 60 | - name: Run benchmarks 61 | id: bench 62 | continue-on-error: true # This is a fake failure, asv will exit code 1 for regressions 63 | run: | 64 | # extracting the regex, see https://stackoverflow.com/a/36798723 65 | REGEX=$(echo "$COMMENT" | sed -n "s/^@benchmark\s\+-b\s*\(\S*\).*$/\1/p") 66 | if [ -z "$REGEX" ]; then 67 | BENCHMARKS="" 68 | else 69 | BENCHMARKS="-b $REGEX" 70 | fi 71 | cd benchmarks 72 | asv check -E existing 73 | git remote add upstream https://github.com/dedupeio/dedupe.git 74 | git fetch upstream 75 | asv machine --yes 76 | asv continuous --show-stderr -f 1.1 $BENCHMARKS upstream/main HEAD | cat 77 | echo 'BENCH_OUTPUT<> $GITHUB_ENV 78 | asv compare -f 1.1 upstream/main HEAD | python ../.github/scripts/asv_markdown.py >> $GITHUB_ENV 79 | echo 'EOF' >> $GITHUB_ENV 80 | echo "REGEX=$REGEX" >> $GITHUB_ENV 81 | 82 | - name: Add comment with results 83 | uses: actions/github-script@v6 84 | env: 85 | BENCH_OUTPUT: ${{env.BENCH_OUTPUT}} 86 | REGEX: ${{env.REGEX}} 87 | with: 88 | script: | 89 | const ENV_VARS = process.env 90 | const run_url = `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}` 91 | github.rest.issues.createComment({ 92 | issue_number: context.issue.number, 93 | owner: context.repo.owner, 94 | repo: context.repo.repo, 95 | body: ENV_VARS["BENCH_OUTPUT"] + '\n\n[(logs)](' + run_url + ')' 96 | }) 97 | -------------------------------------------------------------------------------- /.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | # 7 | # ******** NOTE ******** 8 | # We have attempted to detect the languages in your repository. Please check 9 | # the `language` matrix defined below to confirm you have the correct set of 10 | # supported CodeQL languages. 11 | # 12 | name: "CodeQL" 13 | 14 | on: 15 | push: 16 | branches: [ main ] 17 | pull_request: 18 | # The branches below must be a subset of the branches above 19 | branches: [ main ] 20 | schedule: 21 | - cron: '17 21 * * 4' 22 | 23 | jobs: 24 | analyze: 25 | name: Analyze 26 | runs-on: ubuntu-latest 27 | permissions: 28 | actions: read 29 | contents: read 30 | security-events: write 31 | 32 | strategy: 33 | fail-fast: false 34 | matrix: 35 | language: [ 'python' ] 36 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ] 37 | # Learn more about CodeQL language support at https://git.io/codeql-language-support 38 | 39 | steps: 40 | - name: Checkout repository 41 | uses: actions/checkout@v3 42 | 43 | # Initializes the CodeQL tools for scanning. 44 | - name: Initialize CodeQL 45 | uses: github/codeql-action/init@v3 46 | with: 47 | languages: ${{ matrix.language }} 48 | # If you wish to specify custom queries, you can do so here or in a config file. 49 | # By default, queries listed here will override any specified in a config file. 50 | # Prefix the list here with "+" to use these queries and those in the config file. 51 | # queries: ./path/to/local/query, your-org/your-repo/queries@main 52 | 53 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). 54 | # If this step fails, then you should remove it and run the build manually (see below) 55 | - name: Autobuild 56 | uses: github/codeql-action/autobuild@v3 57 | 58 | # ℹ️ Command-line programs to run using the OS shell. 59 | # 📚 https://git.io/JvXDl 60 | 61 | # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines 62 | # and modify them (or add more) to build your code if your project 63 | # uses a compiled language 64 | 65 | #- run: | 66 | # make bootstrap 67 | # make release 68 | 69 | - name: Perform CodeQL Analysis 70 | uses: github/codeql-action/analyze@v3 71 | -------------------------------------------------------------------------------- /.github/workflows/lock.yml: -------------------------------------------------------------------------------- 1 | name: 'Lock Threads' 2 | 3 | on: 4 | schedule: 5 | - cron: '0 0 1/7 * *' 6 | workflow_dispatch: 7 | 8 | permissions: 9 | issues: write 10 | 11 | concurrency: 12 | group: lock 13 | 14 | jobs: 15 | action: 16 | runs-on: ubuntu-latest 17 | steps: 18 | - uses: dessant/lock-threads@v5 19 | with: 20 | process-only: 'issues' 21 | issue-inactive-days: '14' 22 | log-output: true 23 | -------------------------------------------------------------------------------- /.github/workflows/pythonpackage.yml: -------------------------------------------------------------------------------- 1 | name: tests 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | lint: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: actions/checkout@v2 10 | - uses: actions/setup-python@v2 11 | with: 12 | python-version: "3.12" 13 | - name: Install dependencies 14 | run: | 15 | pip install --upgrade pip 16 | pip install . 17 | pip install -r requirements.txt 18 | - name: flake8 19 | run: flake8 dedupe tests benchmarks/benchmarks 20 | - name: isort 21 | if: always() 22 | run: isort --check-only . 23 | - name: black 24 | if: always() 25 | run: black . --check 26 | - name: mypy 27 | if: always() 28 | run: mypy 29 | test: 30 | timeout-minutes: 40 31 | runs-on: ${{ matrix.os }} 32 | strategy: 33 | fail-fast: false 34 | matrix: 35 | os: [windows-latest, macos-latest, ubuntu-latest] 36 | python-version: [3.8, 3.9, "3.10", "3.11", "3.12"] 37 | 38 | steps: 39 | - uses: actions/checkout@v2 40 | - name: Set up Python ${{ matrix.python-version }} 41 | uses: actions/setup-python@v2 42 | with: 43 | python-version: ${{ matrix.python-version }} 44 | - name: Install dependencies 45 | run: | 46 | pip install --upgrade pip 47 | pip install -e . 48 | - name: Install test dependencies 49 | run: pip install -r requirements.txt 50 | - name: pytest 51 | run: pytest 52 | - name: Code Coverage 53 | uses: codecov/codecov-action@v4 54 | env: 55 | OS: ${{ matrix.os }} 56 | PYTHON: '3.7' 57 | with: 58 | env_vars: OS,PYTHON 59 | - name: Integration tests 60 | # Do everything twice: The first time is training and generates settings, 61 | # the second time it tests using a static settings file. 62 | run: | 63 | python -m pip install ./benchmarks 64 | python benchmarks/benchmarks/canonical.py 65 | python benchmarks/benchmarks/canonical.py 66 | python benchmarks/benchmarks/canonical_matching.py 67 | python benchmarks/benchmarks/canonical_matching.py 68 | python benchmarks/benchmarks/canonical_gazetteer.py 69 | python benchmarks/benchmarks/canonical_gazetteer.py 70 | settings_file_persists: 71 | runs-on: ubuntu-latest 72 | steps: 73 | - name: checkout main 74 | uses: actions/checkout@v2 75 | with: 76 | ref: main 77 | - uses: actions/setup-python@v2 78 | - name: Install dependencies 79 | run: | 80 | pip install --upgrade pip 81 | pip install . 82 | python -m pip install ./benchmarks 83 | - name: Run on canonical on main 84 | run: python benchmarks/benchmarks/canonical.py 85 | - name: checkout this PR branch 86 | uses: actions/checkout@v2 87 | with: 88 | clean: false 89 | - name: Install any new dependencies 90 | run: pip install . 91 | - name: Run on canonical with setting file created on main 92 | run: python benchmarks/benchmarks/canonical.py 93 | wheels: 94 | if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags') 95 | needs: [test, lint, settings_file_persists] 96 | name: Build wheels on ${{ matrix.os }} 97 | runs-on: ${{ matrix.os }} 98 | strategy: 99 | matrix: 100 | os: [windows-latest, macos-latest, ubuntu-latest] 101 | steps: 102 | - uses: actions/checkout@v2 103 | - uses: actions/setup-python@v2 104 | - name: Build wheels 105 | uses: pypa/cibuildwheel@v2.21.3 106 | env: 107 | CIBW_ARCHS_MACOS: x86_64 arm64 universal2 108 | - name: Build sdist 109 | run: | 110 | pip install build 111 | python -m build --sdist 112 | - name: Publish wheels to PyPI 113 | env: 114 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 115 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 116 | run: | 117 | pip install twine 118 | twine upload --skip-existing wheelhouse/*.whl 119 | twine upload dist/* 120 | continue-on-error: true 121 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .env 2 | .coverage* 3 | htmlcov 4 | cpredicates.c 5 | *.code-workspace 6 | libdistance-0.2.1 7 | build 8 | _build 9 | *.pyc 10 | logfile 11 | *.*~ 12 | *.o 13 | *.so 14 | *.py.* 15 | *.*gz 16 | *.html 17 | .#* 18 | *.*# 19 | *.json 20 | examples/output/*.* 21 | examples/csv_example/csv_example_output.csv 22 | *output.csv 23 | examples/mysql_example/*.txt* 24 | *.db 25 | kernprof.py 26 | possible_classifiers 27 | .DS_Store 28 | mysql.cnf 29 | *settings 30 | *.egg-info 31 | ENV 32 | dist 33 | src/*.c 34 | 35 | .coverage 36 | htmlcov 37 | 38 | !benchmarks/asv.conf.json 39 | benchmarks/.asv/* 40 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/psf/black 3 | rev: 24.4.2 4 | hooks: 5 | - id: black 6 | - repo: https://github.com/pycqa/isort 7 | rev: 5.13.2 8 | hooks: 9 | - id: isort 10 | name: isort (python) 11 | - repo: https://github.com/pycqa/flake8 12 | rev: "7.1.0" 13 | hooks: 14 | - id: flake8 15 | args: [--config=.flake8] 16 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 2 | 3 | # Required 4 | version: 2 5 | 6 | # Set the OS, Python version and other tools you might need 7 | build: 8 | os: ubuntu-22.04 9 | tools: 10 | python: "3.12" 11 | 12 | # Build documentation in the docs/ directory with Sphinx 13 | sphinx: 14 | configuration: docs/conf.py 15 | 16 | # Build documentation with MkDocs 17 | #mkdocs: 18 | # configuration: mkdocs.yml 19 | 20 | # Optionally build your docs in additional formats such as PDF and ePub 21 | formats: all 22 | 23 | # Optionally set the version of Python and requirements required to build your docs 24 | python: 25 | install: 26 | - requirements: docs/requirements.txt 27 | - method: pip 28 | path: . 29 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # 3.0.2 2 | - Fixed regression in Exists predicate 3 | 4 | # 3.0.1 5 | - Fixed regression in Exists predicate 6 | 7 | 8 | # 3.0.0 9 | - Development in python packaging made supporting the previous namespace approach for 10 | variable plugins untenable. Since we had to redo the way we defined the data model, 11 | we took the opportunity to explicity instantiate variable objects. 12 | 13 | # 2.0.6 14 | - fixed bug that was preventing learning of index predicates in Dedupe mode 15 | 16 | # 2.0.3 17 | - Improved memory performance of connected components 18 | 19 | 20 | # 2.0 21 | 22 | - Python 3 only 23 | - Static typing and type Hints 24 | - Incorporate sqlite to extend normal API to millions of records 25 | - Multiprocessing enabled for Windows 26 | - Multiprocessing mode changed to spawn for Mac OS X 27 | - Moved from CamelCase to lowercase_with_underscore for method names. 28 | - Dropped ability to save indices in save settings. 29 | - Moved from Deduper.match -> Dedupe.partition, RecordLink.match -> RecordLink.join, Gazetteer.match -> Gazetteer.search 30 | - Renamed Matching.blocker -> Matching.fingerprinter 31 | - Moved to autodoc for documentation 32 | - Dropped threshold methods 33 | - matchBlocks has been replaced by score, which takes pairs of records not blocks 34 | 35 | # 1.10.0 36 | - Dropped python 2.7 support 37 | 38 | # 1.9.4 39 | - Cleaned up block learning 40 | 41 | # 1.9.3 42 | - Improved performance of connected components algorithm with very large components 43 | - Fixed pickling unpickling bug of Index predicate classes 44 | 45 | # 1.9.0 46 | - Implemented a disagreement based active labeler to improve blocking recall 47 | 48 | # 1.8.2 49 | - removed shelve-backed persistence in blocking data in favor of an improved in-memory implementation 50 | 51 | # 1.8.0 52 | - matchBlocks is not a generator; match is now optionally a generator. If the 53 | generator option is turned of for the Gazette match is lazy 54 | 55 | # 1.7.8 56 | - Speed up blocking, on our way to 3-predicates 57 | 58 | # 1.7.5 59 | - Significantly reduced memory footprint during connected_components 60 | 61 | # 1.7.3 62 | - Significantly reduced memory footprint during scoreDuplicates 63 | 64 | # 1.7.2 65 | - Improper release 66 | 67 | # 1.7.1 68 | - TempShelve class that addresses various bugs related to cleaning up tempoary shelves 69 | 70 | # 1.7.0 71 | - Added `target` argument to blocker and predicates for changing the behavior 72 | of the predicates for the target and source dataset if we are linking. 73 | 74 | # 1.6.8 75 | - Use file-backed blocking with dbm, dramatically increases size of data that can be handled without special programming 76 | 77 | # 1.6.7 78 | - Reduce memory footprint of matching 79 | 80 | # 1.6.0 81 | - Simplify .train method 82 | 83 | # 1.5.5 84 | - Levenshtein search based index predicates thanks to @mattandahalfew 85 | 86 | # 1.5.0 87 | - simplified the sample API, this might be a breaking change for some 88 | - the active learner interface is now more modular to allow for a different learner 89 | - random sampling of pairs has been improved for linking case and 90 | dedupe case, h/t to @MarkusShepherd 91 | 92 | ## 1.4.15 93 | - frozendicts have finally been removed 94 | - first N char predicates return their entire length if length is less 95 | than N, instead of nothing 96 | - crossvalidation is skipped in active learning if using default rlr learner 97 | 98 | ## 1.4.5 99 | - Block indexes can now be persisted by using the index=True argument 100 | in the writeSettings method 101 | 102 | ## 1.4.1 103 | - Now uses C version of double metaphone for speed 104 | - Much faster compounding of blocks in block learning 105 | 106 | ## 1.4.0 107 | - Block learning now tries to minimize the total number of comparisons 108 | not just the comparisons of distinct records. This decouples makes 109 | block learning from learning classifier learning. This change has 110 | requires new, different arguments to the train method. 111 | 112 | ## 1.3.8 113 | - Console labeler now shows fields in the order they are defined in 114 | the data model. The labeler also reports number of labeled examples 115 | - `pud` argument added to the `train` method. Proportion of uncovered 116 | dupes. This deprecates `uncovered_dupes` argument 117 | 118 | ## 1.3.0 119 | - If we have enough training data, consider Compound predicates of length 3 in addition to predicates of length 2 120 | 121 | ## 1.1.1 122 | - None now treated as missing data indicator. Warnings for deprecations of older types of missing data indicators 123 | 124 | ## 1.1.0 125 | Features 126 | - Handle FuzzyCategoricalType in datamodel 127 | 128 | ## 1.0.0 129 | Features 130 | - Speed up learning 131 | - Parallelize sampling 132 | - Optional [CRF Edit Distance](https://dedupe.readthedocs.io/en/latest/Variable-definition.html#optional-edit-distance) 133 | 134 | ## 0.8.0 135 | Support for Python 3.4 added. Support for Python 2.6 dropped. 136 | 137 | Features 138 | - Windows OS supported 139 | - train method has argument for not considering index predicates 140 | - TfIDFNGram Index Predicate added (for shorter string) 141 | - SuffixArray Predicate 142 | - Double Metaphone Predicates 143 | - Predicates for numbers, OrderOfMagnitude, Round 144 | - Set Predicate OrderOfCardinality 145 | - Final, learned predicates list will now often be smaller without 146 | loss of coverage 147 | - Variables refactored to support external extensions like 148 | https://github.com/datamade/dedupe-variable-address 149 | - Categorical distance, regularized logistic regression, affine gap 150 | distance, canonicalization have been turned into separate libraries. 151 | - Simplejson is now dependency 152 | 153 | ## 0.7.5 154 | Features 155 | - Individual record cluster membership scores 156 | - New predicates 157 | - New Exists Variable Type 158 | 159 | Bug Fixes 160 | - Latlong predicate fixed 161 | - Set TFIDF canopy working properly 162 | 163 | ## 0.7.4 164 | Features 165 | - Sampling methods now use blocked sampling 166 | 167 | ## 0.7.0 168 | Version 0.7.0 is backwards compatible, except for the match method of Gazetteer class 169 | 170 | Features 171 | - new index, unindex, and match methods in Gazetter Matching. Useful for 172 | streaming matching 173 | 174 | ## 0.6.0 175 | Version 0.6.0 is *not* backwards compatible. 176 | 177 | Features : 178 | - new Text, ShortString, and exact string types 179 | - multiple variables can be defined on same field 180 | - new Gazette linker for matching dirty records against a master list 181 | - performance improvements, particularly in memory usage 182 | - canonicalize function in dedupe.convenience for creating a canonical representation of a cluster of records 183 | - tons of bugfixes 184 | 185 | API breaks 186 | - when initializing an ActiveMatching object, `variable_definition` replaces `field_definition` and is a list of dictionaries instead of a dictionary. See the documentation for details 187 | - also when initializing a Matching object, `num_processes` has been replaced by `num_cores`, which now defaults to the 188 | number of cpus on the machine 189 | - when initializing a StaticMatching object, `settings_file` is now expected to be a file object not a string. The `readTraining`, `writeTraining`, `writeSettings` methods also all now expect file objects 190 | 191 | 192 | ## 0.5 193 | Version 0.5 is *not* backwards compatible. 194 | 195 | Features : 196 | 197 | - Special case code for linking two datasets that, individually are unique 198 | - Parallel processing using python standard library multiprocessing 199 | - Much faster canopy creation using zope.index 200 | - Asynchronous active learning methods 201 | 202 | API breaks : 203 | - `duplicateClusters` has been removed, it has been replaced by 204 | `match` and `matchBlocks` 205 | - `goodThreshold` has been removed, it has been replaced by 206 | `threshold` and `thresholdBlocks` 207 | - the meaning of `train` has changed. To train from training file use `readTraining`. To use console labeling, pass a dedupe instance to the `consoleLabel` function 208 | - The convenience function dataSample has been removed. It has been replaced by 209 | the `sample` methods 210 | - It is no longer necessary to pass `frozendicts` to `Matching` classes 211 | - `blockingFunction` has been removed and been replaced by the `blocker` method 212 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you use this software, please cite it as below." 3 | authors: 4 | - family-names: "Gregg" 5 | given-names: "Forest" 6 | - family-names: "Eder" 7 | given-names: "Derek" 8 | title: "dedupe" 9 | version: 2.0.11 10 | date-released: 2022-01-27 11 | url: "https://github.com/dedupeio/dedupe" 12 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation. 6 | 7 | ## Our Standards 8 | 9 | Examples of behavior that contributes to creating a positive environment include: 10 | 11 | * Using welcoming and inclusive language 12 | * Being respectful of differing viewpoints and experiences 13 | * Gracefully accepting constructive criticism 14 | * Focusing on what is best for the community 15 | * Showing empathy towards other community members 16 | 17 | Examples of unacceptable behavior by participants include: 18 | 19 | * The use of sexualized language or imagery and unwelcome sexual attention or advances 20 | * Trolling, insulting/derogatory comments, and personal or political attacks 21 | * Public or private harassment 22 | * Publishing others' private information, such as a physical or electronic address, without explicit permission 23 | * Other conduct which could reasonably be considered inappropriate in a professional setting 24 | 25 | ## Our Responsibilities 26 | 27 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. 28 | 29 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. 30 | 31 | ## Scope 32 | 33 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. 34 | 35 | ## Enforcement 36 | 37 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at info@datamade.us. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. 38 | 39 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. 40 | 41 | ## Attribution 42 | 43 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version] 44 | 45 | [homepage]: http://contributor-covenant.org 46 | [version]: http://contributor-covenant.org/version/1/4/ 47 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | ## Reporting issues 2 | 3 | When reporting issues please include as much detail as possible about your 4 | operating system, dedupe version and python version. Whenever possible, please 5 | also include a brief, self-contained code example that demonstrates the problem. 6 | 7 | If dedupe is raising an exception, please paste a [full traceback](https://en.wikipedia.org/wiki/Stack_trace). 8 | -------------------------------------------------------------------------------- /CONTRIBUTORS.md: -------------------------------------------------------------------------------- 1 | * Forest Gregg 2 | * Derek Eder 3 | * Nikit Saraf 4 | * Mark Huberty 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Forest Gregg, Derek Eder, DataMade and Contributors 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include dedupe/cpredicates.pyx -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Dedupe Python Library 2 | 3 | [![Tests Passing](https://github.com/dedupeio/dedupe/workflows/tests/badge.svg)](https://github.com/dedupeio/dedupe/actions?query=workflow%3Atests)[![codecov](https://codecov.io/gh/dedupeio/dedupe/branch/main/graph/badge.svg?token=aauKUrTEgh)](https://codecov.io/gh/dedupeio/dedupe) 4 | 5 | _dedupe is a python library that uses machine learning to perform fuzzy matching, deduplication and entity resolution quickly on structured data._ 6 | 7 | __dedupe__ will help you: 8 | 9 | * __remove duplicate entries__ from a spreadsheet of names and addresses 10 | * __link a list__ with customer information to another with order history, even without unique customer IDs 11 | * take a database of campaign contributions and __figure out which ones were made by the same person__, even if the names were entered slightly differently for each record 12 | 13 | dedupe takes in human training data and comes up with the best rules for your dataset to quickly and automatically find similar records, even with very large databases. 14 | 15 | ## Important links 16 | * Documentation: https://docs.dedupe.io/ 17 | * Repository: https://github.com/dedupeio/dedupe 18 | * Issues: https://github.com/dedupeio/dedupe/issues 19 | * Mailing list: https://groups.google.com/forum/#!forum/open-source-deduplication 20 | * Examples: https://github.com/dedupeio/dedupe-examples 21 | 22 | ## dedupe library consulting 23 | 24 | If you or your organization would like professional assistance in working with the dedupe library, Dedupe.io LLC offers consulting services. [Read more about pricing and available services here](https://dedupe.io/pricing/#consulting). 25 | 26 | ## Tools built with dedupe 27 | 28 | ### [Dedupe.io](https://dedupe.io/) 29 | A cloud service powered by the dedupe library for de-duplicating and finding matches in your data. It provides a step-by-step wizard for uploading your data, setting up a model, training, clustering and reviewing the results. 30 | 31 | [Dedupe.io](https://dedupe.io/) also supports record linkage across data sources and continuous matching and training through an [API](https://apidocs.dedupe.io/en/latest/). 32 | 33 | For more, see the [Dedupe.io product site](https://dedupe.io/), [tutorials on how to use it](https://dedupe.io/tutorial/intro-to-dedupe-io.html), and [differences between it and the dedupe library](https://dedupe.io/documentation/should-i-use-dedupeio-or-the-dedupe-python-library.html). 34 | 35 | Dedupe is well adopted by the Python community. Check out this [blogpost](https://medium.com/district-data-labs/basics-of-entity-resolution-with-python-and-dedupe-bc87440b64d4), 36 | a YouTube video on how to use [Dedupe with Python](https://youtu.be/McsTWXeURhA) and a Youtube video on how to apply [Dedupe at scale using Spark](https://youtu.be/q9HPUYmiwjE?t=2704). 37 | 38 | 39 | ### [csvdedupe](https://github.com/dedupeio/csvdedupe) 40 | Command line tool for de-duplicating and [linking](https://github.com/dedupeio/csvdedupe#csvlink-usage) CSV files. Read about it on [Source Knight-Mozilla OpenNews](https://source.opennews.org/en-US/articles/introducing-cvsdedupe/). 41 | 42 | ## Installation 43 | 44 | ### Using dedupe 45 | 46 | If you only want to use dedupe, install it this way: 47 | 48 | ```bash 49 | pip install dedupe 50 | ``` 51 | 52 | Familiarize yourself with [dedupe's API](https://docs.dedupe.io/en/latest/API-documentation.html), and get started on your project. Need inspiration? Have a look at [some examples](https://github.com/dedupeio/dedupe-examples). 53 | 54 | ### Developing dedupe 55 | 56 | We recommend using [virtualenv](http://virtualenv.readthedocs.org/en/latest/virtualenv.html) and [virtualenvwrapper](http://virtualenvwrapper.readthedocs.org/en/latest/install.html) for working in a virtualized development environment. [Read how to set up virtualenv](http://docs.python-guide.org/en/latest/dev/virtualenvs/). 57 | 58 | Once you have virtualenvwrapper set up, 59 | 60 | ```bash 61 | mkvirtualenv dedupe 62 | git clone https://github.com/dedupeio/dedupe.git 63 | cd dedupe 64 | pip install -e . --config-settings editable_mode=compat 65 | pip install -r requirements.txt 66 | ``` 67 | 68 | If these tests pass, then everything should have been installed correctly! 69 | 70 | ```bash 71 | pytest 72 | ``` 73 | 74 | Afterwards, whenever you want to work on dedupe, 75 | 76 | ```bash 77 | workon dedupe 78 | ``` 79 | 80 | ## Testing 81 | Unit tests of core dedupe functions 82 | ```bash 83 | pytest 84 | ``` 85 | 86 | #### Test using canonical dataset from Bilenko's research 87 | 88 | Using Deduplication 89 | ```bash 90 | python -m pip install -e ./benchmarks 91 | python benchmarks/benchmarks/canonical.py 92 | ``` 93 | 94 | Using Record Linkage 95 | ```bash 96 | python -m pip install -e ./benchmarks 97 | python benchmarks/benchmarks/canonical_matching.py 98 | ``` 99 | 100 | 101 | ## Team 102 | 103 | * Forest Gregg, DataMade 104 | * Derek Eder, DataMade 105 | 106 | ## Credits 107 | 108 | Dedupe is based on Mikhail Yuryevich Bilenko's Ph.D. dissertation: [*Learnable Similarity Functions and their Application to Record Linkage and Clustering*](http://www.cs.utexas.edu/~ml/papers/marlin-dissertation-06.pdf). 109 | 110 | ## Errors / Bugs 111 | 112 | If something is not behaving intuitively, it is a bug, and should be reported. 113 | [Report it here](https://github.com/dedupeio/dedupe/issues) 114 | 115 | 116 | ## Note on Patches/Pull Requests 117 | 118 | * Fork the project. 119 | * Make your feature addition or bug fix. 120 | * Send us a pull request. Bonus points for topic branches. 121 | 122 | ## Copyright 123 | 124 | Copyright (c) 2022 Forest Gregg and Derek Eder. Released under the [MIT License](https://github.com/dedupeio/dedupe/blob/main/LICENSE). 125 | 126 | Third-party copyright in this distribution is noted where applicable. 127 | 128 | ## Citing Dedupe 129 | If you use Dedupe in an academic work, please give this citation: 130 | 131 | Forest Gregg and Derek Eder. 2022. Dedupe. https://github.com/dedupeio/dedupe. 132 | -------------------------------------------------------------------------------- /THANKS.md: -------------------------------------------------------------------------------- 1 | # Thanks To 2 | 3 | * Jon Markel for the Illinois campaign contributions data used in the mysql_example, which he got from 4 | the [Illinois State Board of Elections](http://www.elections.il.gov/) 5 | 6 | * [Daniel Müllner](http://math.stanford.edu/~muellner/) for his wonderful [fastcluster](http://math.stanford.edu/~muellner/fastcluster.html) library and the many changes he made at our request 7 | -------------------------------------------------------------------------------- /benchmarks/benchmarks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/54ecfe77d41390da66899596834a2bde3712c966/benchmarks/benchmarks/__init__.py -------------------------------------------------------------------------------- /benchmarks/benchmarks/canonical.py: -------------------------------------------------------------------------------- 1 | import io 2 | import os 3 | import time 4 | from itertools import combinations 5 | 6 | import dedupe 7 | from benchmarks import common 8 | 9 | 10 | def make_report(data, clustering): 11 | true_dupes = common.get_true_dupes(data) 12 | predicted_dupes = set() 13 | for cluser_id, _ in clustering: 14 | for pair in combinations(cluser_id, 2): 15 | predicted_dupes.add(frozenset(pair)) 16 | 17 | return common.Report.from_scores(true_dupes, predicted_dupes) 18 | 19 | 20 | class Canonical: 21 | settings_file = common.DATASETS_DIR / "canonical_learned_settings" 22 | data_file = common.DATASETS_DIR / "restaurant-nophone-training.csv" 23 | 24 | def setup(self): 25 | self.data = common.load_data(self.data_file) 26 | training_pairs = dedupe.training_data_dedupe(self.data, "unique_id", 5000) 27 | self.training_pairs_filelike = io.StringIO() 28 | dedupe.serializer.write_training(training_pairs, self.training_pairs_filelike) 29 | self.training_pairs_filelike.seek(0) 30 | 31 | def make_report(self, clustering): 32 | return make_report(self.data, clustering) 33 | 34 | def run(self, use_settings=False): 35 | deduper: dedupe.StaticDedupe | dedupe.Dedupe 36 | 37 | if use_settings and os.path.exists(self.settings_file): 38 | with open(self.settings_file, "rb") as f: 39 | deduper = dedupe.StaticDedupe(f) 40 | 41 | else: 42 | variables = [ 43 | dedupe.variables.String("name"), 44 | dedupe.variables.Exact("name"), 45 | dedupe.variables.String("address"), 46 | dedupe.variables.ShortString("cuisine", has_missing=True), 47 | dedupe.variables.ShortString("city"), 48 | ] 49 | 50 | deduper = dedupe.Dedupe(variables, num_cores=5) 51 | deduper.prepare_training( 52 | self.data, training_file=self.training_pairs_filelike, sample_size=10000 53 | ) 54 | deduper.train(index_predicates=True) 55 | with open(self.settings_file, "wb") as f: 56 | deduper.write_settings(f) 57 | 58 | return deduper.partition(self.data, threshold=0.5) 59 | 60 | def time_run(self): 61 | return self.run() 62 | 63 | def peakmem_run(self): 64 | return self.run() 65 | 66 | def track_precision(self): 67 | return self.make_report(self.run()).precision 68 | 69 | def track_recall(self): 70 | return self.make_report(self.run()).recall 71 | 72 | 73 | def cli(): 74 | common.configure_logging() 75 | 76 | can = Canonical() 77 | can.setup() 78 | 79 | t0 = time.time() 80 | clustering = can.run(use_settings=True) 81 | elapsed = time.time() - t0 82 | 83 | print(can.make_report(clustering)) 84 | print(f"ran in {elapsed} seconds") 85 | 86 | 87 | if __name__ == "__main__": 88 | cli() 89 | -------------------------------------------------------------------------------- /benchmarks/benchmarks/canonical_gazetteer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | 4 | import dedupe 5 | from benchmarks import canonical_matching, common 6 | 7 | 8 | def make_report(data, clustering): 9 | true_dupes = canonical_matching.get_true_dupes(data) 10 | predicted_dupes = { 11 | frozenset([a, b]) for a, result in clustering for b, score in result 12 | } 13 | return common.Report.from_scores(true_dupes, predicted_dupes) 14 | 15 | 16 | class Gazetteer(canonical_matching.Matching): 17 | settings_file = common.DATASETS_DIR / "canonical_gazetteer_learned_settings" 18 | data_1_file = common.DATASETS_DIR / "restaurant-1.csv" 19 | data_2_file = common.DATASETS_DIR / "restaurant-2.csv" 20 | 21 | params = [None] # placholder 22 | 23 | def make_report(self, clustering): 24 | return make_report(self.data, clustering) 25 | 26 | def run(self, kwargs, use_settings=False): 27 | data_1, data_2 = self.data 28 | gazetteer: dedupe.StaticGazetteer | dedupe.Gazetteer 29 | 30 | if use_settings and os.path.exists(self.settings_file): 31 | with open(self.settings_file, "rb") as f: 32 | gazetteer = dedupe.StaticGazetteer(f) 33 | else: 34 | variables = [ 35 | dedupe.variables.String("name"), 36 | dedupe.variables.String("address"), 37 | dedupe.variables.String("cuisine"), 38 | dedupe.variables.String("city"), 39 | ] 40 | 41 | gazetteer = dedupe.Gazetteer(variables) 42 | gazetteer.prepare_training( 43 | data_1, 44 | data_2, 45 | training_file=self.training_pairs_filelike, 46 | sample_size=10000, 47 | ) 48 | gazetteer.train() 49 | 50 | with open(self.settings_file, "wb") as f: 51 | gazetteer.write_settings(f) 52 | 53 | gazetteer.index(data_2) 54 | gazetteer.unindex(data_2) 55 | gazetteer.index(data_2) 56 | 57 | return gazetteer.search(data_1, n_matches=1, generator=True) 58 | 59 | 60 | def cli(): 61 | common.configure_logging() 62 | 63 | gaz = Gazetteer() 64 | gaz.setup(None) 65 | 66 | t0 = time.time() 67 | clustering = gaz.run(None, use_settings=True) 68 | elapsed = time.time() - t0 69 | 70 | print(gaz.make_report(clustering)) 71 | print(f"ran in {elapsed} seconds") 72 | 73 | 74 | if __name__ == "__main__": 75 | cli() 76 | -------------------------------------------------------------------------------- /benchmarks/benchmarks/canonical_matching.py: -------------------------------------------------------------------------------- 1 | import io 2 | import os 3 | import time 4 | 5 | import dedupe 6 | from benchmarks import common 7 | 8 | 9 | def get_true_dupes(data): 10 | data_1, data_2 = data 11 | all_data = data_1.copy() 12 | all_data.update(data_2) 13 | return common.get_true_dupes(all_data) 14 | 15 | 16 | def make_report(data, clustering): 17 | true_dupes = get_true_dupes(data) 18 | predicted_dupes = {frozenset(pair) for pair, _ in clustering} 19 | return common.Report.from_scores(true_dupes, predicted_dupes) 20 | 21 | 22 | class Matching: 23 | settings_file = common.DATASETS_DIR / "canonical_data_matching_learned_settings" 24 | data_1_file = common.DATASETS_DIR / "restaurant-1.csv" 25 | data_2_file = common.DATASETS_DIR / "restaurant-2.csv" 26 | 27 | params = [ 28 | {"threshold": 0.5}, 29 | {"threshold": 0.5, "constraint": "many-to-one"}, 30 | ] 31 | param_names = ["kwargs"] 32 | 33 | def setup(self, kwargs): 34 | data_1 = common.load_data(self.data_1_file) 35 | data_2 = common.load_data(self.data_2_file) 36 | 37 | self.data = (data_1, data_2) 38 | training_pairs = dedupe.training_data_link(data_1, data_2, "unique_id", 5000) 39 | self.training_pairs_filelike = io.StringIO() 40 | dedupe.serializer.write_training(training_pairs, self.training_pairs_filelike) 41 | self.training_pairs_filelike.seek(0) 42 | 43 | def run(self, kwargs, use_settings=False): 44 | data_1, data_2 = self.data 45 | deduper: dedupe.StaticRecordLink | dedupe.RecordLink 46 | 47 | if use_settings and os.path.exists(self.settings_file): 48 | with open(self.settings_file, "rb") as f: 49 | deduper = dedupe.StaticRecordLink(f) 50 | else: 51 | variables = [ 52 | dedupe.variables.String("name"), 53 | dedupe.variables.String("address"), 54 | dedupe.variables.String("cuisine"), 55 | dedupe.variables.String("city"), 56 | ] 57 | deduper = dedupe.RecordLink(variables) 58 | deduper.prepare_training( 59 | data_1, 60 | data_2, 61 | training_file=self.training_pairs_filelike, 62 | sample_size=10000, 63 | ) 64 | deduper.train() 65 | with open(self.settings_file, "wb") as f: 66 | deduper.write_settings(f) 67 | 68 | return deduper.join(data_1, data_2, **kwargs) 69 | 70 | def make_report(self, clustering): 71 | return make_report(self.data, clustering) 72 | 73 | def time_run(self, kwargs): 74 | return self.run(kwargs) 75 | 76 | def peakmem_run(self, kwargs): 77 | return self.run(kwargs) 78 | 79 | def track_precision(self, kwargs): 80 | return self.make_report(self.run(kwargs)).precision 81 | 82 | def track_recall(self, kwargs): 83 | return self.make_report(self.run(kwargs)).recall 84 | 85 | 86 | def cli(): 87 | common.configure_logging() 88 | 89 | m = Matching() 90 | for kwargs in m.params: 91 | m.setup(kwargs) 92 | print() 93 | print(f"running with kwargs: {kwargs}") 94 | t0 = time.time() 95 | clustering = m.run(kwargs=kwargs, use_settings=True) 96 | elapsed = time.time() - t0 97 | 98 | print(m.make_report(clustering)) 99 | print(f"ran in {elapsed} seconds") 100 | 101 | 102 | if __name__ == "__main__": 103 | cli() 104 | -------------------------------------------------------------------------------- /benchmarks/benchmarks/common.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import logging 3 | import optparse 4 | import re 5 | from dataclasses import dataclass 6 | from itertools import groupby 7 | from pathlib import Path 8 | 9 | DATASETS_DIR = Path(__file__).parent / "datasets" 10 | 11 | 12 | def pre_process(column): 13 | column = re.sub(" +", " ", column) 14 | column = re.sub("\n", " ", column) 15 | column = column.strip().strip('"').strip("'").lower() 16 | if not column: 17 | column = None 18 | return column 19 | 20 | 21 | def load_data(pathlike): 22 | data_d = {} 23 | with open(pathlike) as f: 24 | reader = csv.DictReader(f) 25 | for i, row in enumerate(reader): 26 | clean_row = {k: pre_process(v) for (k, v) in row.items()} 27 | data_d[str(pathlike) + str(i)] = clean_row 28 | 29 | return data_d 30 | 31 | 32 | def configure_logging() -> None: 33 | optp = optparse.OptionParser() 34 | optp.add_option( 35 | "-v", 36 | "--verbose", 37 | dest="verbose", 38 | action="count", 39 | help="Increase verbosity (specify multiple times for more)", 40 | ) 41 | opts, _ = optp.parse_args() 42 | log_level = logging.WARNING 43 | if opts.verbose: 44 | if opts.verbose == 1: 45 | log_level = logging.INFO 46 | elif opts.verbose >= 2: 47 | log_level = logging.DEBUG 48 | logging.basicConfig(level=log_level) 49 | 50 | 51 | def get_true_dupes(data: dict) -> set: 52 | duplicates = set() 53 | for _, pair in groupby( 54 | sorted(data.items(), key=lambda x: x[1]["unique_id"]), 55 | key=lambda x: x[1]["unique_id"], 56 | ): 57 | pair_l = list(pair) 58 | if len(pair_l) == 2: 59 | a, b = pair_l 60 | duplicates.add(frozenset((a[0], b[0]))) 61 | return duplicates 62 | 63 | 64 | @dataclass 65 | class Report: 66 | # TODO add more and replace calculations with sklearn 67 | n_true: int 68 | n_found: int 69 | precision: float 70 | recall: float 71 | 72 | @classmethod 73 | def from_scores(cls, true_dupes: set, found_dupes: set): 74 | true_positives = found_dupes.intersection(true_dupes) 75 | 76 | n_true = len(true_dupes) 77 | n_found = len(found_dupes) 78 | precision = len(true_positives) / n_found 79 | recall = len(true_positives) / n_true 80 | 81 | return cls(n_true, n_found, precision, recall) 82 | -------------------------------------------------------------------------------- /benchmarks/setup.py: -------------------------------------------------------------------------------- 1 | # Dummy file to allow editable installs 2 | from setuptools import find_packages, setup 3 | 4 | if __name__ == "__main__": 5 | setup( 6 | name="benchmarks", 7 | packages=find_packages(), 8 | package_data={ 9 | # If any package contains *.txt or *.json files, include them: 10 | "": ["*.csv"], 11 | # And include any files found in the 'mypackage/data' directory: 12 | "benchmarks": ["datasets/*"], 13 | }, 14 | ) 15 | -------------------------------------------------------------------------------- /dedupe/__init__.py: -------------------------------------------------------------------------------- 1 | from dedupe.api import ( # noqa: F401 2 | Dedupe, 3 | Gazetteer, 4 | RecordLink, 5 | StaticDedupe, 6 | StaticGazetteer, 7 | StaticRecordLink, 8 | ) 9 | from dedupe.convenience import ( # noqa: F401 10 | canonicalize, 11 | console_label, 12 | training_data_dedupe, 13 | training_data_link, 14 | ) 15 | from dedupe.serializer import read_training, write_training # noqa: F401 16 | 17 | __all__ = [ 18 | "Dedupe", 19 | "Gazetteer", 20 | "RecordLink", 21 | "StaticDedupe", 22 | "StaticGazetteer", 23 | "StaticRecordLink", 24 | "canonicalize", 25 | "console_label", 26 | "training_data_dedupe", 27 | "training_data_link", 28 | "read_training", 29 | "write_training", 30 | ] 31 | -------------------------------------------------------------------------------- /dedupe/_typing.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import ( 3 | TYPE_CHECKING, 4 | Any, 5 | Callable, 6 | Dict, 7 | FrozenSet, 8 | Iterable, 9 | Iterator, 10 | List, 11 | Literal, 12 | Mapping, 13 | MutableSequence, 14 | Protocol, 15 | Sequence, 16 | Tuple, 17 | Type, 18 | TypedDict, 19 | Union, 20 | runtime_checkable, 21 | ) 22 | 23 | import numpy 24 | import numpy.typing 25 | 26 | if TYPE_CHECKING: 27 | from dedupe.predicates import Predicate 28 | 29 | 30 | RecordDict = Mapping[str, Any] 31 | RecordID = Union[int, str] 32 | RecordIDDType = Union[Type[int], Tuple[Type[str], Literal[256]]] 33 | RecordIDPair = Union[Tuple[int, int], Tuple[str, str]] 34 | RecordInt = Tuple[int, RecordDict] 35 | RecordStr = Tuple[str, RecordDict] 36 | Record = Union[RecordInt, RecordStr] 37 | RecordPairInt = Tuple[RecordInt, RecordInt] 38 | RecordPairStr = Tuple[RecordStr, RecordStr] 39 | RecordPairs = Union[Iterator[RecordPairInt], Iterator[RecordPairStr]] 40 | BlockInt = List[RecordPairInt] 41 | BlockStr = List[RecordPairStr] 42 | Block = Union[RecordPairInt, RecordPairStr] 43 | BlocksInt = Iterator[BlockInt] 44 | BlocksStr = Iterator[BlockStr] 45 | Blocks = Union[BlocksInt, BlocksStr] 46 | ClusterInt = Tuple[ 47 | Tuple[int, ...], Union[numpy.typing.NDArray[numpy.float64], Tuple[float, ...]] 48 | ] 49 | ClusterStr = Tuple[ 50 | Tuple[str, ...], Union[numpy.typing.NDArray[numpy.float64], Tuple[float, ...]] 51 | ] 52 | ClustersInt = Iterable[ClusterInt] 53 | ClustersStr = Iterable[ClusterStr] 54 | Clusters = Union[ClustersInt, ClustersStr] 55 | 56 | DataInt = Mapping[int, RecordDict] 57 | DataStr = Mapping[str, RecordDict] 58 | Data = Union[DataInt, DataStr] 59 | 60 | RecordDictPair = Tuple[RecordDict, RecordDict] 61 | RecordDictPairs = List[RecordDictPair] 62 | ArrayLinks = Iterable[numpy.ndarray] 63 | TupleLinksInt = Iterable[Tuple[Tuple[int, int], float]] 64 | TupleLinksStr = Iterable[Tuple[Tuple[str, str], float]] 65 | TupleLinks = Union[TupleLinksInt, TupleLinksStr] 66 | Links = Union[ArrayLinks, TupleLinks] 67 | LookupResultsInt = Iterable[Tuple[int, Tuple[Tuple[int, float], ...]]] 68 | LookupResultsStr = Iterable[Tuple[str, Tuple[Tuple[str, float], ...]]] 69 | LookupResults = Union[LookupResultsInt, LookupResultsStr] 70 | JoinConstraint = Literal["one-to-one", "many-to-one", "many-to-many"] 71 | Comparator = Callable[[Any, Any], Union[Union[int, float], Sequence[Union[int, float]]]] 72 | CustomComparator = Callable[[Any, Any], Union[int, float]] 73 | Scores = Union[numpy.memmap, numpy.ndarray] 74 | Labels = List[Literal[0, 1]] 75 | LabelsLike = Iterable[Literal[0, 1]] 76 | Cover = Dict["Predicate", FrozenSet[int]] 77 | ComparisonCoverInt = Dict["Predicate", FrozenSet[Tuple[int, int]]] 78 | ComparisonCoverStr = Dict["Predicate", FrozenSet[Tuple[str, str]]] 79 | ComparisonCover = Union[ComparisonCoverInt, ComparisonCoverStr] 80 | PredicateFunction = Callable[[Any], FrozenSet[str]] 81 | 82 | 83 | class TrainingData(TypedDict): 84 | match: MutableSequence[RecordDictPair] 85 | distinct: MutableSequence[RecordDictPair] 86 | 87 | 88 | # Takes pairs of records and generates a (n_samples X n_features) array 89 | FeaturizerFunction = Callable[ 90 | [Sequence[RecordDictPair]], numpy.typing.NDArray[numpy.float64] 91 | ] 92 | 93 | 94 | class Classifier(Protocol): 95 | """Takes an array of pairwise distances and computes the likelihood they are a pair.""" 96 | 97 | def fit(self, X: numpy.typing.NDArray[numpy.float64], y: LabelsLike) -> None: ... 98 | 99 | def predict_proba( 100 | self, X: numpy.typing.NDArray[numpy.float64] 101 | ) -> numpy.typing.NDArray[numpy.float64]: ... 102 | 103 | 104 | class ClosableJoinable(Protocol): 105 | def close(self) -> None: ... 106 | 107 | def join(self) -> None: ... 108 | 109 | 110 | class Variable(Protocol): 111 | name: str 112 | predicates: List["Predicate"] 113 | has_missing: bool 114 | 115 | def __len__(self) -> int: ... 116 | 117 | 118 | @runtime_checkable 119 | class FieldVariable(Variable, Protocol): 120 | field: str 121 | comparator: Comparator 122 | 123 | 124 | class InteractionVariable(Variable, Protocol): 125 | interaction_fields: List[str] 126 | 127 | 128 | MapLike = Callable[[Callable[[Any], Any], Iterable], Iterable] 129 | 130 | PathLike = Union[str, os.PathLike] 131 | -------------------------------------------------------------------------------- /dedupe/backport.py: -------------------------------------------------------------------------------- 1 | import platform 2 | 3 | if platform.system() == "Darwin": 4 | import multiprocessing 5 | 6 | ctx = multiprocessing.get_context("spawn") 7 | Queue = ctx.Queue 8 | Process = ctx.Process 9 | Pool = ctx.Pool 10 | SimpleQueue = ctx.SimpleQueue 11 | Lock = ctx.Lock 12 | RLock = ctx.RLock 13 | else: 14 | from multiprocessing import ( # type: ignore # noqa 15 | Lock, 16 | Pool, 17 | Process, 18 | Queue, 19 | RLock, 20 | SimpleQueue, 21 | ) 22 | -------------------------------------------------------------------------------- /dedupe/blocking.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | from __future__ import annotations 3 | 4 | import logging 5 | import time 6 | from collections import defaultdict 7 | from typing import TYPE_CHECKING 8 | 9 | if TYPE_CHECKING: 10 | from typing import ( 11 | Any, 12 | Callable, 13 | DefaultDict, 14 | Generator, 15 | Iterable, 16 | List, 17 | Sequence, 18 | Union, 19 | ) 20 | 21 | import dedupe.predicates 22 | from dedupe._typing import Data, Record, RecordID 23 | from dedupe.index import Index 24 | 25 | Docs = Union[Iterable[str], Iterable[Iterable[str]]] 26 | IndexList = DefaultDict[str, List[dedupe.predicates.IndexPredicate]] 27 | 28 | 29 | logger = logging.getLogger(__name__) 30 | 31 | 32 | def index_list() -> IndexList: 33 | return defaultdict(list) 34 | 35 | 36 | class Fingerprinter: 37 | """Takes in a record and returns all blocks that record belongs to""" 38 | 39 | def __init__(self, predicates: Iterable[dedupe.predicates.Predicate]) -> None: 40 | self.predicates = predicates 41 | 42 | self.index_fields: dict[str, IndexList] 43 | self.index_fields = defaultdict(index_list) 44 | """ 45 | A dictionary of all the fingerprinter methods that use an 46 | index of data field values. The keys are the field names, 47 | which can be useful to know for indexing the data. 48 | """ 49 | 50 | self.index_predicates = [] 51 | 52 | for full_predicate in predicates: 53 | for predicate in full_predicate: 54 | if hasattr(predicate, "index"): 55 | self.index_fields[predicate.field][predicate.type].append(predicate) 56 | self.index_predicates.append(predicate) 57 | 58 | def __call__( 59 | self, records: Iterable[Record], target: bool = False 60 | ) -> Generator[tuple[str, RecordID]]: 61 | """ 62 | Generate the predicates for records. Yields tuples of (predicate, 63 | record_id). 64 | 65 | Args: 66 | records: A sequence of tuples of (record_id, 67 | record_dict). Can often be created by 68 | `data_dict.items()`. 69 | target: Indicates whether the data should be treated as 70 | the target data. This effects the behavior of 71 | search predicates. If `target` is set to 72 | `True`, an search predicate will return the 73 | value itself. If `target` is set to `False` the 74 | search predicate will return all possible 75 | values within the specified search distance. 76 | 77 | Let's say we have a 78 | `LevenshteinSearchPredicate` with an associated 79 | distance of `1` on a `"name"` field; and we 80 | have a record like `{"name": "thomas"}`. If the 81 | `target` is set to `True` then the predicate 82 | will return `"thomas"`. If `target` is set to 83 | `False`, then the blocker could return 84 | `"thomas"`, `"tomas"`, and `"thoms"`. By using 85 | the `target` argument on one of your datasets, 86 | you will dramatically reduce the total number 87 | of comparisons without a loss of accuracy. 88 | 89 | .. code:: python 90 | 91 | > data = [(1, {'name' : 'bob'}), (2, {'name' : 'suzanne'})] 92 | > blocked_ids = deduper.fingerprinter(data) 93 | > print list(blocked_ids) 94 | [('foo:1', 1), ..., ('bar:1', 100)] 95 | 96 | """ 97 | 98 | start_time = time.perf_counter() 99 | predicates = [ 100 | (":" + str(i), predicate) for i, predicate in enumerate(self.predicates) 101 | ] 102 | 103 | for i, record in enumerate(records): 104 | record_id, instance = record 105 | 106 | for pred_id, predicate in predicates: 107 | block_keys = predicate(instance, target=target) 108 | for block_key in block_keys: 109 | yield block_key + pred_id, record_id 110 | 111 | if i and i % 10000 == 0: 112 | logger.info( 113 | "%(iteration)d, %(elapsed)f2 seconds", 114 | {"iteration": i, "elapsed": time.perf_counter() - start_time}, 115 | ) 116 | 117 | def reset_indices(self) -> None: 118 | """ 119 | Fingerprinter indices can take up a lot of memory. If you are 120 | done with blocking, the method will reset the indices to free up. 121 | If you need to block again, the data will need to be re-indexed. 122 | """ 123 | for predicate in self.index_predicates: 124 | predicate.reset() 125 | 126 | def index(self, docs: Docs, field: str) -> None: 127 | """ 128 | Add docs to the indices used by fingerprinters. 129 | 130 | Some fingerprinter methods depend upon having an index of 131 | values that a field may have in the data. This method adds 132 | those values to the index. If you don't have any fingerprinter 133 | methods that use an index, this method will do nothing. 134 | 135 | Args: 136 | docs: an iterator of values from your data to index. While 137 | not required, it is recommended that docs be a unique 138 | set of of those values. Indexing can be an expensive 139 | operation. 140 | field: fieldname or key associated with the values you are 141 | indexing 142 | 143 | """ 144 | indices = extractIndices(self.index_fields[field]) 145 | 146 | for doc in docs: 147 | if doc: 148 | for _, index, preprocess in indices: 149 | index.index(preprocess(doc)) 150 | 151 | for index_type, index, _ in indices: 152 | index.initSearch() 153 | 154 | for predicate in self.index_fields[field][index_type]: 155 | logger.debug("Canopy: %s", str(predicate)) 156 | predicate.index = index 157 | predicate.bust_cache() 158 | 159 | def unindex(self, docs: Docs, field: str) -> None: 160 | """Remove docs from indices used by fingerprinters 161 | 162 | Args: 163 | docs: an iterator of values from your data to remove. While 164 | not required, it is recommended that docs be a unique 165 | set of of those values. Indexing can be an expensive 166 | operation. 167 | field: fieldname or key associated with the values you are 168 | unindexing 169 | """ 170 | 171 | indices = extractIndices(self.index_fields[field]) 172 | 173 | for doc in docs: 174 | if doc: 175 | for _, index, preprocess in indices: 176 | try: 177 | index.unindex(preprocess(doc)) 178 | except KeyError: 179 | pass 180 | 181 | for index_type, index, _ in indices: 182 | index.initSearch() 183 | 184 | for predicate in self.index_fields[field][index_type]: 185 | logger.debug("Canopy: %s", str(predicate)) 186 | predicate.index = index 187 | predicate.bust_cache() 188 | 189 | def index_all(self, data: Data) -> None: 190 | for field in self.index_fields: 191 | unique_fields = {record[field] for record in data.values() if record[field]} 192 | self.index(unique_fields, field) 193 | 194 | 195 | def extractIndices( 196 | index_fields: IndexList, 197 | ) -> Sequence[tuple[str, Index, Callable[[Any], Any]]]: 198 | indices = [] 199 | for index_type, predicates in index_fields.items(): 200 | predicate = predicates[0] 201 | index = predicate.index 202 | preprocess = predicate.preprocess 203 | if predicate.index is None: 204 | index = predicate.initIndex() 205 | assert index is not None 206 | indices.append((index_type, index, preprocess)) 207 | 208 | return indices 209 | -------------------------------------------------------------------------------- /dedupe/branch_and_bound.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import functools 4 | from typing import Any, Collection, Iterable, Mapping, Tuple 5 | 6 | from ._typing import Cover 7 | from .predicates import Predicate 8 | 9 | Partial = Tuple[Predicate, ...] 10 | 11 | 12 | def _reachable(dupe_cover: Mapping[Any, frozenset[int]]) -> int: 13 | return len(frozenset.union(*dupe_cover.values())) if dupe_cover else 0 14 | 15 | 16 | def _remove_dominated(coverage: Cover, dominator: Predicate) -> Cover: 17 | dominant_cover = coverage[dominator] 18 | 19 | return { 20 | pred: cover 21 | for pred, cover in coverage.items() 22 | if not (dominator.cover_count <= pred.cover_count and dominant_cover >= cover) 23 | } 24 | 25 | 26 | def _uncovered_by( 27 | coverage: Mapping[Any, frozenset[int]], covered: frozenset[int] 28 | ) -> dict[Any, frozenset[int]]: 29 | return { 30 | pred: still_uncovered 31 | for pred, uncovered in coverage.items() 32 | if (still_uncovered := uncovered - covered) 33 | } 34 | 35 | 36 | def _order_by( 37 | candidates: Mapping[Predicate, Collection[Any]], p: Predicate 38 | ) -> tuple[int, float]: 39 | return (len(candidates[p]), -p.cover_count) 40 | 41 | 42 | def _score(partial: Iterable[Predicate]) -> float: 43 | return sum(p.cover_count for p in partial) 44 | 45 | 46 | def search(original_cover: Cover, target: int, calls: int) -> Partial: 47 | def _covered(partial: Partial) -> int: 48 | return ( 49 | len(frozenset.union(*(original_cover[p] for p in partial))) 50 | if partial 51 | else 0 52 | ) 53 | 54 | cheapest_score = float("inf") 55 | cheapest: Partial = () 56 | 57 | start: tuple[Cover, Partial] = (original_cover, ()) 58 | to_explore = [start] 59 | 60 | while to_explore and calls: 61 | candidates, partial = to_explore.pop() 62 | 63 | covered = _covered(partial) 64 | score = _score(partial) 65 | 66 | if covered < target: 67 | window = cheapest_score - score 68 | candidates = { 69 | p: cover for p, cover in candidates.items() if p.cover_count < window 70 | } 71 | 72 | reachable = _reachable(candidates) + covered 73 | 74 | if candidates and reachable >= target: 75 | order_by = functools.partial(_order_by, candidates) 76 | best = max(candidates, key=order_by) 77 | 78 | reduced = _remove_dominated(candidates, best) 79 | to_explore.append((reduced, partial)) 80 | 81 | remaining = _uncovered_by(candidates, candidates[best]) 82 | to_explore.append((remaining, partial + (best,))) 83 | 84 | elif score < cheapest_score: 85 | cheapest = partial 86 | cheapest_score = score 87 | 88 | calls -= 1 89 | 90 | return cheapest 91 | -------------------------------------------------------------------------------- /dedupe/canonical.py: -------------------------------------------------------------------------------- 1 | from typing import Mapping, Sequence 2 | 3 | import numpy 4 | import numpy.typing 5 | from affinegap import normalizedAffineGapDistance as affine 6 | 7 | from dedupe._typing import Comparator, RecordDict 8 | 9 | 10 | def getCentroid(attribute_variants: Sequence[str], comparator: Comparator) -> str: 11 | """ 12 | Takes in a list of attribute values for a field, 13 | evaluates the centroid using the comparator, 14 | & returns the centroid (i.e. the 'best' value for the field) 15 | """ 16 | 17 | n = len(attribute_variants) 18 | 19 | distance_matrix = numpy.zeros([n, n]) 20 | 21 | # populate distance matrix by looping through elements of matrix triangle 22 | for i in range(0, n): 23 | for j in range(0, i): 24 | distance = comparator(attribute_variants[i], attribute_variants[j]) 25 | distance_matrix[i, j] = distance_matrix[j, i] = distance 26 | 27 | average_distance = distance_matrix.mean(0) 28 | 29 | # there can be ties for minimum, average distance string 30 | min_dist_indices: numpy.typing.NDArray[numpy.int_] 31 | min_dist_indices = numpy.where(average_distance == average_distance.min())[0] 32 | 33 | if len(min_dist_indices) > 1: 34 | centroid = breakCentroidTie(attribute_variants, min_dist_indices) 35 | else: 36 | centroid_index = min_dist_indices[0] 37 | centroid = attribute_variants[centroid_index] 38 | 39 | return centroid 40 | 41 | 42 | def breakCentroidTie( 43 | attribute_variants: Sequence[str], 44 | min_dist_indices: numpy.typing.NDArray[numpy.int_], 45 | ) -> str: 46 | """ 47 | Finds centroid when there are multiple values w/ min avg distance 48 | (e.g. any dupe cluster of 2) right now this selects the first 49 | among a set of ties, but can be modified to break ties in strings 50 | by selecting the longest string 51 | 52 | """ 53 | return attribute_variants[min_dist_indices[0]] 54 | 55 | 56 | def getCanonicalRep(record_cluster: Sequence[RecordDict]) -> Mapping[str, str]: 57 | """ 58 | Given a list of records within a duplicate cluster, constructs a 59 | canonical representation of the cluster by finding canonical 60 | values for each field 61 | 62 | """ 63 | canonical_rep = {} 64 | 65 | keys = record_cluster[0].keys() 66 | 67 | for key in keys: 68 | key_values = [] 69 | for record in record_cluster: 70 | # assume non-empty values always better than empty value 71 | # for canonical record 72 | if record.get(key): 73 | key_values.append(record[key]) 74 | if key_values: 75 | canonical_rep[key] = getCentroid(key_values, affine) 76 | else: 77 | canonical_rep[key] = "" 78 | 79 | return canonical_rep 80 | -------------------------------------------------------------------------------- /dedupe/canopy_index.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import logging 4 | import math 5 | from typing import Iterable 6 | 7 | import numpy 8 | from BTrees.Length import Length 9 | from zope.index.text.cosineindex import CosineIndex 10 | from zope.index.text.lexicon import Lexicon 11 | from zope.index.text.setops import mass_weightedUnion 12 | from zope.index.text.textindex import TextIndex 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | class CanopyIndex(TextIndex): # pragma: no cover 18 | def __init__(self) -> None: 19 | lexicon = CanopyLexicon() 20 | self.index = CosineIndex(lexicon) 21 | self.lexicon = lexicon 22 | 23 | def initSearch(self) -> None: 24 | N = len(self.index._docweight) 25 | threshold = int(max(1000, N * 0.05)) 26 | 27 | stop_words = [] 28 | self._wids_dict = {} 29 | 30 | bucket = self.index.family.IF.Bucket 31 | for wid, docs in self.index._wordinfo.items(): 32 | if len(docs) > threshold: 33 | stop_words.append(wid) 34 | continue 35 | 36 | if isinstance(docs, dict): 37 | docs = bucket(docs) 38 | self.index._wordinfo[wid] = docs 39 | 40 | idf = numpy.log1p(N / len(docs)) 41 | term = self.lexicon._words[wid] 42 | 43 | self._wids_dict[term] = (wid, idf) 44 | 45 | for wid in stop_words: 46 | word = self.lexicon._words.pop(wid) 47 | del self.lexicon._wids[word] 48 | logger.info(f"Removing stop word {word}") 49 | del self.index._wordinfo[wid] 50 | 51 | def apply( 52 | self, 53 | query_list: Iterable[str], 54 | threshold: float, 55 | start: int = 0, 56 | count: int | None = None, 57 | ) -> list[tuple[float, int]]: 58 | _wids_dict = self._wids_dict 59 | _wordinfo = self.index._wordinfo 60 | l_pow = float.__pow__ 61 | 62 | L = [] 63 | qw = 0.0 64 | 65 | for term in query_list: 66 | wid, weight = _wids_dict.get(term, (None, None)) 67 | if wid is None: 68 | continue 69 | docs = _wordinfo[wid] 70 | L.append((docs, weight)) 71 | qw += l_pow(weight, 2) 72 | 73 | results = mass_weightedUnion(L) 74 | 75 | qw = math.sqrt(qw) 76 | filtered_results: list[tuple[float, int]] = results.byValue(qw * threshold) 77 | 78 | return filtered_results 79 | 80 | 81 | class CanopyLexicon(Lexicon): # pragma: no cover 82 | def sourceToWordIds(self, last: list | None = None) -> list[int]: 83 | if last is None: 84 | last = [] 85 | if not isinstance(self.wordCount, Length): # type: ignore[has-type] 86 | self.wordCount = Length(self.wordCount()) # type: ignore[has-type] 87 | self.wordCount._p_deactivate() 88 | return list(map(self._getWordIdCreate, last)) 89 | -------------------------------------------------------------------------------- /dedupe/cpredicates.pyx: -------------------------------------------------------------------------------- 1 | # cython: c_string_type=unicode, c_string_encoding=utf8, infertypes=True, language_level=3 2 | 3 | cpdef list ngrams(basestring field, int n): 4 | """ngrams returns all contiguous sequences of n characters 5 | of a given field. 6 | 7 | :param field: the string to be sequenced 8 | :param n: the number of characters to be included in each gram 9 | 10 | usage: 11 | >>> from dedupe.dedupe.predicated import ngrams 12 | >>> ngrams("deduplicate", 3) 13 | ['ded', 'edu', 'dup', 'upl', 'pli', 'lic', 'ica', 'cat', 'ate'] 14 | """ 15 | cdef unicode ufield = _ustring(field) 16 | 17 | cdef int i 18 | cdef int n_char = len(ufield) 19 | cdef int n_grams = n_char - n + 1 20 | cdef list grams = [ufield[i:i+n] for i in range(n_grams)] 21 | return grams 22 | 23 | 24 | cpdef frozenset unique_ngrams(basestring field, int n): 25 | """unique_ngrams returns all contiguous unique sequences of n characters 26 | of a given field. 27 | 28 | :param field: the string to be sequenced 29 | :param n: the number of characters to be included in each gram 30 | 31 | usage: 32 | >>> from dedupe.dedupe.predicated import unique_ngrams 33 | >>> unique_ngrams("mississippi", 2) 34 | {"mi", "is", "ss", "si", "ip", "pp", "pi"} 35 | """ 36 | cdef unicode ufield = _ustring(field) 37 | 38 | cdef int i 39 | cdef int n_char = len(ufield) 40 | cdef int n_grams = n_char - n + 1 41 | cdef set grams = {ufield[i:i+n] for i in range(n_grams)} 42 | return frozenset(grams) 43 | 44 | 45 | cpdef frozenset initials(basestring field, int n): 46 | """returns a tuple containing the first n chars of a field. 47 | The whole field is returned if n is greater than the field length. 48 | 49 | :param field: the string 50 | :type n: int 51 | 52 | usage: 53 | >>> initials("dedupe", 7) 54 | ('dedupe', ) 55 | >>> initials("deduplication", 7) 56 | ('dedupli', ) 57 | """ 58 | cdef unicode ufield = _ustring(field) 59 | 60 | return frozenset((ufield[:n],)) 61 | 62 | 63 | cdef unicode _ustring(basestring s): 64 | if type(s) is unicode: 65 | # fast path for most common case(s) 66 | return s 67 | else : # safe because of basestring 68 | return s 69 | -------------------------------------------------------------------------------- /dedupe/datamodel.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import copyreg 4 | import types 5 | from collections.abc import Mapping 6 | from typing import TYPE_CHECKING, cast 7 | 8 | import numpy 9 | 10 | from dedupe._typing import FieldVariable 11 | from dedupe.variables.interaction import InteractionType 12 | 13 | if TYPE_CHECKING: 14 | from typing import Collection, Generator, Iterable, Sequence 15 | 16 | from dedupe._typing import ( 17 | Comparator, 18 | InteractionVariable, 19 | RecordDict, 20 | RecordDictPair, 21 | Variable, 22 | ) 23 | from dedupe.predicates import Predicate 24 | 25 | 26 | class DataModel: 27 | version = 2 28 | 29 | def __init__(self, variable_definitions: Collection[Variable]): 30 | for item in variable_definitions: 31 | if isinstance(item, Mapping): 32 | raise ValueError( 33 | "It looks like you are trying to use a variable definition " 34 | "composed of dictionaries. dedupe 3.0 uses variable objects " 35 | 'directly. So instead of [{"field": "name", "type": "String"}] ' 36 | 'we now do [dedupe.variables.String("name")].' 37 | ) 38 | 39 | variable_definitions = list(variable_definitions) 40 | if not variable_definitions: 41 | raise ValueError("The variable definitions cannot be empty") 42 | if not any(variable.predicates for variable in variable_definitions): 43 | raise ValueError( 44 | "At least one of the variable types needs to be a type" 45 | "other than 'Custom'. 'Custom' types have no associated" 46 | "blocking rules" 47 | ) 48 | 49 | # This is a protocol check, not a class inheritance check 50 | self.field_variables: list[FieldVariable] = [ 51 | variable 52 | for variable in variable_definitions 53 | if isinstance(variable, FieldVariable) 54 | ] 55 | 56 | # we need to keep track of ordering of variables because in 57 | # order to calculate derived fields like interaction and missing 58 | # data fields. 59 | columns: list[Variable] = [] 60 | for variable in self.field_variables: 61 | if len(variable) == 1: 62 | columns.append(variable) 63 | elif len(variable) > 1: 64 | assert hasattr(variable, "higher_vars") 65 | columns.extend(variable.higher_vars) 66 | 67 | self._derived_start = len(columns) 68 | 69 | # i'm not really satisfied with how we are dealing with interactions 70 | # here. seems like there should be a cleaner path, but i don't see it 71 | # today 72 | columns += interactions(variable_definitions, self.field_variables) 73 | 74 | self._missing_field_indices = missing_field_indices(columns) 75 | self._interaction_indices = interaction_indices(columns) 76 | 77 | self._len = len(columns) + len(self._missing_field_indices) 78 | 79 | def __len__(self) -> int: 80 | return self._len 81 | 82 | # Changing this from a property to just a normal attribute causes 83 | # pickling problems, because we are removing static methods from 84 | # their class context. This could be fixed by defining comparators 85 | # outside of classes in fieldclasses 86 | @property 87 | def _field_comparators( 88 | self, 89 | ) -> Generator[tuple[str, Comparator, int, int]]: 90 | start = 0 91 | stop = 0 92 | for var in self.field_variables: 93 | stop = start + len(var) 94 | comparator = cast("Comparator", var.comparator) 95 | yield (var.field, comparator, start, stop) 96 | start = stop 97 | 98 | @property 99 | def predicates(self) -> set[Predicate]: 100 | predicates = set() 101 | for var in self.field_variables: 102 | for predicate in var.predicates: 103 | predicates.add(predicate) 104 | return predicates 105 | 106 | def distances( 107 | self, record_pairs: Sequence[RecordDictPair] 108 | ) -> numpy.typing.NDArray[numpy.float64]: 109 | num_records = len(record_pairs) 110 | 111 | distances = numpy.empty((num_records, len(self)), "f4") 112 | 113 | for i, (record_1, record_2) in enumerate(record_pairs): 114 | for field, compare, start, stop in self._field_comparators: 115 | if record_1[field] is not None and record_2[field] is not None: 116 | distances[i, start:stop] = compare(record_1[field], record_2[field]) 117 | elif hasattr(compare, "missing"): 118 | distances[i, start:stop] = compare(record_1[field], record_2[field]) 119 | else: 120 | distances[i, start:stop] = numpy.nan 121 | 122 | distances = self._add_derived_distances(distances) 123 | 124 | return distances 125 | 126 | def _add_derived_distances( 127 | self, distances: numpy.typing.NDArray[numpy.float64] 128 | ) -> numpy.typing.NDArray[numpy.float64]: 129 | current_column = self._derived_start 130 | 131 | for indices in self._interaction_indices: 132 | distances[:, current_column] = numpy.prod(distances[:, indices], axis=1) 133 | current_column += 1 134 | 135 | is_missing = numpy.isnan(distances[:, :current_column]) 136 | 137 | distances[:, :current_column][is_missing] = 0 138 | 139 | if self._missing_field_indices: 140 | distances[:, current_column:] = ( 141 | 1 - is_missing[:, self._missing_field_indices] 142 | ) 143 | 144 | return distances 145 | 146 | def check(self, record: RecordDict) -> None: 147 | for field, _, _, _ in self._field_comparators: 148 | if field not in record: 149 | raise ValueError( 150 | "Records do not line up with data model. " 151 | "The field '%s' is in data_model but not " 152 | "in a record" % field 153 | ) 154 | 155 | def __getstate__(self): 156 | d = self.__dict__ 157 | d["object_version"] = self.version 158 | return d 159 | 160 | def __setstate__(self, d): 161 | version = d.pop("object_version", None) 162 | if version is None and "_variables" in d: 163 | d["_len"] = len(d.pop("_variables")) 164 | d["primary_variables"] = d.pop("primary_fields") 165 | elif version == 1: 166 | d["field_variables"] = d.pop("primary_variables") 167 | 168 | self.__dict__ = d 169 | 170 | 171 | def interactions( 172 | variables: Iterable[Variable], primary_variables: Iterable[FieldVariable] 173 | ) -> list[InteractionVariable]: 174 | field_d = {field.name: field for field in primary_variables} 175 | 176 | interactions: list[InteractionVariable] = [] 177 | for variable in variables: 178 | if isinstance(variable, InteractionType): 179 | variable.expandInteractions(field_d) 180 | interactions.extend(variable.higher_vars) 181 | return interactions 182 | 183 | 184 | def missing_field_indices(variables: list[Variable]) -> list[int]: 185 | return [i for i, var in enumerate(variables) if var.has_missing] 186 | 187 | 188 | def interaction_indices(variables: list[Variable]) -> list[list[int]]: 189 | var_names = [var.name for var in variables] 190 | indices = [] 191 | for var in variables: 192 | if hasattr(var, "interaction_fields"): 193 | interaction_indices = [var_names.index(f) for f in var.interaction_fields] 194 | indices.append(interaction_indices) 195 | return indices 196 | 197 | 198 | def reduce_method(m): 199 | return (getattr, (m.__self__, m.__func__.__name__)) 200 | 201 | 202 | copyreg.pickle(types.MethodType, reduce_method) 203 | -------------------------------------------------------------------------------- /dedupe/index.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from abc import ABC, abstractmethod 4 | from typing import TYPE_CHECKING 5 | 6 | if TYPE_CHECKING: 7 | from typing import MutableMapping, Tuple 8 | 9 | Doc = Tuple[str, ...] 10 | 11 | 12 | class Index(ABC): 13 | _doc_to_id: MutableMapping[Doc, int] 14 | 15 | @abstractmethod 16 | def __init__(self) -> None: # pragma: no cover 17 | pass 18 | 19 | @abstractmethod 20 | def index(self, doc: Doc) -> None: # pragma: no cover 21 | pass 22 | 23 | @abstractmethod 24 | def unindex(self, doc: Doc) -> None: # pragma: no cover 25 | pass 26 | 27 | @abstractmethod # pragma: no cover 28 | def search(self, doc: Doc, threshold: int | float = 0) -> list[int]: 29 | pass 30 | 31 | @abstractmethod 32 | def initSearch(self) -> None: # pragma: no cover 33 | pass 34 | -------------------------------------------------------------------------------- /dedupe/levenshtein.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List 2 | 3 | import Levenshtein_search 4 | 5 | from .core import Enumerator 6 | from .index import Index 7 | 8 | 9 | class LevenshteinIndex(Index): 10 | _doc_to_id: Dict[str, int] # type: ignore[assignment] 11 | 12 | def __init__(self) -> None: 13 | self.index_key = Levenshtein_search.populate_wordset(-1, []) 14 | self._doc_to_id = Enumerator(start=1) 15 | 16 | def index(self, doc: str) -> None: # type: ignore[override] 17 | if doc not in self._doc_to_id: 18 | self._doc_to_id[doc] 19 | Levenshtein_search.add_string(self.index_key, doc) 20 | 21 | def unindex(self, doc: str) -> None: # type: ignore[override] 22 | del self._doc_to_id[doc] 23 | Levenshtein_search.clear_wordset(self.index_key) 24 | self.index_key = Levenshtein_search.populate_wordset(-1, list(self._doc_to_id)) 25 | 26 | def initSearch(self) -> None: 27 | pass 28 | 29 | def search(self, doc: str, threshold: int = 0) -> List[int]: # type: ignore[override] 30 | matching_docs = Levenshtein_search.lookup(self.index_key, doc, threshold) 31 | if matching_docs: 32 | return [self._doc_to_id[match] for match, _, _ in matching_docs] 33 | else: 34 | return [] 35 | 36 | def __del__(self) -> None: 37 | Levenshtein_search.clear_wordset(self.index_key) 38 | -------------------------------------------------------------------------------- /dedupe/predicate_functions.py: -------------------------------------------------------------------------------- 1 | import re 2 | from itertools import chain 3 | from math import copysign, floor, log10 4 | from typing import Any, FrozenSet, Sequence, Tuple, Union 5 | 6 | from doublemetaphone import doublemetaphone 7 | 8 | # This allows to import all cpredicate functions from this module. 9 | from dedupe.cpredicates import initials, ngrams, unique_ngrams # noqa: F401 10 | 11 | words = re.compile(r"[\w']+").findall 12 | integers = re.compile(r"\d+").findall 13 | start_word = re.compile(r"^([\w']+)").match 14 | two_start_words = re.compile(r"^([\w']+\W+[\w']+)").match 15 | start_integer = re.compile(r"^(\d+)").match 16 | alpha_numeric = re.compile(r"(?=[a-zA-Z]*\d)[a-zA-Z\d]+").findall 17 | 18 | 19 | def wholeFieldPredicate(field: str) -> FrozenSet[str]: 20 | """return the whole field as a string""" 21 | return frozenset((str(field),)) 22 | 23 | 24 | def tokenFieldPredicate(field: str) -> FrozenSet[str]: 25 | """returns the tokens""" 26 | return frozenset(words(field)) 27 | 28 | 29 | def firstTokenPredicate(field: str) -> FrozenSet[str]: 30 | first_token = start_word(field) 31 | if first_token: 32 | return frozenset(first_token.groups()) 33 | else: 34 | return frozenset() 35 | 36 | 37 | def firstTwoTokensPredicate(field: str) -> FrozenSet[str]: 38 | first_two_tokens = two_start_words(field) 39 | if first_two_tokens: 40 | return frozenset(first_two_tokens.groups()) 41 | else: 42 | return frozenset() 43 | 44 | 45 | def commonIntegerPredicate(field: str) -> FrozenSet[str]: 46 | """return any integers""" 47 | 48 | # `str(int(i))` removes leading zeros, e.g. `str(int("0001")) = "1"` 49 | return frozenset(str(int(i)) for i in integers(field)) 50 | 51 | 52 | def alphaNumericPredicate(field: str) -> FrozenSet[str]: 53 | return frozenset(alpha_numeric(field)) 54 | 55 | 56 | def nearIntegersPredicate(field: str) -> FrozenSet[str]: 57 | """for any integer N in field return the integers N-1, N and N+1""" 58 | string_ints = integers(field) 59 | near_ints = set() 60 | for s in string_ints: 61 | num = int(s) 62 | near_ints.add(str(num - 1)) 63 | near_ints.add(str(num)) 64 | near_ints.add(str(num + 1)) 65 | 66 | return frozenset(near_ints) 67 | 68 | 69 | def hundredIntegerPredicate(field: str) -> FrozenSet[str]: 70 | return frozenset(str(int(i))[:-2] + "00" for i in integers(field)) 71 | 72 | 73 | def hundredIntegersOddPredicate(field: str) -> FrozenSet[str]: 74 | return frozenset(str(int(i))[:-2] + "0" + str(int(i) % 2) for i in integers(field)) 75 | 76 | 77 | def firstIntegerPredicate(field: str) -> FrozenSet[str]: 78 | first_token = start_integer(field) 79 | if first_token: 80 | return frozenset(first_token.groups()) 81 | else: 82 | return frozenset() 83 | 84 | 85 | def ngramsTokens(field: Sequence[Any], n: int) -> FrozenSet[str]: 86 | grams = set() 87 | n_tokens = len(field) 88 | for i in range(n_tokens): 89 | for j in range(i + n, min(n_tokens, i + n) + 1): 90 | grams.add(" ".join(str(tok) for tok in field[i:j])) 91 | return frozenset(grams) 92 | 93 | 94 | def commonTwoTokens(field: str) -> FrozenSet[str]: 95 | return ngramsTokens(field.split(), 2) 96 | 97 | 98 | def commonThreeTokens(field: str) -> FrozenSet[str]: 99 | return ngramsTokens(field.split(), 3) 100 | 101 | 102 | def fingerprint(field: str) -> FrozenSet[str]: 103 | return frozenset(("".join(sorted(field.split())),)) 104 | 105 | 106 | def oneGramFingerprint(field: str) -> FrozenSet[str]: 107 | return frozenset(("".join(sorted({*field.replace(" ", "")})),)) 108 | 109 | 110 | def twoGramFingerprint(field: str) -> FrozenSet[str]: 111 | if len(field) > 1: 112 | return frozenset(("".join(sorted(unique_ngrams(field.replace(" ", ""), 2))),)) 113 | else: 114 | return frozenset() 115 | 116 | 117 | def commonFourGram(field: str) -> FrozenSet[str]: 118 | """return 4-grams""" 119 | return frozenset(unique_ngrams(field.replace(" ", ""), 4)) 120 | 121 | 122 | def commonSixGram(field: str) -> FrozenSet[str]: 123 | """return 6-grams""" 124 | return frozenset(unique_ngrams(field.replace(" ", ""), 6)) 125 | 126 | 127 | def sameThreeCharStartPredicate(field: str) -> FrozenSet[str]: 128 | """return first three characters""" 129 | return frozenset(initials(field.replace(" ", ""), 3)) 130 | 131 | 132 | def sameFiveCharStartPredicate(field: str) -> FrozenSet[str]: 133 | """return first five characters""" 134 | return frozenset(initials(field.replace(" ", ""), 5)) 135 | 136 | 137 | def sameSevenCharStartPredicate(field: str) -> FrozenSet[str]: 138 | """return first seven characters""" 139 | return frozenset(initials(field.replace(" ", ""), 7)) 140 | 141 | 142 | def suffixArray(field: str) -> FrozenSet[str]: 143 | n = len(field) - 4 144 | if n > 0: 145 | return frozenset(field[i:] for i in range(0, n)) 146 | else: 147 | return frozenset() 148 | 149 | 150 | def sortedAcronym(field: str) -> FrozenSet[str]: 151 | return frozenset(("".join(sorted(each[0] for each in field.split())),)) 152 | 153 | 154 | def doubleMetaphone(field: str) -> FrozenSet[str]: 155 | return frozenset(metaphone for metaphone in doublemetaphone(field) if metaphone) 156 | 157 | 158 | def metaphoneToken(field: str) -> FrozenSet[str]: 159 | return frozenset( 160 | metaphone_token 161 | for metaphone_token in chain( 162 | *(doublemetaphone(token) for token in field.split()) 163 | ) 164 | if metaphone_token 165 | ) 166 | 167 | 168 | def wholeSetPredicate(field_set: Sequence[Any]) -> FrozenSet[str]: 169 | return frozenset((str(field_set),)) 170 | 171 | 172 | def commonSetElementPredicate(field_set: Sequence[Any]) -> FrozenSet[str]: 173 | """return set as individual elements""" 174 | 175 | return frozenset(str(item) for item in field_set) 176 | 177 | 178 | def commonTwoElementsPredicate(field: Sequence[Any]) -> FrozenSet[str]: 179 | return ngramsTokens(sorted(field), 2) 180 | 181 | 182 | def commonThreeElementsPredicate(field: Sequence[Any]) -> FrozenSet[str]: 183 | return ngramsTokens(sorted(field), 3) 184 | 185 | 186 | def lastSetElementPredicate(field_set: Sequence[Any]) -> FrozenSet[str]: 187 | return frozenset((str(max(field_set)),)) 188 | 189 | 190 | def firstSetElementPredicate(field_set: Sequence[Any]) -> FrozenSet[str]: 191 | return frozenset((str(min(field_set)),)) 192 | 193 | 194 | def magnitudeOfCardinality(field_set: Sequence[Any]) -> FrozenSet[str]: 195 | return orderOfMagnitude(len(field_set)) 196 | 197 | 198 | def latLongGridPredicate(field: Tuple[float], digits: int = 1) -> FrozenSet[str]: 199 | """ 200 | Given a lat / long pair, return the grid coordinates at the 201 | nearest base value. e.g., (42.3, -5.4) returns a grid at 0.1 202 | degree resolution of 0.1 degrees of latitude ~ 7km, so this is 203 | effectively a 14km lat grid. This is imprecise for longitude, 204 | since 1 degree of longitude is 0km at the poles, and up to 111km 205 | at the equator. But it should be reasonably precise given some 206 | prior logical block (e.g., country). 207 | """ 208 | if any(field): 209 | return frozenset((str(tuple(round(dim, digits) for dim in field)),)) 210 | else: 211 | return frozenset() 212 | 213 | 214 | def orderOfMagnitude(field: Union[int, float]) -> FrozenSet[str]: 215 | if field > 0: 216 | return frozenset((str(int(round(log10(field)))),)) 217 | else: 218 | return frozenset() 219 | 220 | 221 | # Thanks to http://stackoverflow.com/questions/3410976/how-to-round-a-number-to-significant-figures-in-python 222 | def roundTo1(field: float) -> FrozenSet[str]: 223 | abs_num = abs(field) 224 | order = int(floor(log10(abs_num))) 225 | rounded = round(abs_num, -order) 226 | return frozenset((str(int(copysign(rounded, field))),)) 227 | -------------------------------------------------------------------------------- /dedupe/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/54ecfe77d41390da66899596834a2bde3712c966/dedupe/py.typed -------------------------------------------------------------------------------- /dedupe/serializer.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import Any, Iterator, TextIO 3 | 4 | from dedupe._typing import TrainingData 5 | 6 | 7 | def _from_json(json_object: Any) -> Any: 8 | if "__class__" in json_object: 9 | if json_object["__class__"] == "frozenset": 10 | return frozenset(json_object["__value__"]) 11 | if json_object["__class__"] == "tuple": 12 | return tuple(json_object["__value__"]) 13 | return json_object 14 | 15 | 16 | def hint_tuples(item: Any) -> Any: 17 | if isinstance(item, tuple): 18 | return {"__class__": "tuple", "__value__": [hint_tuples(e) for e in item]} 19 | if isinstance(item, list): 20 | return [hint_tuples(e) for e in item] 21 | if isinstance(item, dict): 22 | return {key: hint_tuples(value) for key, value in item.items()} 23 | else: 24 | return item 25 | 26 | 27 | class TupleEncoder(json.JSONEncoder): 28 | def encode(self, obj: Any) -> Any: 29 | return super().encode(hint_tuples(obj)) 30 | 31 | def iterencode(self, obj: Any, _one_shot: bool = False) -> Iterator[str]: 32 | return super().iterencode(hint_tuples(obj)) 33 | 34 | def default(self, python_object: Any) -> Any: 35 | if isinstance(python_object, frozenset): 36 | return {"__class__": "frozenset", "__value__": list(python_object)} 37 | return super().default(python_object) 38 | 39 | 40 | def read_training(training_file: TextIO) -> Any: 41 | """ 42 | Read training from previously built training data file object 43 | 44 | Args: 45 | training_file: file object containing the training data 46 | 47 | Returns: 48 | A dictionary with two keys, `match` and `distinct`. See the inverse, 49 | :func:`write_training`. 50 | """ 51 | return json.load(training_file, object_hook=_from_json) 52 | 53 | 54 | def write_training(labeled_pairs: TrainingData, file_obj: TextIO) -> None: 55 | """ 56 | Write a JSON file that contains labeled examples 57 | 58 | Args: 59 | labeled_pairs: A dictionary with two keys, `match` and `distinct`. 60 | The values are lists that can contain pairs of records 61 | file_obj: file object to write training data to 62 | 63 | .. code:: python 64 | 65 | examples = { 66 | "match": [ 67 | ({'name' : 'Georgie Porgie'}, {'name' : 'George Porgie'}), 68 | ], 69 | "distinct": [ 70 | ({'name' : 'Georgie Porgie'}, {'name' : 'Georgette Porgette'}), 71 | ], 72 | } 73 | with open('training.json', 'w') as f: 74 | dedupe.write_training(examples, f) 75 | 76 | """ 77 | json.dump(labeled_pairs, file_obj, cls=TupleEncoder, ensure_ascii=True) 78 | -------------------------------------------------------------------------------- /dedupe/tfidf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import logging 3 | from typing import List, Tuple 4 | 5 | from dedupe.canopy_index import CanopyIndex 6 | from dedupe.core import Enumerator 7 | from dedupe.index import Index 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | Doc = Tuple[str, ...] 12 | 13 | 14 | class TfIdfIndex(Index): 15 | def __init__(self) -> None: 16 | self._index = CanopyIndex() 17 | self._doc_to_id = Enumerator(start=1) 18 | self._parseTerms = self._index.lexicon.parseTerms 19 | 20 | def index(self, doc: Doc) -> None: 21 | if doc not in self._doc_to_id: 22 | i = self._doc_to_id[doc] 23 | self._index.index_doc(i, doc) 24 | 25 | def unindex(self, doc) -> None: 26 | i = self._doc_to_id.pop(doc) 27 | self._index.unindex_doc(i) 28 | self.initSearch() 29 | 30 | def initSearch(self) -> None: 31 | self._index.initSearch() 32 | 33 | def search(self, doc: Doc, threshold: float = 0) -> List[int]: 34 | query_list = self._parseTerms(doc) 35 | 36 | if query_list: 37 | results = [ 38 | center for score, center in self._index.apply(query_list, threshold) 39 | ] 40 | else: 41 | results = [] 42 | 43 | return results 44 | -------------------------------------------------------------------------------- /dedupe/variables/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import CustomType as Custom 2 | from .categorical_type import CategoricalType as Categorical 3 | from .exact import ExactType as Exact 4 | from .exists import ExistsType as Exists 5 | from .interaction import InteractionType as Interaction 6 | from .latlong import LatLongType as LatLong 7 | from .price import PriceType as Price 8 | from .set import SetType as Set 9 | from .string import ShortStringType as ShortString 10 | from .string import StringType as String 11 | from .string import TextType as Text 12 | 13 | __all__ = [ 14 | "Custom", 15 | "Categorical", 16 | "Exact", 17 | "Exists", 18 | "Interaction", 19 | "LatLong", 20 | "Price", 21 | "Set", 22 | "ShortString", 23 | "String", 24 | "Text", 25 | ] 26 | -------------------------------------------------------------------------------- /dedupe/variables/base.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import TYPE_CHECKING 4 | 5 | from dedupe import predicates 6 | 7 | if TYPE_CHECKING: 8 | from typing import Any, ClassVar, Iterable, Sequence 9 | 10 | from dedupe._typing import Comparator, CustomComparator, PredicateFunction 11 | from dedupe._typing import Variable as VariableProtocol 12 | 13 | 14 | class Variable: 15 | name: str 16 | type: ClassVar[str] 17 | predicates: list[predicates.Predicate] 18 | higher_vars: Sequence[VariableProtocol] 19 | 20 | def __len__(self) -> int: 21 | return 1 22 | 23 | def __repr__(self) -> str: 24 | return self.name 25 | 26 | def __hash__(self) -> int: 27 | return hash(self.name) 28 | 29 | def __eq__(self, other: Any) -> bool: 30 | other_name: str = other.name 31 | return self.name == other_name 32 | 33 | def __init__(self, has_missing: bool = False): 34 | self.has_missing = has_missing 35 | 36 | def __getstate__(self) -> dict[str, Any]: 37 | odict = self.__dict__.copy() 38 | odict["predicates"] = None 39 | 40 | return odict 41 | 42 | 43 | class DerivedType(Variable): 44 | type = "Derived" 45 | 46 | def __init__(self, name: str, var_type: str, **kwargs): 47 | self.name = f"({str(name)}: {str(var_type)})" 48 | super().__init__(**kwargs) 49 | 50 | 51 | class FieldType(Variable): 52 | _index_thresholds: Sequence[float] = [] 53 | _index_predicates: Sequence[type[predicates.IndexPredicate]] = [] 54 | _predicate_functions: Sequence[PredicateFunction] = () 55 | _Predicate: type[predicates.SimplePredicate] = predicates.SimplePredicate 56 | comparator: Comparator 57 | 58 | def __init__(self, field: str, name: str | None = None, has_missing: bool = False): 59 | self.field = field 60 | 61 | if name is None: 62 | self.name = f"({self.field}: {self.type})" 63 | else: 64 | self.name = name 65 | 66 | self.predicates = [ 67 | self._Predicate(pred, self.field) for pred in self._predicate_functions 68 | ] 69 | 70 | self.predicates += indexPredicates( 71 | self._index_predicates, self._index_thresholds, self.field 72 | ) 73 | 74 | self.has_missing = has_missing 75 | if self.has_missing: 76 | exists_pred = predicates.ExistsPredicate(self.field) 77 | self.predicates.append(exists_pred) 78 | 79 | 80 | class CustomType(FieldType): 81 | type = "Custom" 82 | 83 | def __init__( 84 | self, 85 | field: str, 86 | comparator: CustomComparator, 87 | name: str | None = None, 88 | **kwargs, 89 | ): 90 | super().__init__(field, **kwargs) 91 | 92 | if comparator is None: 93 | raise ValueError( 94 | "You must define a comparator function for the Custom class" 95 | ) 96 | else: 97 | self.comparator = comparator 98 | 99 | if name is None: 100 | self.name = f"({self.field}: {self.type}, {self.comparator.__name__})" 101 | else: 102 | self.name = name 103 | 104 | 105 | def indexPredicates( 106 | predicates: Iterable[type[predicates.IndexPredicate]], 107 | thresholds: Sequence[float], 108 | field: str, 109 | ) -> list[predicates.IndexPredicate]: 110 | index_predicates = [] 111 | for predicate in predicates: 112 | for threshold in thresholds: 113 | index_predicates.append(predicate(threshold, field)) 114 | 115 | return index_predicates 116 | -------------------------------------------------------------------------------- /dedupe/variables/categorical_type.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Sequence 4 | 5 | from categorical import CategoricalComparator 6 | 7 | from dedupe import predicates 8 | from dedupe._typing import PredicateFunction 9 | from dedupe.variables.base import DerivedType, FieldType 10 | 11 | 12 | class CategoricalType(FieldType): 13 | type = "Categorical" 14 | _predicate_functions: list[PredicateFunction] = [predicates.wholeFieldPredicate] 15 | 16 | def __init__(self, field: str, categories: Sequence[str], **kwargs): 17 | super().__init__(field, **kwargs) 18 | 19 | self.comparator = CategoricalComparator(categories) # type: ignore[assignment] 20 | 21 | self.higher_vars = [] 22 | for higher_var in self.comparator.dummy_names: # type: ignore[attr-defined] 23 | dummy_var = DerivedType(higher_var, "Dummy", has_missing=False) 24 | self.higher_vars.append(dummy_var) 25 | 26 | def __len__(self) -> int: 27 | return len(self.higher_vars) 28 | -------------------------------------------------------------------------------- /dedupe/variables/exact.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | from dedupe import predicates 4 | from dedupe.variables.base import FieldType 5 | 6 | 7 | class ExactType(FieldType): 8 | _predicate_functions = [predicates.wholeFieldPredicate] 9 | type = "Exact" 10 | 11 | @staticmethod 12 | def comparator(field_1: Any, field_2: Any) -> int: 13 | if field_1 == field_2: 14 | return 1 15 | else: 16 | return 0 17 | -------------------------------------------------------------------------------- /dedupe/variables/exists.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Any 4 | 5 | from categorical import CategoricalComparator 6 | 7 | from dedupe._typing import PredicateFunction 8 | from dedupe.variables.base import DerivedType, FieldType 9 | 10 | 11 | class ExistsType(FieldType): 12 | type = "Exists" 13 | _predicate_functions: list[PredicateFunction] = [] 14 | 15 | def __init__(self, field: str, **kwargs): 16 | super().__init__(field, **kwargs) 17 | 18 | self.cat_comparator = CategoricalComparator([0, 1]) 19 | 20 | self.higher_vars = [] 21 | for higher_var in self.cat_comparator.dummy_names: 22 | dummy_var = DerivedType(higher_var, "Dummy", has_missing=self.has_missing) 23 | self.higher_vars.append(dummy_var) 24 | 25 | def comparator(self, field_1: Any, field_2: Any) -> list[int]: 26 | if field_1 and field_2: 27 | return self.cat_comparator(1, 1) 28 | elif field_1 or field_2: 29 | return self.cat_comparator(0, 1) 30 | else: 31 | return self.cat_comparator(0, 0) 32 | 33 | def __len__(self) -> int: 34 | return len(self.higher_vars) 35 | 36 | # This flag tells fieldDistances in dedupe.core to pass 37 | # missing values (None) into the comparator 38 | comparator.missing = True # type: ignore 39 | -------------------------------------------------------------------------------- /dedupe/variables/interaction.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import itertools 4 | from typing import Mapping 5 | 6 | from dedupe._typing import FieldVariable, InteractionVariable 7 | from dedupe.variables.base import Variable 8 | 9 | 10 | class InteractionType(Variable): 11 | type = "Interaction" 12 | higher_vars: list[InteractionVariable] 13 | 14 | def __init__(self, *args: str, **kwargs): 15 | self.interactions = list(args) 16 | 17 | self.name = "(Interaction: %s)" % str(self.interactions) 18 | self.interaction_fields = self.interactions 19 | 20 | super().__init__(**kwargs) 21 | 22 | def expandInteractions(self, field_model: Mapping[str, FieldVariable]) -> None: 23 | self.interaction_fields = self.atomicInteractions( 24 | self.interactions, field_model 25 | ) 26 | for field in self.interaction_fields: 27 | if field_model[field].has_missing: 28 | self.has_missing = True 29 | 30 | self.categorical(field_model) 31 | 32 | def categorical(self, field_model: Mapping[str, FieldVariable]) -> None: 33 | categoricals = [ 34 | field 35 | for field in self.interaction_fields 36 | if hasattr(field_model[field], "higher_vars") 37 | ] 38 | noncategoricals = [ 39 | field 40 | for field in self.interaction_fields 41 | if not hasattr(field_model[field], "higher_vars") 42 | ] 43 | 44 | dummies = [field_model[field].higher_vars for field in categoricals] # type: ignore[attr-defined] 45 | 46 | self.higher_vars = [] 47 | for combo in itertools.product(*dummies): 48 | var_names = [field.name for field in combo] + noncategoricals 49 | higher_var = InteractionType(*var_names, has_missing=self.has_missing) 50 | self.higher_vars.append(higher_var) 51 | 52 | def atomicInteractions( 53 | self, interactions: list[str], field_model: Mapping[str, FieldVariable] 54 | ) -> list[str]: 55 | atomic_interactions = [] 56 | 57 | for field in interactions: 58 | try: 59 | field_model[field] 60 | except KeyError: 61 | raise KeyError( 62 | "The interaction variable %s is " 63 | "not a named variable in the variable " 64 | "definition" % field 65 | ) 66 | 67 | if hasattr(field_model[field], "interaction_fields"): 68 | sub_interactions = field_model[field].interaction_fields # type: ignore[attr-defined] 69 | atoms = self.atomicInteractions(sub_interactions, field_model) 70 | atomic_interactions.extend(atoms) 71 | else: 72 | atomic_interactions.append(field) 73 | 74 | return atomic_interactions 75 | -------------------------------------------------------------------------------- /dedupe/variables/latlong.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from math import sqrt 4 | 5 | from haversine import haversine 6 | 7 | from dedupe import predicates 8 | from dedupe.variables.base import FieldType 9 | 10 | 11 | class LatLongType(FieldType): 12 | type = "LatLong" 13 | 14 | _predicate_functions = [predicates.latLongGridPredicate] 15 | 16 | @staticmethod 17 | def comparator(x: tuple[float, float], y: tuple[float, float]) -> float: 18 | return sqrt(haversine(x, y)) 19 | -------------------------------------------------------------------------------- /dedupe/variables/price.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import numpy 4 | 5 | from dedupe import predicates 6 | from dedupe.variables.base import FieldType 7 | 8 | 9 | class PriceType(FieldType): 10 | _predicate_functions = [ 11 | predicates.orderOfMagnitude, 12 | predicates.wholeFieldPredicate, 13 | predicates.roundTo1, 14 | ] 15 | type = "Price" 16 | 17 | @staticmethod 18 | def comparator(price_1: int | float, price_2: int | float) -> float: 19 | if price_1 <= 0: 20 | return numpy.nan 21 | elif price_2 <= 0: 22 | return numpy.nan 23 | else: 24 | return abs(numpy.log10(price_1) - numpy.log10(price_2)) 25 | -------------------------------------------------------------------------------- /dedupe/variables/set.py: -------------------------------------------------------------------------------- 1 | from typing import Collection, Iterable, Optional 2 | 3 | from simplecosine.cosine import CosineSetSimilarity 4 | 5 | from dedupe import predicates 6 | from dedupe.variables.base import FieldType 7 | 8 | 9 | class SetType(FieldType): 10 | type = "Set" 11 | 12 | _predicate_functions = ( 13 | predicates.wholeSetPredicate, 14 | predicates.commonSetElementPredicate, 15 | predicates.lastSetElementPredicate, 16 | predicates.commonTwoElementsPredicate, 17 | predicates.commonThreeElementsPredicate, 18 | predicates.magnitudeOfCardinality, 19 | predicates.firstSetElementPredicate, 20 | ) 21 | 22 | _index_predicates = ( 23 | predicates.TfidfSetSearchPredicate, 24 | predicates.TfidfSetCanopyPredicate, 25 | ) 26 | _index_thresholds = (0.2, 0.4, 0.6, 0.8) 27 | 28 | def __init__( 29 | self, field: str, corpus: Optional[Iterable[Collection[str]]] = None, **kwargs 30 | ): 31 | super().__init__(field, **kwargs) 32 | 33 | if corpus is None: 34 | corpus = [] 35 | 36 | self.comparator = CosineSetSimilarity(corpus) # type: ignore[assignment] 37 | -------------------------------------------------------------------------------- /dedupe/variables/string.py: -------------------------------------------------------------------------------- 1 | from typing import Iterable, Optional, Sequence, Type 2 | 3 | from affinegap import normalizedAffineGapDistance as affineGap 4 | from highered import CRFEditDistance 5 | from simplecosine.cosine import CosineTextSimilarity 6 | 7 | from dedupe import predicates 8 | from dedupe._typing import PredicateFunction 9 | from dedupe.variables.base import FieldType, indexPredicates 10 | 11 | crfEd = CRFEditDistance() 12 | 13 | base_predicates = ( 14 | predicates.wholeFieldPredicate, 15 | predicates.firstTokenPredicate, 16 | predicates.firstTwoTokensPredicate, 17 | predicates.commonIntegerPredicate, 18 | predicates.nearIntegersPredicate, 19 | predicates.firstIntegerPredicate, 20 | predicates.hundredIntegerPredicate, 21 | predicates.hundredIntegersOddPredicate, 22 | predicates.alphaNumericPredicate, 23 | predicates.sameThreeCharStartPredicate, 24 | predicates.sameFiveCharStartPredicate, 25 | predicates.sameSevenCharStartPredicate, 26 | predicates.commonTwoTokens, 27 | predicates.commonThreeTokens, 28 | predicates.fingerprint, 29 | predicates.oneGramFingerprint, 30 | predicates.twoGramFingerprint, 31 | predicates.sortedAcronym, 32 | ) 33 | 34 | 35 | class BaseStringType(FieldType): 36 | _Predicate = predicates.StringPredicate 37 | _predicate_functions: Sequence[PredicateFunction] = () 38 | 39 | def __init__(self, *args, **kwargs): 40 | super().__init__(*args, **kwargs) 41 | 42 | self.predicates += indexPredicates( 43 | ( 44 | predicates.LevenshteinCanopyPredicate, 45 | predicates.LevenshteinSearchPredicate, 46 | ), 47 | (1, 2, 3, 4), 48 | self.field, 49 | ) 50 | 51 | 52 | class ShortStringType(BaseStringType): 53 | type = "ShortString" 54 | 55 | _predicate_functions = base_predicates + ( 56 | predicates.commonFourGram, 57 | predicates.commonSixGram, 58 | predicates.tokenFieldPredicate, 59 | predicates.suffixArray, 60 | predicates.doubleMetaphone, 61 | predicates.metaphoneToken, 62 | ) 63 | 64 | _index_predicates: Sequence[Type[predicates.IndexPredicate]] = [ 65 | predicates.TfidfNGramCanopyPredicate, 66 | predicates.TfidfNGramSearchPredicate, 67 | ] 68 | _index_thresholds = (0.2, 0.4, 0.6, 0.8) 69 | 70 | def __init__( 71 | self, field: str, name: Optional[str] = None, crf: bool = False, **kwargs 72 | ): 73 | super().__init__(field, name=name, **kwargs) 74 | 75 | if crf: 76 | self.comparator = crfEd # type: ignore[assignment] 77 | else: 78 | self.comparator = affineGap # type: ignore[assignment] 79 | 80 | 81 | class StringType(ShortStringType): 82 | type = "String" 83 | 84 | _index_predicates = [ 85 | predicates.TfidfNGramCanopyPredicate, 86 | predicates.TfidfNGramSearchPredicate, 87 | predicates.TfidfTextCanopyPredicate, 88 | predicates.TfidfTextSearchPredicate, 89 | ] 90 | 91 | 92 | class TextType(BaseStringType): 93 | type = "Text" 94 | 95 | _predicate_functions = base_predicates 96 | 97 | _index_predicates = [ 98 | predicates.TfidfTextCanopyPredicate, 99 | predicates.TfidfTextSearchPredicate, 100 | ] 101 | _index_thresholds = (0.2, 0.4, 0.6, 0.8) 102 | 103 | def __init__(self, field: str, corpus: Optional[Iterable[str]] = None, **kwargs): 104 | super().__init__(field, **kwargs) 105 | 106 | if corpus is None: 107 | corpus = [] 108 | 109 | self.comparator = CosineTextSimilarity(corpus) # type: ignore[assignment] 110 | -------------------------------------------------------------------------------- /docs/API-documentation.rst: -------------------------------------------------------------------------------- 1 | ===================== 2 | Library Documentation 3 | ===================== 4 | 5 | :class:`Dedupe` Objects 6 | ----------------------- 7 | .. autoclass:: dedupe.Dedupe 8 | 9 | .. code:: python 10 | 11 | # initialize from a defined set of fields 12 | variables = [ 13 | dedupe.variables.String("Site name"), 14 | dedupe.variables.String("Address"), 15 | dedupe.variables.String("Zip", has_missing=True), 16 | dedupe.variables.String("Phone", has_missing=True), 17 | ] 18 | deduper = dedupe.Dedupe(variables) 19 | 20 | .. automethod:: prepare_training 21 | .. automethod:: uncertain_pairs 22 | .. automethod:: mark_pairs 23 | .. automethod:: train 24 | .. automethod:: write_training 25 | .. automethod:: write_settings 26 | .. automethod:: cleanup_training 27 | .. automethod:: partition 28 | 29 | 30 | 31 | :class:`StaticDedupe` Objects 32 | ----------------------------- 33 | .. autoclass:: dedupe.StaticDedupe 34 | 35 | .. code:: python 36 | 37 | with open('learned_settings', 'rb') as f: 38 | matcher = StaticDedupe(f) 39 | 40 | .. automethod:: partition 41 | 42 | 43 | :class:`RecordLink` Objects 44 | --------------------------- 45 | .. autoclass:: dedupe.RecordLink 46 | 47 | .. code:: python 48 | 49 | # initialize from a defined set of fields 50 | variables = [ 51 | dedupe.variables.String("Site name"), 52 | dedupe.variables.String("Address"), 53 | dedupe.variables.String("Zip", has_missing=True), 54 | dedupe.variables.String("Phone", has_missing=True), 55 | ] 56 | deduper = dedupe.RecordLink(variables) 57 | 58 | .. automethod:: prepare_training 59 | .. automethod:: uncertain_pairs 60 | .. automethod:: mark_pairs 61 | .. automethod:: train 62 | .. automethod:: write_training 63 | .. automethod:: write_settings 64 | .. automethod:: cleanup_training 65 | .. automethod:: join 66 | 67 | 68 | :class:`StaticRecordLink` Objects 69 | --------------------------------- 70 | .. autoclass:: dedupe.StaticRecordLink 71 | 72 | .. code:: python 73 | 74 | with open('learned_settings', 'rb') as f: 75 | matcher = StaticRecordLink(f) 76 | 77 | .. automethod:: join 78 | 79 | 80 | :class:`Gazetteer` Objects 81 | -------------------------- 82 | .. autoclass:: dedupe.Gazetteer 83 | 84 | .. code:: python 85 | 86 | # initialize from a defined set of fields 87 | variables = [ 88 | dedupe.variables.String("Site name"), 89 | dedupe.variables.String("Address"), 90 | dedupe.variables.String("Zip", has_missing=True), 91 | dedupe.variables.String("Phone", has_missing=True), 92 | ] 93 | matcher = dedupe.Gazetteer(variables) 94 | 95 | .. automethod:: prepare_training 96 | .. automethod:: uncertain_pairs 97 | .. automethod:: mark_pairs 98 | .. automethod:: train 99 | .. automethod:: write_training 100 | .. automethod:: write_settings 101 | .. automethod:: cleanup_training 102 | .. automethod:: index 103 | .. automethod:: unindex 104 | .. automethod:: search 105 | 106 | 107 | :class:`StaticGazetteer` Objects 108 | -------------------------------- 109 | .. autoclass:: dedupe.StaticGazetteer 110 | 111 | .. code:: python 112 | 113 | with open('learned_settings', 'rb') as f: 114 | matcher = StaticGazetteer(f) 115 | 116 | .. automethod:: index 117 | .. automethod:: unindex 118 | .. automethod:: search 119 | .. automethod:: blocks 120 | .. automethod:: score 121 | .. automethod:: many_to_n 122 | 123 | Lower Level Classes and Methods 124 | ------------------------------- 125 | 126 | With the methods documented above, you can work with data into the 127 | millions of records. However, if are working with larger data you 128 | may not be able to load all your data into memory. You'll need 129 | to interact with some of the lower level classes and methods. 130 | 131 | .. seealso:: The `PostgreSQL `_ and `MySQL `_ examples use these lower level classes and methods. 132 | 133 | Dedupe and StaticDedupe 134 | *********************** 135 | 136 | .. currentmodule:: dedupe 137 | 138 | .. class:: Dedupe 139 | :noindex: 140 | 141 | .. attribute:: fingerprinter 142 | 143 | Instance of :class:`dedupe.blocking.Fingerprinter` class if 144 | the :func:`train` has been run, else `None`. 145 | 146 | .. automethod:: pairs 147 | .. automethod:: score 148 | .. automethod:: cluster 149 | 150 | .. class:: StaticDedupe 151 | :noindex: 152 | 153 | .. attribute:: fingerprinter 154 | 155 | Instance of :class:`dedupe.blocking.Fingerprinter` class 156 | 157 | .. method:: pairs(data) 158 | 159 | Same as :func:`dedupe.Dedupe.pairs` 160 | 161 | .. method:: score(pairs) 162 | 163 | Same as :func:`dedupe.Dedupe.score` 164 | 165 | .. method:: cluster(scores, threshold=0.5) 166 | 167 | Same as :func:`dedupe.Dedupe.cluster` 168 | 169 | 170 | RecordLink and StaticRecordLink 171 | ******************************* 172 | 173 | .. class:: RecordLink 174 | :noindex: 175 | 176 | .. attribute:: fingerprinter 177 | 178 | Instance of :class:`dedupe.blocking.Fingerprinter` class if 179 | the :func:`train` has been run, else `None`. 180 | 181 | .. automethod:: pairs 182 | .. automethod:: score 183 | .. automethod:: one_to_one 184 | .. automethod:: many_to_one 185 | 186 | .. class:: StaticRecordLink 187 | :noindex: 188 | 189 | .. attribute:: fingerprinter 190 | 191 | Instance of :class:`dedupe.blocking.Fingerprinter` class 192 | 193 | .. method:: pairs(data_1, data_2) 194 | 195 | Same as :func:`dedupe.RecordLink.pairs` 196 | 197 | .. method:: score(pairs) 198 | 199 | Same as :func:`dedupe.RecordLink.score` 200 | 201 | .. method:: one_to_one(scores, threshold=0.0) 202 | 203 | Same as :func:`dedupe.RecordLink.one_to_one` 204 | 205 | .. method:: many_to_one(scores, threshold=0.0) 206 | 207 | Same as :func:`dedupe.RecordLink.many_to_one` 208 | 209 | 210 | Gazetteer and StaticGazetteer 211 | ***************************** 212 | 213 | .. class:: Gazetteer 214 | :noindex: 215 | 216 | .. attribute:: fingerprinter 217 | 218 | Instance of :class:`dedupe.blocking.Fingerprinter` class if 219 | the :func:`train` has been run, else `None`. 220 | 221 | .. automethod:: blocks 222 | .. automethod:: score 223 | .. automethod:: many_to_n 224 | 225 | .. class:: StaticGazeteer 226 | :noindex: 227 | 228 | .. attribute:: fingerprinter 229 | 230 | Instance of :class:`dedupe.blocking.Fingerprinter` class 231 | 232 | .. method:: blocks(data) 233 | 234 | Same as :func:`dedupe.Gazetteer.blocks` 235 | 236 | .. method:: score(blocks) 237 | 238 | Same as :func:`dedupe.Gazetteer.score` 239 | 240 | .. method:: many_to_n(score_blocks, threshold=0.0, n_matches=1) 241 | 242 | Same as :func:`dedupe.Gazetteer.many_to_n` 243 | 244 | :class:`Fingerprinter` Objects 245 | ****************************** 246 | .. autoclass:: dedupe.blocking.Fingerprinter 247 | 248 | .. automethod:: __call__ 249 | .. autoattribute:: index_fields 250 | .. automethod:: index 251 | .. automethod:: unindex 252 | .. automethod:: reset_indices 253 | 254 | 255 | Convenience Functions 256 | --------------------- 257 | 258 | .. autofunction:: dedupe.console_label 259 | .. autofunction:: dedupe.training_data_dedupe 260 | .. autofunction:: dedupe.training_data_link 261 | .. autofunction:: dedupe.canonicalize 262 | .. autofunction:: dedupe.read_training 263 | .. autofunction:: dedupe.write_training 264 | -------------------------------------------------------------------------------- /docs/Bibliography.rst: -------------------------------------------------------------------------------- 1 | ============ 2 | Bibliography 3 | ============ 4 | 5 | - http://research.microsoft.com/apps/pubs/default.aspx?id=153478 6 | - http://cs.anu.edu.au/~Peter.Christen/data-matching-book-2012.html 7 | - http://www.umiacs.umd.edu/~getoor/Tutorials/ER\_VLDB2012.pdf 8 | 9 | New School 10 | ---------- 11 | - Steorts, Rebecca C., Rob Hall and Stephen Fienberg. "A Bayesian Approach to Record Linkage and De-duplication" December 2013. http://arxiv.org/abs/1312.4645 12 | 13 | Very beautiful work. Records are matched to latent individuals. O(N) 14 | running time. Unsupervised, but everything hinges on tuning 15 | hyperparameters. This work only contemplates categorical variables. 16 | 17 | 18 | To Read 19 | ------- 20 | - Domingos and Domingos Multi-relational record linkage. http://homes.cs.washington.edu/~pedrod/papers/mrdm04.pdf 21 | - An Entity Based Model for Coreference Resolution http://cs.tulane.edu/~aculotta/pubs/wick09entity.pdf 22 | 23 | -------------------------------------------------------------------------------- /docs/Examples.rst: -------------------------------------------------------------------------------- 1 | ======== 2 | Examples 3 | ======== 4 | 5 | Dedupe is a library and not a stand-alone command line tool. To 6 | demonstrate its usage, we have come up with a few example recipes for 7 | different sized datasets for you to try out. 8 | 9 | You can view and download the source code for these examples in the 10 | `examples repo `__. 11 | 12 | Or, you can view annotated, "walkthrough" versions online: 13 | 14 | * `Small data deduplication `__ 15 | * `Record Linkage `__ 16 | * `Gazetter example `__ 17 | * `MySQL example `__ 18 | * `Postgres big dedupe example `__ 19 | * `Patent Author Disambiguation `__ -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = -W 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 21 | 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 23 | 24 | help: 25 | @echo "Please use \`make ' where is one of" 26 | @echo " html to make standalone HTML files" 27 | @echo " dirhtml to make HTML files named index.html in directories" 28 | @echo " singlehtml to make a single large HTML file" 29 | @echo " pickle to make pickle files" 30 | @echo " json to make JSON files" 31 | @echo " htmlhelp to make HTML files and a HTML help project" 32 | @echo " qthelp to make HTML files and a qthelp project" 33 | @echo " devhelp to make HTML files and a Devhelp project" 34 | @echo " epub to make an epub" 35 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 36 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 37 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 38 | @echo " text to make text files" 39 | @echo " man to make manual pages" 40 | @echo " texinfo to make Texinfo files" 41 | @echo " info to make Texinfo files and run them through makeinfo" 42 | @echo " gettext to make PO message catalogs" 43 | @echo " changes to make an overview of all changed/added/deprecated items" 44 | @echo " xml to make Docutils-native XML files" 45 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 46 | @echo " linkcheck to check all external links for integrity" 47 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 48 | 49 | clean: 50 | rm -rf $(BUILDDIR)/* 51 | 52 | html: 53 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 54 | @echo 55 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 56 | 57 | dirhtml: 58 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 59 | @echo 60 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 61 | 62 | singlehtml: 63 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 64 | @echo 65 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 66 | 67 | pickle: 68 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 69 | @echo 70 | @echo "Build finished; now you can process the pickle files." 71 | 72 | json: 73 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 74 | @echo 75 | @echo "Build finished; now you can process the JSON files." 76 | 77 | htmlhelp: 78 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 79 | @echo 80 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 81 | ".hhp project file in $(BUILDDIR)/htmlhelp." 82 | 83 | qthelp: 84 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 85 | @echo 86 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 87 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 88 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/dedupe.qhcp" 89 | @echo "To view the help file:" 90 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/dedupe.qhc" 91 | 92 | devhelp: 93 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 94 | @echo 95 | @echo "Build finished." 96 | @echo "To view the help file:" 97 | @echo "# mkdir -p $$HOME/.local/share/devhelp/dedupe" 98 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/dedupe" 99 | @echo "# devhelp" 100 | 101 | epub: 102 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 103 | @echo 104 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 105 | 106 | latex: 107 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 108 | @echo 109 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 110 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 111 | "(use \`make latexpdf' here to do that automatically)." 112 | 113 | latexpdf: 114 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 115 | @echo "Running LaTeX files through pdflatex..." 116 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 117 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 118 | 119 | latexpdfja: 120 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 121 | @echo "Running LaTeX files through platex and dvipdfmx..." 122 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 123 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 124 | 125 | text: 126 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 127 | @echo 128 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 129 | 130 | man: 131 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 132 | @echo 133 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 134 | 135 | texinfo: 136 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 137 | @echo 138 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 139 | @echo "Run \`make' in that directory to run these through makeinfo" \ 140 | "(use \`make info' here to do that automatically)." 141 | 142 | info: 143 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 144 | @echo "Running Texinfo files through makeinfo..." 145 | make -C $(BUILDDIR)/texinfo info 146 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 147 | 148 | gettext: 149 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 150 | @echo 151 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 152 | 153 | changes: 154 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 155 | @echo 156 | @echo "The overview file is in $(BUILDDIR)/changes." 157 | 158 | linkcheck: 159 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 160 | @echo 161 | @echo "Link check complete; look for any errors in the above output " \ 162 | "or in $(BUILDDIR)/linkcheck/output.txt." 163 | 164 | doctest: 165 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 166 | @echo "Testing of doctests in the sources finished, look at the " \ 167 | "results in $(BUILDDIR)/doctest/output.txt." 168 | 169 | xml: 170 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 171 | @echo 172 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 173 | 174 | pseudoxml: 175 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 176 | @echo 177 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 178 | -------------------------------------------------------------------------------- /docs/Troubleshooting.rst: -------------------------------------------------------------------------------- 1 | *************** 2 | Troubleshooting 3 | *************** 4 | 5 | So you've tried to apply dedupe to your dataset, but you're having some problems. 6 | Once you understand :ref:`how dedupe works `, and you've taken 7 | a look at some of the :doc:`examples`, then this troubleshoooting 8 | guide is your next step. 9 | 10 | Memory Considerations 11 | ===================== 12 | 13 | The top two likely memory bottlenecks, in order of likelihood, are: 14 | 15 | 1. Building the index predicates for blocking. If this is a problem, 16 | you can try turning off index blocking rules (and just use predicate 17 | blocking rules) by setting ``index_predicates=False`` in 18 | :meth:`dedupe.Dedupe.train`. 19 | 20 | 2. During `cluster()`. After scoring, we have to compare all the pairwise scores 21 | and build the clusters. dedupe runs a connected-components algorithm to 22 | determine where to begin the clustering, and this is currently done in 23 | memory using python dicts, so it can take substantial memory. 24 | There isn't currently a way to avoid this except to just use less records. 25 | 26 | Time Considerations 27 | =================== 28 | 29 | The slowest part of dedupe is probably during blocking. A big part of this is building 30 | the index predicates, so the easiest fix for this is to set `index_predicates=False` 31 | in :meth:`dedupe.Dedupe.train`. 32 | 33 | Blocking could also be slow if dedupe has to do too many or too complex of 34 | blocking rules. You can fix this by reducing the number of blocking rules dedupe has 35 | to learn to cover all the true positives. Either you reduce the `recall` parameter 36 | in :meth:`dedupe.Dedupe.train`, or, similarly, just use less positive examples 37 | during training. 38 | 39 | Note that you are making a choice here between speed and recall. The less blocking 40 | you do, the faster you go, but the more likely you are to not block true positives 41 | together. 42 | 43 | This part of dedupe is still single-threaded, and could probably benefit 44 | from parallelization or other code strategies, 45 | although current attempts haven't really proved promising yet. 46 | 47 | 48 | Improving Accuracy 49 | ================== 50 | 51 | - Inspect your results and see if you can find any patterns: Does dedupe 52 | not seem to be paying enough attention to some detail? 53 | 54 | - Inspect the pairs given to you during :func:`dedupe.console_label`. These 55 | are pairs that dedupe is most confused about. Are these actually confusing 56 | pairs? If so, then great, dedupe is doing about as well as you could expect. 57 | If the pair is obviously a duplicate or obviously not a duplicate, then this 58 | means there is some signal that you should help dedupe to find. 59 | 60 | - Read up on the theory behind each of the variable types. Some of them 61 | are going to work better depending on the situation, so try to understand 62 | them as well as you can. 63 | 64 | - Add other variables. For instance try treating a field as both a `String` 65 | and as a `Text` variable. If this doesn't cut it, add your own custom 66 | variable that emphasizes the feature that you're really looking for. 67 | For instance, if you have a list of last names, you might want "Smith" 68 | to score well with "Smith-Johnson" (someone got married?). None of the 69 | builtin variables will handle this well, so write your own comparator. 70 | 71 | - Add `Interaction` variables. For instance, if both the "last name" and 72 | "street address" fields score very well, then this is almost a guarantee 73 | that these two records refer to the same person. An `Interaction` variable 74 | can emphasize this to the learner. 75 | 76 | Extending Dedupe 77 | ================ 78 | 79 | If the built in variables don't cut it, you can write your own variables. 80 | 81 | Take a look at the separately maintained `optional variables 82 | `__ 83 | for examples of how to write your own custom variable types with 84 | your custom comparators and predicates. -------------------------------------------------------------------------------- /docs/_static/css/custom.css: -------------------------------------------------------------------------------- 1 | 2 | /* ========== */ 3 | /* Typography */ 4 | /* ========== */ 5 | 6 | body { 7 | font-size: 16px; 8 | font-family: 'Montserrat', 'Arial', sans-serif; 9 | color: #555; 10 | } 11 | 12 | h1, h2, h3, h4, h5, h6 { 13 | font-family: 'Roboto', 'Arial', sans-serif; 14 | } 15 | 16 | h1, h1 a, h2, h2 a, h3, h3 a { 17 | color: #666; 18 | font-weight: bold; 19 | } 20 | 21 | /* Override some of the built-in RTD fonts */ 22 | h1, h2, h3, h4, h5, h6, legend, 23 | .rst-content .toctree-wrapper p.caption { 24 | font-family: 'Montserrat', 'Arial', sans-serif; 25 | } 26 | 27 | h1 a:hover, h2 a:hover, h3 a:hover { 28 | color: #333; 29 | } 30 | 31 | a, a:visited, .wy-menu-vertical a { color: #F26F80;} 32 | a:hover { color: #F26F80;} 33 | a:active, a:focus { outline: 0;} 34 | 35 | .navbar-nav > li > a, 36 | .navbar-brand { 37 | height: 40px; 38 | } 39 | 40 | .navbar-nav > li > a { 41 | padding-top: 10.5px; 42 | line-height: 20.57px; 43 | } 44 | 45 | .navbar { 46 | min-height: 40px; 47 | } 48 | 49 | .navbar-fixed-top .navbar-collapse { 50 | padding-right: 15px; 51 | } 52 | 53 | .navbar-default .navbar-nav > li > a { 54 | color: #555; 55 | } 56 | 57 | .navbar-logo { 58 | height: 15px; 59 | margin: 13px; 60 | } 61 | 62 | .nav > li { 63 | font-size: 0.9em; 64 | margin-right: 1.1px; 65 | } 66 | 67 | .navbar-default .navbar-nav > li > a:hover { 68 | color: #F26F80; 69 | background-color: transparent; 70 | } 71 | 72 | .navbar-brand { 73 | font-size: 17px; 74 | padding: 11px 15px; 75 | } 76 | 77 | /* ================ */ 78 | /* Navs and footers */ 79 | /* ================ */ 80 | 81 | .footer { 82 | background-color: #CAD8E7; 83 | padding: 20px 0; 84 | margin-top: 20px; 85 | font-size: 0.9em; 86 | } 87 | 88 | .footer a { 89 | color: #29446F; 90 | } 91 | 92 | /* Remove Bootstrap navbar stuff */ 93 | .navbar > .container .navbar-brand, 94 | .navbar > .container-fluid .navbar-brand { 95 | margin-left: 0; 96 | } 97 | 98 | /* Pad headers so that the navbar doesn't headbutt them */ 99 | /* (Thx to Chris Coyer on CSS-Tricks for the elegant sol'n) */ 100 | /* https://css-tricks.com/hash-tag-links-padding/ */ 101 | h1:before, h2:before, h3:before, 102 | h4:before, h4:before, h6:before { 103 | display: block; 104 | content: " "; 105 | margin-top: -55px; 106 | height: 55px; 107 | opacity: 0; 108 | pointer-events: none; 109 | } 110 | 111 | /* =================== */ 112 | /* RTD theme overrides */ 113 | /* =================== */ 114 | 115 | /* Remove sidebar background colors */ 116 | .wy-nav-side, .wy-side-nav-search { 117 | background: #fcfcfc; 118 | } 119 | 120 | /* Remove dark colors from the background */ 121 | .wy-body-for-nav { 122 | background: #fcfcfc; 123 | background-image: none; 124 | } 125 | 126 | /* Give the nav a little bit of padding */ 127 | .wy-nav-side { 128 | padding-left: 10px; 129 | top: 52px; 130 | } 131 | 132 | /* Restyle mobile nav */ 133 | .wy-nav-top { 134 | background-color: #f8f8f8; 135 | border-bottom: 1px solid #e7e7e7; 136 | } 137 | 138 | /* Give the hamburger menu styles like Bootstrap */ 139 | .wy-nav-top i { 140 | padding: 5px 10px; 141 | margin-top: 8px; 142 | margin-bottom: 8px; 143 | background-color: transparent; 144 | background-image: none; 145 | border: 1px solid #7777; 146 | border-radius: 4px; 147 | border-color: #ddd; 148 | color: #777; 149 | } 150 | 151 | .wy-nav-top i:hover { 152 | background-color: #ddd; 153 | } 154 | 155 | /* Pad the content just a lil */ 156 | .wy-nav-content { 157 | margin-top: 52px; 158 | margin-left: auto; 159 | margin-right: auto; 160 | } 161 | 162 | /* Sidebar text should always be dark */ 163 | .wy-side-nav-search, 164 | .wy-side-nav-search > a, 165 | .wy-side-nav-search .wy-dropdown > a, 166 | .wy-side-nav-search > div.version { 167 | color: #777777 168 | } 169 | 170 | /* Hover styles for the menu links */ 171 | .wy-menu > ul > li > a:hover, 172 | .wy-menu > ul > li > a:focus { 173 | background-color: #EFEFEF; 174 | color: #F26F80; 175 | font-weight: bold; 176 | } 177 | 178 | /*Make the search bar less garish */ 179 | .wy-side-nav-search input[type="text"] { 180 | border-color: #fcfcfc; /* hacky override for the gross border */ 181 | border-radius: 5px; 182 | } 183 | 184 | /* Right-align the sidebar text */ 185 | .wy-menu-vertical { 186 | text-align: right; 187 | } 188 | 189 | /* Version box in the lower left-hand corner */ 190 | .rst-versions { 191 | background: #ababab; 192 | border-top: none; 193 | } 194 | 195 | .rst-versions .rst-current-version { 196 | background-color: #7d7d7d; 197 | color: #ffffff; 198 | } 199 | 200 | .rst-versions .rst-other-versions { 201 | color: #4e4e4e; 202 | } 203 | 204 | /* ============= */ 205 | /* Media queries */ 206 | /* ============= */ 207 | 208 | /* Remove Dedupe.io navbar at small breakpoints */ 209 | @media (max-width: 768px) { 210 | 211 | #dedupe-nav { 212 | display: none; 213 | } 214 | 215 | .wy-nav-side { 216 | top: 0; 217 | } 218 | 219 | .wy-nav-content { 220 | margin-top: 0; 221 | } 222 | 223 | .wy-nav-top a { 224 | text-decoration: none; 225 | } 226 | } 227 | 228 | /* Get rid of mysterious dark void that appears on big screens */ 229 | @media screen and (min-width: 1400px) { 230 | .wy-nav-content-wrap { 231 | background: #fcfcfc; 232 | } 233 | } 234 | -------------------------------------------------------------------------------- /docs/_static/images/dedupeio-logo-reversed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/54ecfe77d41390da66899596834a2bde3712c966/docs/_static/images/dedupeio-logo-reversed.png -------------------------------------------------------------------------------- /docs/_static/images/dedupeio-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/54ecfe77d41390da66899596834a2bde3712c966/docs/_static/images/dedupeio-logo.png -------------------------------------------------------------------------------- /docs/_templates/layout.html: -------------------------------------------------------------------------------- 1 | {% extends "!layout.html" %} 2 | 3 | {% block extrahead %} 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 18 | {% endblock %} 19 | 20 | {% block extrabody %} 21 | 22 | 45 | {% endblock %} 46 | -------------------------------------------------------------------------------- /docs/how-it-works/Choosing-a-good-threshold.rst: -------------------------------------------------------------------------------- 1 | ========================= 2 | Choosing a Good Threshold 3 | ========================= 4 | 5 | Dedupe can predict the *probability* that a pair of records are 6 | duplicates. So, how should we decide that a pair of records really are 7 | duplicates? 8 | 9 | To answer this question we need to know something about Precision and 10 | Recall. Why don't you check out the `Wikipedia 11 | page `__ and come 12 | back here. 13 | 14 | There's always a trade-off between precision and recall. That's okay. As 15 | long as we know how much we care about precision vs. recall, `we can 16 | define an F-score `__ that will 17 | let us find a threshold for deciding when records are duplicates *that 18 | is optimal for our priorities*. 19 | 20 | Typically, the way that we find that threshold is by looking at the true 21 | precision and recall of some data where we know their true labels - 22 | where we know the real duplicates. However, we will only get a good 23 | threshold if the labeled examples are representative of the data we are 24 | trying to classify. 25 | 26 | So here's the problem - the labeled examples that we make with Dedupe 27 | are not at all representative, and that's by design. In the active 28 | learning step, we are not trying to find the most representative data 29 | examples. We're trying to find the ones that will teach us the most. 30 | 31 | The approach we take here is to take a random sample of blocked data, 32 | and then calculate the pairwise probability that records will be 33 | duplicates within each block. From these probabilities we can calculate 34 | the expected number of duplicates and distinct pairs, so we can 35 | calculate the expected precision and recall. 36 | 37 | -------------------------------------------------------------------------------- /docs/how-it-works/Grouping-duplicates.rst: -------------------------------------------------------------------------------- 1 | =================== 2 | Grouping Duplicates 3 | =================== 4 | 5 | Once we have calculated the probability that pairs of record are 6 | duplicates or not, we still have a kind of thorny problem because it's 7 | not just pairs of records that can be duplicates. Three, four, thousands 8 | of records could all refer to the same entity (person, organization, ice 9 | cream flavor, etc.,) but we only have pairwise measures. 10 | 11 | Let's say we have measured the following pairwise probabilities between 12 | records A, B, and C. 13 | 14 | :: 15 | 16 | A -- 0.6 -- B -- 0.6 -- C 17 | 18 | The probability that A and B are duplicates is 60%, the probability that 19 | B and C are duplicates is 60%, but what is the probability that A and C 20 | are duplicates? 21 | 22 | Let's say that everything is going perfectly and we can say there's a 23 | 36% probability that A and C are duplicates. We'd probably want to say 24 | that A and C should not be considered duplicates. 25 | 26 | Okay, then should we say that A and B are a duplicate pair and C is a 27 | distinct record or that A is the distinct record and that B and C are 28 | duplicates? 29 | 30 | Well... this is a thorny problem, and we tried solving it a few 31 | different ways. In the end, we found that **hierarchical clustering with 32 | centroid linkage** gave us the best results. What this algorithm does is 33 | say that all points within some distance of centroid are part of the 34 | same group. In this example, B would be the centroid - and A, B, C and 35 | would all be put in the same group. 36 | 37 | Unfortunately, a more principled answer does not exist because the 38 | estimated pairwise probabilities are not transitive. 39 | 40 | Clustering the groups depends on us setting a threshold for group 41 | membership -- the distance of the points to the centroid. Depending on how 42 | we choose that threshold, we'll get very different groups, and we will 43 | want to choose this threshold wisely. 44 | 45 | In recent years, there has been some very exciting research that 46 | solves the problem of turning pairwise distances into clusters, by 47 | avoiding making pairwise comparisons altogether. Unfortunately, these 48 | developments are not compatible with Dedupe's pairwise approach. See, 49 | `Michael Wick, et.al, 2012. "A Discriminative Hierarchical Model for Fast Coreference at Large Scale" `__ 50 | and `Rebecca C. Steorts, et. al., 2013. "A Bayesian Approach to Graphical Record Linkage and De-duplication" `__. 51 | 52 | -------------------------------------------------------------------------------- /docs/how-it-works/How-it-works.rst: -------------------------------------------------------------------------------- 1 | .. _how-it-works-label: 2 | 3 | ############ 4 | How it works 5 | ############ 6 | 7 | .. toctree:: 8 | :maxdepth: 1 9 | 10 | Matching-records 11 | Making-smart-comparisons 12 | Grouping-duplicates 13 | Choosing-a-good-threshold 14 | Special-Cases 15 | 16 | 17 | **Problems with real-world data** 18 | 19 | Journalists, academics, and businesses work hard to get big masses of 20 | data to learn about what people or organizations are doing. 21 | Unfortunately, once we get the data, we often can't answer our questions 22 | because we can't tell who is who. 23 | 24 | In much real-world data, we do not have a way of absolutely deciding 25 | whether two records, say ``John Smith`` and ``J. Smith`` are referring 26 | to the same person. If these were records of campaign contribution data, 27 | did a ``John Smith`` give two donations or did ``John Smith`` and maybe 28 | ``Jane Smith`` give one contribution apiece? 29 | 30 | People are pretty good at making these calls, if they have enough 31 | information. For example, I would be pretty confident that the following 32 | two records are the about the same person. 33 | 34 | :: 35 | 36 | first name | last name | address | phone | 37 | -------------------------------------------------------------- 38 | bob | roberts | 1600 pennsylvania ave. | 555-0123 | 39 | Robert | Roberts | 1600 Pensylvannia Avenue | | 40 | 41 | If we have to decide which records in our data are about the same person 42 | or organization, then we could just go through by hand, compare every 43 | record, and decide which records are about the same entity. 44 | 45 | This is very, very boring and can takes a **long** time. Dedupe is a 46 | software library that can make these decisions about whether records are 47 | about the same thing about as good as a person can, but quickly. 48 | -------------------------------------------------------------------------------- /docs/how-it-works/Making-smart-comparisons.rst: -------------------------------------------------------------------------------- 1 | ======================== 2 | Making Smart Comparisons 3 | ======================== 4 | 5 | Say we have magic function that takes in a pair of records and always 6 | returns a ``False`` if a pair of records are distinct and ``True`` if a 7 | pair of records refer to the same person or organization. 8 | 9 | Let's say that this function was pretty slow. It always took one second 10 | to return. 11 | 12 | How long would it take to duplicate a thousand records? 13 | 14 | Within a dataset of thousand records, there are :math:`\frac{1{,}000 15 | \times 999}{2} = 499{,}500` unique pairs of records. If we 16 | compared all of them using our magic function it would take six days. 17 | 18 | But, one second is a **long** time, let's say we sped it up so that we 19 | can make 10,000 comparisons per second. Now we can get through our 20 | thousand-record-long dataset in less than a minute. 21 | 22 | Feeling good about our super-fast comparison function, let's take on a 23 | dataset of 100,000 records. Now there are 24 | :math:`\frac{100{,}000 \times 99{,}999}{2} = 4{,}999{,}950{,}000` unique possible 25 | pairs. If we compare all of them with our super-fast comparison function, 26 | it will take six days again. 27 | 28 | If we want to work with moderately sized data, we have to find a way of 29 | making fewer comparisons. 30 | 31 | Duplicates are rare 32 | ------------------- 33 | 34 | In real world data, nearly all possible pairs of records are not 35 | duplicates. 36 | 37 | In this four-record example below, only two pairs of records are 38 | duplicates--(1, 2) and (3, 4), while there are four unique 39 | pairs of records that are not duplicates--(1,3), (1,4), (2,3), and (2,4). 40 | Typically, as the size of the dataset grows, the fraction of pairs of records 41 | that are duplicates gets very small very quickly. 42 | 43 | +-------------+-----------+--------------------------+--------------+----------+ 44 | | first name | last name | address | phone | record_id| 45 | +=============+===========+==========================+==============+==========+ 46 | | bob | roberts | 1600 pennsylvania ave. | 555-0123 | 1 | 47 | +-------------+-----------+--------------------------+--------------+----------+ 48 | | Robert | Roberts | 1600 Pensylvannia Avenue | | 2 | 49 | +-------------+-----------+--------------------------+--------------+----------+ 50 | | steve | Jones | 123 Cowabunga Lane | 555-0000 | 3 | 51 | +-------------+-----------+--------------------------+--------------+----------+ 52 | | Stephen | Janes | 123 Cawabunga Ln | 444-555-0000 | 4 | 53 | +-------------+-----------+--------------------------+--------------+----------+ 54 | 55 | 56 | If we could only compare records that were true duplicates, we wouldn't 57 | run into the explosion of comparisons. Of course, if we already knew where 58 | the true duplicates were, we wouldn't need to compare any individual 59 | records. Unfortunately we don't, but we do quite well if just compare 60 | records that are somewhat similar. 61 | 62 | Blocking 63 | -------- 64 | 65 | Duplicate records almost always share *something* in common. If we 66 | define groups of data that share something and only compare the records 67 | in that group, or *block*, then we can dramatically reduce the number of 68 | comparisons we will make. If we define these blocks well, then we will make 69 | very few comparisons and still have confidence that will compare records 70 | that truly are duplicates. 71 | 72 | This task is called blocking, and we approach it in two ways: predicate 73 | blocks and index blocks. 74 | 75 | Predicate blocks 76 | ~~~~~~~~~~~~~~~~ 77 | 78 | A predicate block is a bundle of records that all share a feature -- a 79 | feature produced by a simple function called a predicate. 80 | 81 | Predicate functions take in a record field, and output a set of features 82 | for that field. These features could be "the first 3 characters of the 83 | field," "every word in the field," and so on. Records that share the 84 | same feature become part of a block. 85 | 86 | Let's take an example. Let's use a "first 3 character" predicate on 87 | the **address field** below.. 88 | 89 | +-------------+-----------+--------------------------+--------------+----------+ 90 | | first name | last name | address | phone | record_id| 91 | +=============+===========+==========================+==============+==========+ 92 | | bob | roberts | 1600 pennsylvania ave. | 555-0123 | 1 | 93 | +-------------+-----------+--------------------------+--------------+----------+ 94 | | Robert | Roberts | 1600 Pensylvannia Avenue | | 2 | 95 | +-------------+-----------+--------------------------+--------------+----------+ 96 | | steve | Jones | 123 Cowabunga Lane | 555-0000 | 3 | 97 | +-------------+-----------+--------------------------+--------------+----------+ 98 | | Stephen | Janes | 123 Cawabunga Ln | 444-555-0000 | 4 | 99 | +-------------+-----------+--------------------------+--------------+----------+ 100 | 101 | That leaves us with two blocks - The '160' block, which contains records 102 | 1 and 2, and the '123' block, which contains records 3 and 4. 103 | 104 | :: 105 | 106 | {'160' : (1,2) # tuple of record_ids 107 | '123' : (3,4) 108 | } 109 | 110 | Again, we're applying the "first three characters" predicate function to the 111 | address field in our data, the function outputs the following features -- 112 | 160, 160, 123, 123 -- and then we group together the records that have 113 | identical features into "blocks". 114 | 115 | Others simple predicates Dedupe uses include: 116 | 117 | * whole field 118 | * token field 119 | * common integer 120 | * same three char start 121 | * same five char start 122 | * same seven char start 123 | * near integers 124 | * common four gram 125 | * common six gram 126 | 127 | .. _index-blocks-label: 128 | 129 | Index Blocks 130 | ~~~~~~~~~~~~ 131 | 132 | Dedupe also uses another way of producing blocks from searching and 133 | index. First, we create a special data structure, like an `inverted 134 | index `__, that lets us 135 | quickly find records similar to target records. We populate the index 136 | with all the unique values that appear in field. 137 | 138 | When blocking, for each record we search the index for values similar to 139 | the record's field. We block together records that share at least one 140 | common search result. 141 | 142 | Index predicates require building an index from all the unique values 143 | in a field. This can take substantial time and memory. Index 144 | predicates are also usually slower than predicate blocking. 145 | 146 | Combining blocking rules 147 | ------------------------ 148 | 149 | If it's good to put define blocks of records that share the same 'city' 150 | field, it might be even better to block records that share *both* the 151 | 'city' field *and* the 'zip code' field. Dedupe tries these cross-field 152 | blocks. These combinations blocks are called disjunctive blocks. 153 | 154 | Learning good blocking rules for given data 155 | ------------------------------------------- 156 | 157 | Dedupe comes with a long set of predicates, and when these are 158 | combined Dedupe can have hundreds of possible blocking rules to choose 159 | from. We will want to find a small set of these rules that covers 160 | every labeled duplicated pair but minimizes the total number pairs 161 | dedupe will have to compare. 162 | 163 | While we approach this problem by using greedy algorithms, particularly 164 | `Chvatal's Greedy Set-Cover 165 | algorithm `__. 166 | 167 | -------------------------------------------------------------------------------- /docs/how-it-works/Matching-records.rst: -------------------------------------------------------------------------------- 1 | **************** 2 | Matching Records 3 | **************** 4 | 5 | If you look at the following two records, you might think it's pretty 6 | clear that they are about the same person. 7 | 8 | :: 9 | 10 | first name | last name | address | phone | 11 | -------------------------------------------------------------- 12 | bob | roberts | 1600 pennsylvania ave. | 555-0123 | 13 | Robert | Roberts | 1600 Pensylvannia Avenue | | 14 | 15 | However, I bet it would be pretty hard for you to explicitly write down 16 | all the reasons why you think these records are about the same Mr. 17 | Roberts. 18 | 19 | Record similarity 20 | ----------------- 21 | 22 | One way that people have approached this problem is by saying that 23 | records that are more similar are more likely to be duplicates. That's a 24 | good first step, but then we have to precisely define what we mean for 25 | two records to be similar. 26 | 27 | The default way that we do this in Dedupe is to use what's called a 28 | string metric. A string metric is an way of taking two strings and 29 | returning a number that is low if the strings are similar and high if 30 | they are dissimilar. One famous string metric is called the Hamming 31 | distance. It counts the number of substitutions that must be made to 32 | turn one string into another. For example, ``roberts`` and ``Roberts`` 33 | would have Hamming distance of 1 because we have to substitute ``r`` for 34 | ``R`` in order to turn ``roberts`` into ``Roberts``. 35 | 36 | There are lots of different string metrics, and we actually use a metric 37 | called the `Affine Gap Distance `__, which is a 38 | variation on the Hamming distance. 39 | 40 | Record by record or field by field 41 | ---------------------------------- 42 | 43 | When we are calculating whether two records are similar we could treat 44 | each record as if it was a long string. 45 | 46 | :: 47 | 48 | record_distance = string_distance('bob roberts 1600 pennsylvania ave. 555-0123', 49 | 'Robert Roberts 1600 Pensylvannia Avenue') 50 | 51 | Alternately, we could compare field by field 52 | 53 | :: 54 | 55 | record_distance = (string_distance('bob', 'Robert') 56 | + string_distance('roberts', 'Roberts') 57 | + string_distance('1600 pennsylvania ave.', '1600 Pensylvannia Avenue') 58 | + string_distance('555-0123', '')) 59 | 60 | The major advantage of comparing field by field is that we don't have to 61 | treat each field string distance equally. Maybe we think that its really 62 | important that the last names and addresses are similar but it's not as 63 | important that first name and phone numbers are close. We can express 64 | that importance with numeric weights, i.e. 65 | 66 | :: 67 | 68 | record_distance = (0.5 * string_distance('bob', 'Robert') 69 | + 2.0 * string_distance('roberts', 'Roberts') 70 | + 2.0 * string_distance('1600 pennsylvania ave.', '1600 Pensylvannia Avenue') 71 | + 0.5 * string_distance('555-0123', '')) 72 | 73 | Setting weights and making decisions 74 | ------------------------------------ 75 | 76 | Say we set our record\_distance to be this weighted sum of field 77 | distances, just as we had above. Let's say we calculated the 78 | record\_distance and we found that it was the beautiful number **8**. 79 | 80 | That number, by itself, is not that helpful. Ultimately, we are trying 81 | to decide whether a pair of records are duplicates, and I'm not sure 82 | what decision I should make if I see an 8. Does an 8 mean that the pair 83 | of records are really similar or really far apart, likely or unlikely to 84 | be duplicates. We'd like to define the record distances so that we can 85 | look at the number and know whether to decide whether it's a duplicate. 86 | 87 | Also, I really would rather not have to set the weights by hand every 88 | time. It can be very tricky to know which fields are going to matter and 89 | even if I know that some fields are more important I'm not sure how to 90 | quantify it (is it 2 times more important or 1.3 times)? 91 | 92 | Fortunately, we can solve both problems with a technique called 93 | regularized logistic regression. If we supply pairs of records that we 94 | label as either being duplicates or distinct, then Dedupe will learn a 95 | set of weights such that the record distance can easily be transformed 96 | into our best estimate of the probability that a pair of records are 97 | duplicates. 98 | 99 | Once we have learned these good weights, we want to use them to find 100 | which records are duplicates. But turns out that doing this the naive 101 | way will usually not work, and :doc:`we'll have to do something 102 | smarter `. 103 | 104 | Active learning 105 | ~~~~~~~~~~~~~~~ 106 | 107 | In order to learn those weights, Dedupe needs example pairs with labels. 108 | Most of the time, we will need people to supply those labels. 109 | 110 | But the whole point of Dedupe is to save people's time, and that 111 | includes making good use of your labeling time so we use an approach 112 | called Active Learning. 113 | 114 | Basically, Dedupe keeps track of bunch unlabeled pairs and whether 115 | 116 | 1. the current learning blocking rules would cover the pairs 117 | 2. the current learned classifier would predict that the pairs are 118 | duplicates or are distinct 119 | 120 | We maintain a set of the pairs where there is disagreement: that is 121 | pairs which classifier believes are duplicates but which are not 122 | covered by the current blocking rules, and the pairs which the 123 | classifier believes are distinct but which are blocked together. 124 | 125 | Dedupe picks, at random from this disagreement set, a pair of records 126 | and asks the user to decide. Once it gets this label, it relearns the 127 | weights and blocking rules. We then recalculate the disagreement set. 128 | 129 | Other field distances 130 | ~~~~~~~~~~~~~~~~~~~~~ 131 | 132 | We have implemented a number of field distance measures. See :doc:`the 133 | details about variables `. 134 | 135 | 136 | -------------------------------------------------------------------------------- /docs/how-it-works/Special-Cases.rst: -------------------------------------------------------------------------------- 1 | ============= 2 | Special Cases 3 | ============= 4 | 5 | The process we have been describing is for the most general case--when 6 | you have a dataset where an arbitrary number of records can all refer to 7 | the same entity. 8 | 9 | There are certain special cases where we can make more assumptions about 10 | how records can be linked, which if true, make the problem much simpler. 11 | 12 | One important case we call Record Linkage. Say you have two datasets and 13 | you want to find the records in each dataset that refer to the same 14 | thing. If you can assume that each dataset, individually, is unique, 15 | then this puts a big constraint on how records can match. If this 16 | uniqueness assumption holds, then (A) two records can only refer to the 17 | same entity if they are from different datasets and (B) no other record 18 | can match either of those two records. 19 | 20 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. dedupe documentation master file, created by 2 | sphinx-quickstart on Thu Apr 10 11:27:59 2014. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | ================ 7 | Dedupe |release| 8 | ================ 9 | 10 | *dedupe is a library that uses machine learning to perform de-duplication and entity resolution quickly on structured data.* 11 | 12 | If you're looking for the documentation for the Dedupe.io Web API, you can find that here: https://apidocs.dedupe.io/ 13 | 14 | **dedupe** will help you: 15 | 16 | * **remove duplicate entries** from a spreadsheet of names and addresses 17 | * **link a list** with customer information to another with order history, even without unique customer id's 18 | * take a database of campaign contributions and **figure out which ones were made by the same person**, even if the names were entered slightly differently for each record 19 | 20 | dedupe takes in human training data and comes up with the best rules for your dataset to quickly and automatically find similar records, even with very large databases. 21 | 22 | Important links 23 | =============== 24 | 25 | * Documentation: https://docs.dedupe.io/ 26 | * Repository: https://github.com/dedupeio/dedupe 27 | * Issues: https://github.com/dedupeio/dedupe/issues 28 | * Mailing list: https://groups.google.com/forum/#!forum/open-source-deduplication 29 | * Examples: https://github.com/dedupeio/dedupe-examples 30 | * IRC channel, `#dedupe on irc.freenode.net `__ 31 | 32 | Tools built with dedupe 33 | ======================= 34 | 35 | `Dedupe.io `__ 36 | A full service web service powered by dedupe for de-duplicating and find matches in your messy data. It provides an easy-to-use interface and provides cluster review and automation, as well as advanced record linkage, continuous matching and API integrations. `See the product page `__ and the `launch blog post `__. 37 | 38 | `csvdedupe `__ 39 | Command line tool for de-duplicating and `linking `__ CSV files. Read about it on `Source Knight-Mozilla OpenNews `__. 40 | 41 | Contents 42 | ======== 43 | 44 | .. toctree:: 45 | :maxdepth: 1 46 | 47 | API-documentation 48 | Variable-definition 49 | Examples 50 | how-it-works/How-it-works 51 | Troubleshooting 52 | Bibliography 53 | 54 | 55 | Features 56 | ======== 57 | 58 | * **machine learning** - reads in human labeled data to automatically create optimum weights and blocking rules 59 | * **runs on a laptop** - makes intelligent comparisons so you don't need a powerful server to run it 60 | * **built as a library** - so it can be integrated in to your applications or import scripts 61 | * **extensible** - supports adding custom data types, string comparators and blocking rules 62 | * **open source** - anyone can use, modify or add to it 63 | 64 | Installation 65 | ============ 66 | 67 | .. code-block:: bash 68 | 69 | pip install dedupe 70 | 71 | Errors / Bugs 72 | ============= 73 | 74 | If something is not behaving intuitively, it is a bug, and should be 75 | reported. `Report it here `__ 76 | 77 | Contributing to dedupe 78 | ====================== 79 | 80 | Check out `dedupe `__ 81 | repo for how to contribute to the library. 82 | 83 | Check out `dedupe-examples 84 | `__ for how to contribute 85 | a useful example of using dedupe. 86 | 87 | Citing dedupe 88 | ============= 89 | 90 | If you use Dedupe in an academic work, please give this citation: 91 | 92 | Gregg, Forest and Derek Eder. 2015. Dedupe. https://github.com/dedupeio/dedupe. 93 | 94 | 95 | Indices and tables 96 | ================== 97 | 98 | * :ref:`genindex` 99 | 100 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx>=4.3.0 2 | sphinx-autodoc-typehints 3 | sphinx-rtd-theme>=0.5.1 4 | sphinxcontrib-htmlhelp 5 | sphinxcontrib-jsmath 6 | sphinxcontrib-serializinghtml 7 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "dedupe" 3 | description = "A python library for accurate and scaleable data deduplication and entity-resolution" 4 | version = "3.0.3" 5 | readme = "README.md" 6 | requires-python = ">=3.8" 7 | license = {file = "LICENSE"} 8 | keywords = [] 9 | authors = [ 10 | { name = "Forest Gregg", email = "fgregg@datamade.us" }, 11 | ] 12 | classifiers = [ 13 | "Development Status :: 4 - Beta", 14 | "Intended Audience :: Developers", 15 | "Intended Audience :: Science/Research", 16 | "License :: OSI Approved :: MIT License", 17 | "Natural Language :: English", 18 | "Operating System :: MacOS :: MacOS X", 19 | "Operating System :: Microsoft :: Windows", 20 | "Operating System :: POSIX", 21 | "Programming Language :: Cython", 22 | "Programming Language :: Python :: 3", 23 | "Topic :: Software Development :: Libraries :: Python Modules", 24 | "Topic :: Scientific/Engineering", 25 | "Topic :: Scientific/Engineering :: Information Analysis", 26 | ] 27 | dependencies = [ 28 | "scikit-learn", 29 | "affinegap>=1.3", 30 | "categorical-distance>=1.9", 31 | "numpy>=1.20", 32 | "doublemetaphone", 33 | "highered>=0.2.0", 34 | "simplecosine>=1.2", 35 | "haversine>=0.4.1", 36 | "BTrees>=4.1.4", 37 | "zope.index", 38 | "dedupe_Levenshtein_search", 39 | ] 40 | 41 | [project.urls] 42 | Homepage = "https://github.com/dedupeio/dedupe" 43 | Issues = "https://github.com/dedupeio/dedupe/issues" 44 | Documentation = "https://docs.dedupe.io/en/latest/" 45 | Examples = "https://github.com/dedupeio/dedupe-examples" 46 | Twitter = "https://twitter.com/DedupeIo" 47 | Changelog = "https://github.com/dedupeio/dedupe/blob/main/CHANGELOG.md" 48 | MailingList = "https://groups.google.com/forum/#!forum/open-source-deduplication" 49 | 50 | 51 | [build-system] 52 | requires = ["setuptools", 53 | "wheel", 54 | "cython"] 55 | build-backend = "setuptools.build_meta" 56 | 57 | [tool.setuptools] 58 | packages = ["dedupe", "dedupe.variables"] 59 | 60 | [tool.setuptools.package-data] 61 | dedupe = ["py.typed"] 62 | 63 | [tool.mypy] 64 | plugins = "numpy.typing.mypy_plugin" 65 | files = ["dedupe"] 66 | show_error_codes = true 67 | ignore_missing_imports = true 68 | check_untyped_defs = true 69 | implicit_reexport = false 70 | 71 | [tool.pytest.ini_options] 72 | minversion = "7.1" 73 | addopts = "--cov dedupe --cov-report xml" 74 | testpaths = ["tests", "dedupe"] 75 | 76 | [tool.isort] 77 | profile = "black" 78 | src_paths = ["dedupe", "tests", "benchmarks"] 79 | 80 | [tool.coverage.run] 81 | omit = ["dedupe/backport.py"] 82 | source = ["dedupe"] 83 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | asv 2 | black 3 | coverage[toml] 4 | coveralls 5 | flake8 6 | mock 7 | mypy 8 | pytest 9 | pytest-cov 10 | virtualenv 11 | isort 12 | pre-commit 13 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | try: 2 | from setuptools import Extension, setup 3 | except ImportError: 4 | raise ImportError( 5 | "setuptools module required, please go to https://pypi.python.org/pypi/setuptools and follow the instructions for installing setuptools" 6 | ) 7 | 8 | from Cython.Build import cythonize 9 | 10 | setup( 11 | ext_modules=cythonize([Extension("dedupe.cpredicates", ["dedupe/cpredicates.pyx"])]) 12 | ) 13 | -------------------------------------------------------------------------------- /tests/duplicateCluster_memory_case.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import dedupe.core 4 | import dedupe.dedupe # noqa: F401 5 | 6 | # simulated_candidates = (((1, {'name': 'asdffdsa'}), (2, {'name': 'fdsaasdf'})) 7 | # for _ in xrange(10**6)) 8 | 9 | # data_model = {"fields": {"name": {"type": "String", "weight": -1.0}}, 10 | # "bias": 1.0} 11 | # threshold = 0 12 | 13 | # dupes = dedupe.core.scoreDuplicates(simulated_candidates, 14 | # data_model, 15 | # 0) 16 | 17 | # simulated_candidates = (((1, {'name': 'asdffdsa'}), (2, {'name': 'fdsaasdf'})) 18 | # for _ in xrange(10**7)) 19 | 20 | 21 | # deduper = dedupe.dedupe.Dedupe({"name": {"type": "String", "weight": -1.0}}) 22 | # clusters = deduper.duplicateClusters(simulated_candidates, 0, 0) 23 | 24 | 25 | def candidates_gen(): 26 | candidate_set = set() 27 | for _ in range(10**5): 28 | block = [((random.randint(0, 1000), "a"), (random.randint(0, 1000), "b"))] 29 | for candidate in block: 30 | pair_ids = (candidate[0][0], candidate[1][0]) 31 | if pair_ids not in candidate_set: 32 | yield candidate 33 | candidate_set.add(pair_ids) 34 | del candidate_set 35 | 36 | 37 | @profile # noqa: F821 38 | def generator_test(): 39 | a = sum(candidate[0][0] for candidate in candidates_gen()) 40 | print(a) 41 | 42 | 43 | generator_test() 44 | -------------------------------------------------------------------------------- /tests/test_api.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import unittest 3 | import warnings 4 | from collections import OrderedDict 5 | 6 | import dedupe.api 7 | 8 | 9 | def icfi(x): 10 | return list(itertools.chain.from_iterable(x)) 11 | 12 | 13 | DATA_SAMPLE = [ 14 | ({"age": "27", "name": "Kyle"}, {"age": "50", "name": "Bob"}), 15 | ({"age": "27", "name": "Kyle"}, {"age": "35", "name": "William"}), 16 | ({"age": "10", "name": "Sue"}, {"age": "35", "name": "William"}), 17 | ({"age": "27", "name": "Kyle"}, {"age": "20", "name": "Jimmy"}), 18 | ({"age": "75", "name": "Charlie"}, {"age": "21", "name": "Jimbo"}), 19 | ] 20 | 21 | data_dict = OrderedDict( 22 | ( 23 | (0, {"name": "Bob", "age": "51"}), 24 | (1, {"name": "Linda", "age": "50"}), 25 | (2, {"name": "Gene", "age": "12"}), 26 | (3, {"name": "Tina", "age": "15"}), 27 | (4, {"name": "Bob B.", "age": "51"}), 28 | (5, {"name": "bob belcher", "age": "51"}), 29 | (6, {"name": "linda ", "age": "50"}), 30 | ) 31 | ) 32 | 33 | data_dict_2 = OrderedDict( 34 | ( 35 | (7, {"name": "BOB", "age": "51"}), 36 | (8, {"name": "LINDA", "age": "50"}), 37 | (9, {"name": "GENE", "age": "12"}), 38 | (10, {"name": "TINA", "age": "15"}), 39 | (11, {"name": "BOB B.", "age": "51"}), 40 | (12, {"name": "BOB BELCHER", "age": "51"}), 41 | (13, {"name": "LINDA ", "age": "50"}), 42 | ) 43 | ) 44 | 45 | 46 | class ActiveMatch(unittest.TestCase): 47 | def setUp(self): 48 | self.field_definition = [ 49 | dedupe.variables.String("name"), 50 | dedupe.variables.String("age"), 51 | ] 52 | 53 | def test_initialize_fields(self): 54 | self.assertRaises(TypeError, dedupe.api.ActiveMatching) 55 | 56 | with self.assertRaises(ValueError): 57 | dedupe.api.ActiveMatching( 58 | [], 59 | ) 60 | 61 | with self.assertRaises(ValueError): 62 | dedupe.api.ActiveMatching([{"field": "name", "type": "String"}]) 63 | 64 | with self.assertRaises(ValueError): 65 | dedupe.api.ActiveMatching( 66 | [dedupe.variables.Custom("name", comparator=lambda x, y: 1)], 67 | ) 68 | 69 | with self.assertRaises(ValueError): 70 | dedupe.api.ActiveMatching( 71 | [ 72 | dedupe.variables.Custom("name", comparator=lambda x, y: 1), 73 | dedupe.variables.Custom("age", comparator=lambda x, y: 1), 74 | ], 75 | ) 76 | 77 | dedupe.api.ActiveMatching( 78 | [ 79 | dedupe.variables.Custom("name", comparator=lambda x, y: 1), 80 | dedupe.variables.String("age"), 81 | ], 82 | ) 83 | 84 | def test_check_record(self): 85 | matcher = dedupe.api.ActiveMatching(self.field_definition) 86 | 87 | self.assertRaises(ValueError, matcher._checkRecordPair, ()) 88 | self.assertRaises(ValueError, matcher._checkRecordPair, (1, 2)) 89 | self.assertRaises(ValueError, matcher._checkRecordPair, (1, 2, 3)) 90 | self.assertRaises(ValueError, matcher._checkRecordPair, ({}, {})) 91 | 92 | matcher._checkRecordPair( 93 | ({"name": "Frank", "age": "72"}, {"name": "Bob", "age": "27"}) 94 | ) 95 | 96 | def test_markPair(self): 97 | from collections import OrderedDict 98 | 99 | good_training_pairs = OrderedDict( 100 | (("match", DATA_SAMPLE[3:5]), ("distinct", DATA_SAMPLE[0:3])) 101 | ) 102 | bad_training_pairs = {"non_dupes": DATA_SAMPLE[0:3], "match": DATA_SAMPLE[3:5]} 103 | 104 | matcher = dedupe.api.ActiveMatching(self.field_definition) 105 | 106 | self.assertRaises(ValueError, matcher.mark_pairs, bad_training_pairs) 107 | 108 | matcher.mark_pairs(good_training_pairs) 109 | 110 | with warnings.catch_warnings(record=True) as w: 111 | warnings.simplefilter("always") 112 | matcher.mark_pairs({"match": [], "distinct": []}) 113 | assert len(w) == 1 114 | assert str(w[-1].message) == "Didn't return any labeled record pairs" 115 | 116 | 117 | if __name__ == "__main__": 118 | unittest.main() 119 | -------------------------------------------------------------------------------- /tests/test_blocking.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from collections import defaultdict 3 | 4 | import dedupe 5 | 6 | 7 | class BlockingTest(unittest.TestCase): 8 | def setUp(self): 9 | field_definition = [{"field": "name", "type": "String"}] 10 | self.data_model = dedupe.Dedupe(field_definition).data_model 11 | self.training_pairs = { 12 | "match": [ 13 | ({"name": "Bob", "age": "50"}, {"name": "Bob", "age": "75"}), 14 | ({"name": "Meredith", "age": "40"}, {"name": "Sue", "age": "10"}), 15 | ], 16 | "distinct": [ 17 | ({"name": "Jimmy", "age": "20"}, {"name": "Jimbo", "age": "21"}), 18 | ({"name": "Willy", "age": "35"}, {"name": "William", "age": "35"}), 19 | ({"name": "William", "age": "36"}, {"name": "William", "age": "35"}), 20 | ], 21 | } 22 | 23 | self.training = self.training_pairs["match"] + self.training_pairs["distinct"] 24 | self.training_records = [] 25 | for pair in self.training: 26 | for record in pair: 27 | if record not in self.training_records: 28 | self.training_records.append(record) 29 | 30 | self.simple = lambda x: {str(k) for k in x if "CompoundPredicate" not in str(k)} 31 | 32 | 33 | class TfidfTest(unittest.TestCase): 34 | def setUp(self): 35 | self.data_d = { 36 | 100: {"name": "Bob", "age": "50", "dataset": 0}, 37 | 105: {"name": "Charlie", "age": "75", "dataset": 1}, 38 | 110: {"name": "Meredith", "age": "40", "dataset": 1}, 39 | 115: {"name": "Sue", "age": "10", "dataset": 0}, 40 | 120: {"name": "Jimbo", "age": "21", "dataset": 0}, 41 | 125: {"name": "Jimbo", "age": "21", "dataset": 0}, 42 | 130: {"name": "Willy", "age": "35", "dataset": 0}, 43 | 135: {"name": "Willy", "age": "35", "dataset": 1}, 44 | 140: {"name": "Martha", "age": "19", "dataset": 1}, 45 | 145: {"name": "Kyle", "age": "27", "dataset": 0}, 46 | } 47 | 48 | def test_unconstrained_inverted_index(self): 49 | blocker = dedupe.blocking.Fingerprinter( 50 | [dedupe.predicates.TfidfTextSearchPredicate(0.0, "name")] 51 | ) 52 | 53 | blocker.index({record["name"] for record in self.data_d.values()}, "name") 54 | 55 | blocks = defaultdict(set) 56 | 57 | for block_key, record_id in blocker(self.data_d.items()): 58 | blocks[block_key].add(record_id) 59 | 60 | blocks = {frozenset(block) for block in blocks.values() if len(block) > 1} 61 | 62 | assert blocks == {frozenset([120, 125]), frozenset([130, 135])} 63 | 64 | 65 | class TfIndexUnindex(unittest.TestCase): 66 | def setUp(self): 67 | data_d = { 68 | 100: {"name": "Bob", "age": "50", "dataset": 0}, 69 | 105: {"name": "Charlie", "age": "75", "dataset": 1}, 70 | 110: {"name": "Meredith", "age": "40", "dataset": 1}, 71 | 115: {"name": "Sue", "age": "10", "dataset": 0}, 72 | 120: {"name": "Jimbo", "age": "21", "dataset": 0}, 73 | 125: {"name": "Jimbo", "age": "21", "dataset": 0}, 74 | 130: {"name": "Willy", "age": "35", "dataset": 0}, 75 | 135: {"name": "Willy", "age": "35", "dataset": 1}, 76 | 140: {"name": "Martha", "age": "19", "dataset": 1}, 77 | 145: {"name": "Kyle", "age": "27", "dataset": 0}, 78 | } 79 | 80 | self.blocker = dedupe.blocking.Fingerprinter( 81 | [dedupe.predicates.TfidfTextSearchPredicate(0.0, "name")] 82 | ) 83 | 84 | self.records_1 = { 85 | record_id: record 86 | for record_id, record in data_d.items() 87 | if record["dataset"] == 0 88 | } 89 | 90 | self.fields_2 = { 91 | record_id: record["name"] 92 | for record_id, record in data_d.items() 93 | if record["dataset"] == 1 94 | } 95 | 96 | def test_index(self): 97 | self.blocker.index(set(self.fields_2.values()), "name") 98 | 99 | blocks = defaultdict(set) 100 | 101 | for block_key, record_id in self.blocker(self.records_1.items()): 102 | blocks[block_key].add(record_id) 103 | 104 | assert list(blocks.items())[0][1] == {130} 105 | 106 | def test_doubled_index(self): 107 | self.blocker.index(self.fields_2.values(), "name") 108 | self.blocker.index(self.fields_2.values(), "name") 109 | 110 | blocks = defaultdict(set) 111 | 112 | for block_key, record_id in self.blocker(self.records_1.items()): 113 | blocks[block_key].add(record_id) 114 | 115 | result = list(blocks.items()) 116 | 117 | assert len(result) == 1 118 | 119 | assert result[0][1] == {130} 120 | 121 | def test_unindex(self): 122 | self.blocker.index(self.fields_2.values(), "name") 123 | self.blocker.unindex(self.fields_2.values(), "name") 124 | 125 | blocks = defaultdict(set) 126 | 127 | for block_key, record_id in self.blocker(self.records_1.items()): 128 | blocks[block_key].add(record_id) 129 | 130 | assert len(blocks.values()) == 0 131 | 132 | 133 | if __name__ == "__main__": 134 | unittest.main() 135 | -------------------------------------------------------------------------------- /tests/test_canonical.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import dedupe.canonical 4 | 5 | 6 | class CanonicalizationTest(unittest.TestCase): 7 | def test_get_centroid(self): 8 | from affinegap import normalizedAffineGapDistance as comparator 9 | 10 | attributeList = [ 11 | "mary crane center", 12 | "mary crane center north", 13 | "mary crane league - mary crane - west", 14 | "mary crane league mary crane center (east)", 15 | "mary crane league mary crane center (north)", 16 | "mary crane league mary crane center (west)", 17 | "mary crane league - mary crane - east", 18 | "mary crane family and day care center", 19 | "mary crane west", 20 | "mary crane center east", 21 | "mary crane league mary crane center (east)", 22 | "mary crane league mary crane center (north)", 23 | "mary crane league mary crane center (west)", 24 | "mary crane league", 25 | "mary crane", 26 | "mary crane east 0-3", 27 | "mary crane north", 28 | "mary crane north 0-3", 29 | "mary crane league - mary crane - west", 30 | "mary crane league - mary crane - north", 31 | "mary crane league - mary crane - east", 32 | "mary crane league - mary crane - west", 33 | "mary crane league - mary crane - north", 34 | "mary crane league - mary crane - east", 35 | ] 36 | 37 | centroid = dedupe.canonical.getCentroid(attributeList, comparator) 38 | assert centroid == "mary crane" 39 | 40 | def test_get_canonical_rep(self): 41 | record_list = [ 42 | {"name": "mary crane", "address": "123 main st", "zip": "12345"}, 43 | {"name": "mary crane east", "address": "123 main street", "zip": ""}, 44 | {"name": "mary crane west", "address": "123 man st", "zip": ""}, 45 | ] 46 | 47 | rep = dedupe.canonical.getCanonicalRep(record_list) 48 | assert rep == { 49 | "name": "mary crane", 50 | "address": "123 main street", 51 | "zip": "12345", 52 | } 53 | 54 | rep = dedupe.canonical.getCanonicalRep(record_list[0:2]) 55 | assert rep == {"name": "mary crane", "address": "123 main st", "zip": "12345"} 56 | 57 | rep = dedupe.canonical.getCanonicalRep(record_list[0:1]) 58 | assert rep == {"name": "mary crane", "address": "123 main st", "zip": "12345"} 59 | -------------------------------------------------------------------------------- /tests/test_core.py: -------------------------------------------------------------------------------- 1 | import random 2 | import unittest 3 | 4 | import numpy 5 | import scipy.special 6 | 7 | import dedupe 8 | 9 | 10 | class MockClassifier: 11 | def __init__(self): 12 | self.weight = 0 13 | self.bias = 0 14 | 15 | def predict_proba(self, examples): 16 | return scipy.special.expit(examples * self.weight + self.bias) 17 | 18 | 19 | class ScoreDuplicates(unittest.TestCase): 20 | def setUp(self): 21 | random.seed(123) 22 | 23 | long_string = "asa;sasdfjasdio;fio;asdnfasdvnvao;asduifvnavjasdfasdfasfasasdfasdfasdfasdfasdfsdfasgnuavpidcvaspdivnaspdivninasduinguipghauipsdfnvaspfighapsdifnasdifnasdpighuignpaguinpgiasidfjasdfjsdofgiongag" # noqa: E501 24 | 25 | self.records = iter( 26 | [ 27 | ( 28 | (long_string, {"name": "Margret", "age": "32"}), 29 | ("2", {"name": "Marga", "age": "33"}), 30 | ), 31 | ( 32 | ("2", {"name": "Marga", "age": "33"}), 33 | ("3", {"name": "Maria", "age": "19"}), 34 | ), 35 | ( 36 | ("4", {"name": "Maria", "age": "19"}), 37 | ("5", {"name": "Monica", "age": "39"}), 38 | ), 39 | ( 40 | ("6", {"name": "Monica", "age": "39"}), 41 | ("7", {"name": "Mira", "age": "47"}), 42 | ), 43 | ( 44 | ("8", {"name": "Mira", "age": "47"}), 45 | ("9", {"name": "Mona", "age": "9"}), 46 | ), 47 | ] 48 | ) 49 | 50 | deduper = dedupe.Dedupe([dedupe.variables.String("name")]) 51 | self.data_model = deduper.data_model 52 | self.classifier = MockClassifier() 53 | 54 | self.classifier.weight = -1.0302742719650269 55 | self.classifier.bias = 4.76 56 | 57 | score_dtype = [("pairs", " 0.001: 165 | return False 166 | else: 167 | return True 168 | 169 | def test_hierarchical(self): 170 | hierarchical = dedupe.clustering.cluster 171 | assert self.clusterEquals(list(hierarchical(self.dupes, 1)), []) 172 | 173 | assert self.clusterEquals( 174 | list(hierarchical(self.dupes, 0.5)), 175 | [ 176 | ((1, 2, 3), (0.778, 0.860, 0.778)), 177 | ((4, 5), (0.720, 0.720)), 178 | ((10, 11), (0.899, 0.899)), 179 | ], 180 | ) 181 | 182 | print(hierarchical(self.dupes, 0.0)) 183 | assert self.clusterEquals( 184 | list(hierarchical(self.dupes, 0)), 185 | [ 186 | ((1, 2, 3, 4, 5), (0.526, 0.564, 0.542, 0.320, 0.623)), 187 | ((10, 11), (0.899, 0.899)), 188 | ], 189 | ) 190 | 191 | assert list(hierarchical(self.str_dupes, 1)) == [] 192 | assert list(zip(*hierarchical(self.str_dupes, 0.5)))[0] == ( 193 | (b"1", b"2", b"3"), 194 | (b"4", b"5"), 195 | ) 196 | assert list(zip(*hierarchical(self.str_dupes, 0)))[0] == ( 197 | (b"1", b"2", b"3", b"4", b"5"), 198 | ) 199 | 200 | def test_greedy_matching(self): 201 | greedyMatch = dedupe.clustering.greedyMatching 202 | 203 | bipartite_dupes = numpy.array( 204 | list(self.bipartite_dupes), dtype=[("ids", int, 2), ("score", float)] 205 | ) 206 | 207 | assert list(greedyMatch(bipartite_dupes)) == [ 208 | ((4, 6), 0.96), 209 | ((2, 7), 0.72), 210 | ((3, 8), 0.65), 211 | ((1, 5), 0.1), 212 | ] 213 | 214 | def test_gazette_matching(self): 215 | gazetteMatch = dedupe.clustering.gazetteMatching 216 | blocked_dupes = itertools.groupby(self.bipartite_dupes, key=lambda x: x[0][0]) 217 | 218 | def to_numpy(x): 219 | return numpy.array(x, dtype=[("ids", int, 2), ("score", float)]) 220 | 221 | blocked_dupes = [to_numpy(list(block)) for _, block in blocked_dupes] 222 | 223 | target = [ 224 | (((1, 6), 0.72), ((1, 8), 0.6)), 225 | (((2, 7), 0.72), ((2, 8), 0.3)), 226 | (((3, 6), 0.72), ((3, 8), 0.65)), 227 | (((4, 6), 0.96), ((4, 5), 0.63)), 228 | (((5, 8), 0.24),), 229 | ] 230 | 231 | assert [ 232 | tuple((tuple(pair), score) for pair, score in each.tolist()) 233 | for each in gazetteMatch(blocked_dupes, n_matches=2) 234 | ] == target 235 | 236 | 237 | if __name__ == "__main__": 238 | unittest.main() 239 | -------------------------------------------------------------------------------- /tests/test_exists.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import numpy 4 | 5 | from dedupe.variables.exists import ExistsType 6 | 7 | 8 | class TestExists(unittest.TestCase): 9 | def test_comparator(self): 10 | var = ExistsType("foo") 11 | assert numpy.array_equal(var.comparator(None, None), [0, 0]) 12 | assert numpy.array_equal(var.comparator(1, 1), [1, 0]) 13 | assert numpy.array_equal(var.comparator(1, 0), [0, 1]) 14 | 15 | def test_len_higher_vars(self): 16 | # The len > 1 is neccessary for the correct processing in datamodel.py 17 | var = ExistsType("foo") 18 | assert len(var) > 1 19 | assert len(var.higher_vars) > 1 20 | assert len(var) == len(var.higher_vars) 21 | -------------------------------------------------------------------------------- /tests/test_labeler.py: -------------------------------------------------------------------------------- 1 | import random 2 | import unittest 3 | 4 | import pytest 5 | 6 | import dedupe 7 | from dedupe import datamodel, labeler 8 | from dedupe._typing import RecordDictPair 9 | 10 | SAMPLE = { 11 | 1: {"name": "Meredith", "age": "40"}, 12 | 2: {"name": "Sue", "age": "10"}, 13 | 3: {"name": "Willy", "age": "35"}, 14 | 4: {"name": "William", "age": "35"}, 15 | 5: {"name": "Jimmy", "age": "20"}, 16 | 6: {"name": "Jimbo", "age": "21"}, 17 | } 18 | 19 | 20 | def freeze_record_pair(record_pair: RecordDictPair): 21 | rec1, rec2 = record_pair 22 | return (frozenset(rec1.items()), frozenset(rec2.items())) 23 | 24 | 25 | class ActiveLearningTest(unittest.TestCase): 26 | def setUp(self): 27 | self.data_model = datamodel.DataModel( 28 | [dedupe.variables.String("name"), dedupe.variables.String("age")] 29 | ) 30 | 31 | def test_AL(self): 32 | random.seed(1111111111110) 33 | # Even with random seed, the order of the following seem to be random, 34 | # so we shouldn't test for exact order. 35 | EXPECTED_CANDIDATES = [ 36 | ({"name": "Willy", "age": "35"}, {"name": "William", "age": "35"}), 37 | ({"name": "Jimmy", "age": "20"}, {"name": "Jimbo", "age": "21"}), 38 | ({"name": "Willy", "age": "35"}, {"name": "Jimmy", "age": "20"}), 39 | ({"name": "William", "age": "35"}, {"name": "Jimmy", "age": "20"}), 40 | ({"name": "Meredith", "age": "40"}, {"name": "Sue", "age": "10"}), 41 | ({"name": "Meredith", "age": "40"}, {"name": "Jimmy", "age": "20"}), 42 | ({"name": "Sue", "age": "10"}, {"name": "Jimmy", "age": "20"}), 43 | ({"name": "Willy", "age": "35"}, {"name": "Jimbo", "age": "21"}), 44 | ({"name": "William", "age": "35"}, {"name": "Jimbo", "age": "21"}), 45 | ] 46 | EXPECTED_CANDIDATES = {freeze_record_pair(pair) for pair in EXPECTED_CANDIDATES} 47 | active_learner = labeler.DedupeDisagreementLearner( 48 | self.data_model.predicates, self.data_model.distances, SAMPLE, [] 49 | ) 50 | actual_candidates = set() 51 | for i in range(len(EXPECTED_CANDIDATES), 0, -1): 52 | assert len(active_learner) == i 53 | record_pair = freeze_record_pair(active_learner.pop()) 54 | actual_candidates.add(record_pair) 55 | assert actual_candidates == EXPECTED_CANDIDATES 56 | with pytest.raises(IndexError): 57 | active_learner.pop() 58 | 59 | 60 | if __name__ == "__main__": 61 | unittest.main() 62 | -------------------------------------------------------------------------------- /tests/test_memory.sh: -------------------------------------------------------------------------------- 1 | valgrind --tool=massif --suppressions=/usr/share/doc/python26-devel-2.6.8/valgrind-python.supp --massif-out-file=out.txt --depth=1 python2.6 tests/test_affine_memory.py 2 | ms_print out.txt | less 3 | -------------------------------------------------------------------------------- /tests/test_predicates.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from dedupe import predicates 4 | 5 | 6 | class TestPuncStrip(unittest.TestCase): 7 | def test_sevenchar(self): 8 | s1 = predicates.StringPredicate(predicates.sameSevenCharStartPredicate, "foo") 9 | assert s1({"foo": "fo,18v*1vaad80"}) == s1({"foo": "fo18v1vaad80"}) 10 | 11 | def test_set(self): 12 | s1 = predicates.SimplePredicate(predicates.wholeSetPredicate, "foo") 13 | colors = {"red", "blue", "green"} 14 | assert s1({"foo": colors}) == {str(colors)} 15 | 16 | 17 | class TestMetaphone(unittest.TestCase): 18 | def test_metaphone_token(self): 19 | block_val = predicates.metaphoneToken("9301 S. State St. ") 20 | assert block_val == {"STT", "S", "ST"} 21 | 22 | 23 | class TestWholeSet(unittest.TestCase): 24 | def setUp(self): 25 | self.s1 = {"red", "blue", "green"} 26 | 27 | def test_full_set(self): 28 | block_val = predicates.wholeSetPredicate(self.s1) 29 | self.assertEqual(block_val, {str(self.s1)}) 30 | 31 | 32 | class TestSetElement(unittest.TestCase): 33 | def setUp(self): 34 | self.s1 = {"red", "blue", "green"} 35 | 36 | def test_long_set(self): 37 | block_val = predicates.commonSetElementPredicate(self.s1) 38 | self.assertEqual(set(block_val), {"blue", "green", "red"}) 39 | 40 | def test_empty_set(self): 41 | block_val = predicates.commonSetElementPredicate(set()) 42 | self.assertEqual(block_val, set()) 43 | 44 | def test_first_last(self): 45 | block_val = predicates.lastSetElementPredicate(self.s1) 46 | assert block_val == {"red"} 47 | block_val = predicates.firstSetElementPredicate(self.s1) 48 | assert block_val == {"blue"} 49 | 50 | def test_magnitude(self): 51 | block_val = predicates.magnitudeOfCardinality(self.s1) 52 | assert block_val == {"0"} 53 | 54 | block_val = predicates.magnitudeOfCardinality(()) 55 | assert block_val == set() 56 | 57 | 58 | class TestLatLongGrid(unittest.TestCase): 59 | def setUp(self): 60 | self.latlong1 = (42.535, -5.012) 61 | 62 | def test_precise_latlong(self): 63 | block_val = predicates.latLongGridPredicate(self.latlong1) 64 | assert block_val == {"(42.5, -5.0)"} 65 | block_val = predicates.latLongGridPredicate((0, 0)) 66 | assert block_val == set() 67 | 68 | 69 | class TestAlpaNumeric(unittest.TestCase): 70 | def test_alphanumeric(self): 71 | assert predicates.alphaNumericPredicate("a1") == {"a1"} 72 | assert predicates.alphaNumericPredicate("1a") == {"1a"} 73 | assert predicates.alphaNumericPredicate("a1b") == {"a1b"} 74 | assert predicates.alphaNumericPredicate("1 a") == {"1"} 75 | assert predicates.alphaNumericPredicate("a1 b1") == {"a1", "b1"} 76 | assert predicates.alphaNumericPredicate("asdf") == set() 77 | assert predicates.alphaNumericPredicate("1") == {"1"} 78 | assert predicates.alphaNumericPredicate("a_1") == {"1"} 79 | assert predicates.alphaNumericPredicate("a$1") == {"1"} 80 | assert predicates.alphaNumericPredicate("a 1") == {"1"} 81 | assert predicates.alphaNumericPredicate("773-555-1676") == { 82 | "773", 83 | "555", 84 | "1676", 85 | } 86 | 87 | 88 | class TestNumericPredicates(unittest.TestCase): 89 | def test_order_of_magnitude(self): 90 | assert predicates.orderOfMagnitude(10) == {"1"} 91 | assert predicates.orderOfMagnitude(9) == {"1"} 92 | assert predicates.orderOfMagnitude(2) == {"0"} 93 | assert predicates.orderOfMagnitude(-2) == set() 94 | 95 | def test_round_to_1(self): 96 | assert predicates.roundTo1(22315) == {"20000"} 97 | assert predicates.roundTo1(-22315) == {"-20000"} 98 | 99 | 100 | class TestCompoundPredicate(unittest.TestCase): 101 | def test_escapes_colon(self): 102 | """ 103 | Regression test for issue #836 104 | """ 105 | predicate_1 = predicates.SimplePredicate( 106 | predicates.commonSetElementPredicate, "col_1" 107 | ) 108 | predicate_2 = predicates.SimplePredicate( 109 | predicates.commonSetElementPredicate, "col_2" 110 | ) 111 | record = {"col_1": ["foo:", "foo"], "col_2": [":bar", "bar"]} 112 | 113 | block_val = predicates.CompoundPredicate([predicate_1, predicate_2])(record) 114 | assert len(set(block_val)) == 4 115 | assert block_val == {"foo\\::\\:bar", "foo\\::bar", "foo:\\:bar", "foo:bar"} 116 | 117 | def test_escapes_escaped_colon(self): 118 | """ 119 | Regression test for issue #836 120 | """ 121 | predicate_1 = predicates.SimplePredicate( 122 | predicates.commonSetElementPredicate, "col_1" 123 | ) 124 | predicate_2 = predicates.SimplePredicate( 125 | predicates.commonSetElementPredicate, "col_2" 126 | ) 127 | record = {"col_1": ["foo\\:", "foo"], "col_2": ["\\:bar", "bar"]} 128 | 129 | block_val = predicates.CompoundPredicate([predicate_1, predicate_2])(record) 130 | assert len(set(block_val)) == 4 131 | assert block_val == { 132 | "foo\\\\::\\\\:bar", 133 | "foo\\\\::bar", 134 | "foo:\\\\:bar", 135 | "foo:bar", 136 | } 137 | 138 | 139 | if __name__ == "__main__": 140 | unittest.main() 141 | -------------------------------------------------------------------------------- /tests/test_price.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from dedupe.variables.price import PriceType 4 | 5 | 6 | class TestPrice(unittest.TestCase): 7 | def test_comparator(self): 8 | assert PriceType.comparator(1, 10) == 1 9 | assert PriceType.comparator(10, 1) == 1 10 | -------------------------------------------------------------------------------- /tests/test_serializer.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | import json 3 | import sys 4 | import unittest 5 | 6 | import dedupe 7 | 8 | 9 | class SerializerTest(unittest.TestCase): 10 | def test_writeTraining(self): 11 | if sys.version < "3": 12 | from StringIO import StringIO 13 | 14 | output = StringIO() 15 | encoded_file = codecs.EncodedFile( 16 | output, data_encoding="utf8", file_encoding="ascii" 17 | ) 18 | else: 19 | from io import StringIO 20 | 21 | encoded_file = StringIO() 22 | 23 | training_pairs = { 24 | "distinct": [ 25 | [ 26 | { 27 | "bar": frozenset(["barë"]), 28 | "baz": (1, 2), 29 | "bang": (1, 2), 30 | "foo": "baz", 31 | }, 32 | {"foo": "baz"}, 33 | ] 34 | ], 35 | "match": [], 36 | } 37 | 38 | json.dump(training_pairs, encoded_file, cls=dedupe.serializer.TupleEncoder) 39 | 40 | encoded_file.seek(0) 41 | 42 | loaded_training_pairs = json.load( 43 | encoded_file, object_hook=dedupe.serializer._from_json 44 | ) 45 | 46 | assert loaded_training_pairs["distinct"][0][0] == dict( 47 | training_pairs["distinct"][0][0] 48 | ) 49 | 50 | assert isinstance(loaded_training_pairs["distinct"][0][0]["bar"], frozenset) 51 | assert isinstance(loaded_training_pairs["distinct"][0][0]["baz"], tuple) 52 | 53 | deduper = dedupe.Dedupe([dedupe.variables.String("foo")]) 54 | deduper.classifier.cv = False 55 | 56 | encoded_file.seek(0) 57 | 58 | deduper._read_training(encoded_file) 59 | print(deduper.training_pairs) 60 | print(training_pairs) 61 | assert deduper.training_pairs == training_pairs 62 | 63 | encoded_file.close() 64 | 65 | 66 | if __name__ == "__main__": 67 | unittest.main() 68 | -------------------------------------------------------------------------------- /tests/test_tfidf.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import dedupe 4 | 5 | 6 | class ParsingTest(unittest.TestCase): 7 | def setUp(self): 8 | self.index = dedupe.tfidf.TfIdfIndex() 9 | 10 | def test_keywords(self): 11 | self.index.index(("AND", "OR", "EOF", "NOT")) 12 | self.index._index.initSearch() 13 | assert self.index.search(("AND", "OR", "EOF", "NOT"))[0] == 1 14 | 15 | def test_keywords_title(self): 16 | self.index.index(("And", "Or", "Eof", "Not")) 17 | self.index._index.initSearch() 18 | assert self.index.search(("And", "Or", "Eof", "Not"))[0] == 1 19 | 20 | def test_empty_search(self): 21 | self.index._index.initSearch() 22 | assert self.index.search(()) == [] 23 | 24 | def test_wildcards(self): 25 | self.index.index((r"f\o",)) 26 | self.index.index(("f*",)) 27 | self.index._index.initSearch() 28 | assert len(self.index.search(("f*",))) == 1 29 | 30 | 31 | if __name__ == "__main__": 32 | unittest.main() 33 | -------------------------------------------------------------------------------- /tests/test_training.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import dedupe 4 | import dedupe.branch_and_bound as branch_and_bound 5 | import dedupe.training as training 6 | 7 | 8 | class TrainingTest(unittest.TestCase): 9 | def setUp(self): 10 | field_definition = [dedupe.variables.String("name")] 11 | self.data_model = dedupe.Dedupe(field_definition).data_model 12 | self.training_pairs = { 13 | "match": [ 14 | ({"name": "Bob", "age": "50"}, {"name": "Bob", "age": "75"}), 15 | ({"name": "Meredith", "age": "40"}, {"name": "Sue", "age": "10"}), 16 | ], 17 | "distinct": [ 18 | ({"name": "Jimmy", "age": "20"}, {"name": "Jimbo", "age": "21"}), 19 | ({"name": "Willy", "age": "35"}, {"name": "William", "age": "35"}), 20 | ({"name": "William", "age": "36"}, {"name": "William", "age": "35"}), 21 | ], 22 | } 23 | 24 | self.training = self.training_pairs["match"] + self.training_pairs["distinct"] 25 | self.training_records = [] 26 | for pair in self.training: 27 | for record in pair: 28 | if record not in self.training_records: 29 | self.training_records.append(record) 30 | 31 | self.simple = lambda x: {str(k) for k in x if "CompoundPredicate" not in str(k)} 32 | 33 | self.block_learner = training.BlockLearner 34 | self.block_learner.blocker = dedupe.blocking.Fingerprinter( 35 | self.data_model.predicates 36 | ) 37 | self.block_learner.blocker.index_all( 38 | {i: x for i, x in enumerate(self.training_records)} 39 | ) 40 | 41 | def test_dedupe_coverage(self): 42 | coverage = self.block_learner.cover(self.block_learner, self.training) 43 | assert self.simple(coverage.keys()).issuperset( 44 | { 45 | "SimplePredicate: (tokenFieldPredicate, name)", 46 | "SimplePredicate: (commonSixGram, name)", 47 | "TfidfTextCanopyPredicate: (0.4, name)", 48 | "SimplePredicate: (sortedAcronym, name)", 49 | "SimplePredicate: (sameThreeCharStartPredicate, name)", 50 | "TfidfTextCanopyPredicate: (0.2, name)", 51 | "SimplePredicate: (sameFiveCharStartPredicate, name)", 52 | "TfidfTextCanopyPredicate: (0.6, name)", 53 | "SimplePredicate: (wholeFieldPredicate, name)", 54 | "TfidfTextCanopyPredicate: (0.8, name)", 55 | "SimplePredicate: (commonFourGram, name)", 56 | "SimplePredicate: (firstTokenPredicate, name)", 57 | "SimplePredicate: (sameSevenCharStartPredicate, name)", 58 | } 59 | ) 60 | 61 | def test_uncovered_by(self): 62 | before = {1: frozenset({1, 2, 3}), 2: frozenset({1, 2}), 3: frozenset({3})} 63 | after = {1: frozenset({1, 2}), 2: frozenset({1, 2})} 64 | 65 | before_copy = before.copy() 66 | 67 | assert branch_and_bound._uncovered_by(before, frozenset()) == before 68 | assert branch_and_bound._uncovered_by(before, frozenset({3})) == after 69 | assert before == before_copy 70 | 71 | def test_covered_pairs(self): 72 | p1 = lambda x, target=None: frozenset((1,)) # noqa: E 731 73 | 74 | self.block_learner.blocker.predicates = (p1,) 75 | cover = self.block_learner.cover(self.block_learner, [("a", "b")] * 2) 76 | 77 | assert cover[p1] == {0, 1} 78 | 79 | 80 | if __name__ == "__main__": 81 | unittest.main() 82 | --------------------------------------------------------------------------------