├── .flake8 ├── .git-blame-ignore-revs ├── .github ├── dependabot.yml ├── scripts │ └── asv_markdown.py └── workflows │ ├── benchmark-bot.yml │ ├── codeql-analysis.yml │ ├── lock.yml │ └── pythonpackage.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .readthedocs.yml ├── CHANGELOG.md ├── CITATION.cff ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── CONTRIBUTORS.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── THANKS.md ├── benchmarks ├── asv.conf.json ├── benchmarks │ ├── __init__.py │ ├── canonical.py │ ├── canonical_gazetteer.py │ ├── canonical_matching.py │ ├── common.py │ └── datasets │ │ ├── restaurant-1.csv │ │ ├── restaurant-2.csv │ │ ├── restaurant-nophone-training.csv │ │ └── restaurant-nophone.csv └── setup.py ├── dedupe ├── __init__.py ├── _typing.py ├── api.py ├── backport.py ├── blocking.py ├── branch_and_bound.py ├── canonical.py ├── canopy_index.py ├── clustering.py ├── convenience.py ├── core.py ├── cpredicates.pyx ├── datamodel.py ├── index.py ├── labeler.py ├── levenshtein.py ├── predicate_functions.py ├── predicates.py ├── py.typed ├── serializer.py ├── tfidf.py ├── training.py └── variables │ ├── __init__.py │ ├── base.py │ ├── categorical_type.py │ ├── exact.py │ ├── exists.py │ ├── interaction.py │ ├── latlong.py │ ├── price.py │ ├── set.py │ └── string.py ├── docs ├── API-documentation.rst ├── Bibliography.rst ├── Examples.rst ├── Makefile ├── Troubleshooting.rst ├── Variable-definition.rst ├── _static │ ├── css │ │ ├── bootstrap.css │ │ └── custom.css │ ├── images │ │ ├── dedupeio-logo-reversed.png │ │ └── dedupeio-logo.png │ └── js │ │ ├── bootstrap.min.js │ │ └── jquery.min.js ├── _templates │ └── layout.html ├── conf.py ├── how-it-works │ ├── Choosing-a-good-threshold.rst │ ├── Grouping-duplicates.rst │ ├── How-it-works.rst │ ├── Making-smart-comparisons.rst │ ├── Matching-records.rst │ └── Special-Cases.rst ├── index.rst └── requirements.txt ├── pyproject.toml ├── requirements.txt ├── setup.py └── tests ├── duplicateCluster_memory_case.py ├── test_api.py ├── test_blocking.py ├── test_canonical.py ├── test_core.py ├── test_cpredicates.py ├── test_dedupe.py ├── test_exists.py ├── test_labeler.py ├── test_memory.sh ├── test_predicate_functions.py ├── test_predicates.py ├── test_price.py ├── test_serializer.py ├── test_tfidf.py └── test_training.py /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length=160 3 | extend-ignore = E203 -------------------------------------------------------------------------------- /.git-blame-ignore-revs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/.git-blame-ignore-revs -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/.github/dependabot.yml -------------------------------------------------------------------------------- /.github/scripts/asv_markdown.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/.github/scripts/asv_markdown.py -------------------------------------------------------------------------------- /.github/workflows/benchmark-bot.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/.github/workflows/benchmark-bot.yml -------------------------------------------------------------------------------- /.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/.github/workflows/codeql-analysis.yml -------------------------------------------------------------------------------- /.github/workflows/lock.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/.github/workflows/lock.yml -------------------------------------------------------------------------------- /.github/workflows/pythonpackage.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/.github/workflows/pythonpackage.yml -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/.gitignore -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/.pre-commit-config.yaml -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/.readthedocs.yml -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/CHANGELOG.md -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/CITATION.cff -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/CODE_OF_CONDUCT.md -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/CONTRIBUTING.md -------------------------------------------------------------------------------- /CONTRIBUTORS.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/CONTRIBUTORS.md -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/LICENSE -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include dedupe/cpredicates.pyx -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/README.md -------------------------------------------------------------------------------- /THANKS.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/THANKS.md -------------------------------------------------------------------------------- /benchmarks/asv.conf.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/benchmarks/asv.conf.json -------------------------------------------------------------------------------- /benchmarks/benchmarks/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /benchmarks/benchmarks/canonical.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/benchmarks/benchmarks/canonical.py -------------------------------------------------------------------------------- /benchmarks/benchmarks/canonical_gazetteer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/benchmarks/benchmarks/canonical_gazetteer.py -------------------------------------------------------------------------------- /benchmarks/benchmarks/canonical_matching.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/benchmarks/benchmarks/canonical_matching.py -------------------------------------------------------------------------------- /benchmarks/benchmarks/common.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/benchmarks/benchmarks/common.py -------------------------------------------------------------------------------- /benchmarks/benchmarks/datasets/restaurant-1.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/benchmarks/benchmarks/datasets/restaurant-1.csv -------------------------------------------------------------------------------- /benchmarks/benchmarks/datasets/restaurant-2.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/benchmarks/benchmarks/datasets/restaurant-2.csv -------------------------------------------------------------------------------- /benchmarks/benchmarks/datasets/restaurant-nophone-training.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/benchmarks/benchmarks/datasets/restaurant-nophone-training.csv -------------------------------------------------------------------------------- /benchmarks/benchmarks/datasets/restaurant-nophone.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/benchmarks/benchmarks/datasets/restaurant-nophone.csv -------------------------------------------------------------------------------- /benchmarks/setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/benchmarks/setup.py -------------------------------------------------------------------------------- /dedupe/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/dedupe/__init__.py -------------------------------------------------------------------------------- /dedupe/_typing.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/dedupe/_typing.py -------------------------------------------------------------------------------- /dedupe/api.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/dedupe/api.py -------------------------------------------------------------------------------- /dedupe/backport.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/dedupe/backport.py -------------------------------------------------------------------------------- /dedupe/blocking.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/dedupe/blocking.py -------------------------------------------------------------------------------- /dedupe/branch_and_bound.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/dedupe/branch_and_bound.py -------------------------------------------------------------------------------- /dedupe/canonical.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/dedupe/canonical.py -------------------------------------------------------------------------------- /dedupe/canopy_index.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/dedupe/canopy_index.py -------------------------------------------------------------------------------- /dedupe/clustering.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/dedupe/clustering.py -------------------------------------------------------------------------------- /dedupe/convenience.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/dedupe/convenience.py -------------------------------------------------------------------------------- /dedupe/core.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/dedupe/core.py -------------------------------------------------------------------------------- /dedupe/cpredicates.pyx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/dedupe/cpredicates.pyx -------------------------------------------------------------------------------- /dedupe/datamodel.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/dedupe/datamodel.py -------------------------------------------------------------------------------- /dedupe/index.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/dedupe/index.py -------------------------------------------------------------------------------- /dedupe/labeler.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/dedupe/labeler.py -------------------------------------------------------------------------------- /dedupe/levenshtein.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/dedupe/levenshtein.py -------------------------------------------------------------------------------- /dedupe/predicate_functions.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/dedupe/predicate_functions.py -------------------------------------------------------------------------------- /dedupe/predicates.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/dedupe/predicates.py -------------------------------------------------------------------------------- /dedupe/py.typed: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dedupe/serializer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/dedupe/serializer.py -------------------------------------------------------------------------------- /dedupe/tfidf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/dedupe/tfidf.py -------------------------------------------------------------------------------- /dedupe/training.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/dedupe/training.py -------------------------------------------------------------------------------- /dedupe/variables/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/dedupe/variables/__init__.py -------------------------------------------------------------------------------- /dedupe/variables/base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/dedupe/variables/base.py -------------------------------------------------------------------------------- /dedupe/variables/categorical_type.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/dedupe/variables/categorical_type.py -------------------------------------------------------------------------------- /dedupe/variables/exact.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/dedupe/variables/exact.py -------------------------------------------------------------------------------- /dedupe/variables/exists.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/dedupe/variables/exists.py -------------------------------------------------------------------------------- /dedupe/variables/interaction.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/dedupe/variables/interaction.py -------------------------------------------------------------------------------- /dedupe/variables/latlong.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/dedupe/variables/latlong.py -------------------------------------------------------------------------------- /dedupe/variables/price.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/dedupe/variables/price.py -------------------------------------------------------------------------------- /dedupe/variables/set.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/dedupe/variables/set.py -------------------------------------------------------------------------------- /dedupe/variables/string.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/dedupe/variables/string.py -------------------------------------------------------------------------------- /docs/API-documentation.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/docs/API-documentation.rst -------------------------------------------------------------------------------- /docs/Bibliography.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/docs/Bibliography.rst -------------------------------------------------------------------------------- /docs/Examples.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/docs/Examples.rst -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/docs/Makefile -------------------------------------------------------------------------------- /docs/Troubleshooting.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/docs/Troubleshooting.rst -------------------------------------------------------------------------------- /docs/Variable-definition.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/docs/Variable-definition.rst -------------------------------------------------------------------------------- /docs/_static/css/bootstrap.css: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/docs/_static/css/bootstrap.css -------------------------------------------------------------------------------- /docs/_static/css/custom.css: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/docs/_static/css/custom.css -------------------------------------------------------------------------------- /docs/_static/images/dedupeio-logo-reversed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/docs/_static/images/dedupeio-logo-reversed.png -------------------------------------------------------------------------------- /docs/_static/images/dedupeio-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/docs/_static/images/dedupeio-logo.png -------------------------------------------------------------------------------- /docs/_static/js/bootstrap.min.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/docs/_static/js/bootstrap.min.js -------------------------------------------------------------------------------- /docs/_static/js/jquery.min.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/docs/_static/js/jquery.min.js -------------------------------------------------------------------------------- /docs/_templates/layout.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/docs/_templates/layout.html -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/docs/conf.py -------------------------------------------------------------------------------- /docs/how-it-works/Choosing-a-good-threshold.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/docs/how-it-works/Choosing-a-good-threshold.rst -------------------------------------------------------------------------------- /docs/how-it-works/Grouping-duplicates.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/docs/how-it-works/Grouping-duplicates.rst -------------------------------------------------------------------------------- /docs/how-it-works/How-it-works.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/docs/how-it-works/How-it-works.rst -------------------------------------------------------------------------------- /docs/how-it-works/Making-smart-comparisons.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/docs/how-it-works/Making-smart-comparisons.rst -------------------------------------------------------------------------------- /docs/how-it-works/Matching-records.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/docs/how-it-works/Matching-records.rst -------------------------------------------------------------------------------- /docs/how-it-works/Special-Cases.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/docs/how-it-works/Special-Cases.rst -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/docs/index.rst -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/docs/requirements.txt -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/pyproject.toml -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/requirements.txt -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/setup.py -------------------------------------------------------------------------------- /tests/duplicateCluster_memory_case.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/tests/duplicateCluster_memory_case.py -------------------------------------------------------------------------------- /tests/test_api.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/tests/test_api.py -------------------------------------------------------------------------------- /tests/test_blocking.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/tests/test_blocking.py -------------------------------------------------------------------------------- /tests/test_canonical.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/tests/test_canonical.py -------------------------------------------------------------------------------- /tests/test_core.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/tests/test_core.py -------------------------------------------------------------------------------- /tests/test_cpredicates.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/tests/test_cpredicates.py -------------------------------------------------------------------------------- /tests/test_dedupe.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/tests/test_dedupe.py -------------------------------------------------------------------------------- /tests/test_exists.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/tests/test_exists.py -------------------------------------------------------------------------------- /tests/test_labeler.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/tests/test_labeler.py -------------------------------------------------------------------------------- /tests/test_memory.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/tests/test_memory.sh -------------------------------------------------------------------------------- /tests/test_predicate_functions.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/tests/test_predicate_functions.py -------------------------------------------------------------------------------- /tests/test_predicates.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/tests/test_predicates.py -------------------------------------------------------------------------------- /tests/test_price.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/tests/test_price.py -------------------------------------------------------------------------------- /tests/test_serializer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/tests/test_serializer.py -------------------------------------------------------------------------------- /tests/test_tfidf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/tests/test_tfidf.py -------------------------------------------------------------------------------- /tests/test_training.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dedupeio/dedupe/HEAD/tests/test_training.py --------------------------------------------------------------------------------