├── .github └── workflows │ └── testing.yml ├── .gitignore ├── CHANGES.txt ├── LICENSE ├── LICENSES ├── CLOUDPICKLE_LICENSE ├── JOBLIB_LICENSE ├── PANDAS_LICENSE ├── PYPARSING_LICENSE ├── PYPRIND_LICENSE ├── PY_STRINGMATCHING_LICENSE ├── PY_STRINGSIMJOIN_LICENSE ├── SCIKIT_LEARN_LICENSE └── SIX_LICENSE ├── MANIFEST.in ├── README.rst ├── asv.conf.json ├── benchmarks ├── __init__.py ├── benchmark_attr_equiv_blocker.py ├── benchmark_blackbox_blocker.py ├── benchmark_debugblocker.py ├── benchmark_down_sample_sampler.py ├── benchmark_overlap_blocker.py ├── benchmark_rule_based_blocker.py └── benchmark_sn_blocker.py ├── build.bat ├── build.sh ├── build_tools ├── appveyor │ ├── install.ps1 │ ├── rm_rf.py │ └── run_with_env.cmd ├── cythonize.py ├── move-conda-package.py └── requirements_dev.txt ├── conda.recipe ├── build.bat ├── build.sh └── meta.yaml ├── continuous-integration └── appveyor │ ├── install.ps1 │ ├── rm_rf.py │ └── run_with_env.cmd ├── cythonize.dat ├── docs ├── Makefile ├── conf.py ├── conf_copy.py ├── contributing.rst ├── how_to_guide.rst ├── index.rst ├── make.bat ├── make_copy.sh ├── themes │ └── nature_with_gtoc │ │ ├── layout.html │ │ ├── static │ │ └── nature.css_t │ │ └── theme.conf └── user_manual │ ├── api │ ├── actual_commands.rst │ ├── adding_features.rst │ ├── blocking.rst │ ├── combiner.rst │ ├── combining_blocker_outputs.rst │ ├── creating_the_features_automatically.rst │ ├── creating_the_features_manually.rst │ ├── dask_commands.rst │ ├── data_exploration.rst │ ├── debugging_blocker_output.rst │ ├── debugging_matcher.rst │ ├── downsampling.rst │ ├── evaluating_the_matching_output.rst │ ├── extracting_feature_vectors.rst │ ├── handling_features.rst │ ├── handling_metadata.rst │ ├── imputing_missing_values.rst │ ├── labeling.rst │ ├── loading_and_saving_objects.rst │ ├── matching.rst │ ├── overview.rst │ ├── reading_and_writing_data.rst │ ├── sampling.rst │ ├── selecting_matcher.rst │ ├── splitting_data_into_train_test.rst │ ├── supported_matchers.rst │ ├── supported_similarity_functions.rst │ ├── supported_tokenizers.rst │ ├── triggers.rst │ └── tuners.rst │ ├── blocker_hierarchy.png │ ├── blocking.rst │ ├── create_feats_for_blocking.rst │ ├── create_feats_for_matching.rst │ ├── data_exploration.rst │ ├── datastructures.rst │ ├── debugging_blocking.rst │ ├── debugging_matcher.rst │ ├── down_sampling.rst │ ├── evaluate_matching.rst │ ├── example-blocking-matching.png │ ├── example-dev-stage.png │ ├── example-match-two-tables.png │ ├── example-prod-stage.png │ ├── example-workflow.png │ ├── extract_feat_vecs.rst │ ├── guides.rst │ ├── imputing_missing_values.rst │ ├── installation.rst │ ├── labeling.rst │ ├── matchercombiner.rst │ ├── matching.rst │ ├── misc.rst │ ├── overview.rst │ ├── profiling.rst │ ├── read_csv_files.rst │ ├── sampling.rst │ ├── select_best_matcher.rst │ ├── split_train_test.rst │ ├── steps_supp_em_workflows.rst │ ├── triggers.rst │ └── whatisnew.rst ├── notebooks ├── .ipynb_checkpoints │ ├── Untitled-checkpoint.ipynb │ ├── test_addfeature_py2-checkpoint.ipynb │ ├── test_attr_equiv_blocker-checkpoint.ipynb │ ├── test_autogenfeature_py3-checkpoint.ipynb │ ├── test_blackboxfunction-checkpoint.ipynb │ ├── test_blocker_combiner-checkpoint.ipynb │ ├── test_catalog-checkpoint.ipynb │ ├── test_combine_ids-checkpoint.ipynb │ ├── test_debug_matcher-checkpoint.ipynb │ ├── test_evaluation-checkpoint.ipynb │ ├── test_feature-checkpoint.ipynb │ ├── test_feature_add_features-checkpoint.ipynb │ ├── test_feature_attributeutils-checkpoint.ipynb │ ├── test_feature_extract_featurevecs-checkpoint.ipynb │ ├── test_feature_parse_string-checkpoint.ipynb │ ├── test_io-checkpoint.ipynb │ ├── test_kitchen-checkpoint.ipynb │ ├── test_labeling-checkpoint.ipynb │ ├── test_load_save-checkpoint.ipynb │ ├── test_overlapblocker-checkpoint.ipynb │ ├── test_projection-checkpoint.ipynb │ ├── test_rulebased_blocker-checkpoint.ipynb │ ├── test_sampling-checkpoint.ipynb │ └── test_trtst_split-checkpoint.ipynb ├── guides │ ├── .ipynb_checkpoints │ │ ├── Adding Features to Feature Table-checkpoint.ipynb │ │ ├── Combining Multiple Blockers-checkpoint.ipynb │ │ ├── Debugging Blocker Output-checkpoint.ipynb │ │ ├── Down Sampling-checkpoint.ipynb │ │ ├── Editing and Generate Features for Blocking Manually-checkpoint.ipynb │ │ ├── Evaluating the Selected Matcher-checkpoint.ipynb │ │ ├── Generating Features for Blocking Manually-checkpoint.ipynb │ │ ├── Performing Blocking Using Blackbox Blocker-checkpoint.ipynb │ │ ├── Performing Blocking Using Built-In Blockers (Attr. Equivalence Blocker)-checkpoint.ipynb │ │ ├── Performing Blocking Using Built-In Blockers (Overlap Blocker)-checkpoint.ipynb │ │ ├── Performing Blocking Using Rule-Based Blocking-checkpoint.ipynb │ │ ├── Reading CSV Files from Disk-checkpoint.ipynb │ │ ├── Reading the CSV Files from Disk-checkpoint.ipynb │ │ ├── Removing Features From Feature Table-checkpoint.ipynb │ │ ├── Sampling and Labeling-checkpoint.ipynb │ │ └── Selecting the Best Learning Matcher-checkpoint.ipynb │ ├── end_to_end_em_guides │ │ ├── .ipynb_checkpoints │ │ │ ├── Basic EM Workflow DBLP ACM-checkpoint.ipynb │ │ │ ├── Basic EM Workflow Restaurants - 1-checkpoint.ipynb │ │ │ ├── Basic EM Workflow Restaurants - 2-checkpoint.ipynb │ │ │ ├── Basic EM Workflow Restaurants - 3-checkpoint.ipynb │ │ │ └── Basic EM Workflow-checkpoint.ipynb │ │ ├── Basic EM Workflow DBLP ACM.ipynb │ │ ├── Basic EM Workflow Restaurants - 1.html │ │ ├── Basic EM Workflow Restaurants - 1.ipynb │ │ ├── Basic EM Workflow Restaurants - 2.html │ │ ├── Basic EM Workflow Restaurants - 2.ipynb │ │ ├── Basic EM Workflow Restaurants - 3.html │ │ ├── Basic EM Workflow Restaurants - 3.ipynb │ │ ├── Basic EM Workflow.html │ │ ├── Basic EM Workflow.ipynb │ │ └── helper_functions.py │ └── step_wise_em_guides │ │ ├── .ipynb_checkpoints │ │ ├── Adding Features to Feature Table-checkpoint.ipynb │ │ ├── Combining Multiple Blockers-checkpoint.ipynb │ │ ├── Data Exploration-checkpoint.ipynb │ │ ├── Data Profiling-checkpoint.ipynb │ │ ├── Debugging Blocker Output-checkpoint.ipynb │ │ ├── Down Sampling-checkpoint.ipynb │ │ ├── Editing and Generating Features Manually-checkpoint.ipynb │ │ ├── Evaluating the Selected Matcher-checkpoint.ipynb │ │ ├── Generating Features Manually-checkpoint.ipynb │ │ ├── Performing Blocking Using Blackbox Blocker-checkpoint.ipynb │ │ ├── Performing Blocking Using Built-In Blockers (Attr. Equivalence Blocker)-checkpoint.ipynb │ │ ├── Performing Blocking Using Built-In Blockers (Overlap Blocker)-checkpoint.ipynb │ │ ├── Performing Blocking Using Built-In Blockers (Sorted Neighborhood Blocker)-checkpoint.ipynb │ │ ├── Performing Blocking Using Rule-Based Blocking-checkpoint.ipynb │ │ ├── Performing Matching Using a ML Matcher-checkpoint.ipynb │ │ ├── Performing Matching with a Rule-Based Matcher-checkpoint.ipynb │ │ ├── Reading CSV Files from Disk-checkpoint.ipynb │ │ ├── Removing Features From Feature Table-checkpoint.ipynb │ │ ├── Sampling and Labeling-checkpoint.ipynb │ │ ├── Selecting the Best Learning Matcher-checkpoint.ipynb │ │ └── Using Match Triggers to Improve Results-checkpoint.ipynb │ │ ├── Adding Features to Feature Table.ipynb │ │ ├── Combining Multiple Blockers.ipynb │ │ ├── Data Exploration.ipynb │ │ ├── Data Profiling.ipynb │ │ ├── Debugging Blocker Output.ipynb │ │ ├── Down Sampling.ipynb │ │ ├── Editing and Generating Features Manually.ipynb │ │ ├── Evaluating the Selected Matcher.ipynb │ │ ├── Generating Features Manually.ipynb │ │ ├── Performing Blocking Using Blackbox Blocker.ipynb │ │ ├── Performing Blocking Using Built-In Blockers (Attr. Equivalence Blocker).ipynb │ │ ├── Performing Blocking Using Built-In Blockers (Overlap Blocker).ipynb │ │ ├── Performing Blocking Using Built-In Blockers (Sorted Neighborhood Blocker).ipynb │ │ ├── Performing Blocking Using Rule-Based Blocking.ipynb │ │ ├── Performing Matching Using a ML Matcher.ipynb │ │ ├── Performing Matching with a Rule-Based Matcher.ipynb │ │ ├── Reading CSV Files from Disk.ipynb │ │ ├── Removing Features From Feature Table.ipynb │ │ ├── Sampling and Labeling.ipynb │ │ ├── Selecting the Best Learning Matcher.ipynb │ │ └── Using Match Triggers to Improve Results.ipynb └── vldb_demo │ ├── .ipynb_checkpoints │ ├── Demo_notebook_v6-checkpoint.ipynb │ └── demo-checkpoint.ipynb │ ├── README │ ├── acm_demo.csv │ ├── dblp_demo.csv │ ├── demo.ipynb │ ├── labeled_data_demo.csv │ └── profiler.py ├── py_entitymatching ├── __init__.py ├── blocker │ ├── __init__.py │ ├── attr_equiv_blocker.py │ ├── black_box_blocker.py │ ├── blocker.py │ ├── overlap_blocker.py │ ├── rule_based_blocker.py │ └── sn_blocker.py ├── blockercombiner │ ├── __init__.py │ └── blockercombiner.py ├── catalog │ ├── __init__.py │ ├── catalog.py │ └── catalog_manager.py ├── dask │ ├── __init__.py │ ├── dask_attr_equiv_blocker.py │ ├── dask_black_box_blocker.py │ ├── dask_down_sample.py │ ├── dask_dtmatcher.py │ ├── dask_extract_features.py │ ├── dask_logregmatcher.py │ ├── dask_nbmatcher.py │ ├── dask_overlap_blocker.py │ ├── dask_rfmatcher.py │ ├── dask_rule_based_blocker.py │ ├── dask_svm_matcher.py │ ├── dask_xgboost_matcher.py │ ├── daskmlmatcher.py │ └── utils.py ├── datasets │ ├── ACM.csv │ ├── DBLP.csv │ ├── acm_demo.csv │ ├── acm_demo.metadata │ ├── dblp_acm_demo_labels.csv │ ├── dblp_demo.csv │ ├── end-to-end │ │ ├── Demo_notebook_v6.ipynb │ │ ├── acm_demo.csv │ │ ├── acm_demo.metadata │ │ ├── dblp_demo.csv │ │ ├── dblp_demo.metadata │ │ ├── labeled_data_demo.csv │ │ ├── profiler.py │ │ └── restaurants │ │ │ ├── fodors.csv │ │ │ ├── lbl_restnt_wf1.csv │ │ │ ├── lbl_restnt_wf1.metadata │ │ │ ├── match_fodors_zagats_more_attrs.csv │ │ │ ├── match_fodors_zagats_more_attrs.metadata │ │ │ ├── matches_fodors_zagats.csv │ │ │ └── zagats.csv │ ├── final_matches.csv │ ├── labeled_data_demo.csv │ ├── person_table_A.csv │ ├── person_table_A.metadata │ ├── person_table_B.csv │ ├── person_table_B.metadata │ └── tableC.csv ├── debugblocker │ ├── GenerateRecomLists.cpp │ ├── GenerateRecomLists.h │ ├── OriginalTopkPlain.cpp │ ├── PrefixEvent.cpp │ ├── PrefixEvent.h │ ├── TopPair.cpp │ ├── TopPair.h │ ├── TopkHeader.cpp │ ├── TopkHeader.h │ ├── __init__.py │ ├── backup_debugblocker.py │ ├── debugblocker.py │ ├── debugblocker_cython.cpp │ └── debugblocker_cython.pyx ├── debugmatcher │ ├── __init__.py │ ├── debug_decisiontree_matcher.py │ ├── debug_gui_decisiontree_matcher.py │ ├── debug_gui_randomforest_matcher.py │ ├── debug_gui_utils.py │ └── debug_randomforest_matcher.py ├── evaluation │ ├── __init__.py │ └── evaluation.py ├── experimental │ └── __init__.py ├── explorer │ ├── __init__.py │ ├── openrefine │ │ ├── __init__.py │ │ └── openrefine_wrapper.py │ └── pandastable │ │ ├── __init__.py │ │ └── pandastable_wrapper.py ├── feature │ ├── __init__.py │ ├── addfeatures.py │ ├── attributeutils.py │ ├── autofeaturegen.py │ ├── extractfeatures.py │ ├── simfunctions.py │ └── tokenizers.py ├── gui │ ├── __init__.py │ ├── debug_gui_base.py │ ├── gui_utils.py │ └── table_gui.py ├── io │ ├── __init__.py │ ├── parsers.py │ └── pickles.py ├── labeler │ ├── __init__.py │ └── labeler.py ├── matcher │ ├── __init__.py │ ├── booleanrulematcher.py │ ├── dtmatcher.py │ ├── ensemblematcher.py │ ├── linregmatcher.py │ ├── logregmatcher.py │ ├── matcher.py │ ├── matcherutils.py │ ├── mlmatcher.py │ ├── nbmatcher.py │ ├── rfmatcher.py │ ├── rulematcher.py │ ├── svmmatcher.py │ └── xgboostmatcher.py ├── matchercombiner │ ├── __init__.py │ └── matchercombiner.py ├── matcherselector │ ├── __init__.py │ ├── mlmatchercombinerselection.py │ └── mlmatcherselection.py ├── sampler │ ├── __init__.py │ ├── down_sample.py │ └── single_table.py ├── tests │ ├── __init__.py │ ├── _test_debug_matcher_dt.py │ ├── _test_debug_matcher_rf.py │ ├── _test_matcherselector_mlmatcherselection_xg.py │ ├── test_attr_equiv_blocker.py │ ├── test_black_box_blocker.py │ ├── test_blockercombiner.py │ ├── test_catalog.py │ ├── test_datasets │ │ ├── A.csv │ │ ├── A.metadata │ │ ├── B.csv │ │ ├── C.csv │ │ ├── C.metadata │ │ ├── C1.csv │ │ ├── C1.metadata │ │ ├── D.csv │ │ ├── D.metadata │ │ ├── blocker │ │ │ ├── table_A_wi_missing_vals.csv │ │ │ ├── table_A_wi_missing_vals.metadata │ │ │ ├── table_B_wi_missing_vals.csv │ │ │ └── table_B_wi_missing_vals.metadata │ │ ├── blockercombiner │ │ │ ├── C1.csv │ │ │ ├── C1.metadata │ │ │ ├── C1_ex_1.csv │ │ │ ├── C1_ex_1.metadata │ │ │ ├── C2.csv │ │ │ ├── C2.metadata │ │ │ ├── C2_ex_1.csv │ │ │ ├── C2_ex_1.metadata │ │ │ ├── C3.csv │ │ │ ├── C3.metadata │ │ │ ├── C3_ex_2.csv │ │ │ ├── C3_ex_2.metadata │ │ │ ├── C4_ex_1.csv │ │ │ ├── C4_ex_1.metadata │ │ │ ├── C4_ex_2.csv │ │ │ ├── C4_ex_2.metadata │ │ │ ├── C_ex_1.csv │ │ │ ├── C_ex_1.metadata │ │ │ ├── C_ex_2.csv │ │ │ ├── C_ex_2.metadata │ │ │ ├── C_ex_4.csv │ │ │ └── C_ex_4.metadata │ │ ├── catalog │ │ │ ├── A.metadata │ │ │ ├── A_dupid.csv │ │ │ ├── A_inv_fk.csv │ │ │ └── A_mvals.csv │ │ ├── debugblocker │ │ │ ├── test_debugblocker_13.metadata │ │ │ ├── test_debugblocker_13_out.csv │ │ │ ├── test_debugblocker_cand.csv │ │ │ ├── test_debugblocker_ltable.csv │ │ │ ├── test_debugblocker_rtable.csv │ │ │ ├── test_get_tokenized_table_1.txt │ │ │ ├── test_get_tokenized_table_2.txt │ │ │ ├── test_topk_sim_join_1_A.txt │ │ │ ├── test_topk_sim_join_1_B.txt │ │ │ └── test_topk_sim_join_1_C.txt │ │ ├── io │ │ │ ├── A.csv │ │ │ ├── A.mdx │ │ │ ├── A_dupid.csv │ │ │ ├── A_key_zipcode.csv │ │ │ ├── A_key_zipcode.metadata │ │ │ ├── A_md_wrongformat.csv │ │ │ ├── A_md_wrongformat.metadata │ │ │ ├── A_mvals.csv │ │ │ ├── C_partialmeta.csv │ │ │ ├── C_partialmeta.metadata │ │ │ ├── InvalidMetadata1.csv │ │ │ ├── InvalidMetadata1.metadata │ │ │ ├── InvalidMetadata2.csv │ │ │ ├── InvalidMetadata2.metadata │ │ │ ├── expected_A.metadata │ │ │ └── expected_C.metadata │ │ ├── matcherselector │ │ │ ├── ACM_demo.csv │ │ │ ├── DBLP_demo.csv │ │ │ ├── feat_vecs.csv │ │ │ └── feat_vecs.metadata │ │ ├── restA.csv │ │ ├── restB.csv │ │ └── sandbox │ │ │ ├── A.pkl │ │ │ └── A.pklmetadata │ ├── test_debugblocker.py │ ├── test_evaluation.py │ ├── test_feature_addfeatures.py │ ├── test_feature_attributeutils.py │ ├── test_feature_autofeaturegen.py │ ├── test_feature_extractfeaturevecs.py │ ├── test_feature_simfunctions.py │ ├── test_feature_tokenizers.py │ ├── test_io_import_export.py │ ├── test_io_load_save.py │ ├── test_labeler.py │ ├── test_match_trigger.py │ ├── test_matcher_ml_matcher.py │ ├── test_matcherselector_mlmatcherselection.py │ ├── test_overlap_blocker.py │ ├── test_rule_based_blocker.py │ ├── test_rule_based_matcher.py │ ├── test_sampler_down_sample.py │ ├── test_sampler_single_table.py │ ├── test_validation_helper.py │ └── utils.py ├── triggers │ ├── __init__.py │ └── matchtrigger.py ├── tuner │ ├── __init__.py │ ├── tuner_down_sample.py │ └── tuner_overlap_blocker.py └── utils │ ├── __init__.py │ ├── catalog_helper.py │ ├── generic_helper.py │ ├── pandas_helper.py │ ├── stop_words.txt │ └── validation_helper.py ├── requirements.txt ├── requirements.yml └── setup.py /.github/workflows/testing.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies and run with a variety of Python versions 2 | 3 | name: Python package 4 | 5 | on: 6 | - push 7 | - pull_request 8 | 9 | jobs: 10 | build: 11 | 12 | strategy: 13 | fail-fast: false 14 | matrix: 15 | python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"] 16 | os: ["ubuntu-latest", "windows-latest", "macos-latest"] 17 | runs-on: ${{ matrix.os }} 18 | env: 19 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 20 | 21 | steps: 22 | - uses: actions/checkout@v3 23 | - name: Set up Python ${{ matrix.python-version }} 24 | uses: actions/setup-python@v3 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | - name: Install dependencies 28 | run: | 29 | python -m pip install --upgrade pip 30 | pip install numpy PyQt5 Cython 31 | pip install -r requirements.txt 32 | - name: Install package 33 | run: | 34 | python setup.py build_ext --inplace 35 | - name: Run tests 36 | run: | 37 | python -m unittest -v 38 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ### Python template 2 | # Byte-compiled / optimized / DLL files 3 | __pycache__/ 4 | *.py[cod] 5 | *$py.class 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | env/ 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *,cover 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | 55 | # Sphinx documentation 56 | docs/_build/ 57 | 58 | # PyBuilder 59 | target/ 60 | 61 | # temp dir 62 | scratch/ 63 | 64 | # idea files 65 | .idea/ 66 | # Created by .ignore support plugin (hsz.mobi) 67 | 68 | py_entitymatching/datasets/msd_reduced.csv 69 | py_entitymatching/tests/test_datasets/sandbox/* 70 | py_entitymatching/datasets/example_datasets 71 | cover/ 72 | results/ 73 | html/ 74 | 75 | *.dot 76 | *.png 77 | *.pkl 78 | *.swp 79 | 80 | notebooks/how-to-guides/bdata/ 81 | garage/ 82 | 83 | .DS_Store 84 | -------------------------------------------------------------------------------- /CHANGES.txt: -------------------------------------------------------------------------------- 1 | v0.4.2 - 2/7/2024 2 | * Fixed various errors in setup.py when pip installing 3 | * Adjusted setuptools.setup project name to match name on PyPI 4 | 5 | v0.4.1 - 3/18/2023 6 | * Dropped support for Python 3.6. 7 | * Added support for Python 3.10 and 3.11. 8 | * Switched from Nose to vanilla Unittest. 9 | * Replaced Travis and Appveyor CI testing with Github Actions. 10 | 11 | v0.4.0 - 11/20/2020 12 | * Dropped support for Python 2 and 3.5. 13 | * To support Python 3.8, updated the function 14 | py_entitymatching.matcher.matcherutils.impute_table() to use current scikit-learn's 15 | SimpleImputer; see issue #127. 16 | 17 | v0.3.3 - 10/19/2020 18 | * Started tracking release changes in CHANGES.txt. 19 | * Minorly updated usage of Pandas to prevent a dependency on downgraded versions. 20 | * Added stricter scikit-learn dependency requirements to preserve old Imputer API; will change in future release. 21 | * Users can now provide metadata for blackbox features (ex. left_attribute, right_attribute, etc.). 22 | * This is the last version of py_entitymatching that will support Python 2 and Python 3.5. 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016, anhaidgroup 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | * Neither the name of py_entitymatching nor the names of its 15 | contributors may be used to endorse or promote products derived from 16 | this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /LICENSES/CLOUDPICKLE_LICENSE: -------------------------------------------------------------------------------- 1 | This module was extracted from the `cloud` package, developed by 2 | PiCloud, Inc. 3 | 4 | Copyright (c) 2015, Cloudpickle contributors. 5 | Copyright (c) 2012, Regents of the University of California. 6 | Copyright (c) 2009 PiCloud, Inc. http://www.picloud.com. 7 | All rights reserved. 8 | 9 | Redistribution and use in source and binary forms, with or without 10 | modification, are permitted provided that the following conditions 11 | are met: 12 | * Redistributions of source code must retain the above copyright 13 | notice, this list of conditions and the following disclaimer. 14 | * Redistributions in binary form must reproduce the above copyright 15 | notice, this list of conditions and the following disclaimer in the 16 | documentation and/or other materials provided with the distribution. 17 | * Neither the name of the University of California, Berkeley nor the 18 | names of its contributors may be used to endorse or promote 19 | products derived from this software without specific prior written 20 | permission. 21 | 22 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 25 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 26 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 27 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 28 | TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 29 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 30 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 31 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 32 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 | 34 | -------------------------------------------------------------------------------- /LICENSES/JOBLIB_LICENSE: -------------------------------------------------------------------------------- 1 | joblib is BSD-licenced (3 clause): 2 | 3 | This software is OSI Certified Open Source Software. 4 | OSI Certified is a certification mark of the Open Source Initiative. 5 | 6 | Copyright (c) 2009-2011, joblib developpers 7 | All rights reserved. 8 | 9 | Redistribution and use in source and binary forms, with or without 10 | modification, are permitted provided that the following conditions are met: 11 | 12 | * Redistributions of source code must retain the above copyright notice, 13 | this list of conditions and the following disclaimer. 14 | 15 | * Redistributions in binary form must reproduce the above copyright notice, 16 | this list of conditions and the following disclaimer in the documentation 17 | and/or other materials provided with the distribution. 18 | 19 | * Neither the name of Gael Varoquaux. nor the names of other joblib 20 | contributors may be used to endorse or promote products derived from 21 | this software without specific prior written permission. 22 | 23 | This software is provided by the copyright holders and contributors 24 | "as is" and any express or implied warranties, including, but not 25 | limited to, the implied warranties of merchantability and fitness for 26 | a particular purpose are disclaimed. In no event shall the copyright 27 | owner or contributors be liable for any direct, indirect, incidental, 28 | special, exemplary, or consequential damages (including, but not 29 | limited to, procurement of substitute goods or services; loss of use, 30 | data, or profits; or business interruption) however caused and on any 31 | theory of liability, whether in contract, strict liability, or tort 32 | (including negligence or otherwise) arising in any way out of the use 33 | of this software, even if advised of the possibility of such 34 | damage. 35 | -------------------------------------------------------------------------------- /LICENSES/PYPARSING_LICENSE: -------------------------------------------------------------------------------- 1 | Permission is hereby granted, free of charge, to any person obtaining 2 | a copy of this software and associated documentation files (the 3 | "Software"), to deal in the Software without restriction, including 4 | without limitation the rights to use, copy, modify, merge, publish, 5 | distribute, sublicense, and/or sell copies of the Software, and to 6 | permit persons to whom the Software is furnished to do so, subject to 7 | the following conditions: 8 | 9 | The above copyright notice and this permission notice shall be 10 | included in all copies or substantial portions of the Software. 11 | 12 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 13 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 14 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 15 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 16 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 17 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 18 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 19 | -------------------------------------------------------------------------------- /LICENSES/PYPRIND_LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014-2016, Sebastian Raschka 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | * Neither the name of biopandas nor the names of its 15 | contributors may be used to endorse or promote products derived from 16 | this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | 29 | -------------------------------------------------------------------------------- /LICENSES/PY_STRINGMATCHING_LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016, anhaidgroup 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | * Neither the name of py_stringmatching nor the names of its 15 | contributors may be used to endorse or promote products derived from 16 | this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /LICENSES/PY_STRINGSIMJOIN_LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016, anhaidgroup 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | * Neither the name of py_stringsimjoin nor the names of its 15 | contributors may be used to endorse or promote products derived from 16 | this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /LICENSES/SCIKIT_LEARN_LICENSE: -------------------------------------------------------------------------------- 1 | New BSD License 2 | 3 | Copyright (c) 2007–2016 The scikit-learn developers. 4 | All rights reserved. 5 | 6 | 7 | Redistribution and use in source and binary forms, with or without 8 | modification, are permitted provided that the following conditions are met: 9 | 10 | a. Redistributions of source code must retain the above copyright notice, 11 | this list of conditions and the following disclaimer. 12 | b. Redistributions in binary form must reproduce the above copyright 13 | notice, this list of conditions and the following disclaimer in the 14 | documentation and/or other materials provided with the distribution. 15 | c. Neither the name of the Scikit-learn Developers nor the names of 16 | its contributors may be used to endorse or promote products 17 | derived from this software without specific prior written 18 | permission. 19 | 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 | ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR 25 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 27 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 28 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH 31 | DAMAGE. 32 | -------------------------------------------------------------------------------- /LICENSES/SIX_LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2010-2016 Benjamin Peterson 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | this software and associated documentation files (the "Software"), to deal in 5 | the Software without restriction, including without limitation the rights to 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 7 | the Software, and to permit persons to whom the Software is furnished to do so, 8 | subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 15 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 16 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 17 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 18 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 19 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include MANIFEST.in 2 | include README.rst 3 | include setup.py 4 | include py_entitymatching/datasets/*.csv 5 | recursive-include py_entitymatching/tests *.csv 6 | 7 | 8 | graft py_entitymatching 9 | 10 | global-exclude *.so 11 | global-exclude *.pyd 12 | global-exclude *.pyc 13 | global-exclude *~ 14 | global-exclude \#* 15 | global-exclude .git* 16 | global-exclude .DS_Store 17 | global-exclude *.png 18 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | py_entitymatching 2 | ================= 3 | 4 | This project seeks to build a Python software package to match entities 5 | between two tables using supervised learning. This problem is often 6 | referred as entity matching (EM). Given two tables A and B, the goal of 7 | EM is to discover the tuple pairs between two tables that refer to the 8 | same real-world entities. There are two main steps involved in entity matching: 9 | blocking and matching. The blocking step aims to remove obvious non-matching 10 | tuple pairs and reduce the set considered for matching. Entity matching in 11 | practice involves many steps than just blocking and matching. While performing EM 12 | users often execute many steps, e.g. exploring, cleaning, debugging, sampling, 13 | estimating accuracy, etc. Current EM systems however do not cover the entire 14 | EM pipeline, providing support only for a few steps (e.g., blocking, matching), while 15 | ignoring less well-known yet equally critical steps (e.g., debgging, sampling). 16 | This package seeks to support all the steps involved in EM pipeline. 17 | 18 | The package is free, open-source, and BSD-licensed. 19 | 20 | Important links 21 | =============== 22 | 23 | * Project Homepage: https://sites.google.com/site/anhaidgroup/projects/magellan/py_entitymatching 24 | * Code repository: https://github.com/anhaidgroup/py_entitymatching 25 | * User Manual: http://anhaidgroup.github.io/py_entitymatching/v0.4.0/index.html 26 | * Guides: http://anhaidgroup.github.io/py_entitymatching/v0.4.0/user_manual/guides.html 27 | * How to Contribute: http://anhaidgroup.github.io/py_entitymatching/v0.4.0/contributing.html 28 | * Issue Tracker: https://github.com/anhaidgroup/py_entitymatching/issues 29 | 30 | Dependencies 31 | ============ 32 | 33 | The required dependencies to build the packages are: 34 | 35 | * numpy 1.7.0 or higher. Tested on version 1.19.4. 36 | * pandas (provides data structures to store and manage tables). Tested on version 1.1.4. 37 | * scikit-learn 0.22 or higher (provides implementations for common machine learning algorithms). Tested on version 0.23.2. 38 | * joblib (provides multiprocessing capabilities). Tested on version 0.17.0. 39 | * py_stringsimjoin (provides implementations for string similarity joins). Tested on version 0.3.2. 40 | * py_stringmatching (provides a set of string tokenizers and string similarity functions). Tested on version 0.4.2. 41 | * cloudpickle (provides functions to serialize Python constructs). Tested on version 1.6.0. 42 | * pyprind (library to display progress indicators). Tested on version 2.9.8. 43 | * pyparsing (library to parse strings). Tested on version 2.4.7. 44 | * six (provides functions to write compatible code across Python 2 and 3). Tested on version 1.15.0. 45 | 46 | Platforms 47 | ========= 48 | 49 | py_entitymatching has been tested on Linux, OS X and Windows. 50 | -------------------------------------------------------------------------------- /asv.conf.json: -------------------------------------------------------------------------------- 1 | { 2 | // The version of the config file format. Do not change, unless 3 | // you know what you are doing. 4 | "version": 1, 5 | 6 | // The name of the project being benchmarked 7 | "project": "magellan", 8 | 9 | // The project's homepage 10 | //"project_url": "https://github.com/anhaidgroup/magellan/", 11 | "project_url": "https://github.com/anhaidgroup/magellan/", 12 | 13 | // The URL or local path of the source code repository for the 14 | // project being benchmarked 15 | //"repo": "https://github.com/anhaidgroup/magellan.git", 16 | "repo": ".", 17 | 18 | // List of branches to benchmark. If not provided, defaults to "master" 19 | // (for git) or "tip" (for mercurial). 20 | "branches": ["master"], // for git 21 | // "branches": ["tip"], // for mercurial 22 | 23 | // The DVCS being used. If not set, it will be automatically 24 | // determined from "repo" by looking at the protocol in the URL 25 | // (if remote), or by looking for special directories, such as 26 | // ".git" (if local). 27 | // "dvcs": "git", 28 | 29 | // The tool to use to create environments. May be "conda", 30 | // "virtualenv" or other value depending on the plugins in use. 31 | // If missing or the empty string, the tool will be automatically 32 | // determined by looking for tools on the PATH environment 33 | // variable. 34 | "environment_type": "conda", 35 | 36 | // the base URL to show a commit for the project. 37 | "show_commit_url": "https://github.com/anhaidgroup/magellan/commit/", 38 | 39 | // The Pythons you'd like to test against. If not provided, defaults 40 | // to the current version of Python used to run `asv`. 41 | "pythons": ["2.7"], 42 | //"pythons": ["2.7","3.3", "3.4", "3.5"], 43 | 44 | // The matrix of dependencies to test. Each key is the name of a 45 | // package (in PyPI) and the values are version numbers. An empty 46 | // list indicates to just test against the default (latest) 47 | // version. 48 | "matrix": { 49 | "numpy":[], 50 | "pyqt":[], 51 | "scipy":[], 52 | "pandas":[], 53 | "pyparsing":[], 54 | "six":[], 55 | "scikit-learn":[], 56 | "cloudpickle":[], 57 | "joblib": [], 58 | "pip+py_stringmatching": [], 59 | "pip+pyprind": [] 60 | }, 61 | 62 | // The directory (relative to the current directory) that benchmarks are 63 | // stored in. If not provided, defaults to "benchmarks" 64 | "benchmark_dir": "benchmarks" 65 | 66 | // The directory (relative to the current directory) to cache the Python 67 | // environments in. If not provided, defaults to "env" 68 | // "env_dir": "env", 69 | 70 | 71 | // The directory (relative to the current directory) that raw benchmark 72 | // results are stored in. If not provided, defaults to "results". 73 | // "results_dir": "results", 74 | 75 | // The directory (relative to the current directory) that the html tree 76 | // should be written to. If not provided, defaults to "html". 77 | // "html_dir": "html", 78 | 79 | // The number of characters to retain in the commit hashes. 80 | // "hash_length": 8, 81 | 82 | // `asv` will cache wheels of the recent builds in each 83 | // environment, making them faster to install next time. This is 84 | // number of builds to keep, per environment. 85 | // "wheel_cache_size": 0 86 | } 87 | -------------------------------------------------------------------------------- /benchmarks/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /build.bat: -------------------------------------------------------------------------------- 1 | "%PYTHON%" setup.py install --single-version-externally-managed --record=record.txt 2 | if errorlevel 1 exit 1 3 | 4 | :: Add more build steps here, if they are necessary. 5 | 6 | :: See 7 | :: http://docs.continuum.io/conda/build.html 8 | :: for a list of environment variables that are set during the build process. -------------------------------------------------------------------------------- /build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | $PYTHON setup.py install --single-version-externally-managed --record=record.txt 4 | 5 | # Add more build steps here, if they are necessary. 6 | 7 | # See 8 | # http://docs.continuum.io/conda/build.html 9 | # for a list of environment variables that are set during the build process. 10 | -------------------------------------------------------------------------------- /build_tools/appveyor/install.ps1: -------------------------------------------------------------------------------- 1 | # Sample script to install Miniconda under Windows 2 | # Authors: Olivier Grisel, Jonathan Helmus and Kyle Kastner, Robert McGibbon 3 | # License: CC0 1.0 Universal: http://creativecommons.org/publicdomain/zero/1.0/ 4 | 5 | $MINICONDA_URL = "http://repo.continuum.io/miniconda/" 6 | 7 | 8 | function DownloadMiniconda ($python_version, $platform_suffix) { 9 | $webclient = New-Object System.Net.WebClient 10 | $filename = "Miniconda3-latest-Windows-" + $platform_suffix + ".exe" 11 | # $filename = "Miniconda3-3.8.3-Windows-" + $platform_suffix + ".exe" 12 | $url = $MINICONDA_URL + $filename 13 | 14 | $basedir = $pwd.Path + "\" 15 | $filepath = $basedir + $filename 16 | if (Test-Path $filename) { 17 | Write-Host "Reusing" $filepath 18 | return $filepath 19 | } 20 | 21 | # Download and retry up to 3 times in case of network transient errors. 22 | Write-Host "Downloading" $filename "from" $url 23 | $retry_attempts = 2 24 | for($i=0; $i -lt $retry_attempts; $i++){ 25 | try { 26 | $webclient.DownloadFile($url, $filepath) 27 | break 28 | } 29 | Catch [Exception]{ 30 | Start-Sleep 1 31 | } 32 | } 33 | if (Test-Path $filepath) { 34 | Write-Host "File saved at" $filepath 35 | } else { 36 | # Retry once to get the error message if any at the last try 37 | $webclient.DownloadFile($url, $filepath) 38 | } 39 | return $filepath 40 | } 41 | 42 | 43 | function InstallMiniconda ($python_version, $architecture, $python_home) { 44 | Write-Host "Installing Python" $python_version "for" $architecture "bit architecture to" $python_home 45 | if (Test-Path $python_home) { 46 | Write-Host $python_home "already exists, skipping." 47 | return $false 48 | } 49 | if ($architecture -match "32") { 50 | $platform_suffix = "x86" 51 | } else { 52 | $platform_suffix = "x86_64" 53 | } 54 | 55 | $filepath = DownloadMiniconda $python_version $platform_suffix 56 | Write-Host "Installing" $filepath "to" $python_home 57 | $install_log = $python_home + ".log" 58 | $args = "/S /D=$python_home" 59 | Write-Host $filepath $args 60 | Start-Process -FilePath $filepath -ArgumentList $args -Wait -Passthru 61 | if (Test-Path $python_home) { 62 | Write-Host "Python $python_version ($architecture) installation complete" 63 | } else { 64 | Write-Host "Failed to install Python in $python_home" 65 | Get-Content -Path $install_log 66 | Exit 1 67 | } 68 | } 69 | 70 | 71 | function InstallCondaPackages ($python_home, $spec) { 72 | $conda_path = $python_home + "\Scripts\conda.exe" 73 | $args = "install --yes " + $spec 74 | Write-Host ("conda " + $args) 75 | Start-Process -FilePath "$conda_path" -ArgumentList $args -Wait -Passthru 76 | } 77 | 78 | function UpdateConda ($python_home) { 79 | $conda_path = $python_home + "\Scripts\conda.exe" 80 | Write-Host "Updating conda..." 81 | $args = "update --yes conda" 82 | Write-Host $conda_path $args 83 | Start-Process -FilePath "$conda_path" -ArgumentList $args -Wait -Passthru 84 | } 85 | 86 | 87 | function main () { 88 | InstallMiniconda $env:PYTHON_VERSION $env:PYTHON_ARCH $env:PYTHON 89 | UpdateConda $env:PYTHON 90 | InstallCondaPackages $env:PYTHON "conda-build jinja2 anaconda-client" 91 | } 92 | 93 | main 94 | -------------------------------------------------------------------------------- /build_tools/appveyor/rm_rf.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import os 3 | import sys 4 | import stat 5 | import shutil 6 | 7 | def remove_readonly(func, path, excinfo): 8 | os.chmod(path, stat.S_IWRITE) 9 | func(path) 10 | 11 | def main(): 12 | print(sys.executable) 13 | try: 14 | shutil.rmtree(sys.argv[1], onerror=remove_readonly) 15 | except Exception as e: 16 | print("Error") 17 | print(e) 18 | 19 | if __name__ == '__main__': 20 | main() 21 | 22 | -------------------------------------------------------------------------------- /build_tools/move-conda-package.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import yaml 4 | import glob 5 | import shutil 6 | 7 | #try 8 | # from conda_build.config import config 9 | #except ImportError: 10 | from conda_build.config import Config # 03/03/2017: Updated based on the changes to conda_build.config 11 | config = Config() 12 | 13 | with open(os.path.join(sys.argv[1], 'meta.yaml')) as f: 14 | name = yaml.load(f)['package']['name'] 15 | 16 | binary_package_glob = os.path.join(config.bldpkgs_dir, '{0}*.tar.bz2'.format(name)) 17 | binary_package = glob.glob(binary_package_glob)[0] 18 | 19 | shutil.move(binary_package, '.') 20 | -------------------------------------------------------------------------------- /build_tools/requirements_dev.txt: -------------------------------------------------------------------------------- 1 | numpy>=1.7.0 2 | six 3 | Cython 4 | nose 5 | -------------------------------------------------------------------------------- /conda.recipe/build.bat: -------------------------------------------------------------------------------- 1 | "%PYTHON%" setup.py install --single-version-externally-managed --record=record.txt 2 | if errorlevel 1 exit 1 3 | 4 | :: Add more build steps here, if they are necessary. 5 | 6 | :: See 7 | :: http://docs.continuum.io/conda/build.html 8 | :: for a list of environment variables that are set during the build process. 9 | -------------------------------------------------------------------------------- /conda.recipe/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | $PYTHON setup.py install --single-version-externally-managed --record=record.txt 4 | 5 | # Add more build steps here, if they are necessary. 6 | 7 | # See 8 | # http://docs.continuum.io/conda/build.html 9 | # for a list of environment variables that are set during the build process. 10 | -------------------------------------------------------------------------------- /conda.recipe/meta.yaml: -------------------------------------------------------------------------------- 1 | package: 2 | name: py_entitymatching 3 | version: "0.2.1" 4 | 5 | source: 6 | git_url: ../ 7 | 8 | 9 | requirements: 10 | build: 11 | - python 12 | - setuptools 13 | - py_stringsimjoin 14 | - cloudpickle 15 | - pyparsing 16 | - scikit-learn 17 | - pyqt 18 | - pandas-profiling 19 | - requests 20 | # - xgboost 21 | 22 | run: 23 | - python 24 | - py_stringsimjoin 25 | - cloudpickle 26 | - pyparsing 27 | - scikit-learn 28 | - pyqt 29 | - pandas-profiling 30 | - requests 31 | # - xgboost 32 | 33 | test: 34 | # Python imports 35 | imports: 36 | - py_entitymatching 37 | 38 | # commands: 39 | # You can put test commands to be run here. Use this to test that the 40 | # entry points work. 41 | 42 | 43 | # You can also put a file called run_test.py in the recipe that will be run 44 | # at test time. 45 | 46 | # requires: 47 | # Put any additional test requirements here. For example 48 | # - nose 49 | 50 | about: 51 | home: https://sites.google.com/site/anhaidgroup/projects/magellan/py_entitymatching 52 | license: BSD License 53 | summary: 'Python library for entity matching.' 54 | 55 | # See 56 | # http://docs.continuum.io/conda/build.html for 57 | # more information about meta.yaml 58 | -------------------------------------------------------------------------------- /continuous-integration/appveyor/install.ps1: -------------------------------------------------------------------------------- 1 | # Sample script to install Miniconda under Windows 2 | # Authors: Olivier Grisel, Jonathan Helmus and Kyle Kastner, Robert McGibbon 3 | # License: CC0 1.0 Universal: http://creativecommons.org/publicdomain/zero/1.0/ 4 | 5 | $MINICONDA_URL = "http://repo.continuum.io/miniconda/" 6 | 7 | 8 | function DownloadMiniconda ($python_version, $platform_suffix) { 9 | $webclient = New-Object System.Net.WebClient 10 | if ($python_version -match "3.4") { 11 | $filename = "Miniconda3-latest-Windows-" + $platform_suffix + ".exe" 12 | } else { 13 | $filename = "Miniconda3-latest-Windows-" + $platform_suffix + ".exe" 14 | } 15 | $url = $MINICONDA_URL + $filename 16 | 17 | $basedir = $pwd.Path + "\" 18 | $filepath = $basedir + $filename 19 | if (Test-Path $filename) { 20 | Write-Host "Reusing" $filepath 21 | return $filepath 22 | } 23 | 24 | # Download and retry up to 3 times in case of network transient errors. 25 | Write-Host "Downloading" $filename "from" $url 26 | $retry_attempts = 2 27 | for($i=0; $i -lt $retry_attempts; $i++){ 28 | try { 29 | $webclient.DownloadFile($url, $filepath) 30 | break 31 | } 32 | Catch [Exception]{ 33 | Start-Sleep 1 34 | } 35 | } 36 | if (Test-Path $filepath) { 37 | Write-Host "File saved at" $filepath 38 | } else { 39 | # Retry once to get the error message if any at the last try 40 | $webclient.DownloadFile($url, $filepath) 41 | } 42 | return $filepath 43 | } 44 | 45 | 46 | function InstallMiniconda ($python_version, $architecture, $python_home) { 47 | Write-Host "Installing Python" $python_version "for" $architecture "bit architecture to" $python_home 48 | if (Test-Path $python_home) { 49 | Write-Host $python_home "already exists, skipping." 50 | return $false 51 | } 52 | if ($architecture -match "32") { 53 | $platform_suffix = "x86" 54 | } else { 55 | $platform_suffix = "x86_64" 56 | } 57 | 58 | $filepath = DownloadMiniconda $python_version $platform_suffix 59 | Write-Host "Installing" $filepath "to" $python_home 60 | $install_log = $python_home + ".log" 61 | $args = "/S /D=$python_home" 62 | Write-Host $filepath $args 63 | Start-Process -FilePath $filepath -ArgumentList $args -Wait -Passthru 64 | if (Test-Path $python_home) { 65 | Write-Host "Python $python_version ($architecture) installation complete" 66 | } else { 67 | Write-Host "Failed to install Python in $python_home" 68 | Get-Content -Path $install_log 69 | Exit 1 70 | } 71 | } 72 | 73 | 74 | function InstallCondaPackages ($python_home, $spec) { 75 | $conda_path = $python_home + "\Scripts\conda.exe" 76 | $args = "install --yes " + $spec 77 | Write-Host ("conda " + $args) 78 | Start-Process -FilePath "$conda_path" -ArgumentList $args -Wait -Passthru 79 | } 80 | 81 | function UpdateConda ($python_home) { 82 | $conda_path = $python_home + "\Scripts\conda.exe" 83 | Write-Host "Updating conda..." 84 | $args = "update --yes conda" 85 | Write-Host $conda_path $args 86 | Start-Process -FilePath "$conda_path" -ArgumentList $args -Wait -Passthru 87 | } 88 | 89 | 90 | function main () { 91 | InstallMiniconda $env:PYTHON_VERSION $env:PYTHON_ARCH $env:PYTHON 92 | UpdateConda $env:PYTHON 93 | InstallCondaPackages $env:PYTHON "conda-build jinja2 anaconda-client" 94 | } 95 | 96 | main 97 | 98 | -------------------------------------------------------------------------------- /continuous-integration/appveyor/rm_rf.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import os 3 | import sys 4 | import stat 5 | import shutil 6 | 7 | def remove_readonly(func, path, excinfo): 8 | os.chmod(path, stat.S_IWRITE) 9 | func(path) 10 | 11 | def main(): 12 | print(sys.executable) 13 | try: 14 | shutil.rmtree(sys.argv[1], onerror=remove_readonly) 15 | except Exception as e: 16 | print("Error") 17 | print(e) 18 | 19 | if __name__ == '__main__': 20 | main() 21 | 22 | -------------------------------------------------------------------------------- /continuous-integration/appveyor/run_with_env.cmd: -------------------------------------------------------------------------------- 1 | :: To build extensions for 64 bit Python 3, we need to configure environment 2 | :: variables to use the MSVC 2010 C++ compilers from GRMSDKX_EN_DVD.iso of: 3 | :: MS Windows SDK for Windows 7 and .NET Framework 4 (SDK v7.1) 4 | :: 5 | :: To build extensions for 64 bit Python 2, we need to configure environment 6 | :: variables to use the MSVC 2008 C++ compilers from GRMSDKX_EN_DVD.iso of: 7 | :: MS Windows SDK for Windows 7 and .NET Framework 3.5 (SDK v7.0) 8 | :: 9 | :: 32 bit builds do not require specific environment configurations. 10 | :: 11 | :: Note: this script needs to be run with the /E:ON and /V:ON flags for the 12 | :: cmd interpreter, at least for (SDK v7.0) 13 | :: 14 | :: More details at: 15 | :: https://github.com/cython/cython/wiki/64BitCythonExtensionsOnWindows 16 | :: http://stackoverflow.com/a/13751649/163740 17 | :: 18 | :: Author: Olivier Grisel 19 | :: License: CC0 1.0 Universal: http://creativecommons.org/publicdomain/zero/1.0/ 20 | @ECHO OFF 21 | 22 | SET COMMAND_TO_RUN=%* 23 | SET WIN_SDK_ROOT=C:\Program Files\Microsoft SDKs\Windows 24 | 25 | SET MAJOR_PYTHON_VERSION="%PYTHON_VERSION:~0,1%" 26 | IF %MAJOR_PYTHON_VERSION% == "2" ( 27 | SET WINDOWS_SDK_VERSION="v7.0" 28 | ) ELSE IF %MAJOR_PYTHON_VERSION% == "3" ( 29 | SET WINDOWS_SDK_VERSION="v7.1" 30 | ) ELSE ( 31 | ECHO Unsupported Python version: "%MAJOR_PYTHON_VERSION%" 32 | EXIT 1 33 | ) 34 | 35 | IF "%PYTHON_ARCH%"=="64" ( 36 | ECHO Configuring Windows SDK %WINDOWS_SDK_VERSION% for Python %MAJOR_PYTHON_VERSION% on a 64 bit architecture 37 | SET DISTUTILS_USE_SDK=1 38 | SET MSSdk=1 39 | "%WIN_SDK_ROOT%\%WINDOWS_SDK_VERSION%\Setup\WindowsSdkVer.exe" -q -version:%WINDOWS_SDK_VERSION% 40 | "%WIN_SDK_ROOT%\%WINDOWS_SDK_VERSION%\Bin\SetEnv.cmd" /x64 /release 41 | ECHO Executing: %COMMAND_TO_RUN% 42 | call %COMMAND_TO_RUN% || EXIT 1 43 | ) ELSE ( 44 | ECHO Using default MSVC build environment for 32 bit architecture 45 | ECHO Executing: %COMMAND_TO_RUN% 46 | call %COMMAND_TO_RUN% || EXIT 1 47 | ) 48 | 49 | -------------------------------------------------------------------------------- /cythonize.dat: -------------------------------------------------------------------------------- 1 | py_entitymatching/debugblocker/debugblocker_cython.pyx NA 5e568768d488850114e2748b2190a9f647c97d66 eeb6f78e85562d50de57173b94a39b7db5ecae0b 2 | -------------------------------------------------------------------------------- /docs/how_to_guide.rst: -------------------------------------------------------------------------------- 1 | ================================== 2 | How to Guide To Do Entity Matching 3 | ================================== 4 | 5 | The initial draft of the how to guide to do entity matching can be found `here. `_ 6 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | 2 | User Manual for py_entitymatching 3 | ================================= 4 | 5 | This document explains how to install, use, and contribute to the package. 6 | 7 | Contents 8 | ======== 9 | 10 | .. toctree:: 11 | :maxdepth: 3 12 | :caption: How To Guide 13 | 14 | how_to_guide 15 | 16 | .. toctree:: 17 | :maxdepth: 3 18 | :caption: User Manual 19 | 20 | user_manual/whatisnew 21 | user_manual/installation 22 | user_manual/overview 23 | user_manual/guides 24 | user_manual/datastructures 25 | user_manual/steps_supp_em_workflows 26 | user_manual/misc 27 | 28 | .. toctree:: 29 | :maxdepth: 3 30 | :caption: API Reference 31 | 32 | user_manual/api/overview 33 | user_manual/api/actual_commands 34 | 35 | .. toctree:: 36 | :maxdepth: 3 37 | :caption: How to Contribute 38 | 39 | contributing 40 | 41 | Indices and tables 42 | ================== 43 | 44 | * :ref:`genindex` 45 | * :ref:`modindex` 46 | * :ref:`search` 47 | -------------------------------------------------------------------------------- /docs/make_copy.sh: -------------------------------------------------------------------------------- 1 | cd /Users/pradap/Documents/Research/Python-Package/anhaid/py_entitymatching/docs 2 | make clean html 3 | cd _build/html 4 | scp -r * pradap@trinity.cs.wisc.edu:~/public/html-www/magellan/user_manual/multi_page 5 | cd /Users/pradap/Documents/Research/Python-Package/anhaid/py_entitymatching/docs 6 | make clean singlehtml 7 | cd _build/singlehtml 8 | scp -r * pradap@trinity.cs.wisc.edu:~/public/html-www/magellan/user_manual/single_page 9 | cd /Users/pradap/Documents/Research/Python-Package/anhaid/py_entitymatching/docs 10 | -------------------------------------------------------------------------------- /docs/themes/nature_with_gtoc/layout.html: -------------------------------------------------------------------------------- 1 | {# 2 | 3 | Subset of agogo theme 4 | agogo/layout.html 5 | 6 | Sphinx layout template for the agogo theme, originally written 7 | by Andi Albrecht. 8 | 9 | :copyright: Copyright 2007-2011 by the Sphinx team, see AUTHORS. 10 | :license: BSD, see LICENSE for details. 11 | #} 12 | {% extends "basic/layout.html" %} 13 | 14 | {%- block content %} 15 |
16 |
17 |
18 |
19 | {%- block sidebar1 %} 20 | {%- block sidebartoc %} 21 |

{{ _('Table Of Contents') }}

22 | {{ toctree() }} 23 | {%- endblock %} 24 | {%- block sidebarsearch %} 25 |

{{ _('Search') }}

26 | 27 | 33 |

34 | {{ _('Enter search terms or a module, class or function name.') }} 35 |

36 | 37 |
38 | {%- endblock %} 39 | {# possible location for sidebar #} {% endblock %} 40 | 41 | 42 | {%- block document %} 43 |
44 | {%- if render_sidebar %} 45 |
46 | {%- endif %} 47 |
48 | {% block body %} {% endblock %} 49 |
50 | {%- if render_sidebar %} 51 |
52 | {%- endif %} 53 |
54 | {%- endblock %} 55 | 56 | {%- block sidebar2 %} 57 | 58 | {% endblock %} 59 |
60 |
61 |
62 |
63 | {%- endblock %} 64 | 65 | {%- block footer %} 66 | 76 | Scroll To Top 77 | 97 | 108 | {% endblock %} -------------------------------------------------------------------------------- /docs/themes/nature_with_gtoc/theme.conf: -------------------------------------------------------------------------------- 1 | [theme] 2 | inherit = basic 3 | stylesheet = nature.css 4 | pygments_style = tango 5 | 6 | [options] 7 | sidebarwidth = 270 8 | -------------------------------------------------------------------------------- /docs/user_manual/api/actual_commands.rst: -------------------------------------------------------------------------------- 1 | ============================= 2 | Commands in py_entitymatching 3 | ============================= 4 | 5 | .. toctree:: 6 | :maxdepth: 3 7 | 8 | reading_and_writing_data 9 | loading_and_saving_objects 10 | handling_metadata 11 | downsampling 12 | data_exploration 13 | blocking 14 | debugging_blocker_output 15 | combining_blocker_outputs 16 | sampling 17 | labeling 18 | handling_features 19 | matching 20 | debugging_matcher 21 | triggers 22 | evaluating_the_matching_output 23 | 24 | ===================== 25 | Experimental Commands 26 | ===================== 27 | .. toctree:: 28 | :maxdepth: 3 29 | 30 | dask_commands 31 | tuners 32 | combiner 33 | 34 | 35 | -------------------------------------------------------------------------------- /docs/user_manual/api/adding_features.rst: -------------------------------------------------------------------------------- 1 | ================================ 2 | Adding Features to Feature Table 3 | ================================ 4 | .. autofunction:: py_entitymatching.get_feature_fn 5 | .. autofunction:: py_entitymatching.add_feature 6 | .. autofunction:: py_entitymatching.add_blackbox_feature 7 | -------------------------------------------------------------------------------- /docs/user_manual/api/blocking.rst: -------------------------------------------------------------------------------- 1 | ======== 2 | Blocking 3 | ======== 4 | .. autoclass:: py_entitymatching.AttrEquivalenceBlocker 5 | :members: 6 | .. autoclass:: py_entitymatching.OverlapBlocker 7 | :members: 8 | .. autoclass:: py_entitymatching.RuleBasedBlocker 9 | :members: 10 | .. autoclass:: py_entitymatching.BlackBoxBlocker 11 | :members: 12 | .. autoclass:: py_entitymatching.SortedNeighborhoodBlocker 13 | :members: 14 | -------------------------------------------------------------------------------- /docs/user_manual/api/combiner.rst: -------------------------------------------------------------------------------- 1 | ================ 2 | Matcher Combiner 3 | ================ 4 | .. autoclass:: py_entitymatching.matchercombiner.matchercombiner.MajorityVote 5 | :members: 6 | .. autoclass:: py_entitymatching.matchercombiner.matchercombiner.WeightedVote 7 | :members: 8 | -------------------------------------------------------------------------------- /docs/user_manual/api/combining_blocker_outputs.rst: -------------------------------------------------------------------------------- 1 | ========================= 2 | Combining Blocker Outputs 3 | ========================= 4 | .. autofunction:: py_entitymatching.combine_blocker_outputs_via_union -------------------------------------------------------------------------------- /docs/user_manual/api/creating_the_features_automatically.rst: -------------------------------------------------------------------------------- 1 | =================================== 2 | Creating the Features Automatically 3 | =================================== 4 | 5 | .. autofunction:: py_entitymatching.get_features_for_blocking 6 | .. autofunction:: py_entitymatching.get_features_for_matching 7 | 8 | -------------------------------------------------------------------------------- /docs/user_manual/api/creating_the_features_manually.rst: -------------------------------------------------------------------------------- 1 | ============================== 2 | Creating the Features Manually 3 | ============================== 4 | 5 | .. autofunction:: py_entitymatching.get_features 6 | .. autofunction:: py_entitymatching.get_attr_corres 7 | .. autofunction:: py_entitymatching.get_attr_types 8 | .. autofunction:: py_entitymatching.get_sim_funs_for_blocking 9 | .. autofunction:: py_entitymatching.get_sim_funs_for_matching 10 | .. autofunction:: py_entitymatching.get_tokenizers_for_blocking 11 | .. autofunction:: py_entitymatching.get_tokenizers_for_matching 12 | -------------------------------------------------------------------------------- /docs/user_manual/api/dask_commands.rst: -------------------------------------------------------------------------------- 1 | =============================== 2 | Commands Implemented Using Dask 3 | =============================== 4 | 5 | Downsampling 6 | ------------ 7 | .. autofunction:: py_entitymatching.dask.dask_down_sample.dask_down_sample 8 | 9 | 10 | Blocking 11 | -------- 12 | .. autoclass:: py_entitymatching.dask.dask_attr_equiv_blocker.DaskAttrEquivalenceBlocker 13 | :members: 14 | 15 | .. autoclass:: py_entitymatching.dask.dask_overlap_blocker.DaskOverlapBlocker 16 | :members: 17 | 18 | .. autoclass:: py_entitymatching.dask.dask_rule_based_blocker.DaskRuleBasedBlocker 19 | :members: 20 | 21 | .. autoclass:: py_entitymatching.dask.dask_black_box_blocker.DaskBlackBoxBlocker 22 | :members: 23 | 24 | Extracting Feature Vectors 25 | -------------------------- 26 | .. autofunction:: py_entitymatching.dask.dask_extract_features.dask_extract_feature_vecs 27 | 28 | ML-Matchers 29 | ----------- 30 | .. autoclass:: py_entitymatching.dask.dask_dtmatcher.DaskDTMatcher 31 | :inherited-members: 32 | :exclude-members: __delattr__, __format__, __getattribute__, __hash__, __reduce__, __reduce_ex__, __repr__, __setattr__, __sizeof__, __str__ 33 | 34 | .. autoclass:: py_entitymatching.dask.dask_rfmatcher.DaskRFMatcher 35 | :inherited-members: 36 | :exclude-members: __delattr__, __format__, __getattribute__, __hash__, __reduce__, __reduce_ex__, __repr__, __setattr__, __sizeof__, __str__ 37 | 38 | .. autoclass:: py_entitymatching.dask.dask_svm_matcherDaskSVMMatcher 39 | :inherited-members: 40 | :exclude-members: __delattr__, __format__, __getattribute__, __hash__, __reduce__, __reduce_ex__, __repr__, __setattr__, __sizeof__, __str__ 41 | 42 | .. autoclass:: py_entitymatching.dask.dask_nbmatcher.DaskNBMatcher 43 | :inherited-members: 44 | :exclude-members: __delattr__, __format__, __getattribute__, __hash__, __reduce__, __reduce_ex__, __repr__, __setattr__, __sizeof__, __str__ 45 | 46 | .. autoclass:: py_entitymatching.dask.dask_logregmatcher.DaskLogRegMatcher 47 | :inherited-members: 48 | :exclude-members: __delattr__, __format__, __getattribute__, __hash__, __reduce__, __reduce_ex__, __repr__, __setattr__, __sizeof__, __str__ 49 | 50 | 51 | .. autoclass:: py_entitymatching.dask.dask_xgboost_matcher.DaskXGBoostMatcher 52 | :inherited-members: 53 | :exclude-members: __delattr__, __format__, __getattribute__, __hash__, __reduce__, __reduce_ex__, __repr__, __setattr__, __sizeof__, __str__ -------------------------------------------------------------------------------- /docs/user_manual/api/data_exploration.rst: -------------------------------------------------------------------------------- 1 | ================ 2 | Data Exploration 3 | ================ 4 | .. autoclass:: py_entitymatching.data_explore_openrefine 5 | .. autoclass:: py_entitymatching.data_explore_pandastable 6 | -------------------------------------------------------------------------------- /docs/user_manual/api/debugging_blocker_output.rst: -------------------------------------------------------------------------------- 1 | ======================== 2 | Debugging Blocker Output 3 | ======================== 4 | .. autofunction:: py_entitymatching.debug_blocker 5 | .. autofunction:: py_entitymatching.backup_debug_blocker 6 | -------------------------------------------------------------------------------- /docs/user_manual/api/debugging_matcher.rst: -------------------------------------------------------------------------------- 1 | ================= 2 | Debugging Matcher 3 | ================= 4 | .. autofunction:: py_entitymatching.vis_debug_dt 5 | .. autofunction:: py_entitymatching.vis_debug_rf 6 | .. autofunction:: py_entitymatching.debug_decisiontree_matcher 7 | .. autofunction:: py_entitymatching.debug_randomforest_matcher 8 | -------------------------------------------------------------------------------- /docs/user_manual/api/downsampling.rst: -------------------------------------------------------------------------------- 1 | ============ 2 | Downsampling 3 | ============ 4 | .. autofunction:: py_entitymatching.down_sample -------------------------------------------------------------------------------- /docs/user_manual/api/evaluating_the_matching_output.rst: -------------------------------------------------------------------------------- 1 | =============================== 2 | Evaluating the Matching Output 3 | =============================== 4 | 5 | .. autofunction:: py_entitymatching.eval_matches 6 | .. autofunction:: py_entitymatching.print_eval_summary 7 | .. autofunction:: py_entitymatching.get_false_positives_as_df 8 | .. autofunction:: py_entitymatching.get_false_negatives_as_df 9 | -------------------------------------------------------------------------------- /docs/user_manual/api/extracting_feature_vectors.rst: -------------------------------------------------------------------------------- 1 | ========================== 2 | Extracting Feature Vectors 3 | ========================== 4 | .. autofunction:: py_entitymatching.extract_feature_vecs -------------------------------------------------------------------------------- /docs/user_manual/api/handling_features.rst: -------------------------------------------------------------------------------- 1 | ================= 2 | Handling Features 3 | ================= 4 | 5 | .. toctree:: 6 | :maxdepth: 2 7 | 8 | creating_the_features_automatically 9 | creating_the_features_manually 10 | adding_features 11 | extracting_feature_vectors 12 | imputing_missing_values 13 | supported_similarity_functions 14 | supported_tokenizers -------------------------------------------------------------------------------- /docs/user_manual/api/handling_metadata.rst: -------------------------------------------------------------------------------- 1 | .. _label-handling-metadata: 2 | 3 | ================= 4 | Handling Metadata 5 | ================= 6 | .. autofunction:: py_entitymatching.get_catalog 7 | .. autofunction:: py_entitymatching.get_catalog_len 8 | .. autofunction:: py_entitymatching.del_catalog 9 | .. autofunction:: py_entitymatching.is_catalog_empty 10 | .. autofunction:: py_entitymatching.is_dfinfo_present 11 | .. autofunction:: py_entitymatching.is_property_present_for_df 12 | .. autofunction:: py_entitymatching.show_properties 13 | .. autofunction:: py_entitymatching.show_properties_for_id 14 | .. autofunction:: py_entitymatching.get_property 15 | .. autofunction:: py_entitymatching.set_property 16 | .. autofunction:: py_entitymatching.del_property 17 | .. autofunction:: py_entitymatching.copy_properties 18 | .. autofunction:: py_entitymatching.get_key 19 | .. autofunction:: py_entitymatching.set_key 20 | .. autofunction:: py_entitymatching.get_fk_ltable 21 | .. autofunction:: py_entitymatching.set_fk_ltable 22 | .. autofunction:: py_entitymatching.get_fk_rtable 23 | .. autofunction:: py_entitymatching.set_fk_rtable 24 | .. autofunction:: py_entitymatching.get_ltable 25 | .. autofunction:: py_entitymatching.set_ltable 26 | .. autofunction:: py_entitymatching.get_rtable 27 | .. autofunction:: py_entitymatching.set_rtable 28 | 29 | -------------------------------------------------------------------------------- /docs/user_manual/api/imputing_missing_values.rst: -------------------------------------------------------------------------------- 1 | ======================= 2 | Imputing Missing Values 3 | ======================= 4 | .. autofunction:: py_entitymatching.impute_table -------------------------------------------------------------------------------- /docs/user_manual/api/labeling.rst: -------------------------------------------------------------------------------- 1 | ======== 2 | Labeling 3 | ======== 4 | .. autofunction:: py_entitymatching.label_table 5 | -------------------------------------------------------------------------------- /docs/user_manual/api/loading_and_saving_objects.rst: -------------------------------------------------------------------------------- 1 | ============================ 2 | Loading and Saving Objects 3 | ============================ 4 | .. autofunction:: py_entitymatching.load_table 5 | .. autofunction:: py_entitymatching.save_table 6 | .. autofunction:: py_entitymatching.load_object 7 | .. autofunction:: py_entitymatching.save_object 8 | -------------------------------------------------------------------------------- /docs/user_manual/api/matching.rst: -------------------------------------------------------------------------------- 1 | ======== 2 | Matching 3 | ======== 4 | .. toctree:: 5 | 6 | splitting_data_into_train_test 7 | supported_matchers 8 | selecting_matcher -------------------------------------------------------------------------------- /docs/user_manual/api/overview.rst: -------------------------------------------------------------------------------- 1 | ================================ 2 | Overview of Command Organization 3 | ================================ 4 | 5 | The commands are organized into two parts. First, the commands that the user will typically use to 6 | create an entity matching workflow. Second, a set of experimental commands that are expected to be 7 | useful to create an entity matching workflow. Specifically, it includes commands such 8 | as dask-based implementations for blockers and combining predictions from a set of matchers. 9 | However, the experimental commands are not tested, so use these commands at your own risk. 10 | 11 | -------------------------------------------------------------------------------- /docs/user_manual/api/reading_and_writing_data.rst: -------------------------------------------------------------------------------- 1 | ========================= 2 | Reading and Writing Data 3 | ========================= 4 | .. autofunction:: py_entitymatching.read_csv_metadata 5 | .. autofunction:: py_entitymatching.to_csv_metadata 6 | -------------------------------------------------------------------------------- /docs/user_manual/api/sampling.rst: -------------------------------------------------------------------------------- 1 | ======== 2 | Sampling 3 | ======== 4 | .. autofunction:: py_entitymatching.sample_table -------------------------------------------------------------------------------- /docs/user_manual/api/selecting_matcher.rst: -------------------------------------------------------------------------------- 1 | ================== 2 | Selecting Matcher 3 | ================== 4 | .. autofunction:: py_entitymatching.select_matcher -------------------------------------------------------------------------------- /docs/user_manual/api/splitting_data_into_train_test.rst: -------------------------------------------------------------------------------- 1 | ================================== 2 | Splitting Data into Train and Test 3 | ================================== 4 | .. autofunction:: py_entitymatching.split_train_test -------------------------------------------------------------------------------- /docs/user_manual/api/supported_matchers.rst: -------------------------------------------------------------------------------- 1 | ================== 2 | Supported Matchers 3 | ================== 4 | 5 | ML Matchers 6 | =========== 7 | 8 | .. autoclass:: py_entitymatching.DTMatcher 9 | :inherited-members: 10 | :exclude-members: __delattr__, __format__, __getattribute__, __hash__, __reduce__, __reduce_ex__, __repr__, __setattr__, __sizeof__, __str__ 11 | 12 | .. autoclass:: py_entitymatching.RFMatcher 13 | :inherited-members: 14 | :exclude-members: __delattr__, __format__, __getattribute__, __hash__, __reduce__, __reduce_ex__, __repr__, __setattr__, __sizeof__, __str__ 15 | 16 | .. autoclass:: py_entitymatching.SVMMatcher 17 | :inherited-members: 18 | :exclude-members: __delattr__, __format__, __getattribute__, __hash__, __reduce__, __reduce_ex__, __repr__, __setattr__, __sizeof__, __str__ 19 | 20 | .. autoclass:: py_entitymatching.NBMatcher 21 | :inherited-members: 22 | :exclude-members: __delattr__, __format__, __getattribute__, __hash__, __reduce__, __reduce_ex__, __repr__, __setattr__, __sizeof__, __str__ 23 | 24 | .. autoclass:: py_entitymatching.LinRegMatcher 25 | :inherited-members: 26 | :exclude-members: __delattr__, __format__, __getattribute__, __hash__, __reduce__, __reduce_ex__, __repr__, __setattr__, __sizeof__, __str__ 27 | 28 | 29 | .. autoclass:: py_entitymatching.LogRegMatcher 30 | :inherited-members: 31 | :exclude-members: __delattr__, __format__, __getattribute__, __hash__, __reduce__, __reduce_ex__, __repr__, __setattr__, __sizeof__, __str__ 32 | 33 | 34 | .. autoclass:: py_entitymatching.XGBoostMatcher 35 | :inherited-members: 36 | :exclude-members: __delattr__, __format__, __getattribute__, __hash__, __reduce__, __reduce_ex__, __repr__, __setattr__, __sizeof__, __str__ 37 | 38 | 39 | Rule-Based Matcher 40 | ================== 41 | 42 | .. autoclass:: py_entitymatching.BooleanRuleMatcher 43 | :inherited-members: 44 | :excluded-members: __delattr__, __format__, __getattribute__, __hash__, __reduce__, __reduce_ex__, __repr__, __setattr__, __sizeof__, __str__ 45 | -------------------------------------------------------------------------------- /docs/user_manual/api/supported_similarity_functions.rst: -------------------------------------------------------------------------------- 1 | ============================== 2 | Supported Similarity Functions 3 | ============================== 4 | 5 | .. autofunction:: py_entitymatching.affine 6 | .. autofunction:: py_entitymatching.hamming_dist 7 | .. autofunction:: py_entitymatching.hamming_sim 8 | .. autofunction:: py_entitymatching.lev_dist 9 | .. autofunction:: py_entitymatching.lev_sim 10 | .. autofunction:: py_entitymatching.jaro 11 | .. autofunction:: py_entitymatching.jaro_winkler 12 | .. autofunction:: py_entitymatching.needleman_wunsch 13 | .. autofunction:: py_entitymatching.smith_waterman 14 | .. autofunction:: py_entitymatching.jaccard 15 | .. autofunction:: py_entitymatching.cosine 16 | .. autofunction:: py_entitymatching.overlap_coeff 17 | .. autofunction:: py_entitymatching.dice 18 | .. autofunction:: py_entitymatching.monge_elkan 19 | .. autofunction:: py_entitymatching.exact_match 20 | .. autofunction:: py_entitymatching.rel_diff 21 | .. autofunction:: py_entitymatching.abs_norm -------------------------------------------------------------------------------- /docs/user_manual/api/supported_tokenizers.rst: -------------------------------------------------------------------------------- 1 | ==================== 2 | Supported Tokenizers 3 | ==================== 4 | .. autofunction:: py_entitymatching.tok_qgram 5 | .. autofunction:: py_entitymatching.tok_delim 6 | .. autofunction:: py_entitymatching.tok_wspace 7 | .. autofunction:: py_entitymatching.tok_alphabetic 8 | .. autofunction:: py_entitymatching.tok_alphanumeric -------------------------------------------------------------------------------- /docs/user_manual/api/triggers.rst: -------------------------------------------------------------------------------- 1 | ======== 2 | Triggers 3 | ======== 4 | .. autoclass:: py_entitymatching.MatchTrigger 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/user_manual/api/tuners.rst: -------------------------------------------------------------------------------- 1 | ================================== 2 | Tuners for the Dask-based Commands 3 | ================================== 4 | 5 | Downsampling 6 | ------------ 7 | .. autofunction:: py_entitymatching.tuner.tuner_down_sample.tuner_down_sample 8 | 9 | 10 | Overlap Blocker 11 | --------------- 12 | .. autofunction:: py_entitymatching.tuner.tuner_overlap_blocker.tuner_overlap_blocker 13 | -------------------------------------------------------------------------------- /docs/user_manual/blocker_hierarchy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhaidgroup/py_entitymatching/0eebc5823a5faac89934c52cfe2b7cb5f6e15e8a/docs/user_manual/blocker_hierarchy.png -------------------------------------------------------------------------------- /docs/user_manual/create_feats_for_matching.rst: -------------------------------------------------------------------------------- 1 | .. _label-create-feats-matching: 2 | 3 | ============================== 4 | Creating Features for Matching 5 | ============================== 6 | If you have to use supervised learning-based matchers, then you cannot just operate on the 7 | labeled set of tuple pairs. For each tuple in the labeled, you need to convert it 8 | into a feature vector which consists of a list of numerical/categorical features. To do 9 | this, first we need to create a set of features. 10 | 11 | There are two ways to create features: 12 | 13 | * Automatically create a set of features (then the user can remove or add some more). 14 | * Skip the automatic process and generate features manually. 15 | 16 | 17 | Creating the Features Manually 18 | ------------------------------ 19 | This is very similar to manual feature creation process for blocking (see section 20 | :ref:`label-create-features-blocking`) except the features are created for 21 | matching purposes. 22 | In brief, you can execute the following sequence of commands in py_entitymatching 23 | to create the features manually: 24 | 25 | >>> match_t = em.get_tokenizers_for_matching() 26 | >>> match_s = em.get_sim_funs_for_matching() 27 | >>> atypes1 = em.get_attr_types(A) # don't need, if atypes1 exists from blocking step 28 | >>> atypes2 = em.get_attr_types(B) # don't need, if atypes2 exists from blocking step 29 | >>> match_c = em.get_attr_corres(A, B) 30 | >>> match_f = em.get_features(A, B, atypes1, atype2, match_c, match_t, match_s) 31 | 32 | Further, you can add or delete features as see saw in section 33 | :ref:`label-add-remove-features`. 34 | 35 | Please refer to the API reference of :py:meth:`~py_entitymatching.get_tokenizers_for_matching` 36 | and :py:meth:`py_entitymatching.get_sim_funs_for_matching` for more details. 37 | 38 | .. note:: Currently, py_entitymatching returns the same set of features for blocking and matching purposes. 39 | 40 | Creating the Features Automatically 41 | ----------------------------------- 42 | If you do not want to go through the hassle of creating the features manually, then 43 | the user can generate the features automatically. This is very similar to automatic 44 | feature creation process for blocking (see section :ref:`label-gen-feats-automatically`). 45 | 46 | In py_entitymatching, you can use `get_features_for_matching` to generate features 47 | for matching purposes automatically. An example of using `get_features_for_matching` is 48 | shown below: 49 | 50 | >>> match_f = em.get_features_for_matching(A, B) 51 | 52 | Similar to what we saw in section :ref:`label-gen-feats-automatically` for blocking, the 53 | command will set the following variables: `_match_t`, `_match_s`, `_atypes1`, `_atypes2`, `_match_c` 54 | and they can be accessed like this: 55 | 56 | >>> em._match_t 57 | >>> em._match_s 58 | >>> em._atypes1 59 | >>> em._atypes2 60 | >>> em._match_c 61 | 62 | You can to examine these variables, modify them as appropriate, and then 63 | perhaps regenerate a set of features. 64 | Please refer to the API reference of :py:meth:`~py_entitymatching.get_features_for_matching` 65 | for more details. 66 | 67 | 68 | 69 | 70 | 71 | -------------------------------------------------------------------------------- /docs/user_manual/data_exploration.rst: -------------------------------------------------------------------------------- 1 | ================ 2 | Data Exploration 3 | ================ 4 | 5 | Data exploration is an important part of the entity matching workflow because it 6 | gives the user a chance to look at the actual data closely. Data exploration 7 | allows the user to inspect the individual records and features present in the 8 | table so that he or she can understand the important trends and relationships 9 | present in the data. A complete understanding of the data gives the user an 10 | advantage later on in the entity matching workflow. 11 | 12 | 13 | OpenRefine 14 | ---------- 15 | 16 | OpenRefine is a data exploration tool that is compatible with Python >= 2.7 or 17 | Python >= 3.4. More information about OpenRefine can be found at its github page 18 | at https://github.com/OpenRefine/OpenRefine 19 | 20 | 21 | .. note:: 22 | OpenRefine is not included with py_entitymatching and must be downloaded and 23 | installed separately. The installation instructions can be found at 24 | https://github.com/OpenRefine/OpenRefine/wiki/Installation-Instructions 25 | 26 | Using OpenRefine 27 | ~~~~~~~~~~~~~~~~ 28 | 29 | Before using OpenRefine, you must start the application to start an OpenRefine 30 | server. The explanations for doing so are explained after the installation 31 | instructions at https://github.com/OpenRefine/OpenRefine/wiki/Installation-Instructions 32 | 33 | Once the application has created a server, copy the URL from the address bar of 34 | the OpenRefine browser (default is http://127.0.0.1:3333 ). Then the data can 35 | be explored as in the example below: 36 | 37 | 38 | >>> import py_entitymatching as em 39 | >>> A = em.read_csv_metadata('path_to_csv_dir/table.csv', key='ID') 40 | >>> p = em.data_explore_openrefine(A, name='Table') 41 | >>> # Save the project back to our dataframe 42 | >>> # Calling export_pandas_frame will automatically delete the OpenRefine project 43 | >>> df = p.export_pandas_frame() 44 | 45 | 46 | Pandastable 47 | ----------- 48 | Pandastable is a data exploration tool available for python >=3.4 that allows users 49 | to view and manipulate data. More information about pandastable can be found at 50 | https://github.com/dmnfarrell/pandastable 51 | 52 | .. note:: 53 | pandastable is not packaged along with py_entitymatching. You can install 54 | pandastable using pip as show below: 55 | 56 | $ pip install pandastable 57 | 58 | or conda as shown below: 59 | 60 | $ conda install -c dmnfarrell pandastable=0.7.1 61 | 62 | 63 | 64 | Using pandastable 65 | ~~~~~~~~~~~~~~~~~ 66 | 67 | 68 | Pandastable can be easily be used with the wrappers included with py_entitymatching. 69 | The following example shows how: 70 | 71 | >>> # import py_entitymatching 72 | >>> import py_entitymatching as em 73 | >>> # Explore the data using pandastable 74 | >>> A = em.read_csv_metadata('path_to_csv_dir/table.csv', key='ID') 75 | >>> em.data_explore_pandastable(A) 76 | -------------------------------------------------------------------------------- /docs/user_manual/debugging_blocking.rst: -------------------------------------------------------------------------------- 1 | ================== 2 | Debugging Blocking 3 | ================== 4 | In a typical entity matching workflow, you will load in the two tables to 5 | match, sample them (if required) and use a blocker to remove obvious non-matches. 6 | But it is often not clear whether the blocker drops only non-matches or it 7 | also removes a lot of potential matches. 8 | 9 | In such cases, it is important to debug the output of blocker. In 10 | py_entitymatching, `debug_blocker` command can be used for that purpose. 11 | 12 | The `debug_blocker` command takes in two input tables A, B, blocker output C 13 | and returns a table D containing a set of tuple pairs that are 14 | potential matches and yet are not present in the blocker output 15 | C. Table D also contains similarity measure computed for each reported 16 | tuple pair (as its second column). 17 | 18 | You can examine these potential matches in table D. If you 19 | find that many of them are indeed true matches, then that means the 20 | blocker may have removed too many true matches. In this case you 21 | may want to `relax` the blocker by modifying its parameters, or 22 | choose a different blocker. On the other hand, if you do not 23 | find many true matches in table D, then it could be the case that the 24 | blocker has done a good job and preserve all the matches (or most of 25 | the matches) in the blocker output C. 26 | 27 | In the `debug_blocker`, you can optionally specify attribute correspondences between 28 | the input tables A and B. If it is not specified, then attribute correspondences 29 | will be a list of attribute pairs with the exact same names in A and B. 30 | 31 | The debugger will use only the attributes mentioned in these attribute 32 | correspondences to try to find potentially matching pairs and place 33 | those pairs into D. Thus, our recommendation is that (a) if the tables 34 | have idential schemas or share a lot of attributes with the same 35 | names, then do not specify the attribute correspondences, in this 36 | case the debugger will use all the attributes with the same name between the two 37 | schemas, (b) otherwise think about what attribute pairs you want to see the 38 | debugger use, then specify those as attribute correspondences. 39 | 40 | An example of using `debug_blocker` is shown below: 41 | 42 | >>> import py_entitymatching as em 43 | >>> ob = em.OverlapBlocker() 44 | >>> C = ob.block_tables(A, B, l_overlap_attr='title', r_overlap_attr='title', overlap_size=3) 45 | >>> corres = [('ID','ssn'), ('name', 'ename'), ('address', 'location'),('zipcode', 'zipcode')] 46 | >>> D = em.debug_blocker(C, A, B, attr_corres=corres) 47 | 48 | Please refer to the API reference of :py:meth:`~py_entitymatching.debug_blocker` 49 | for more details. 50 | 51 | The blocker debugger is implemented in Cython. In case this version of the 52 | command is not working properly, there is also a python version of the command, 53 | called `backup_debug_blocker`, available that can be used instead. Please refer 54 | to the API reference of :py:meth:`~py_entitymatching.backup_debug_blocker` for 55 | more details. 56 | 57 | -------------------------------------------------------------------------------- /docs/user_manual/down_sampling.rst: -------------------------------------------------------------------------------- 1 | ============= 2 | Down Sampling 3 | ============= 4 | Once the tables to be matched are read, they must be down sampled if the number of 5 | tuples in them are large (for example, 100K+ tuples). This is because working with 6 | large tables can be very time consuming (as any operation performed would have 7 | to process these large tables). 8 | 9 | Random sampling however does not work, because the sampled may end up sharing very 10 | few matches, especially if the number of matches between the 11 | input tables are small to begin with. 12 | 13 | In py_entitymatching, you can use sample the input tables using `down_sample` command. 14 | This command samples the input tables intelligently that ensures a reasonable number of 15 | matches between them. 16 | 17 | If `A` and `B` are the input tables, then you can use `down_sample` command as shown 18 | below: 19 | 20 | >>> sample_A, sample_B = em.down_sample(A, B, size=500, y_param=1) 21 | 22 | Conceptually, the command takes in two original input tables, `A`, `B` (and some parameters), 23 | and produces two sampled tables, `sample_A` and `sample_B`. 24 | Specifically, you must set the `size` to be the number of tuples that 25 | should be sampled from `B` (this will be the size of `sample_B` table) and set the 26 | `y_param` to be the number of tuples to be selected from `A` (for each tuple in 27 | `sample_B` table). The command internally uses a 28 | heuristic to ensure a reasonable number of matches between `sample_A` and `sample_B`. 29 | 30 | Please look at the API reference of :py:meth:`~py_entitymatching.down_sample` for more 31 | details. 32 | 33 | .. note:: Currently, the input tables must be loaded in memory before the user can down 34 | sample. 35 | 36 | 37 | -------------------------------------------------------------------------------- /docs/user_manual/evaluate_matching.rst: -------------------------------------------------------------------------------- 1 | ============================== 2 | Evaluating the Matching Output 3 | ============================== 4 | Once you have predicted matches using ML-based matcher, then you would have to 5 | evaluate the matches. py_entitymatching supports `eval_matches` command for that 6 | purpose. 7 | 8 | An example of using `eval_matches` command is shown below: 9 | 10 | >>> H = em.extract_feat_vecs(G, feat_table=match_f, attrs_after='gold_labels') 11 | >>> dt = em.DTMatcher() 12 | >>> dt.fit(table=H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_labels'], target_attr='gold_labels') 13 | >>> pred_table = dt.predict(table=H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_labels'], append=True, target_attr='predicted_labels') 14 | >>> eval_summary = em.eval_matches(pred_table, 'gold_labels', 'predicted_labels') 15 | 16 | In the above, `eval_summary` is a dictionary containing accuracy numbers (such as 17 | precision, recall, F1, etc) and the list of false positives/negatives. 18 | 19 | Please refer to the API reference of :py:meth:`~py_entitymatching.eval_matches` for 20 | more details. 21 | 22 | -------------------------------------------------------------------------------- /docs/user_manual/example-blocking-matching.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhaidgroup/py_entitymatching/0eebc5823a5faac89934c52cfe2b7cb5f6e15e8a/docs/user_manual/example-blocking-matching.png -------------------------------------------------------------------------------- /docs/user_manual/example-dev-stage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhaidgroup/py_entitymatching/0eebc5823a5faac89934c52cfe2b7cb5f6e15e8a/docs/user_manual/example-dev-stage.png -------------------------------------------------------------------------------- /docs/user_manual/example-match-two-tables.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhaidgroup/py_entitymatching/0eebc5823a5faac89934c52cfe2b7cb5f6e15e8a/docs/user_manual/example-match-two-tables.png -------------------------------------------------------------------------------- /docs/user_manual/example-prod-stage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhaidgroup/py_entitymatching/0eebc5823a5faac89934c52cfe2b7cb5f6e15e8a/docs/user_manual/example-prod-stage.png -------------------------------------------------------------------------------- /docs/user_manual/example-workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhaidgroup/py_entitymatching/0eebc5823a5faac89934c52cfe2b7cb5f6e15e8a/docs/user_manual/example-workflow.png -------------------------------------------------------------------------------- /docs/user_manual/extract_feat_vecs.rst: -------------------------------------------------------------------------------- 1 | ========================== 2 | Extracting Feature Vectors 3 | ========================== 4 | Once you have created a set of features, you use them to convert labeled sample to feature 5 | vectors. In py_entitymatching, you can use `extract_feature_vecs` to convert 6 | labeled sample to feature vectors using the features created 7 | (see section :ref:`label-create-feats-matching`). 8 | 9 | An example of using `extract_feature_vecs` is shown below: 10 | 11 | >>> H = em.extract_feature_vecs(G, feature_table=match_f, attrs_before=['title'], attrs_after=['gold_labels']) 12 | 13 | Conceptually, the command takes the labeled data (`G`), applies the feature functions (in `match_f`) 14 | to each tuple in G to create a Dataframe, adds the `attrs_before` and `attrs_after` 15 | columns, updates the metadata and returns the resulting Dataframe. 16 | 17 | If there is one (or several columns) in labeled data that contains the labels, then those need 18 | to be explicitly specified in `attrs_after`, if you want them them to copy over. 19 | 20 | Please refer to the API reference of :py:meth:`~py_entitymatching.extract_feature_vecs` 21 | for more details. 22 | -------------------------------------------------------------------------------- /docs/user_manual/labeling.rst: -------------------------------------------------------------------------------- 1 | ======== 2 | Labeling 3 | ======== 4 | The command `label_table` can be used to label the samples (see section 5 | :ref:`label-sampling`). An example of using `label_table` is shown below: 6 | 7 | >>> G = em.label_table(S, label_column_name='gold_labels') 8 | 9 | The above command will first create a copy of the input table `S`, update 10 | the metadata, add a column with the 11 | specified column name (in `label_col_name` parameter) fill it with 0 (i.e non-matches) 12 | and open a GUI for you to update the labels. You must specify 0 for non-matches and 13 | 1 for matches. Once you close the GUI, the updated table will be returned. 14 | 15 | Please refer to the API reference of :py:meth:`~py_entitymatching.label_table` 16 | for more details. 17 | -------------------------------------------------------------------------------- /docs/user_manual/matchercombiner.rst: -------------------------------------------------------------------------------- 1 | ============================================ 2 | Combining Predictions from Multiple Matchers 3 | ============================================ 4 | In the matching step, if you use multiple matchers then you will have to combine the 5 | predictions from them to get a consolidated prediction. There are many different ways 6 | to combine these predictions such as weighted vote, majority vote, stacking, etc. 7 | Currently, py_entitymatching supports majority and weighted voting-based combining. 8 | These combiners are experimental and not tested. 9 | 10 | An example of using majority voting-based combining is shown below. 11 | 12 | >>> dt = DTMatcher() 13 | >>> rf = RFMatcher() 14 | >>> nb = NBMatcher() 15 | >>> dt.fit(table=H, exclude_attrs=['_id', 'l_id', 'r_id'], target_attr='label') # H is training set containing feature vectors 16 | >>> dt.predict(table=L, exclude_attrs=['id', 'l_id', 'r_id'], append=True, inplace=True, target_attr='dt_predictions') # L is the test set for which we should get predictions. 17 | >>> rf.fit(table=H, exclude_attrs=['_id', 'l_id', 'r_id'], target_attr='label') 18 | >>> rf.predict(table=L, exclude_attrs=['id', 'l_id', 'r_id'], append=True, inplace=True, target_attr='rf_predictions') 19 | >>> nb.fit(table=H, exclude_attrs=['_id', 'l_id', 'r_id'], target_attr='label') 20 | >>> nb.predict(table=L, exclude_attrs=['id', 'l_id', 'r_id'], append=True, inplace=True, target_attr='nb_predictions') 21 | >>> mv_combiner = MajorityVote() 22 | >>> L['consol_predictions'] = mv_combiner.combine(L[['dt_predictions', 'rf_predictions', 'nb_predictions']]) 23 | 24 | Conceptually, given a list of predictions (from different matchers) the prediction that 25 | occurs most is returned as the consolidated prediction. If there is no clear winning 26 | prediction (for example, 0 and 1 occuring equal number of times) then 0 is returned. 27 | 28 | An example of using weighted voting-based combining is shown below. 29 | 30 | 31 | >>> dt = DTMatcher() 32 | >>> rf = RFMatcher() 33 | >>> nb = NBMatcher() 34 | >>> dt.fit(table=H, exclude_attrs=['_id', 'l_id', 'r_id'], target_attr='label') # H is training set containing feature vectors 35 | >>> dt.predict(table=L, exclude_attrs=['id', 'l_id', 'r_id'], append=True, inplace=True, target_attr='dt_predictions') # L is the test set for which we should get predictions. 36 | >>> rf.fit(table=H, exclude_attrs=['_id', 'l_id', 'r_id'], target_attr='label') 37 | >>> rf.predict(table=L, exclude_attrs=['id', 'l_id', 'r_id'], append=True, inplace=True, target_attr='rf_predictions') 38 | >>> nb.fit(table=H, exclude_attrs=['_id', 'l_id', 'r_id'], target_attr='label') 39 | >>> nb.predict(table=L, exclude_attrs=['id', 'l_id', 'r_id'], append=True, inplace=True, target_attr='nb_predictions') 40 | >>> wv_combiner = WeightedVote(weights=[0.3, 0.2, 0.1], threshold=0.4) 41 | >>> L['consol_predictions'] = wv_combiner.combine(L[['dt_predictions', 42 | 'rf_predictions', 'nb_predictions']]) 43 | 44 | Conceptually, given a list of predictions, each prediction is given a 45 | weight, we compute a weighted sum of these predictions and compare the result to a 46 | threshold. If the result is greater than or equal to the threshold then the 47 | consolidated prediction is returned as 1 (i.e., a match) else returned as 0 (no-match). 48 | -------------------------------------------------------------------------------- /docs/user_manual/misc.rst: -------------------------------------------------------------------------------- 1 | ============= 2 | Miscellaneous 3 | ============= 4 | This section covers some miscellaneous things in py_entitymatching. 5 | 6 | .. _label-csv-format: 7 | 8 | CSV Format 9 | ---------- 10 | The CSV format is selected because it’s well known and can be read by numerous external 11 | programs. Further, it can be easily inspected and edited by the users. 12 | You can read more about CSV formats `here `_. 13 | 14 | There are two common CSV formats that are used to store CSV files: one with attribute 15 | names in the first line, and one without. Both these formats are supported by py_entitymatching. 16 | 17 | An example of a CSV file with attribute names is shown below: 18 | :: 19 | 20 | ID, name, birth_year, hourly_wage, zipcode 21 | a1, Kevin Smith, 1989, 30, 94107 22 | a2, Michael Franklin, 1988, 27.5, 94122 23 | a3, William Bridge, 1988, 32, 94321 24 | 25 | An example of a CSV file with out attribute names is shown below: 26 | 27 | :: 28 | 29 | a1, Kevin Smith, 1989, 30, 94107 30 | a2, Michael Franklin, 1988, 27.5, 94122 31 | a3, William Bridge, 1988, 32, 94321 32 | 33 | Metadata File Format 34 | -------------------- 35 | The CSV file can be accompanied with a metadata file containing the metadata information 36 | of the table. Typically, it contains information such as key, foreign key, etc. 37 | The metadata file is expected to be of the same name as the CSV file but with `.metadata` 38 | extension. For example, if the CSV file `table_A.csv` contains table A's data, then 39 | `table_A.metadata` will contain table A's metadata. So, the metadata is 40 | associated based on the names of the files. The metadata file contains key-value pairs 41 | one per line and each line starts with '#'. 42 | 43 | An example of metadata file is shown below: 44 | 45 | :: 46 | 47 | #key=ID 48 | 49 | In the above, the pair key=ID states that ID is the key attribute. 50 | 51 | Writing a Dataframe to Disk Along With Its Metadata 52 | --------------------------------------------------- 53 | To write a Dataframe to disk along with its metadata, you can use `to_csv_metadata` 54 | command in py_entitymatching. An example of using `to_csv_metadata` is shown below: 55 | 56 | >>> em.to_csv_metadata(A, './table_A.csv') 57 | 58 | The above command will first write Dataframe pointed by `A` to `table_A.csv` file in the 59 | disk (in CSV format), next it will write the metadata of `table A` stored in the Catalog 60 | to `table_A.metadata` file in the disk. 61 | 62 | Please refer to the API reference of :py:meth:`~py_entitymatching.to_csv_metadata` for 63 | more details. 64 | 65 | .. note:: Once the Dataframe is written to disk along with metadata, it can read using :py:meth:`~py_entitymatching.read_csv_metadata` command. 66 | 67 | 68 | Writing/Reading Other Types of py_entitymatching Objects 69 | ---------------------------------------------------------- 70 | After creating a blocker or feature table, it is desirable to have a 71 | way to persist the objects to disk for future use. py_entitymatching provides 72 | two commands for that purpose: `save_object` and `load_object`. 73 | 74 | An example of using `save_object` is shown below: 75 | 76 | >>> block_f = em.get_features_for_blocking(A, B) 77 | >>> rb = em.RuleBasedBlocker() 78 | >>> rb.add_rule([name_name_lev(ltuple, rtuple) < 0.4], block_f) 79 | >>> em.save_object(rb, './rule_based_blocker.pkl') 80 | 81 | `load_object` loads the stored object from disk. An example of using `load_object` is 82 | shown below: 83 | 84 | >>> rb = em.load_object('./rule_based_blocker.pkl') 85 | 86 | Please refer to the API reference of :py:meth:`~py_entitymatching.save_object` and 87 | :py:meth:`~py_entitymatching.save_object` for more details. 88 | -------------------------------------------------------------------------------- /docs/user_manual/profiling.rst: -------------------------------------------------------------------------------- 1 | ============== 2 | Profiling Data 3 | ============== 4 | Profiling data is used to help users get general information about their data. 5 | Before working with the data, it is useful for a user to have a high level 6 | understanding of the data because he or she will be able to take advantage of 7 | the the general trends to successfully and efficiently complete the rest of 8 | the workflow. 9 | 10 | Data profiling specifically can show users important statistics such as type, 11 | uniqueness, missing values, quartile statistics, mean, mode, standard deviation, 12 | sum, median absolute deviation, coefficient of variation, kurtosis, skewness. 13 | It can also display information to the user visually such as in a histogram. 14 | 15 | We recommend using the python package pandas-profiling because it is simple 16 | and easy to use. More information about the package can be found on the github 17 | page at https://github.com/JosPolfliet/pandas-profiling 18 | 19 | 20 | Example Usage 21 | ------------- 22 | After reading in a CSV file into a Dataframe, pandas-profiling shows the user a 23 | report containing useful profiling information. For example: 24 | 25 | 26 | >>> import pandas_profiling 27 | >>> # Read in csv file 28 | >>> A = em.read_csv_metadata('path_to_csv_dir/table.csv', key='ID') 29 | >>> # Use the profiler 30 | >>> pandas_profiling.ProfileReport(A) 31 | 32 | The user can also check to see if any variables are highly correlated: 33 | 34 | >>> # Read in csv file 35 | >>> import pandas_profiling 36 | >>> A = em.read_csv_metadata('path_to_csv_dir/table.csv', key='ID') 37 | >>> #Use the profiler 38 | >>> profile = pandas_profiling.ProfileReport(A) 39 | >>> # Check for rejected variables 40 | >>> rejected_variables = profile.get_rejected_variables(threshold=0.9) 41 | 42 | The report generated can also be saved into an html file: 43 | 44 | 45 | >>> import pandas_profiling 46 | >>> A = em.read_csv_metadata('path_to_csv_dir/table.csv', key='ID') 47 | >>> # Save report to a variable 48 | >>> profile = pandas_profiling.ProfileReport(A) 49 | >>> # Save report to an html file 50 | >>> profile.to_file(outputfile="/tmp/myoutputfile.html") 51 | 52 | For more information about pandas-profiling please go to the github page 53 | at https://github.com/JosPolfliet/pandas-profiling -------------------------------------------------------------------------------- /docs/user_manual/read_csv_files.rst: -------------------------------------------------------------------------------- 1 | =============================== 2 | Reading the CSV Files from Disk 3 | =============================== 4 | Currently, py_entitymatching only asupports reading CSV files from disk. 5 | 6 | **The Minimal That You Should Do:** First, you must store the input tables as CSV files 7 | in disk. Please look at section :ref:`label-csv-format` to learn more 8 | about CSV format. An example of a CSV file will look like this: 9 | 10 | :: 11 | 12 | ID, name, birth_year, hourly_wage, zipcode 13 | a1, Kevin Smith, 1989, 40, 94107 14 | a2, Michael Franklin, 1988, 27.5, 94122 15 | a3, William Bridge, 1988, 32, 94121 16 | 17 | Next, each table in py_entitymatching must have a key column. If the table already 18 | has a key column, then you can read the CSV file and set the key column as like this: 19 | 20 | :: 21 | 22 | # ID is the key column in table.csv 23 | >>> A = em.read_csv_metadata('path_to_csv_dir/table.csv', key='ID') 24 | 25 | If the table does not have a key column, then you can read the CSV file, add a 26 | key column and set the added key column like this: 27 | 28 | :: 29 | 30 | # Read the CSV file 31 | >>> A = em.read_csv_metadata('path_to_csv_dir/table.csv') 32 | # Add a key column with name 'ID' 33 | >>> A['ID'] = range(0, len(A)) 34 | # Set 'ID' as the key column 35 | >>> em.set_key(A, 'ID') 36 | 37 | **If You Want to Read and Play Around More:** In general, the command 38 | :py:meth:`~py_entitymatching.read_csv_metadata` looks for a file (with the same file name 39 | as the `CSV` file) with `.metadata` extension in the same directory containing the 40 | metadata. If the file containing metadata information is not present, then 41 | :py:meth:`~py_entitymatching.read_csv_metadata` will proceed just reading the CSV file 42 | as mentioned in the command. 43 | 44 | To update the metadata for a table, using a metadata file, first, you must manually create 45 | this file and specify the metadata for a table and then call 46 | :py:meth:`~py_entitymatching.read_csv_metadata`. The command will automatically read the metadata from the 47 | file and update the Catalog. 48 | 49 | For example, if you read `table.csv` then :py:meth:`~py_entitymatching.read_csv_metadata` 50 | looks for `table.metadata` file. The contents of `table.metadata` may look like this: 51 | :: 52 | 53 | #key=ID 54 | 55 | Each line in the file starts with `#`. The metadata is written as `key=value` pairs, 56 | one in each line. The contents of the above file says that `ID` is the key attribute 57 | (for the table in the file `table.csv`). 58 | 59 | 60 | The table mentioned in the above example along with the metadata file 61 | stored in the same directory can be read as follows: 62 | 63 | >>> import py_entitymatching as em 64 | >>> A = em.read_csv_metadata('path_to_csv_dir/table.csv') 65 | 66 | Once, the table is read, you can check to see which 67 | attribute of the table is a key using :py:meth:`~py_entitymatching.get_key` command as 68 | shown below: 69 | 70 | 71 | >>> em.get_key(A) 72 | 'ID' 73 | 74 | As you see, the key for the table is updated correctly as 'ID'. 75 | 76 | See :py:meth:`~py_entitymatching.read_csv_metadata` for more details. 77 | -------------------------------------------------------------------------------- /docs/user_manual/sampling.rst: -------------------------------------------------------------------------------- 1 | .. _label-sampling: 2 | 3 | ======== 4 | Sampling 5 | ======== 6 | If you have to use supervised learning-based matchers or evaluate matchers, you need to 7 | create labeled data. To create labeled data, first you need to sample of candidate set 8 | pairs and then label them. 9 | 10 | In *py_stringmatching*, you can use `sample_table` to get a sample. The command does 11 | uniform random sampling without replacement. An example of using `sample_table` is shown 12 | below: 13 | 14 | >>> S = em.sample_table(C, 100) 15 | 16 | The command will first create a copy of the input table, sample the specified number of 17 | tuple pairs from the copy, update the metadata and return the sampled table. 18 | 19 | 20 | For more details, please look into the API reference of :py:meth:`~py_entitymatching.sample_table` -------------------------------------------------------------------------------- /docs/user_manual/select_best_matcher.rst: -------------------------------------------------------------------------------- 1 | ====================== 2 | Selecting a ML-Matcher 3 | ====================== 4 | Once you have created different concrete ML matchers, then you have to choose one of 5 | them for matching purposes. There are many different criteria by which one can 6 | decide to choose a matcher such as `akaike information criterion`, `bayesian information 7 | criterion`, `k-fold cross validation`, etc. Currently py_entitymatching supports 8 | k-fold cross validation and other approaches are left for future work. 9 | 10 | Conceptually, the command to select a matcher would take in the following inputs: 11 | 12 | * List of ML matchers. 13 | * Training data (feature vector). 14 | * A column of labels that correspond to the feature vectors in the training data. 15 | * Number of folds. 16 | 17 | And it would produce the following output: 18 | 19 | * Selected matcher. 20 | * Statistics such as mean accuracy of all input matchers. 21 | 22 | In py_entitymatching, `select_matcher` command addresses the above needs. An 23 | example of using `select_matcher` is shown below: 24 | 25 | >>> dt = em.DTMatcher() 26 | >>> rf = em.RFMatcher() 27 | >>> result = em.select_matcher(matchers=[dt, rf], table=train, exclude_attrs=['_id', 'ltable_id', 'rtable_id'], target_attr='gold_labels', k=5) 28 | 29 | In the above the output, `result` is a dictionary containing three keys: (1) selected_matcher, 30 | (2) cv_stats, and (3) drill_down_cv_stats. `selected_matcher` is the selected ML-based matcher, 31 | `cv_stats` is a Dataframe which includes the average cross validation scores for each matcher 32 | and for each metric, and 'drill_down_cv_stats' is a dictionary where each key is a metric that 33 | includes the cross validation statistics for each fold. 34 | 35 | Please refer to the API reference of :py:meth:`~py_entitymatching.select_matcher` for 36 | more details. 37 | -------------------------------------------------------------------------------- /docs/user_manual/split_train_test.rst: -------------------------------------------------------------------------------- 1 | ===================================================== 2 | Splitting Labeled Data into Training and Testing Sets 3 | ===================================================== 4 | While doing entity matching you will have to split data for 5 | multiple purposes. Some examples are: 6 | 7 | 1. Split labeled data into development and test. Th development 8 | set is used to come up with right features for learning-based matcher, and 9 | `test` set is used to evaluate the matcher. 10 | 11 | 2. Split feature vectors into a train and test set. The train 12 | set is used to train the learning-based matcher and test set is used 13 | for evaluation. 14 | 15 | 16 | py_entitymatching provides `split_train_test` command for the above need. 17 | An example of using `split_train_test` is shown below: 18 | 19 | >>> train_test = em.split_train_test(G, train_proportion=0.5) 20 | 21 | In the above, `split_train_test` returns a dictionary with two keys: train, and test. 22 | The value for the key `train` is a Dataframe containing tuples 23 | allocated from the input table based on train_proportion. 24 | Similarly, the value for the key `test` is a Dataframe containing 25 | tuples for evaluation. An example of getting train and test Dataframes from the output 26 | of `split_train_test` command is shown below: 27 | 28 | 29 | >>> devel_set = train_test['train'] 30 | >>> eval_set = train_test['test'] 31 | 32 | Setting the value for train proportion would depend on the 33 | context of its use. For instance, if the data is split for machine learning 34 | purposes then train proportion is typically larger than the 35 | test. 36 | The most commonly used values of train_proportion are between 37 | 0.5 and 0.8. 38 | 39 | Please refer to the API reference of :py:meth:`~py_entitymatching.split_train_test` for 40 | more details. 41 | 42 | -------------------------------------------------------------------------------- /docs/user_manual/steps_supp_em_workflows.rst: -------------------------------------------------------------------------------- 1 | Steps of Supported EM Workflows 2 | =============================== 3 | .. toctree:: 4 | :maxdepth: 3 5 | 6 | 7 | read_csv_files 8 | down_sampling 9 | profiling 10 | data_exploration 11 | blocking 12 | create_feats_for_blocking 13 | debugging_blocking 14 | sampling 15 | labeling 16 | split_train_test 17 | create_feats_for_matching 18 | extract_feat_vecs 19 | imputing_missing_values 20 | matching 21 | select_best_matcher 22 | debugging_matcher 23 | matchercombiner 24 | triggers 25 | evaluate_matching 26 | -------------------------------------------------------------------------------- /docs/user_manual/whatisnew.rst: -------------------------------------------------------------------------------- 1 | What is New? 2 | ============ 3 | 4 | Compared to Version 0.3.3, the followings are new: 5 | * Dropped support for Python 2 and 3.5. 6 | * To support Python 3.8, updated the function 7 | :code:`py_entitymatching.matcher.matcherutils.impute_table()` to use current 8 | scikit-learn's :code:`SimpleImputer`; see :ref:`Imputing Missing Values` for correct 9 | usage. 10 | -------------------------------------------------------------------------------- /notebooks/.ipynb_checkpoints/Untitled-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 0 6 | } 7 | -------------------------------------------------------------------------------- /notebooks/.ipynb_checkpoints/test_addfeature_py2-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 0 6 | } 7 | -------------------------------------------------------------------------------- /notebooks/.ipynb_checkpoints/test_attr_equiv_blocker-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 0 6 | } 7 | -------------------------------------------------------------------------------- /notebooks/.ipynb_checkpoints/test_autogenfeature_py3-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 0 6 | } 7 | -------------------------------------------------------------------------------- /notebooks/.ipynb_checkpoints/test_blackboxfunction-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 0 6 | } 7 | -------------------------------------------------------------------------------- /notebooks/.ipynb_checkpoints/test_blocker_combiner-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 0 6 | } 7 | -------------------------------------------------------------------------------- /notebooks/.ipynb_checkpoints/test_catalog-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 0 6 | } 7 | -------------------------------------------------------------------------------- /notebooks/.ipynb_checkpoints/test_combine_ids-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 0 6 | } 7 | -------------------------------------------------------------------------------- /notebooks/.ipynb_checkpoints/test_debug_matcher-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 0 6 | } 7 | -------------------------------------------------------------------------------- /notebooks/.ipynb_checkpoints/test_feature-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 0 6 | } 7 | -------------------------------------------------------------------------------- /notebooks/.ipynb_checkpoints/test_feature_add_features-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 0 6 | } 7 | -------------------------------------------------------------------------------- /notebooks/.ipynb_checkpoints/test_feature_attributeutils-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 0 6 | } 7 | -------------------------------------------------------------------------------- /notebooks/.ipynb_checkpoints/test_feature_extract_featurevecs-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 0 6 | } 7 | -------------------------------------------------------------------------------- /notebooks/.ipynb_checkpoints/test_feature_parse_string-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 0 6 | } 7 | -------------------------------------------------------------------------------- /notebooks/.ipynb_checkpoints/test_io-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 0 6 | } 7 | -------------------------------------------------------------------------------- /notebooks/.ipynb_checkpoints/test_kitchen-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 0 6 | } 7 | -------------------------------------------------------------------------------- /notebooks/.ipynb_checkpoints/test_labeling-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 0 6 | } 7 | -------------------------------------------------------------------------------- /notebooks/.ipynb_checkpoints/test_load_save-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 0 6 | } 7 | -------------------------------------------------------------------------------- /notebooks/.ipynb_checkpoints/test_overlapblocker-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 0 6 | } 7 | -------------------------------------------------------------------------------- /notebooks/.ipynb_checkpoints/test_projection-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 0 6 | } 7 | -------------------------------------------------------------------------------- /notebooks/.ipynb_checkpoints/test_rulebased_blocker-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 0 6 | } 7 | -------------------------------------------------------------------------------- /notebooks/.ipynb_checkpoints/test_sampling-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 0 6 | } 7 | -------------------------------------------------------------------------------- /notebooks/.ipynb_checkpoints/test_trtst_split-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 0 6 | } 7 | -------------------------------------------------------------------------------- /notebooks/guides/.ipynb_checkpoints/Adding Features to Feature Table-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 0 6 | } 7 | -------------------------------------------------------------------------------- /notebooks/guides/.ipynb_checkpoints/Combining Multiple Blockers-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 0 6 | } 7 | -------------------------------------------------------------------------------- /notebooks/guides/.ipynb_checkpoints/Debugging Blocker Output-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 0 6 | } 7 | -------------------------------------------------------------------------------- /notebooks/guides/.ipynb_checkpoints/Evaluating the Selected Matcher-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 0 6 | } 7 | -------------------------------------------------------------------------------- /notebooks/guides/.ipynb_checkpoints/Generating Features for Blocking Manually-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 0 6 | } 7 | -------------------------------------------------------------------------------- /notebooks/guides/.ipynb_checkpoints/Performing Blocking Using Blackbox Blocker-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 0 6 | } 7 | -------------------------------------------------------------------------------- /notebooks/guides/.ipynb_checkpoints/Performing Blocking Using Built-In Blockers (Attr. Equivalence Blocker)-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 0 6 | } 7 | -------------------------------------------------------------------------------- /notebooks/guides/.ipynb_checkpoints/Performing Blocking Using Built-In Blockers (Overlap Blocker)-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 0 6 | } 7 | -------------------------------------------------------------------------------- /notebooks/guides/.ipynb_checkpoints/Performing Blocking Using Rule-Based Blocking-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 0 6 | } 7 | -------------------------------------------------------------------------------- /notebooks/guides/.ipynb_checkpoints/Sampling and Labeling-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 0 6 | } 7 | -------------------------------------------------------------------------------- /notebooks/guides/.ipynb_checkpoints/Selecting the Best Learning Matcher-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 0 6 | } 7 | -------------------------------------------------------------------------------- /notebooks/guides/end_to_end_em_guides/.ipynb_checkpoints/Basic EM Workflow DBLP ACM-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 2 6 | } 7 | -------------------------------------------------------------------------------- /notebooks/guides/end_to_end_em_guides/.ipynb_checkpoints/Basic EM Workflow-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 2 6 | } 7 | -------------------------------------------------------------------------------- /notebooks/guides/end_to_end_em_guides/helper_functions.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import py_entitymatching as em 3 | def get_missing_rows_in_candset(C, L, c_keys, l_keys): 4 | """ 5 | Example usage: 6 | get_missing_rows_in_candset(C, L1, ['ltable_id', 'rtable_id'], ['fodors_id', 'zagats_id']) 7 | L1 is labeled data (with more attrs) 8 | """ 9 | C1 = C[c_keys] 10 | L1 = L[l_keys] 11 | 12 | d = dict() 13 | for t in C1.itertuples(index=False): 14 | d[(t[0], t[1])] = 1 15 | 16 | missing_tuples_in_C = [] 17 | for t in L1.itertuples(index=False): 18 | if (t[0], t[1]) not in d: 19 | missing_tuples_in_C.append(t) 20 | 21 | series_list = [] 22 | 23 | for t in missing_tuples_in_C: 24 | series_list.append((L[(L[l_keys[0]] == t[0]) & (L[l_keys[1]] == t[1])])) 25 | 26 | if len(series_list) == 0: 27 | print('There are no missing tuples') 28 | else: 29 | return pd.concat(series_list) 30 | 31 | def get_sampled_n_labeled_data(C, L, c_keys, l_keys, n, label_col, random_state=0): 32 | """ 33 | Example usage: 34 | get_sampled_n_labeled_data(C, L, ['ltable_id', 'rtable_id'], ['fodors_id', 'zagats_id'], 450, 'gold', random_state=0) 35 | L is labeled data (with more attrs) 36 | """ 37 | # C1 = C[c_keys] 38 | # L1 = L[l_keys] 39 | 40 | d = dict() 41 | for t in L[l_keys].itertuples(index=False): 42 | d[(t[0], t[1])] = 1 43 | 44 | diff_tupes_in_C = [] 45 | for t in C[c_keys].itertuples(index=False): 46 | if (t[0], t[1]) not in d: 47 | diff_tupes_in_C.append(t) 48 | 49 | series_list = [] 50 | 51 | for t in diff_tupes_in_C: 52 | series_list.append((C[(C[c_keys[0]] == t[0]) & (C[c_keys[1]] == t[1])])) 53 | 54 | if len(series_list) == 0: 55 | print('There are no diff tuples in C') 56 | else: 57 | neg_tuples = pd.concat(series_list) 58 | 59 | # pos_tuples 60 | pos_tuples_in_C = [] 61 | for t in C[c_keys].itertuples(index=False): 62 | if (t[0], t[1]) in d: 63 | pos_tuples_in_C.append(t) 64 | 65 | 66 | series_list = [] 67 | 68 | for t in pos_tuples_in_C: 69 | series_list.append((C[(C[c_keys[0]] == t[0]) & (C[c_keys[1]] == t[1])])) 70 | 71 | if len(series_list) == 0: 72 | print('There are no diff tuples in C') 73 | else: 74 | pos_tuples = pd.concat(series_list) 75 | 76 | neg_tuples = neg_tuples.sample(n-len(pos_tuples), random_state=random_state) 77 | 78 | pos_tuples[label_col] = 1 79 | neg_tuples[label_col] = 0 80 | concat_df = pd.concat([pos_tuples, neg_tuples], ignore_index=True) 81 | concat_df = concat_df.sample(frac=1).reset_index(drop=True) 82 | em.copy_properties(C, concat_df) 83 | return concat_df 84 | -------------------------------------------------------------------------------- /notebooks/guides/step_wise_em_guides/.ipynb_checkpoints/Performing Matching with a Rule-Based Matcher-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 2 6 | } 7 | -------------------------------------------------------------------------------- /notebooks/guides/step_wise_em_guides/.ipynb_checkpoints/Using Match Triggers to Improve Results-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 2 6 | } 7 | -------------------------------------------------------------------------------- /notebooks/vldb_demo/README: -------------------------------------------------------------------------------- 1 | 1. Prerequisites: Python 2.7 or Python 3.4+ 2 | 3 | 2. First, Install the following packages: 4 | - py_entitymatching 5 | - seaborn 6 | - Jupyter notebook 7 | you can install the packages using conda like this: 8 | $ conda install py_entitymatching -c uwmagellan 9 | $ conda install seaborn jupyter 10 | 11 | 3. Next, launch the Jupyter notebook from the current directory and open demo. ipynb 12 | NOTE: demo.ipynb assumes that the datafiles (*.csv) and helper scripts (profiler.py) are present in the same directory (as demo.ipynb), 13 | so do not remove those files from the directory. 14 | 15 | 4. Now you can run the cells in demo.ipynb to recreate the demo scenario. 16 | -------------------------------------------------------------------------------- /notebooks/vldb_demo/profiler.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import seaborn as sns 3 | import matplotlib.pyplot as plt 4 | 5 | # def profile_table(df, attribute, plot=True): 6 | # out_df = pd.DataFrame(columns=['Property', 'Value']) 7 | # unique_values = pd.unique(df[attribute]) 8 | # num_missing = sum(pd.isnull(df[attribute])) 9 | # 10 | # if not plot: 11 | # out_df.set_value(0, 'Property', 'Num. Missing Values') 12 | # out_df.set_value(0, 'Value', num_missing) 13 | # out_df.set_value(1, 'Property', 'Num. Unique Values') 14 | # out_df.set_value(1, 'Value', len(unique_values)) 15 | # out_df.set_value(2, 'Property', 'List of Unique Values') 16 | # out_df.set_value(2, 'Value', sorted(list(unique_values))) 17 | # return out_df 18 | # else: 19 | # print('Number of unique values: %d' % len(unique_values)) 20 | # print('Number of missing values: %d' % num_missing) 21 | # print('\nUnique values: ') 22 | # print(sorted(list(unique_values))) 23 | # print('\nFrequency plot:\n') 24 | # 25 | # d = (pd.DataFrame(df[attribute].value_counts())) 26 | # d.sort_index(inplace=True) 27 | # ax = sns.barplot(x="index", y=attribute, data=( 28 | # pd.DataFrame(df[attribute].value_counts())).reset_index()) 29 | # ax.set(xlabel=attribute, ylabel='count') 30 | # ax.grid(b=True, which='major', color='w', linewidth=1.0) 31 | # ax.set_xticklabels(labels=d.index.values, rotation=90) 32 | # plt.show() 33 | 34 | 35 | def profile_table(df, attribute, plot=True): 36 | 37 | unique_values = pd.unique(df[attribute]) 38 | num_missing = sum(pd.isnull(df[attribute])) 39 | 40 | if not plot: 41 | return pd.DataFrame({'Property':['Num. Missing Values', 'Num. Unique Values', 'List of Unique Values'], 42 | 'Value':[num_missing, len(unique_values), sorted(list(unique_values))]}) 43 | else: 44 | print('Number of unique values: %d\nNumber of missing values: ' 45 | '%d\n\nUnique values:' % (len(unique_values), num_missing)) 46 | print(sorted(list(unique_values))) 47 | print('\nFrequency plot:\n') 48 | d = (pd.DataFrame(df[attribute].value_counts())) 49 | ax = sns.barplot(x="index", y=attribute, data=(d).reset_index()) 50 | ax.set(xlabel=attribute, ylabel='count') 51 | ax.grid(b=True, which='major', color='w', linewidth=1.0) 52 | ax.set_xticklabels(labels=d.sort_index().index.values, rotation=90) 53 | plt.show() 54 | -------------------------------------------------------------------------------- /py_entitymatching/blocker/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhaidgroup/py_entitymatching/0eebc5823a5faac89934c52cfe2b7cb5f6e15e8a/py_entitymatching/blocker/__init__.py -------------------------------------------------------------------------------- /py_entitymatching/blockercombiner/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhaidgroup/py_entitymatching/0eebc5823a5faac89934c52cfe2b7cb5f6e15e8a/py_entitymatching/blockercombiner/__init__.py -------------------------------------------------------------------------------- /py_entitymatching/catalog/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | -------------------------------------------------------------------------------- /py_entitymatching/dask/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhaidgroup/py_entitymatching/0eebc5823a5faac89934c52cfe2b7cb5f6e15e8a/py_entitymatching/dask/__init__.py -------------------------------------------------------------------------------- /py_entitymatching/dask/dask_dtmatcher.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module contains the functions for Decision Tree learning-based matcher. 3 | """ 4 | # from py_entitymatching.matcher.mlmatcher import MLMatcher 5 | import logging 6 | 7 | from py_entitymatching.dask.daskmlmatcher import DaskMLMatcher 8 | from py_entitymatching.matcher.matcherutils import get_ts 9 | 10 | logger = logging.getLogger(__name__) 11 | from sklearn.tree import DecisionTreeClassifier 12 | 13 | 14 | class DaskDTMatcher(DaskMLMatcher): 15 | """ 16 | WARNING THIS MATCHER IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK. 17 | 18 | Decision Tree matcher. 19 | 20 | Args: 21 | *args,**kwargs: The arguments to scikit-learn's Decision Tree 22 | classifier. 23 | name (string): The name of this matcher (defaults to None). If the 24 | matcher name is None, the class automatically generates a string 25 | and assigns it as the name. 26 | 27 | 28 | """ 29 | 30 | def __init__(self, *args, **kwargs): 31 | logger.warning( 32 | "WARNING THIS MATCHER IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.") 33 | 34 | super(DaskDTMatcher, self).__init__() 35 | # If the name is given, then pop it 36 | name = kwargs.pop('name', None) 37 | if name is None: 38 | # If the name of the matcher is give, then create one. 39 | # Currently, we use a constant string + a random number. 40 | self.name = 'DecisionTree' + '_' + get_ts() 41 | else: 42 | # Set the name of the matcher, with the given name. 43 | self.name = name 44 | # Set the classifier to the scikit-learn classifier. 45 | self.clf = DecisionTreeClassifier(*args, **kwargs) -------------------------------------------------------------------------------- /py_entitymatching/dask/dask_logregmatcher.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module contains the functions for Logistic Regression classifier. 3 | """ 4 | # from py_entitymatching.matcher.mlmatcher import MLMatcher 5 | from py_entitymatching.dask.daskmlmatcher import DaskMLMatcher 6 | from sklearn.linear_model import LogisticRegression 7 | from py_entitymatching.matcher.matcherutils import get_ts 8 | 9 | 10 | class DaskLogRegMatcher(DaskMLMatcher): 11 | """ 12 | WARNING THIS MATCHER IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK. 13 | 14 | Logistic Regression matcher. 15 | 16 | Args: 17 | *args,**kwargs: THe Arguments to scikit-learn's Logistic Regression 18 | classifier. 19 | name (string): The name of this matcher (defaults to None). If the 20 | matcher name is None, the class automatically generates a string 21 | and assigns it as the name. 22 | 23 | 24 | """ 25 | 26 | def __init__(self, *args, **kwargs): 27 | logger.warning( 28 | "WARNING THIS MATCHER IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.") 29 | 30 | # If the name is given, then pop it 31 | name = kwargs.pop('name', None) 32 | if name is None: 33 | # If the name of the matcher is give, then create one. 34 | # Currently, we use a constant string + a random number. 35 | self.name = 'LogisticRegression' + '_' + get_ts() 36 | else: 37 | # Set the name of the matcher, with the given name. 38 | self.name = name 39 | super(LogRegMatcher, self).__init__() 40 | # Set the classifier to the scikit-learn classifier. 41 | self.clf = LogisticRegression(*args, **kwargs) 42 | self.clf.classes_ = [0, 1] -------------------------------------------------------------------------------- /py_entitymatching/dask/dask_nbmatcher.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module contains the functions for Naive Bayes classifier. 3 | """ 4 | 5 | # from py_entitymatching.matcher.mlmatcher import MLMatcher 6 | from py_entitymatching.dask.daskmlmatcher import DaskMLMatcher 7 | from py_entitymatching.matcher.matcherutils import get_ts 8 | 9 | from sklearn.naive_bayes import GaussianNB 10 | 11 | class DaskNBMatcher(DaskMLMatcher): 12 | """ 13 | WARNING THIS MATCHER IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK. 14 | 15 | Naive Bayes matcher. 16 | 17 | Args: 18 | *args,**kwargs: The arguments to scikit-learn's Naive Bayes 19 | classifier. 20 | 21 | name (string): The name of this matcher (defaults to None). If the 22 | matcher name is None, the class automatically generates a string 23 | and assigns it as the name. 24 | 25 | 26 | """ 27 | def __init__(self, *args, **kwargs): 28 | logger.warning( 29 | "WARNING THIS MATCHER IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.") 30 | # If the name is given, then pop it 31 | name = kwargs.pop('name', None) 32 | if name is None: 33 | # If the name of the matcher is give, then create one. 34 | # Currently, we use a constant string + a random number. 35 | self.name = 'NaiveBayes'+ '_' + get_ts() 36 | else: 37 | # Set the name of the matcher, with the given name. 38 | self.name = name 39 | super(DaskNBMatcher, self).__init__() 40 | # Set the classifier to the scikit-learn classifier. 41 | self.clf = GaussianNB(*args, **kwargs) -------------------------------------------------------------------------------- /py_entitymatching/dask/dask_rfmatcher.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module contains the functions for Random Forest classifier. 3 | """ 4 | 5 | # from py_entitymatching.matcher.mlmatcher import MLMatcher 6 | from py_entitymatching.dask.daskmlmatcher import DaskMLMatcher 7 | from py_entitymatching.matcher.matcherutils import get_ts 8 | 9 | from sklearn.ensemble import RandomForestClassifier 10 | 11 | 12 | class DaskRFMatcher(DaskMLMatcher): 13 | """ 14 | WARNING THIS MATCHER IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK. 15 | 16 | Random Forest matcher. 17 | 18 | Args: 19 | *args,**kwargs: The arguments to scikit-learn's Random Forest 20 | classifier. 21 | 22 | name (string): The name of this matcher (defaults to None). If the 23 | matcher name is None, the class automatically generates a string 24 | and assigns it as the name. 25 | 26 | 27 | """ 28 | 29 | def __init__(self, *args, **kwargs): 30 | logger.warning( 31 | "WARNING THIS MATCHER IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.") 32 | 33 | super(DaskRFMatcher, self).__init__() 34 | # If the name is given, then pop it 35 | name = kwargs.pop('name', None) 36 | if name is None: 37 | # If the name of the matcher is give, then create one. 38 | # Currently, we use a constant string + a random number. 39 | self.name = 'RandomForest' + '_' + get_ts() 40 | else: 41 | # Set the name of the matcher, with the given name. 42 | self.name = name 43 | # Set the classifier to the scikit-learn classifier. 44 | self.clf = RandomForestClassifier(*args, **kwargs) -------------------------------------------------------------------------------- /py_entitymatching/dask/dask_svm_matcher.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module contains the functions for SVM classifier. 3 | 4 | """ 5 | # from py_entitymatching.matcher.mlmatcher import MLMatcher 6 | from py_entitymatching.dask.daskmlmatcher import DaskMLMatcher 7 | from py_entitymatching.matcher.matcherutils import get_ts 8 | 9 | from sklearn.svm import SVC 10 | 11 | 12 | class DaskSVMMatcher(DaskMLMatcher): 13 | """ 14 | WARNING THIS MATCHER IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK. 15 | 16 | SVM matcher. 17 | 18 | Args: 19 | *args,**kwargs: The arguments to scikit-learn's SVM 20 | classifier. 21 | name (string): The name of this matcher (defaults to None). If the 22 | matcher name is None, the class automatically generates a string 23 | and assigns it as the name. 24 | 25 | 26 | """ 27 | 28 | def __init__(self, *args, **kwargs): 29 | logger.warning( 30 | "WARNING THIS MATCHER IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.") 31 | 32 | super(DaskSVMMatcher, self).__init__() 33 | # If the name is given, then pop it 34 | name = kwargs.pop('name', None) 35 | if name is None: 36 | # If the name of the matcher is give, then create one. 37 | # Currently, we use a constant string + a random number. 38 | self.name = 'SVM' + '_' + get_ts() 39 | else: 40 | # Set the name of the matcher, with the given name. 41 | self.name = name 42 | # Set the classifier to the scikit-learn classifier. 43 | self.clf = SVC(*args, **kwargs) -------------------------------------------------------------------------------- /py_entitymatching/dask/dask_xgboost_matcher.py: -------------------------------------------------------------------------------- 1 | # from py_entitymatching.matcher.mlmatcher import MLMatcher 2 | from py_entitymatching.dask.daskmlmatcher import DaskMLMatcher 3 | from py_entitymatching.matcher.matcherutils import get_ts 4 | 5 | # from sklearn.svm import SVC 6 | try: 7 | from xgboost.sklearn import XGBClassifier 8 | except ImportError: 9 | raise ImportError('Check if xgboost library is installed. You can install xgboost ' 10 | 'by following the instructions at http://xgboost.readthedocs.io/en/latest/build.html') 11 | 12 | 13 | class DaskXGBoostMatcher(DaskMLMatcher): 14 | """ 15 | WARNING THIS MATCHER IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK 16 | 17 | XGBoost matcher. 18 | 19 | Args: 20 | *args,**kwargs: The arguments to XGBoost 21 | classifier. 22 | name (string): The name of this matcher (defaults to None). If the 23 | matcher name is None, the class automatically generates a string 24 | and assigns it as the name. 25 | 26 | 27 | """ 28 | 29 | def __init__(self, *args, **kwargs): 30 | logger.warning( 31 | "WARNING THIS MATCHER IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.") 32 | 33 | super(DaskXGBoostMatcher, self).__init__() 34 | # If the name is given, then pop it 35 | name = kwargs.pop('name', None) 36 | if name is None: 37 | # If the name of the matcher is give, then create one. 38 | # Currently, we use a constant string + a random number. 39 | self.name = 'xgboost' + '_' + get_ts() 40 | else: 41 | # Set the name of the matcher, with the given name. 42 | self.name = name 43 | # Set the classifier to the scikit-learn classifier. 44 | try: 45 | from xgboost.sklearn import XGBClassifier 46 | except ImportError: 47 | raise ImportError( 48 | 'Check if xgboost library is installed. You can install xgboost ' 49 | 'by following the instructions at http://xgboost.readthedocs.io/en/latest/build.html') 50 | self.clf = XGBClassifier(*args, **kwargs) 51 | -------------------------------------------------------------------------------- /py_entitymatching/dask/utils.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | 3 | def validate_chunks(n): 4 | if n == 0: 5 | raise AssertionError('The number of chunks cannot be 0 ') 6 | elif n <= -2: 7 | raise AssertionError('The number of chunks should be -1 or > 0') 8 | 9 | def get_num_partitions(given_partitions, n): 10 | if given_partitions == -1: 11 | return multiprocessing.cpu_count() 12 | elif given_partitions > n: 13 | return n 14 | else: 15 | return given_partitions 16 | 17 | def get_num_cores(): 18 | return multiprocessing.cpu_count() 19 | 20 | def wrap(object): 21 | return object -------------------------------------------------------------------------------- /py_entitymatching/datasets/acm_demo.metadata: -------------------------------------------------------------------------------- 1 | #key=id 2 | -------------------------------------------------------------------------------- /py_entitymatching/datasets/end-to-end/acm_demo.metadata: -------------------------------------------------------------------------------- 1 | #key=id 2 | -------------------------------------------------------------------------------- /py_entitymatching/datasets/end-to-end/dblp_demo.metadata: -------------------------------------------------------------------------------- 1 | #key=id 2 | -------------------------------------------------------------------------------- /py_entitymatching/datasets/end-to-end/profiler.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import seaborn as sns 3 | import matplotlib.pyplot as plt 4 | 5 | # def profile_table(df, attribute, plot=True): 6 | # out_df = pd.DataFrame(columns=['Property', 'Value']) 7 | # unique_values = pd.unique(df[attribute]) 8 | # num_missing = sum(pd.isnull(df[attribute])) 9 | # 10 | # if not plot: 11 | # out_df.set_value(0, 'Property', 'Num. Missing Values') 12 | # out_df.set_value(0, 'Value', num_missing) 13 | # out_df.set_value(1, 'Property', 'Num. Unique Values') 14 | # out_df.set_value(1, 'Value', len(unique_values)) 15 | # out_df.set_value(2, 'Property', 'List of Unique Values') 16 | # out_df.set_value(2, 'Value', sorted(list(unique_values))) 17 | # return out_df 18 | # else: 19 | # print('Number of unique values: %d' % len(unique_values)) 20 | # print('Number of missing values: %d' % num_missing) 21 | # print('\nUnique values: ') 22 | # print(sorted(list(unique_values))) 23 | # print('\nFrequency plot:\n') 24 | # 25 | # d = (pd.DataFrame(df[attribute].value_counts())) 26 | # d.sort_index(inplace=True) 27 | # ax = sns.barplot(x="index", y=attribute, data=( 28 | # pd.DataFrame(df[attribute].value_counts())).reset_index()) 29 | # ax.set(xlabel=attribute, ylabel='count') 30 | # ax.grid(b=True, which='major', color='w', linewidth=1.0) 31 | # ax.set_xticklabels(labels=d.index.values, rotation=90) 32 | # plt.show() 33 | 34 | 35 | def profile_table(df, attribute, plot=True): 36 | 37 | unique_values = pd.unique(df[attribute]) 38 | num_missing = sum(pd.isnull(df[attribute])) 39 | 40 | if not plot: 41 | return pd.DataFrame({'Property':['Num. Missing Values', 'Num. Unique Values', 'List of Unique Values'], 42 | 'Value':[num_missing, len(unique_values), sorted(list(unique_values))]}) 43 | else: 44 | print('Number of unique values: %d\nNumber of missing values: ' 45 | '%d\n\nUnique values:' % (len(unique_values), num_missing)) 46 | print(sorted(list(unique_values))) 47 | print('\nFrequency plot:\n') 48 | d = (pd.DataFrame(df[attribute].value_counts())) 49 | ax = sns.barplot(x="index", y=attribute, data=(d).reset_index()) 50 | ax.set(xlabel=attribute, ylabel='count') 51 | ax.grid(b=True, which='major', color='w', linewidth=1.0) 52 | ax.set_xticklabels(labels=d.sort_index().index.values, rotation=90) 53 | plt.show() 54 | -------------------------------------------------------------------------------- /py_entitymatching/datasets/end-to-end/restaurants/lbl_restnt_wf1.metadata: -------------------------------------------------------------------------------- 1 | #rtable=POINTER 2 | #fk_ltable=ltable_id 3 | #ltable=POINTER 4 | #key=_id 5 | #fk_rtable=rtable_id 6 | -------------------------------------------------------------------------------- /py_entitymatching/datasets/end-to-end/restaurants/match_fodors_zagats_more_attrs.metadata: -------------------------------------------------------------------------------- 1 | #key=_id 2 | #ltable=POINTER 3 | #rtable=POINTER 4 | #fk_rtable=zagats_id 5 | #fk_ltable=fodors_id 6 | -------------------------------------------------------------------------------- /py_entitymatching/datasets/end-to-end/restaurants/matches_fodors_zagats.csv: -------------------------------------------------------------------------------- 1 | fodors_id,zagats_id 2 | 534,219 3 | 535,220 4 | 536,221 5 | 537,222 6 | 538,223 7 | 539,224 8 | 540,225 9 | 541,226 10 | 542,227 11 | 543,228 12 | 544,229 13 | 545,230 14 | 546,231 15 | 547,232 16 | 548,233 17 | 549,234 18 | 550,235 19 | 551,236 20 | 552,237 21 | 553,238 22 | 554,239 23 | 555,240 24 | 556,241 25 | 557,242 26 | 558,243 27 | 559,244 28 | 560,245 29 | 561,246 30 | 562,247 31 | 563,248 32 | 564,249 33 | 565,250 34 | 566,251 35 | 567,252 36 | 568,253 37 | 569,254 38 | 570,255 39 | 571,256 40 | 572,257 41 | 573,258 42 | 574,259 43 | 575,260 44 | 576,261 45 | 577,262 46 | 578,263 47 | 579,264 48 | 580,265 49 | 581,266 50 | 582,267 51 | 583,268 52 | 584,269 53 | 585,270 54 | 586,271 55 | 587,272 56 | 588,273 57 | 589,274 58 | 590,275 59 | 591,276 60 | 592,277 61 | 593,278 62 | 594,279 63 | 595,280 64 | 596,281 65 | 597,282 66 | 598,283 67 | 599,284 68 | 600,285 69 | 601,286 70 | 602,287 71 | 603,288 72 | 604,289 73 | 605,290 74 | 606,291 75 | 607,292 76 | 608,293 77 | 609,294 78 | 610,295 79 | 611,296 80 | 612,297 81 | 613,298 82 | 614,299 83 | 615,300 84 | 616,301 85 | 617,302 86 | 618,303 87 | 619,304 88 | 620,305 89 | 621,306 90 | 622,307 91 | 623,308 92 | 624,309 93 | 625,310 94 | 626,311 95 | 627,312 96 | 628,313 97 | 629,314 98 | 630,315 99 | 631,316 100 | 632,317 101 | 633,318 102 | 634,319 103 | 635,320 104 | 636,321 105 | 637,322 106 | 638,323 107 | 639,324 108 | 640,325 109 | 641,326 110 | 642,327 111 | 643,328 112 | 644,329 113 | 645,330 114 | -------------------------------------------------------------------------------- /py_entitymatching/datasets/person_table_A.csv: -------------------------------------------------------------------------------- 1 | ID,name,birth_year,hourly_wage,address,zipcode 2 | a1,Kevin Smith,1989,30,"607 From St, San Francisco",94107 3 | a2,Michael Franklin,1988,27.5,"1652 Stockton St, San Francisco",94122 4 | a3,William Bridge,1986,32,"3131 Webster St, San Francisco",94107 5 | a4,Binto George,1987,32.5,"423 Powell St, San Francisco",94122 6 | a5,Alphonse Kemper,1984,35,"1702 Post Street, San Francisco",94122 -------------------------------------------------------------------------------- /py_entitymatching/datasets/person_table_A.metadata: -------------------------------------------------------------------------------- 1 | #key=ID 2 | -------------------------------------------------------------------------------- /py_entitymatching/datasets/person_table_B.csv: -------------------------------------------------------------------------------- 1 | ID,name,birth_year,hourly_wage,address,zipcode 2 | b1,Mark Levene,1987,29.5,"108 Clement St, San Francisco",94107 3 | b2,Bill Bridge,1986,32,"3131 Webster St, San Francisco",94107 4 | b3,Mike Franklin,1988,27.5,"1652 Stockton St, San Francisco",94122 5 | b4,Joseph Kuan,1982,26,"108 South Park, San Francisco",94122 6 | b5,Alfons Kemper,1984,35,"170 Post St, Apt 4, San Francisco",94122 7 | b6,Michael Brodie,1987,32.5,"133 Clement Street, San Francisco",94107 -------------------------------------------------------------------------------- /py_entitymatching/datasets/person_table_B.metadata: -------------------------------------------------------------------------------- 1 | #key=ID 2 | -------------------------------------------------------------------------------- /py_entitymatching/debugblocker/GenerateRecomLists.h: -------------------------------------------------------------------------------- 1 | #ifndef TEST_GENERATERECOMLISTS_H 2 | #define TEST_GENERATERECOMLISTS_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include "TopkHeader.h" 12 | using namespace std; 13 | 14 | typedef map > CandSet; 15 | typedef vector > Table; 16 | typedef map, int> TopkRankList; 17 | 18 | double double_max(const double a, double b); 19 | 20 | class RecPair { 21 | public: 22 | int l_rec, r_rec, rank; 23 | RecPair(int l_rec, int r_rec, int rank) : l_rec(l_rec), r_rec(r_rec), rank(rank){ 24 | 25 | } 26 | }; 27 | class GenerateRecomLists { 28 | public: 29 | 30 | Table generate_config(const vector& field_list, const vector& ltoken_sum_vector, 31 | const vector& rtoken_sum_vector, const double field_remove_ratio, 32 | const unsigned int ltable_size, const unsigned int rtable_size); 33 | Table sort_config(Table& config_lists); 34 | 35 | TopkRankList generate_topk_with_config(vector& config, Table& ltoken_vector, Table& rtoken_vector, 36 | Table& lindex_vector, Table& rindex_vector, 37 | CandSet& cand_set, unsigned int output_size); 38 | 39 | vector generate_recom_lists(Table& ltoken_vector, Table& rtoken_vector, 40 | Table& lindex_vector, Table& rindex_vector, 41 | vector& ltoken_sum_vector, vector& rtoken_sum_vector, vector& field_list, 42 | CandSet& cand_set, double field_remove_ratio, 43 | unsigned int output_size); 44 | 45 | vector merge_topk_lists(vector& rec_lists); 46 | GenerateRecomLists(); 47 | ~GenerateRecomLists(); 48 | }; 49 | 50 | 51 | #endif //TEST_GENERATERECOMLISTS_H 52 | -------------------------------------------------------------------------------- /py_entitymatching/debugblocker/PrefixEvent.cpp: -------------------------------------------------------------------------------- 1 | #include "PrefixEvent.h" 2 | 3 | PrefixEvent::PrefixEvent() { } 4 | 5 | PrefixEvent::PrefixEvent(double thres, int indicator, int rec, int tok) { 6 | threshold = thres; 7 | table_indicator = indicator; 8 | rec_idx = rec; 9 | tok_idx = tok; 10 | } 11 | 12 | PrefixEvent::~PrefixEvent() { } 13 | 14 | bool PrefixEvent::operator<(const PrefixEvent &other) const 15 | { 16 | return threshold < other.threshold; 17 | } 18 | 19 | bool PrefixEvent::operator>(const PrefixEvent &other) const 20 | { 21 | return threshold > other.threshold; 22 | } 23 | -------------------------------------------------------------------------------- /py_entitymatching/debugblocker/PrefixEvent.h: -------------------------------------------------------------------------------- 1 | class PrefixEvent { 2 | public: 3 | double threshold; 4 | int table_indicator; 5 | int rec_idx; 6 | int tok_idx; 7 | 8 | PrefixEvent(); 9 | PrefixEvent(double threshold, int table_indicator, int rec_idx, int tok_idx); 10 | ~PrefixEvent(); 11 | 12 | bool operator<(const PrefixEvent &other) const; 13 | bool operator>(const PrefixEvent &other) const; 14 | }; 15 | -------------------------------------------------------------------------------- /py_entitymatching/debugblocker/TopPair.cpp: -------------------------------------------------------------------------------- 1 | #include "TopPair.h" 2 | 3 | TopPair::TopPair() { } 4 | 5 | TopPair::TopPair(double similarity, int l_rec_idx, int r_rec_idx) { 6 | sim = similarity; 7 | l_rec = l_rec_idx; 8 | r_rec = r_rec_idx; 9 | } 10 | 11 | TopPair::~TopPair() { } 12 | 13 | bool TopPair::operator<(const TopPair &other) const 14 | { 15 | return sim > other.sim; 16 | } 17 | 18 | bool TopPair::operator>(const TopPair &other) const 19 | { 20 | return sim < other.sim; 21 | } 22 | -------------------------------------------------------------------------------- /py_entitymatching/debugblocker/TopPair.h: -------------------------------------------------------------------------------- 1 | #ifndef TEST_TOPPAIR_H 2 | #define TEST_TOPPAIR_H 3 | 4 | class TopPair { 5 | public: 6 | double sim; 7 | int l_rec; 8 | int r_rec; 9 | 10 | TopPair(); 11 | TopPair(double similarity, int l_rec_idx, int r_rec_idx); 12 | ~TopPair(); 13 | 14 | bool operator<(const TopPair &other) const; 15 | bool operator>(const TopPair &other) const; 16 | }; 17 | 18 | 19 | #endif //TEST_TOPPAIR_H 20 | -------------------------------------------------------------------------------- /py_entitymatching/debugblocker/TopkHeader.cpp: -------------------------------------------------------------------------------- 1 | #include "TopkHeader.h" 2 | 3 | 4 | void original_generate_prefix_events_impl(const Table& table, const int table_indicator, 5 | PrefixHeap& prefix_events) { 6 | for (unsigned int i = 0; i < table.size(); ++i) { 7 | unsigned long int length = table[i].size(); 8 | if (length > 0) { 9 | for (unsigned int j = 0; j < length; ++j) { 10 | prefix_events.push(PrefixEvent(1.0 - j * 1.0 / length, table_indicator, i, j)); 11 | } 12 | } 13 | } 14 | } 15 | 16 | void original_generate_prefix_events(const Table& ltable, const Table& rtable, 17 | PrefixHeap& prefix_events) { 18 | original_generate_prefix_events_impl(ltable, 0, prefix_events); 19 | original_generate_prefix_events_impl(rtable, 1, prefix_events); 20 | } 21 | 22 | 23 | int original_plain_get_overlap(const vector& ltoken_list, const vector& rtoken_list) { 24 | int overlap = 0; 25 | set rset; 26 | 27 | for (unsigned int i = 0; i < rtoken_list.size(); ++i) { 28 | rset.insert(rtoken_list[i]); 29 | } 30 | 31 | for (unsigned int i = 0; i < ltoken_list.size(); ++i) { 32 | if (rset.count(ltoken_list[i])) { 33 | ++overlap; 34 | } 35 | } 36 | 37 | return overlap; 38 | } 39 | -------------------------------------------------------------------------------- /py_entitymatching/debugblocker/TopkHeader.h: -------------------------------------------------------------------------------- 1 | #ifndef __TOPKHEADER_H__ 2 | #define __TOPKHEADER_H__ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include "TopPair.h" 12 | #include "PrefixEvent.h" 13 | 14 | using namespace std; 15 | 16 | typedef priority_queue Heap; 17 | typedef map > CandSet; 18 | typedef map > > InvertedIndex; 19 | typedef vector > Table; 20 | typedef priority_queue PrefixHeap; 21 | 22 | 23 | Heap original_topk_sim_join_plain(const Table& ltoken_vector, const Table& rtoken_vector, 24 | CandSet& cand_set, const unsigned int output_size); 25 | 26 | 27 | int original_plain_get_overlap(const vector& ltoken_list, const vector& rtoken_list); 28 | 29 | 30 | void original_generate_prefix_events_impl(const Table& table, const int table_indicator, 31 | PrefixHeap& prefix_events); 32 | 33 | void original_generate_prefix_events(const Table& ltable, const Table& rtable, 34 | PrefixHeap& prefix_events); 35 | 36 | #endif //__TOPKHEADER_H__ 37 | -------------------------------------------------------------------------------- /py_entitymatching/debugblocker/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhaidgroup/py_entitymatching/0eebc5823a5faac89934c52cfe2b7cb5f6e15e8a/py_entitymatching/debugblocker/__init__.py -------------------------------------------------------------------------------- /py_entitymatching/debugmatcher/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /py_entitymatching/evaluation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhaidgroup/py_entitymatching/0eebc5823a5faac89934c52cfe2b7cb5f6e15e8a/py_entitymatching/evaluation/__init__.py -------------------------------------------------------------------------------- /py_entitymatching/experimental/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhaidgroup/py_entitymatching/0eebc5823a5faac89934c52cfe2b7cb5f6e15e8a/py_entitymatching/experimental/__init__.py -------------------------------------------------------------------------------- /py_entitymatching/explorer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhaidgroup/py_entitymatching/0eebc5823a5faac89934c52cfe2b7cb5f6e15e8a/py_entitymatching/explorer/__init__.py -------------------------------------------------------------------------------- /py_entitymatching/explorer/openrefine/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhaidgroup/py_entitymatching/0eebc5823a5faac89934c52cfe2b7cb5f6e15e8a/py_entitymatching/explorer/openrefine/__init__.py -------------------------------------------------------------------------------- /py_entitymatching/explorer/pandastable/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhaidgroup/py_entitymatching/0eebc5823a5faac89934c52cfe2b7cb5f6e15e8a/py_entitymatching/explorer/pandastable/__init__.py -------------------------------------------------------------------------------- /py_entitymatching/explorer/pandastable/pandastable_wrapper.py: -------------------------------------------------------------------------------- 1 | try: 2 | from tkinter import * 3 | except ImportError as e: 4 | from Tkinter import * 5 | 6 | from py_entitymatching.utils.validation_helper import validate_object_type 7 | import pandas as pd 8 | 9 | 10 | def data_explore_pandastable(df): 11 | """ 12 | Wrapper function for pandastable. Gives user a GUI to examine and edit 13 | the dataframe passed in using pandastable. 14 | 15 | Args: 16 | df (Dataframe): The pandas dataframe to be explored with pandastable. 17 | 18 | Raises: 19 | AssertionError: If `df` is not of type pandas DataFrame. 20 | 21 | Examples: 22 | >>> import py_entitymatching as em 23 | >>> A = em.read_csv_metadata('path_to_csv_dir/table.csv', key='ID') 24 | >>> em.data_explore_pandastable(A) 25 | 26 | """ 27 | 28 | # Validate input parameters 29 | # # We expect the df to be of type pandas DataFrame 30 | validate_object_type(df, pd.DataFrame, 'Input df') 31 | DataExplorePandastable(df) 32 | 33 | 34 | class DataExplorePandastable(Frame): 35 | """ 36 | A wrapper for pandastable. 37 | """ 38 | 39 | def __init__(self, df): 40 | # Import 41 | try: 42 | from pandastable import Table, TableModel 43 | except ImportError: 44 | raise ImportError('Pandastable is not installed. Please install pandastable to use ' 45 | 'pandastable data exploration functions.') 46 | 47 | self.parent = None 48 | Frame.__init__(self) 49 | self.main = self.master 50 | self.main.geometry('600x400+200+100') 51 | self.main.title('Explore Data') 52 | f = Frame(self.main) 53 | f.pack(fill=BOTH, expand=1) 54 | # set the table in the GUI 55 | self.table = pt = Table(f, dataframe=df, 56 | showtoolbar=True, showstatusbar=True) 57 | pt.show() 58 | self.mainloop() 59 | 60 | -------------------------------------------------------------------------------- /py_entitymatching/feature/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhaidgroup/py_entitymatching/0eebc5823a5faac89934c52cfe2b7cb5f6e15e8a/py_entitymatching/feature/__init__.py -------------------------------------------------------------------------------- /py_entitymatching/gui/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | -------------------------------------------------------------------------------- /py_entitymatching/io/__init__.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | import logging 3 | import os 4 | 5 | 6 | -------------------------------------------------------------------------------- /py_entitymatching/labeler/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhaidgroup/py_entitymatching/0eebc5823a5faac89934c52cfe2b7cb5f6e15e8a/py_entitymatching/labeler/__init__.py -------------------------------------------------------------------------------- /py_entitymatching/matcher/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | -------------------------------------------------------------------------------- /py_entitymatching/matcher/dtmatcher.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module contains the functions for Decision Tree learning-based matcher. 3 | """ 4 | from py_entitymatching.matcher.mlmatcher import MLMatcher 5 | from py_entitymatching.matcher.matcherutils import get_ts 6 | 7 | from sklearn.tree import DecisionTreeClassifier 8 | 9 | class DTMatcher(MLMatcher): 10 | """ 11 | Decision Tree matcher. 12 | 13 | Args: 14 | *args,**kwargs: The arguments to scikit-learn's Decision Tree 15 | classifier. 16 | name (string): The name of this matcher (defaults to None). If the 17 | matcher name is None, the class automatically generates a string 18 | and assigns it as the name. 19 | Notes: 20 | For more details please see 21 | 22 | """ 23 | def __init__(self, *args, **kwargs): 24 | super(DTMatcher, self).__init__() 25 | # If the name is given, then pop it 26 | name = kwargs.pop('name', None) 27 | if name is None: 28 | # If the name of the matcher is give, then create one. 29 | # Currently, we use a constant string + a random number. 30 | self.name = 'DecisionTree' + '_' + get_ts() 31 | else: 32 | # Set the name of the matcher, with the given name. 33 | self.name = name 34 | # Set the classifier to the scikit-learn classifier. 35 | self.clf = DecisionTreeClassifier(*args, **kwargs) 36 | -------------------------------------------------------------------------------- /py_entitymatching/matcher/ensemblematcher.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module contains functions for ensembe matcher. 3 | Note: This is not going to be there in the first version of py_entitymatching. 4 | """ 5 | 6 | import pandas as pd 7 | import numpy as np 8 | import six 9 | 10 | from sklearn.base import BaseEstimator 11 | from sklearn.base import ClassifierMixin 12 | from sklearn.base import TransformerMixin 13 | from sklearn.base import clone 14 | from sklearn.pipeline import _name_estimators 15 | 16 | from py_entitymatching.matcher.mlmatcher import MLMatcher 17 | from py_entitymatching.matchercombiner.matchercombiner import MajorityVote, WeightedVote 18 | 19 | class EnsembleSKLearn(BaseEstimator, ClassifierMixin, TransformerMixin): 20 | def __init__(self, clfs, voting, weights=None, threshold=None): 21 | self.clfs = clfs 22 | self.named_clfs = {key:value for key,value in _name_estimators(clfs)} 23 | self.voting=voting 24 | if voting is 'weighted': 25 | self.combiner=WeightedVote(weights=weights, threshold=threshold) 26 | elif voting is 'majority': 27 | self.combiner=MajorityVote() 28 | else: 29 | raise AttributeError('Unrecognized voting method') 30 | 31 | def fit(self, X, y): 32 | self.clfs_ = [] 33 | for clf in self.clfs: 34 | fitted_clf = clone(clf).fit(X, y) 35 | self.clfs_.append(fitted_clf) 36 | return self 37 | 38 | def predict(self, X): 39 | return self._predict(X) 40 | 41 | def _predict(self, X): 42 | """ Collect results from clf.predict calls. """ 43 | predictions = np.asarray([clf.predict(X) for clf in self.clfs_]).T 44 | predicted_labels = self.combiner.combine(predictions) 45 | return predicted_labels 46 | 47 | def get_params(self, deep=True): 48 | """ Return estimator parameter names for GridSearch support""" 49 | if not deep: 50 | return super(EnsembleSKLearn, self).get_params(deep=False) 51 | else: 52 | out = self.named_clfs.copy() 53 | for name, step in six.iteritems(self.named_clfs): 54 | for key, value in six.iteritems(step.get_params(deep=True)): 55 | out['%s__%s' % (name, key)] = value 56 | return out 57 | 58 | class EnsembleMatcher(MLMatcher): 59 | def __init__(self, matchers, name=None, voting='weighted', weights=None, threshold=None): 60 | clfs = [m.clf for m in matchers] 61 | self.clf = EnsembleSKLearn(clfs, voting, weights, threshold) 62 | if name is None: 63 | names = [matcher.get_name() for matcher in matchers ] 64 | self.name = voting+':' 65 | self.name += ','.join(names) 66 | 67 | else: 68 | self.name = name 69 | -------------------------------------------------------------------------------- /py_entitymatching/matcher/logregmatcher.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module contains the functions for Logistic Regression classifier. 3 | """ 4 | from py_entitymatching.matcher.mlmatcher import MLMatcher 5 | from sklearn.linear_model import LogisticRegression 6 | from py_entitymatching.matcher.matcherutils import get_ts 7 | 8 | class LogRegMatcher(MLMatcher): 9 | """ 10 | Logistic Regression matcher. 11 | 12 | Args: 13 | *args,**kwargs: THe Arguments to scikit-learn's Logistic Regression 14 | classifier. 15 | name (string): The name of this matcher (defaults to None). If the 16 | matcher name is None, the class automatically generates a string 17 | and assigns it as the name. 18 | 19 | 20 | """ 21 | def __init__(self, *args, **kwargs): 22 | # If the name is given, then pop it 23 | name = kwargs.pop('name', None) 24 | if name is None: 25 | # If the name of the matcher is give, then create one. 26 | # Currently, we use a constant string + a random number. 27 | self.name = 'LogisticRegression'+ '_' + get_ts() 28 | else: 29 | # Set the name of the matcher, with the given name. 30 | self.name = name 31 | super(LogRegMatcher, self).__init__() 32 | # Set the classifier to the scikit-learn classifier. 33 | self.clf = LogisticRegression(*args, **kwargs) 34 | self.clf.classes_ = [0, 1] -------------------------------------------------------------------------------- /py_entitymatching/matcher/matcher.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module contains the definition for high level matcher class. 3 | """ 4 | class Matcher(object): 5 | pass 6 | 7 | -------------------------------------------------------------------------------- /py_entitymatching/matcher/nbmatcher.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module contains the functions for Naive Bayes classifier. 3 | """ 4 | 5 | from py_entitymatching.matcher.mlmatcher import MLMatcher 6 | from py_entitymatching.matcher.matcherutils import get_ts 7 | 8 | from sklearn.naive_bayes import GaussianNB 9 | 10 | class NBMatcher(MLMatcher): 11 | """ 12 | Naive Bayes matcher. 13 | 14 | Args: 15 | *args,**kwargs: The arguments to scikit-learn's Naive Bayes 16 | classifier. 17 | 18 | name (string): The name of this matcher (defaults to None). If the 19 | matcher name is None, the class automatically generates a string 20 | and assigns it as the name. 21 | 22 | 23 | """ 24 | def __init__(self, *args, **kwargs): 25 | # If the name is given, then pop it 26 | name = kwargs.pop('name', None) 27 | if name is None: 28 | # If the name of the matcher is give, then create one. 29 | # Currently, we use a constant string + a random number. 30 | self.name = 'NaiveBayes'+ '_' + get_ts() 31 | else: 32 | # Set the name of the matcher, with the given name. 33 | self.name = name 34 | super(NBMatcher, self).__init__() 35 | # Set the classifier to the scikit-learn classifier. 36 | self.clf = GaussianNB(*args, **kwargs) -------------------------------------------------------------------------------- /py_entitymatching/matcher/rfmatcher.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module contains the functions for Random Forest classifier. 3 | """ 4 | 5 | from py_entitymatching.matcher.mlmatcher import MLMatcher 6 | from py_entitymatching.matcher.matcherutils import get_ts 7 | 8 | from sklearn.ensemble import RandomForestClassifier 9 | 10 | class RFMatcher(MLMatcher): 11 | """ 12 | Random Forest matcher. 13 | 14 | Args: 15 | *args,**kwargs: The arguments to scikit-learn's Random Forest 16 | classifier. 17 | 18 | name (string): The name of this matcher (defaults to None). If the 19 | matcher name is None, the class automatically generates a string 20 | and assigns it as the name. 21 | 22 | 23 | """ 24 | def __init__(self, *args, **kwargs): 25 | super(RFMatcher, self).__init__() 26 | # If the name is given, then pop it 27 | name = kwargs.pop('name', None) 28 | if name is None: 29 | # If the name of the matcher is give, then create one. 30 | # Currently, we use a constant string + a random number. 31 | self.name = 'RandomForest'+ '_' + get_ts() 32 | else: 33 | # Set the name of the matcher, with the given name. 34 | self.name = name 35 | # Set the classifier to the scikit-learn classifier. 36 | self.clf = RandomForestClassifier(*args, **kwargs) 37 | -------------------------------------------------------------------------------- /py_entitymatching/matcher/rulematcher.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module contains functions for Rule Matcher. 3 | Note: This will not be included in the first version of py_entitymatching. 4 | """ 5 | from py_entitymatching.matcher.matcher import Matcher 6 | 7 | class RuleMatcher(Matcher): 8 | pass 9 | -------------------------------------------------------------------------------- /py_entitymatching/matcher/svmmatcher.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module contains the functions for SVM classifier. 3 | 4 | """ 5 | from py_entitymatching.matcher.mlmatcher import MLMatcher 6 | from py_entitymatching.matcher.matcherutils import get_ts 7 | from sklearn.svm import SVC 8 | 9 | 10 | class SVMMatcher(MLMatcher): 11 | """ 12 | SVM matcher. 13 | 14 | Args: 15 | *args,**kwargs: The arguments to scikit-learn's SVM 16 | classifier. 17 | name (string): The name of this matcher (defaults to None). If the 18 | matcher name is None, the class automatically generates a string 19 | and assigns it as the name. 20 | 21 | 22 | """ 23 | def __init__(self, *args, **kwargs): 24 | super(SVMMatcher, self).__init__() 25 | # If the name is given, then pop it 26 | name = kwargs.pop('name', None) 27 | if name is None: 28 | # If the name of the matcher is give, then create one. 29 | # Currently, we use a constant string + a random number. 30 | self.name = 'SVM'+ '_' + get_ts() 31 | else: 32 | # Set the name of the matcher, with the given name. 33 | self.name = name 34 | # Set the classifier to the scikit-learn classifier. 35 | self.clf = SVC(*args, **kwargs) -------------------------------------------------------------------------------- /py_entitymatching/matcher/xgboostmatcher.py: -------------------------------------------------------------------------------- 1 | from py_entitymatching.matcher.mlmatcher import MLMatcher 2 | from py_entitymatching.matcher.matcherutils import get_ts 3 | # from sklearn.svm import SVC 4 | try: 5 | from xgboost.sklearn import XGBClassifier 6 | except ImportError: 7 | raise ImportError('Check if xgboost library is installed. You can install xgboost ' 8 | 'by following the instructions at http://xgboost.readthedocs.io/en/latest/build.html') 9 | 10 | 11 | class XGBoostMatcher(MLMatcher): 12 | """ 13 | XGBoost matcher. 14 | 15 | Args: 16 | *args,**kwargs: The arguments to XGBoost 17 | classifier. 18 | name (string): The name of this matcher (defaults to None). If the 19 | matcher name is None, the class automatically generates a string 20 | and assigns it as the name. 21 | 22 | 23 | """ 24 | def __init__(self, *args, **kwargs): 25 | super(XGBoostMatcher, self).__init__() 26 | # If the name is given, then pop it 27 | name = kwargs.pop('name', None) 28 | if name is None: 29 | # If the name of the matcher is give, then create one. 30 | # Currently, we use a constant string + a random number. 31 | self.name = 'xgboost'+ '_' + get_ts() 32 | else: 33 | # Set the name of the matcher, with the given name. 34 | self.name = name 35 | # Set the classifier to the scikit-learn classifier. 36 | try: 37 | from xgboost.sklearn import XGBClassifier 38 | except ImportError: 39 | raise ImportError( 40 | 'Check if xgboost library is installed. You can install xgboost ' 41 | 'by following the instructions at http://xgboost.readthedocs.io/en/latest/build.html') 42 | self.clf = XGBClassifier(*args, **kwargs) 43 | 44 | -------------------------------------------------------------------------------- /py_entitymatching/matchercombiner/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhaidgroup/py_entitymatching/0eebc5823a5faac89934c52cfe2b7cb5f6e15e8a/py_entitymatching/matchercombiner/__init__.py -------------------------------------------------------------------------------- /py_entitymatching/matcherselector/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhaidgroup/py_entitymatching/0eebc5823a5faac89934c52cfe2b7cb5f6e15e8a/py_entitymatching/matcherselector/__init__.py -------------------------------------------------------------------------------- /py_entitymatching/matcherselector/mlmatchercombinerselection.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module contains functions for ML-matcher combiner selection. 3 | Note: This is not going to be there for the first release of py_entitymatching. 4 | """ 5 | 6 | import itertools 7 | import six 8 | 9 | from py_entitymatching.matcherselector.mlmatcherselection import select_matcher 10 | from py_entitymatching.matcher.ensemblematcher import EnsembleMatcher 11 | 12 | def selector_matcher_combiner(matchers, combiners, x=None, y=None, table=None, exclude_attrs=None, target_attr=None, 13 | weights=None, threshold=None, k=5): 14 | if not isinstance(matchers, list): 15 | matchers = [matchers] 16 | if not isinstance(combiners, list): 17 | combiners = [combiners] 18 | matcher_list = get_matcher_list(matchers, combiners, weights, threshold) 19 | return select_matcher(matcher_list, x=x, y=y, table=table, exclude_attrs=exclude_attrs, target_attr=target_attr, 20 | k=k) 21 | def get_matcher_list(matchers, combiners, weights, threshold): 22 | ensemble_len = range(2, len(matchers) + 1) 23 | matcher_list = [] 24 | matcher_list.extend(matchers) 25 | for l in ensemble_len: 26 | iter_combns = itertools.combinations(six.moves.xrange(0, 27 | len(matchers)), l) 28 | for ic in iter_combns: 29 | for c in combiners: 30 | m = [matchers[i] for i in ic] 31 | if c is 'Weighted': 32 | em = EnsembleMatcher(m, voting=c, weights=weights, threshold=threshold) 33 | else: 34 | em = EnsembleMatcher(m, voting=c) 35 | matcher_list.append(em) 36 | return matcher_list -------------------------------------------------------------------------------- /py_entitymatching/sampler/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhaidgroup/py_entitymatching/0eebc5823a5faac89934c52cfe2b7cb5f6e15e8a/py_entitymatching/sampler/__init__.py -------------------------------------------------------------------------------- /py_entitymatching/tests/__init__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | #sys.path.append('/scratch/pradap/python-work/py_entitymatching') 3 | 4 | -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/A.csv: -------------------------------------------------------------------------------- 1 | ID,name,birth_year,hourly_wage,address,zipcode 2 | a1,Kevin Smith,1989,30.0,"607 From St, San Francisco",94107 3 | a2,Michael Franklin,1988,27.5,"1652 Stockton St, San Francisco",94122 4 | a3,William Bridge,1986,32.0,"3131 Webster St, San Francisco",94107 5 | a4,Binto George,1987,32.5,"423 Powell St, San Francisco",94122 6 | a5,Alphonse Kemper,1984,35.0,"1702 Post Street, San Francisco",94122 7 | -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/A.metadata: -------------------------------------------------------------------------------- 1 | #key=ID 2 | -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/B.csv: -------------------------------------------------------------------------------- 1 | ID,name,birth_year,hourly_wage,address,zipcode 2 | b1,Mark Levene,1987,29.5,"108 Clement St, San Francisco",94107 3 | b2,Bill Bridge,1986,32,"3131 Webster St, San Francisco",94107 4 | b3,Mike Franklin,1988,27.5,"1652 Stockton St, San Francisco",94122 5 | b4,Joseph Kuan,1982,26,"108 South Park, San Francisco",94122 6 | b5,Alfons Kemper,1984,35,"170 Post St, Apt 4, San Francisco",94122 7 | b6,Michael Brodie,1987,32.5,"133 Clement Street, San Francisco",94107 -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/C.csv: -------------------------------------------------------------------------------- 1 | _id,ltable_ID,rtable_ID,ltable_name,ltable_zipcode,ltable_birth_year,rtable_name,rtable_zipcode,rtable_birth_year 2 | 0,a1,b1,Kevin Smith,94107,1989,Mark Levene,94107,1987 3 | 1,a1,b2,Kevin Smith,94107,1989,Bill Bridge,94107,1986 4 | 2,a1,b6,Kevin Smith,94107,1989,Michael Brodie,94107,1987 5 | 6,a2,b3,Michael Franklin,94122,1988,Mike Franklin,94122,1988 6 | 7,a2,b4,Michael Franklin,94122,1988,Joseph Kuan,94122,1982 7 | 8,a2,b5,Michael Franklin,94122,1988,Alfons Kemper,94122,1984 8 | 3,a3,b1,William Bridge,94107,1986,Mark Levene,94107,1987 9 | 4,a3,b2,William Bridge,94107,1986,Bill Bridge,94107,1986 10 | 5,a3,b6,William Bridge,94107,1986,Michael Brodie,94107,1987 11 | 9,a4,b3,Binto George,94122,1987,Mike Franklin,94122,1988 12 | 10,a4,b4,Binto George,94122,1987,Joseph Kuan,94122,1982 13 | 11,a4,b5,Binto George,94122,1987,Alfons Kemper,94122,1984 14 | 12,a5,b3,Alphonse Kemper,94122,1984,Mike Franklin,94122,1988 15 | 13,a5,b4,Alphonse Kemper,94122,1984,Joseph Kuan,94122,1982 16 | 14,a5,b5,Alphonse Kemper,94122,1984,Alfons Kemper,94122,1984 17 | -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/C.metadata: -------------------------------------------------------------------------------- 1 | #fk_ltable=ltable_ID 2 | #ltable=POINTER 3 | #key=_id 4 | #fk_rtable=rtable_ID 5 | #rtable=POINTER 6 | -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/C1.csv: -------------------------------------------------------------------------------- 1 | _id,ltable_ID,rtable_ID,ltable_name,ltable_zipcode,ltable_birth_year,rtable_name,rtable_zipcode,rtable_birth_year 2 | 0,a1,b1,Kevin Smith,94107,1989,Mark Levene,94107,1987 3 | 1,a1,b2,Kevin Smith,94107,1989,Bill Bridge,94107,1986 4 | 2,a1,b6,Kevin Smith,94107,1989,Michael Brodie,94107,1987 5 | 6,a2,b3,Michael Franklin,94122,1988,Mike Franklin,94122,1988 6 | 7,a2,b4,Michael Franklin,94122,1988,Joseph Kuan,94122,1982 7 | 8,a2,b5,Michael Franklin,94122,1988,Alfons Kemper,94122,1984 8 | 3,a3,b1,William Bridge,94107,1986,Mark Levene,94107,1987 9 | 4,a3,b2,William Bridge,94107,1986,Bill Bridge,94107,1986 10 | 5,a3,b6,William Bridge,94107,1986,Michael Brodie,94107,1987 11 | 9,a4,b3,Binto George,94122,1987,Mike Franklin,94122,1988 12 | 10,a4,b4,Binto George,94122,1987,Joseph Kuan,94122,1982 13 | 11,a4,b5,Binto George,94122,1987,Alfons Kemper,94122,1984 14 | 12,a5,b3,Alphonse Kemper,94122,1984,Mike Franklin,94122,1988 15 | 13,a5,b4,Alphonse Kemper,94122,1984,Joseph Kuan,94122,1982 -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/C1.metadata: -------------------------------------------------------------------------------- 1 | #key=_id -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/D.csv: -------------------------------------------------------------------------------- 1 | _id,name,birth_year,hourly_wage,address,zipcode,label 2 | 1,Kevin Smith ,1989 ,30.0,"607 From St, San Francisco" ,94107 ,Yes 3 | 2,Michael Franklin,1988 ,27.5,"1652 Stockton St, San Francisco",94122 ,Not-Matched 4 | 3,William Bridge ,1986 ,32.0 ,"3131 Webster St, San Francisco" ,94107 ,Yes 5 | 4,Binto George ,1987 ,32.5 ,"423 Powell St, San Francisco" ,94122 ,Not-Matched 6 | 5,Alphonse Kemper ,1984 ,35.0 ,"1702 Post Street, San Francisco",94122 ,Yes 7 | 6,Kevin Smith ,1989 ,30.0 ,"607 From St, San Francisco" ,94107 ,Not-Matched 8 | 7,Michael Franklin,1988 ,27.5 ,"1652 Stockton St, San Francisco",94122 ,Yes 9 | 8,William Bridge ,1986 ,32.0 ,"3131 Webster St, San Francisco" ,94107 ,Not-Sure 10 | 9,Binto George ,1987 ,32.5 ,"423 Powell St, San Francisco" ,94122 ,Yes 11 | 10,Alphonse Kemper ,1984 ,35.0 ,"1702 Post Street, San Francisco",94122 ,Not-Labeled 12 | 11,Kevin Smith ,1989 ,30.0 ,"607 From St, San Francisco" ,94107 ,Yes 13 | 12,Michael Franklin,1988 ,27.5 ,"1652 Stockton St, San Francisco",94122 ,Not-Sure 14 | 13,William Bridge ,1986 ,32.0 ,"3131 Webster St, San Francisco" ,94107 ,Yes 15 | 14,Binto George ,1987 ,32.5 ,"423 Powell St, San Francisco" ,94122 ,Not-Sure -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/D.metadata: -------------------------------------------------------------------------------- 1 | #key=_id -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/blocker/table_A_wi_missing_vals.csv: -------------------------------------------------------------------------------- 1 | ID,name,birth_year,hourly_wage,address,zipcode 2 | a1,Kevin Smith,1989,30,"607 From St, San Francisco", 3 | a2,Michael Franklin,,27.5,"1652 Stockton St, San Francisco",94122 4 | a3,William Bridge,1986,32,"3131 Webster St, San Francisco",94107 5 | a4,,1987,32.5,"423 Powell St, San Francisco", 6 | a5,Alphonse Kemper,1984,35,,94122 7 | -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/blocker/table_A_wi_missing_vals.metadata: -------------------------------------------------------------------------------- 1 | #key=ID 2 | -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/blocker/table_B_wi_missing_vals.csv: -------------------------------------------------------------------------------- 1 | ID,name,birth_year,hourly_wage,address,zipcode 2 | b1,Mark Levene,1987,29.5,"108 Clement St, San Francisco",94107 3 | b2,Bill Bridge,1986,32,"3131 Webster St, San Francisco", 4 | b3,Mike Franklin,1988,27.5,"1652 Stockton St, San Francisco",94122 5 | b4,,1982,26,"108 South Park, San Francisco", 6 | b5,Alfons Kemper,1984,35,"170 Post St, Apt 4, San Francisco",94122 7 | b6,Michael Brodie,1987,32.5,,94107 8 | -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/blocker/table_B_wi_missing_vals.metadata: -------------------------------------------------------------------------------- 1 | #key=ID 2 | -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/blockercombiner/C1.csv: -------------------------------------------------------------------------------- 1 | _id,ltable_ID,rtable_ID,ltable_name,ltable_zipcode,ltable_birth_year,rtable_name,rtable_zipcode,rtable_birth_year 2 | 3,a3,b1,William Bridge,94107,1986,Mark Levene,94107,1987 3 | 4,a3,b2,William Bridge,94107,1986,Bill Bridge,94107,1986 4 | 5,a3,b6,William Bridge,94107,1986,Michael Brodie,94107,1987 5 | 6,a2,b3,Michael Franklin,94122,1988,Mike Franklin,94122,1988 6 | 7,a2,b4,Michael Franklin,94122,1988,Joseph Kuan,94122,1982 7 | 8,a2,b5,Michael Franklin,94122,1988,Alfons Kemper,94122,1984 8 | 9,a4,b3,Binto George,94122,1987,Mike Franklin,94122,1988 9 | 10,a4,b4,Binto George,94122,1987,Joseph Kuan,94122,1982 10 | 11,a4,b5,Binto George,94122,1987,Alfons Kemper,94122,1984 11 | 12,a5,b3,Alphonse Kemper,94122,1984,Mike Franklin,94122,1988 12 | 13,a5,b4,Alphonse Kemper,94122,1984,Joseph Kuan,94122,1982 13 | 14,a5,b5,Alphonse Kemper,94122,1984,Alfons Kemper,94122,1984 14 | -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/blockercombiner/C1.metadata: -------------------------------------------------------------------------------- 1 | #ltable=POINTER 2 | #rtable=POINTER 3 | #fk_rtable=rtable_ID 4 | #fk_ltable=ltable_ID 5 | #key=_id 6 | -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/blockercombiner/C1_ex_1.csv: -------------------------------------------------------------------------------- 1 | _id,ltable_ID,rtable_ID,ltable_name,ltable_address,rtable_name,rtable_address 2 | 0,a1,b1,Kevin Smith,"607 From St, San Francisco",Mark Levene,"108 Clement St, San Francisco" 3 | 1,a1,b2,Kevin Smith,"607 From St, San Francisco",Bill Bridge,"3131 Webster St, San Francisco" 4 | 2,a1,b6,Kevin Smith,"607 From St, San Francisco",Michael Brodie,"133 Clement Street, San Francisco" 5 | 6,a2,b3,Michael Franklin,"1652 Stockton St, San Francisco",Mike Franklin,"1652 Stockton St, San Francisco" 6 | 7,a2,b4,Michael Franklin,"1652 Stockton St, San Francisco",Joseph Kuan,"108 South Park, San Francisco" 7 | 8,a2,b5,Michael Franklin,"1652 Stockton St, San Francisco",Alfons Kemper,"170 Post St, Apt 4, San Francisco" 8 | 3,a3,b1,William Bridge,"3131 Webster St, San Francisco",Mark Levene,"108 Clement St, San Francisco" 9 | 4,a3,b2,William Bridge,"3131 Webster St, San Francisco",Bill Bridge,"3131 Webster St, San Francisco" 10 | 5,a3,b6,William Bridge,"3131 Webster St, San Francisco",Michael Brodie,"133 Clement Street, San Francisco" 11 | 9,a4,b3,Binto George,"423 Powell St, San Francisco",Mike Franklin,"1652 Stockton St, San Francisco" 12 | 10,a4,b4,Binto George,"423 Powell St, San Francisco",Joseph Kuan,"108 South Park, San Francisco" 13 | 11,a4,b5,Binto George,"423 Powell St, San Francisco",Alfons Kemper,"170 Post St, Apt 4, San Francisco" 14 | 12,a5,b3,Alphonse Kemper,"1702 Post Street, San Francisco",Mike Franklin,"1652 Stockton St, San Francisco" 15 | 13,a5,b4,Alphonse Kemper,"1702 Post Street, San Francisco",Joseph Kuan,"108 South Park, San Francisco" 16 | 14,a5,b5,Alphonse Kemper,"1702 Post Street, San Francisco",Alfons Kemper,"170 Post St, Apt 4, San Francisco" 17 | -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/blockercombiner/C1_ex_1.metadata: -------------------------------------------------------------------------------- 1 | #fk_ltable=ltable_ID 2 | #ltable=POINTER 3 | #key=_id 4 | #fk_rtable=rtable_ID 5 | #rtable=POINTER 6 | -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/blockercombiner/C2.csv: -------------------------------------------------------------------------------- 1 | _id,ltable_ID,rtable_ID,ltable_name,ltable_zipcode,ltable_birth_year,rtable_name,rtable_zipcode,rtable_birth_year 2 | 0,a1,b1,Kevin Smith,94107,1989,Mark Levene,94107,1987 3 | 1,a1,b2,Kevin Smith,94107,1989,Bill Bridge,94107,1986 4 | 2,a1,b6,Kevin Smith,94107,1989,Michael Brodie,94107,1987 5 | 6,a2,b3,Michael Franklin,94122,1988,Mike Franklin,94122,1988 6 | 7,a2,b4,Michael Franklin,94122,1988,Joseph Kuan,94122,1982 7 | 8,a2,b5,Michael Franklin,94122,1988,Alfons Kemper,94122,1984 8 | 9,a4,b3,Binto George,94122,1987,Mike Franklin,94122,1988 9 | 10,a4,b4,Binto George,94122,1987,Joseph Kuan,94122,1982 10 | 11,a4,b5,Binto George,94122,1987,Alfons Kemper,94122,1984 11 | 12,a5,b3,Alphonse Kemper,94122,1984,Mike Franklin,94122,1988 12 | 13,a5,b4,Alphonse Kemper,94122,1984,Joseph Kuan,94122,1982 13 | 14,a5,b5,Alphonse Kemper,94122,1984,Alfons Kemper,94122,1984 14 | -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/blockercombiner/C2.metadata: -------------------------------------------------------------------------------- 1 | #ltable=POINTER 2 | #rtable=POINTER 3 | #fk_rtable=rtable_ID 4 | #fk_ltable=ltable_ID 5 | #key=_id 6 | -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/blockercombiner/C2_ex_1.csv: -------------------------------------------------------------------------------- 1 | _id,ltable_ID,rtable_ID,ltable_birth_year,ltable_zipcode,rtable_birth_year,rtable_zipcode 2 | 0,a2,b3,1988,94122,1988,94122 3 | 1,a3,b2,1986,94107,1986,94107 4 | 2,a4,b1,1987,94122,1987,94107 5 | 3,a4,b6,1987,94122,1987,94107 6 | 4,a5,b5,1984,94122,1984,94122 7 | -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/blockercombiner/C2_ex_1.metadata: -------------------------------------------------------------------------------- 1 | #ltable=POINTER 2 | #rtable=POINTER 3 | #fk_rtable=rtable_ID 4 | #fk_ltable=ltable_ID 5 | #key=_id 6 | -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/blockercombiner/C3.csv: -------------------------------------------------------------------------------- 1 | _id,ltable_ID,rtable_ID,ltable_name,ltable_zipcode,ltable_birth_year,rtable_name,rtable_zipcode,rtable_birth_year 2 | 0,a1,b1,Kevin Smith,94107,1989,Mark Levene,94107,1987 3 | 1,a1,b2,Kevin Smith,94107,1989,Bill Bridge,94107,1986 4 | 2,a1,b6,Kevin Smith,94107,1989,Michael Brodie,94107,1987 5 | 3,a3,b1,William Bridge,94107,1986,Mark Levene,94107,1987 6 | 4,a3,b2,William Bridge,94107,1986,Bill Bridge,94107,1986 7 | 5,a3,b6,William Bridge,94107,1986,Michael Brodie,94107,1987 8 | 9,a4,b3,Binto George,94122,1987,Mike Franklin,94122,1988 9 | 10,a4,b4,Binto George,94122,1987,Joseph Kuan,94122,1982 10 | 11,a4,b5,Binto George,94122,1987,Alfons Kemper,94122,1984 11 | 12,a5,b3,Alphonse Kemper,94122,1984,Mike Franklin,94122,1988 12 | 13,a5,b4,Alphonse Kemper,94122,1984,Joseph Kuan,94122,1982 13 | 14,a5,b5,Alphonse Kemper,94122,1984,Alfons Kemper,94122,1984 14 | -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/blockercombiner/C3.metadata: -------------------------------------------------------------------------------- 1 | #ltable=POINTER 2 | #rtable=POINTER 3 | #fk_rtable=rtable_ID 4 | #fk_ltable=ltable_ID 5 | #key=_id 6 | -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/blockercombiner/C3_ex_2.csv: -------------------------------------------------------------------------------- 1 | _id,ltable_ID,rtable_ID,ltable_birth_year,ltable_zipcode,rtable_birth_year,rtable_zipcode 2 | -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/blockercombiner/C3_ex_2.metadata: -------------------------------------------------------------------------------- 1 | #ltable=POINTER 2 | #rtable=POINTER 3 | #fk_rtable=rtable_ID 4 | #fk_ltable=ltable_ID 5 | #key=_id 6 | -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/blockercombiner/C4_ex_1.csv: -------------------------------------------------------------------------------- 1 | _id,l_ID,r_ID,l_name,l_address,r_name,r_address 2 | 0,a1,b1,Kevin Smith,"607 From St, San Francisco",Mark Levene,"108 Clement St, San Francisco" 3 | 1,a1,b2,Kevin Smith,"607 From St, San Francisco",Bill Bridge,"3131 Webster St, San Francisco" 4 | 2,a1,b6,Kevin Smith,"607 From St, San Francisco",Michael Brodie,"133 Clement Street, San Francisco" 5 | 3,a3,b1,William Bridge,"3131 Webster St, San Francisco",Mark Levene,"108 Clement St, San Francisco" 6 | 4,a3,b2,William Bridge,"3131 Webster St, San Francisco",Bill Bridge,"3131 Webster St, San Francisco" 7 | 5,a3,b6,William Bridge,"3131 Webster St, San Francisco",Michael Brodie,"133 Clement Street, San Francisco" 8 | 6,a2,b3,Michael Franklin,"1652 Stockton St, San Francisco",Mike Franklin,"1652 Stockton St, San Francisco" 9 | 7,a2,b4,Michael Franklin,"1652 Stockton St, San Francisco",Joseph Kuan,"108 South Park, San Francisco" 10 | 8,a2,b5,Michael Franklin,"1652 Stockton St, San Francisco",Alfons Kemper,"170 Post St, Apt 4, San Francisco" 11 | 9,a4,b3,Binto George,"423 Powell St, San Francisco",Mike Franklin,"1652 Stockton St, San Francisco" 12 | 10,a4,b4,Binto George,"423 Powell St, San Francisco",Joseph Kuan,"108 South Park, San Francisco" 13 | 11,a4,b5,Binto George,"423 Powell St, San Francisco",Alfons Kemper,"170 Post St, Apt 4, San Francisco" 14 | 12,a5,b3,Alphonse Kemper,"1702 Post Street, San Francisco",Mike Franklin,"1652 Stockton St, San Francisco" 15 | 13,a5,b4,Alphonse Kemper,"1702 Post Street, San Francisco",Joseph Kuan,"108 South Park, San Francisco" 16 | 14,a5,b5,Alphonse Kemper,"1702 Post Street, San Francisco",Alfons Kemper,"170 Post St, Apt 4, San Francisco" 17 | -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/blockercombiner/C4_ex_1.metadata: -------------------------------------------------------------------------------- 1 | #ltable=POINTER 2 | #rtable=POINTER 3 | #fk_rtable=r_ID 4 | #fk_ltable=l_ID 5 | #key=_id 6 | -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/blockercombiner/C4_ex_2.csv: -------------------------------------------------------------------------------- 1 | _id,l_ID,r_ID,l_birth_year,l_zipcode,r_birth_year,r_zipcode 2 | 0,a2,b3,1988,94122,1988,94122 3 | 1,a3,b2,1986,94107,1986,94107 4 | 2,a4,b1,1987,94122,1987,94107 5 | 3,a4,b6,1987,94122,1987,94107 6 | 4,a5,b5,1984,94122,1984,94122 7 | -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/blockercombiner/C4_ex_2.metadata: -------------------------------------------------------------------------------- 1 | #ltable=POINTER 2 | #rtable=POINTER 3 | #fk_rtable=r_ID 4 | #fk_ltable=l_ID 5 | #key=_id 6 | -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/blockercombiner/C_ex_1.csv: -------------------------------------------------------------------------------- 1 | _id,ltable_ID,rtable_ID,ltable_name,ltable_address,ltable_birth_year,ltable_zipcode,rtable_name,rtable_address,rtable_birth_year,rtable_zipcode 2 | 0,a1,b1,Kevin Smith,"607 From St, San Francisco",1989,94107,Mark Levene,"108 Clement St, San Francisco",1987,94107 3 | 1,a1,b2,Kevin Smith,"607 From St, San Francisco",1989,94107,Bill Bridge,"3131 Webster St, San Francisco",1986,94107 4 | 2,a1,b6,Kevin Smith,"607 From St, San Francisco",1989,94107,Michael Brodie,"133 Clement Street, San Francisco",1987,94107 5 | 3,a2,b3,Michael Franklin,"1652 Stockton St, San Francisco",1988,94122,Mike Franklin,"1652 Stockton St, San Francisco",1988,94122 6 | 4,a2,b4,Michael Franklin,"1652 Stockton St, San Francisco",1988,94122,Joseph Kuan,"108 South Park, San Francisco",1982,94122 7 | 5,a2,b5,Michael Franklin,"1652 Stockton St, San Francisco",1988,94122,Alfons Kemper,"170 Post St, Apt 4, San Francisco",1984,94122 8 | 6,a3,b1,William Bridge,"3131 Webster St, San Francisco",1986,94107,Mark Levene,"108 Clement St, San Francisco",1987,94107 9 | 7,a3,b2,William Bridge,"3131 Webster St, San Francisco",1986,94107,Bill Bridge,"3131 Webster St, San Francisco",1986,94107 10 | 8,a3,b6,William Bridge,"3131 Webster St, San Francisco",1986,94107,Michael Brodie,"133 Clement Street, San Francisco",1987,94107 11 | 9,a4,b1,Binto George,"423 Powell St, San Francisco",1987,94122,Mark Levene,"108 Clement St, San Francisco",1987,94107 12 | 10,a4,b3,Binto George,"423 Powell St, San Francisco",1987,94122,Mike Franklin,"1652 Stockton St, San Francisco",1988,94122 13 | 11,a4,b4,Binto George,"423 Powell St, San Francisco",1987,94122,Joseph Kuan,"108 South Park, San Francisco",1982,94122 14 | 12,a4,b5,Binto George,"423 Powell St, San Francisco",1987,94122,Alfons Kemper,"170 Post St, Apt 4, San Francisco",1984,94122 15 | 13,a4,b6,Binto George,"423 Powell St, San Francisco",1987,94122,Michael Brodie,"133 Clement Street, San Francisco",1987,94107 16 | 14,a5,b3,Alphonse Kemper,"1702 Post Street, San Francisco",1984,94122,Mike Franklin,"1652 Stockton St, San Francisco",1988,94122 17 | 15,a5,b4,Alphonse Kemper,"1702 Post Street, San Francisco",1984,94122,Joseph Kuan,"108 South Park, San Francisco",1982,94122 18 | 16,a5,b5,Alphonse Kemper,"1702 Post Street, San Francisco",1984,94122,Alfons Kemper,"170 Post St, Apt 4, San Francisco",1984,94122 19 | -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/blockercombiner/C_ex_1.metadata: -------------------------------------------------------------------------------- 1 | #ltable=POINTER 2 | #rtable=POINTER 3 | #fk_rtable=rtable_ID 4 | #fk_ltable=ltable_ID 5 | #key=_id 6 | -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/blockercombiner/C_ex_2.csv: -------------------------------------------------------------------------------- 1 | _id,ltable_ID,rtable_ID,ltable_name,ltable_address,ltable_birth_year,ltable_zipcode,rtable_name,rtable_address,rtable_birth_year,rtable_zipcode 2 | 0,a1,b1,Kevin Smith,"607 From St, San Francisco",1989,94107,Mark Levene,"108 Clement St, San Francisco",1987,94107 3 | 1,a1,b2,Kevin Smith,"607 From St, San Francisco",1989,94107,Bill Bridge,"3131 Webster St, San Francisco",1986,94107 4 | 2,a1,b6,Kevin Smith,"607 From St, San Francisco",1989,94107,Michael Brodie,"133 Clement Street, San Francisco",1987,94107 5 | 3,a2,b3,Michael Franklin,"1652 Stockton St, San Francisco",1988,94122,Mike Franklin,"1652 Stockton St, San Francisco",1988,94122 6 | 4,a2,b4,Michael Franklin,"1652 Stockton St, San Francisco",1988,94122,Joseph Kuan,"108 South Park, San Francisco",1982,94122 7 | 5,a2,b5,Michael Franklin,"1652 Stockton St, San Francisco",1988,94122,Alfons Kemper,"170 Post St, Apt 4, San Francisco",1984,94122 8 | 6,a3,b1,William Bridge,"3131 Webster St, San Francisco",1986,94107,Mark Levene,"108 Clement St, San Francisco",1987,94107 9 | 7,a3,b2,William Bridge,"3131 Webster St, San Francisco",1986,94107,Bill Bridge,"3131 Webster St, San Francisco",1986,94107 10 | 8,a3,b6,William Bridge,"3131 Webster St, San Francisco",1986,94107,Michael Brodie,"133 Clement Street, San Francisco",1987,94107 11 | 9,a4,b3,Binto George,"423 Powell St, San Francisco",1987,94122,Mike Franklin,"1652 Stockton St, San Francisco",1988,94122 12 | 10,a4,b4,Binto George,"423 Powell St, San Francisco",1987,94122,Joseph Kuan,"108 South Park, San Francisco",1982,94122 13 | 11,a4,b5,Binto George,"423 Powell St, San Francisco",1987,94122,Alfons Kemper,"170 Post St, Apt 4, San Francisco",1984,94122 14 | 12,a5,b3,Alphonse Kemper,"1702 Post Street, San Francisco",1984,94122,Mike Franklin,"1652 Stockton St, San Francisco",1988,94122 15 | 13,a5,b4,Alphonse Kemper,"1702 Post Street, San Francisco",1984,94122,Joseph Kuan,"108 South Park, San Francisco",1982,94122 16 | 14,a5,b5,Alphonse Kemper,"1702 Post Street, San Francisco",1984,94122,Alfons Kemper,"170 Post St, Apt 4, San Francisco",1984,94122 17 | -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/blockercombiner/C_ex_2.metadata: -------------------------------------------------------------------------------- 1 | #ltable=POINTER 2 | #rtable=POINTER 3 | #fk_rtable=rtable_ID 4 | #fk_ltable=ltable_ID 5 | #key=_id 6 | -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/blockercombiner/C_ex_4.csv: -------------------------------------------------------------------------------- 1 | _id,l_ID,r_ID,l_name,l_address,l_birth_year,l_zipcode,r_name,r_address,r_birth_year,r_zipcode 2 | 0,a1,b1,Kevin Smith,"607 From St, San Francisco",1989,94107,Mark Levene,"108 Clement St, San Francisco",1987,94107 3 | 1,a1,b2,Kevin Smith,"607 From St, San Francisco",1989,94107,Bill Bridge,"3131 Webster St, San Francisco",1986,94107 4 | 2,a1,b6,Kevin Smith,"607 From St, San Francisco",1989,94107,Michael Brodie,"133 Clement Street, San Francisco",1987,94107 5 | 3,a2,b3,Michael Franklin,"1652 Stockton St, San Francisco",1988,94122,Mike Franklin,"1652 Stockton St, San Francisco",1988,94122 6 | 4,a2,b4,Michael Franklin,"1652 Stockton St, San Francisco",1988,94122,Joseph Kuan,"108 South Park, San Francisco",1982,94122 7 | 5,a2,b5,Michael Franklin,"1652 Stockton St, San Francisco",1988,94122,Alfons Kemper,"170 Post St, Apt 4, San Francisco",1984,94122 8 | 6,a3,b1,William Bridge,"3131 Webster St, San Francisco",1986,94107,Mark Levene,"108 Clement St, San Francisco",1987,94107 9 | 7,a3,b2,William Bridge,"3131 Webster St, San Francisco",1986,94107,Bill Bridge,"3131 Webster St, San Francisco",1986,94107 10 | 8,a3,b6,William Bridge,"3131 Webster St, San Francisco",1986,94107,Michael Brodie,"133 Clement Street, San Francisco",1987,94107 11 | 9,a4,b1,Binto George,"423 Powell St, San Francisco",1987,94122,Mark Levene,"108 Clement St, San Francisco",1987,94107 12 | 10,a4,b3,Binto George,"423 Powell St, San Francisco",1987,94122,Mike Franklin,"1652 Stockton St, San Francisco",1988,94122 13 | 11,a4,b4,Binto George,"423 Powell St, San Francisco",1987,94122,Joseph Kuan,"108 South Park, San Francisco",1982,94122 14 | 12,a4,b5,Binto George,"423 Powell St, San Francisco",1987,94122,Alfons Kemper,"170 Post St, Apt 4, San Francisco",1984,94122 15 | 13,a4,b6,Binto George,"423 Powell St, San Francisco",1987,94122,Michael Brodie,"133 Clement Street, San Francisco",1987,94107 16 | 14,a5,b3,Alphonse Kemper,"1702 Post Street, San Francisco",1984,94122,Mike Franklin,"1652 Stockton St, San Francisco",1988,94122 17 | 15,a5,b4,Alphonse Kemper,"1702 Post Street, San Francisco",1984,94122,Joseph Kuan,"108 South Park, San Francisco",1982,94122 18 | 16,a5,b5,Alphonse Kemper,"1702 Post Street, San Francisco",1984,94122,Alfons Kemper,"170 Post St, Apt 4, San Francisco",1984,94122 19 | -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/blockercombiner/C_ex_4.metadata: -------------------------------------------------------------------------------- 1 | #ltable=POINTER 2 | #rtable=POINTER 3 | #fk_rtable=r_ID 4 | #fk_ltable=l_ID 5 | #key=_id 6 | -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/catalog/A.metadata: -------------------------------------------------------------------------------- 1 | #key=ID 2 | -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/catalog/A_dupid.csv: -------------------------------------------------------------------------------- 1 | ID,name,birth_year,hourly_wage,address,zipcode 2 | a1,Kevin Smith,1989,30,"607 From St, San Francisco",94107 3 | a2,Michael Franklin,1988,27.5,"1652 Stockton St, San Francisco",94122 4 | a3,William Bridge,1986,32,"3131 Webster St, San Francisco",94107 5 | a4,Binto George,1987,32.5,"423 Powell St, San Francisco",94122 6 | a5,Alphonse Kemper,1984,35,"1702 Post Street, San Francisco",94122 7 | a5,Alphonse Kemper,1984,35,"1702 Post Street, San Francisco",94122 -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/catalog/A_inv_fk.csv: -------------------------------------------------------------------------------- 1 | ID,name,birth_year,hourly_wage,address,zipcode 2 | a1,Kevin Smith,1989,30,"607 From St, San Francisco",94107 3 | a2,Michael Franklin,1988,27.5,"1652 Stockton St, San Francisco",94122 4 | a3,William Bridge,1986,32,"3131 Webster St, San Francisco",94107 5 | -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/catalog/A_mvals.csv: -------------------------------------------------------------------------------- 1 | ID,name,birth_year,hourly_wage,address,zipcode 2 | a1,Kevin Smith,1989,30,"607 From St, San Francisco",94107 3 | a2,Michael Franklin,1988,27.5,"1652 Stockton St, San Francisco",94122 4 | a3,William Bridge,1986,32,"3131 Webster St, San Francisco",94107 5 | a4,Binto George,1987,32.5,"423 Powell St, San Francisco",94122 6 | ,Alphonse Kemper,1984,35,"1702 Post Street, San Francisco",94122 -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/debugblocker/test_debugblocker_13.metadata: -------------------------------------------------------------------------------- 1 | #ltable=POINTER 2 | #rtable=POINTER 3 | #fk_rtable=rtable_ID 4 | #fk_ltable=ltable_ID 5 | #key=_id 6 | -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/debugblocker/test_debugblocker_13_out.csv: -------------------------------------------------------------------------------- 1 | _id,ltable_ID,rtable_ID,ltable_name,ltable_address,rtable_name,rtable_address 2 | 0,a2,b2,Michael Franklin,"1652 Stockton St, San Francisco",Bill Bridge,"3131 Webster St, San Francisco" 3 | 1,a5,b6,Alphonse Kemper,"1702 Post Street, San Francisco",Michael Brodie,"133 Clement Street, San Francisco" 4 | 2,a2,b6,Michael Franklin,"1652 Stockton St, San Francisco",Michael Brodie,"133 Clement Street, San Francisco" 5 | 3,a4,b2,Binto George,"423 Powell St, San Francisco",Bill Bridge,"3131 Webster St, San Francisco" 6 | 4,a1,b3,Kevin Smith,"607 From St, San Francisco",Mike Franklin,"1652 Stockton St, San Francisco" 7 | 5,a4,b1,Binto George,"423 Powell St, San Francisco",Mark Levene,"108 Clement St, San Francisco" 8 | 6,a2,b1,Michael Franklin,"1652 Stockton St, San Francisco",Mark Levene,"108 Clement St, San Francisco" 9 | 7,a3,b3,William Bridge,"3131 Webster St, San Francisco",Mike Franklin,"1652 Stockton St, San Francisco" 10 | 8,a1,b5,Kevin Smith,"607 From St, San Francisco",Alfons Kemper,"170 Post St, Apt 4, San Francisco" 11 | 9,a3,b5,William Bridge,"3131 Webster St, San Francisco",Alfons Kemper,"170 Post St, Apt 4, San Francisco" 12 | 10,a1,b4,Kevin Smith,"607 From St, San Francisco",Joseph Kuan,"108 South Park, San Francisco" 13 | 11,a3,b4,William Bridge,"3131 Webster St, San Francisco",Joseph Kuan,"108 South Park, San Francisco" 14 | 12,a5,b2,Alphonse Kemper,"1702 Post Street, San Francisco",Bill Bridge,"3131 Webster St, San Francisco" 15 | 13,a4,b6,Binto George,"423 Powell St, San Francisco",Michael Brodie,"133 Clement Street, San Francisco" 16 | 14,a5,b1,Alphonse Kemper,"1702 Post Street, San Francisco",Mark Levene,"108 Clement St, San Francisco" -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/debugblocker/test_debugblocker_cand.csv: -------------------------------------------------------------------------------- 1 | _id,ltable_ID,rtable_book_id 2 | -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/debugblocker/test_debugblocker_ltable.csv: -------------------------------------------------------------------------------- 1 | ID,title,author,publisher,price,desc,genre,year,lang 2 | 0,"intro to database","John Doe","ABC publisher",10.00,"introduction to database",,2010, 3 | 1,"data analysis","Jane Doe","BCD publisher",20.00,"introduction to data analysis",,2015,"ENG" 4 | 2,"Thinking in Java","Johnnie Doe",,10.00,"learn how to program in Java",,2000,"ENG" 5 | -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/debugblocker/test_debugblocker_rtable.csv: -------------------------------------------------------------------------------- 1 | book_id,book_title,author,publisher,price,pub_year,language,description,book_genre 2 | "B001","introduction to data analysis","John Doe","ABC publisher"10.00,2015,English,"introduction to data analysis", 3 | "B002","Thinking in C","Jane Doe","BCD publisher",15.00,1990,,"learn programming in C++", 4 | "B003","A brief history of time","Stephen Hawking",,20.00,1988,English,"from Big Bang to black holes", 5 | -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/debugblocker/test_get_tokenized_table_1.txt: -------------------------------------------------------------------------------- 1 | a1 kevin smith 1989 30 607 from st, san francisco 94107 2 | a2 michael franklin 1988 28 1652 stockton st, san francisco 94122 3 | a3 william bridge 1986 32 3131 webster st, san francisco 94107 4 | a4 binto george 1987 32 423 powell st, san francisco 94122 5 | a5 alphonse kemper 1984 35 1702 post street, san francisco 94122 6 | -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/debugblocker/test_get_tokenized_table_2.txt: -------------------------------------------------------------------------------- 1 | b1 mark levene 30 2 | b2 bill bridge 32 3 | b3 mike franklin 28 4 | b4 joseph kuan 26 5 | b5 alfons kemper 35 6 | b6 michael brodie 32 7 | -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/debugblocker/test_topk_sim_join_1_A.txt: -------------------------------------------------------------------------------- 1 | a1 kevin smith 1989 30 607 from st, san francisco 94107 2 | a2 michael franklin 1988 28 1652 stockton st, san francisco 94122 3 | a3 william bridge 1986 32 3131 webster st, san francisco 94107 4 | a4 binto george 1987 32 423 powell st, san francisco 94122 5 | a5 alphonse kemper 1984 35 1702 post street, san francisco 94122 6 | -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/debugblocker/test_topk_sim_join_1_B.txt: -------------------------------------------------------------------------------- 1 | b1 mark levene 1987 30 108 clement st, san francisco 94107 2 | b2 bill bridge 1986 32 3131 webster st, san francisco 94107 3 | b3 mike franklin 1988 28 1652 stockton st, san francisco 94122 4 | b4 joseph kuan 1982 26 108 south park, san francisco 94122 5 | b5 alfons kemper 1984 35 170 post st, apt 4, san francisco 94122 6 | b6 michael brodie 1987 32 133 clement street, san francisco 94107 7 | -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/debugblocker/test_topk_sim_join_1_C.txt: -------------------------------------------------------------------------------- 1 | 0 0 2 | 0 1 3 | 0 5 4 | 1 2 5 | 1 3 6 | 1 4 7 | 2 0 8 | 2 1 9 | 2 5 10 | 3 2 11 | 3 3 12 | 3 4 13 | 4 2 14 | 4 3 15 | 4 4 16 | -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/io/A.csv: -------------------------------------------------------------------------------- 1 | ID,name,birth_year,hourly_wage,address,zipcode 2 | a1,Kevin Smith,1989,30.0,"607 From St, San Francisco",94107 3 | a2,Michael Franklin,1988,27.5,"1652 Stockton St, San Francisco",94122 4 | a3,William Bridge,1986,32.0,"3131 Webster St, San Francisco",94107 5 | a4,Binto George,1987,32.5,"423 Powell St, San Francisco",94122 6 | a5,Alphonse Kemper,1984,35.0,"1702 Post Street, San Francisco",94122 7 | -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/io/A.mdx: -------------------------------------------------------------------------------- 1 | #key=ID 2 | -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/io/A_dupid.csv: -------------------------------------------------------------------------------- 1 | ID,name,birth_year,hourly_wage,address,zipcode 2 | a1,Kevin Smith,1989,30,"607 From St, San Francisco",94107 3 | a2,Michael Franklin,1988,27.5,"1652 Stockton St, San Francisco",94122 4 | a3,William Bridge,1986,32,"3131 Webster St, San Francisco",94107 5 | a4,Binto George,1987,32.5,"423 Powell St, San Francisco",94122 6 | a5,Alphonse Kemper,1984,35,"1702 Post Street, San Francisco",94122 7 | a5,Alphonse Kemper,1984,35,"1702 Post Street, San Francisco",94122 -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/io/A_key_zipcode.csv: -------------------------------------------------------------------------------- 1 | ID,name,birth_year,hourly_wage,address,zipcode 2 | a1,Kevin Smith,1989,30.0,"607 From St, San Francisco",94107 3 | a2,Michael Franklin,1988,27.5,"1652 Stockton St, San Francisco",94122 4 | a3,William Bridge,1986,32.0,"3131 Webster St, San Francisco",94107 5 | a4,Binto George,1987,32.5,"423 Powell St, San Francisco",94122 6 | a5,Alphonse Kemper,1984,35.0,"1702 Post Street, San Francisco",94122 7 | -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/io/A_key_zipcode.metadata: -------------------------------------------------------------------------------- 1 | #key=zipcode 2 | -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/io/A_md_wrongformat.csv: -------------------------------------------------------------------------------- 1 | ID,name,birth_year,hourly_wage,address,zipcode 2 | a1,Kevin Smith,1989,30.0,"607 From St, San Francisco",94107 3 | a2,Michael Franklin,1988,27.5,"1652 Stockton St, San Francisco",94122 4 | a3,William Bridge,1986,32.0,"3131 Webster St, San Francisco",94107 5 | a4,Binto George,1987,32.5,"423 Powell St, San Francisco",94122 6 | a5,Alphonse Kemper,1984,35.0,"1702 Post Street, San Francisco",94122 7 | -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/io/A_md_wrongformat.metadata: -------------------------------------------------------------------------------- 1 | #key=zipcode#=10 2 | %%10 3 | -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/io/A_mvals.csv: -------------------------------------------------------------------------------- 1 | ID,name,birth_year,hourly_wage,address,zipcode 2 | a1,Kevin Smith,1989,30,"607 From St, San Francisco",94107 3 | a2,Michael Franklin,1988,27.5,"1652 Stockton St, San Francisco",94122 4 | a3,William Bridge,1986,32,"3131 Webster St, San Francisco",94107 5 | a4,Binto George,1987,32.5,"423 Powell St, San Francisco",94122 6 | ,Alphonse Kemper,1984,35,"1702 Post Street, San Francisco",94122 -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/io/C_partialmeta.csv: -------------------------------------------------------------------------------- 1 | _id,ltable_ID,rtable_ID,ltable_name,ltable_zipcode,ltable_birth_year,rtable_name,rtable_zipcode,rtable_birth_year 2 | 0,a1,b1,Kevin Smith,94107,1989,Mark Levene,94107,1987 3 | 1,a1,b2,Kevin Smith,94107,1989,Bill Bridge,94107,1986 4 | 2,a1,b6,Kevin Smith,94107,1989,Michael Brodie,94107,1987 5 | 3,a3,b1,William Bridge,94107,1986,Mark Levene,94107,1987 6 | 4,a3,b2,William Bridge,94107,1986,Bill Bridge,94107,1986 7 | 5,a3,b6,William Bridge,94107,1986,Michael Brodie,94107,1987 8 | 6,a2,b3,Michael Franklin,94122,1988,Mike Franklin,94122,1988 9 | 7,a2,b4,Michael Franklin,94122,1988,Joseph Kuan,94122,1982 10 | 8,a2,b5,Michael Franklin,94122,1988,Alfons Kemper,94122,1984 11 | 9,a4,b3,Binto George,94122,1987,Mike Franklin,94122,1988 12 | 10,a4,b4,Binto George,94122,1987,Joseph Kuan,94122,1982 13 | 11,a4,b5,Binto George,94122,1987,Alfons Kemper,94122,1984 14 | 12,a5,b3,Alphonse Kemper,94122,1984,Mike Franklin,94122,1988 15 | 13,a5,b4,Alphonse Kemper,94122,1984,Joseph Kuan,94122,1982 16 | 14,a5,b5,Alphonse Kemper,94122,1984,Alfons Kemper,94122,1984 -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/io/C_partialmeta.metadata: -------------------------------------------------------------------------------- 1 | #ltable=POINTER 2 | #rtable=POINTER 3 | #key=_id 4 | #fk_rtable=rtable_ID 5 | -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/io/InvalidMetadata1.csv: -------------------------------------------------------------------------------- 1 | ID,name,birth_year,hourly_wage,address,zipcode 2 | a1,Kevin Smith,1989,30.0,"607 From St, San Francisco",94107 3 | a2,Michael Franklin,1988,27.5,"1652 Stockton St, San Francisco",94122 4 | a3,William Bridge,1986,32.0,"3131 Webster St, San Francisco",94107 5 | a4,Binto George,1987,32.5,"423 Powell St, San Francisco",94122 6 | a5,Alphonse Kemper,1984,35.0,"1702 Post Street, San Francisco",94122 7 | -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/io/InvalidMetadata1.metadata: -------------------------------------------------------------------------------- 1 | #key1=ID 2 | -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/io/InvalidMetadata2.csv: -------------------------------------------------------------------------------- 1 | ID,name,birth_year,hourly_wage,address,zipcode 2 | a1,Kevin Smith,1989,30.0,"607 From St, San Francisco",94107 3 | a2,Michael Franklin,1988,27.5,"1652 Stockton St, San Francisco",94122 4 | a3,William Bridge,1986,32.0,"3131 Webster St, San Francisco",94107 5 | a4,Binto George,1987,32.5,"423 Powell St, San Francisco",94122 6 | a5,Alphonse Kemper,1984,35.0,"1702 Post Street, San Francisco",94122 7 | -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/io/InvalidMetadata2.metadata: -------------------------------------------------------------------------------- 1 | #key=ID1 2 | -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/io/expected_A.metadata: -------------------------------------------------------------------------------- 1 | #key=ID -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/io/expected_C.metadata: -------------------------------------------------------------------------------- 1 | #ltable=POINTER 2 | #rtable=POINTER 3 | #fk_rtable=rtable_ID 4 | #fk_ltable=ltable_ID 5 | #key=_id -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/matcherselector/feat_vecs.metadata: -------------------------------------------------------------------------------- 1 | #key=_id 2 | #fk_ltable=ltable.id 3 | #ltable=POINTER 4 | #rtable=POINTER 5 | #fk_rtable=rtable.id 6 | -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/sandbox/A.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhaidgroup/py_entitymatching/0eebc5823a5faac89934c52cfe2b7cb5f6e15e8a/py_entitymatching/tests/test_datasets/sandbox/A.pkl -------------------------------------------------------------------------------- /py_entitymatching/tests/test_datasets/sandbox/A.pklmetadata: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhaidgroup/py_entitymatching/0eebc5823a5faac89934c52cfe2b7cb5f6e15e8a/py_entitymatching/tests/test_datasets/sandbox/A.pklmetadata -------------------------------------------------------------------------------- /py_entitymatching/tests/test_feature_attributeutils.py: -------------------------------------------------------------------------------- 1 | import os 2 | # from nose.tools import * 3 | import unittest 4 | import pandas as pd 5 | import six 6 | from .utils import raises 7 | 8 | from py_entitymatching.utils.generic_helper import get_install_path 9 | from py_entitymatching.io.parsers import read_csv_metadata 10 | from py_entitymatching.feature.simfunctions import get_sim_funs_for_matching 11 | from py_entitymatching.feature.tokenizers import get_tokenizers_for_matching 12 | from py_entitymatching.feature.autofeaturegen import get_features_for_matching 13 | from py_entitymatching.feature.attributeutils import get_attr_corres, get_attr_types, _get_type, _len_handle_nan 14 | 15 | import py_entitymatching.catalog.catalog_manager as cm 16 | 17 | datasets_path = os.sep.join([get_install_path(), 'tests', 'test_datasets']) 18 | path_a = os.sep.join([datasets_path, 'A.csv']) 19 | path_b = os.sep.join([datasets_path, 'B.csv']) 20 | 21 | 22 | class AttributeUtilsTestCases(unittest.TestCase): 23 | def test_get_attr_types_valid(self): 24 | A = read_csv_metadata(path_a) 25 | x = get_attr_types(A) 26 | 27 | @raises(AssertionError) 28 | def test_get_attr_types_invalid_df(self): 29 | x = get_attr_types(None) 30 | 31 | def test_get_attr_corres_valid_1(self): 32 | A = read_csv_metadata(path_a) 33 | B = read_csv_metadata(path_b, key='ID') 34 | ac = get_attr_corres(A, B) 35 | for c in ac['corres']: 36 | self.assertEqual(c[0], c[1]) 37 | 38 | self.assertEqual(all(ac['ltable'] == A), True) 39 | self.assertEqual(all(ac['rtable'] == B), True) 40 | 41 | def test_get_attr_corres_valid_2(self): 42 | A = read_csv_metadata(path_a) 43 | A['label'] = 0 44 | B = read_csv_metadata(path_b, key='ID') 45 | ac = get_attr_corres(A, B) 46 | for c in ac['corres']: 47 | self.assertEqual(c[0], c[1]) 48 | 49 | self.assertEqual(all(ac['ltable'] == A), True) 50 | self.assertEqual(all(ac['rtable'] == B), True) 51 | 52 | 53 | @raises(AssertionError) 54 | def test_get_attr_corres_invalid_df1(self): 55 | ac = get_attr_corres(None, pd.DataFrame()) 56 | 57 | @raises(AssertionError) 58 | def test_get_attr_corres_invalid_df2(self): 59 | ac = get_attr_corres(pd.DataFrame(), None) 60 | 61 | def test_get_type_valid(self): 62 | A = read_csv_metadata(path_a) 63 | t = _get_type(A['ID']) 64 | self.assertEqual(t, 'str_eq_1w') 65 | 66 | @raises(AssertionError) 67 | def test_get_type_invalid_series(self): 68 | _get_type(None) 69 | 70 | 71 | def test_get_type_empty_series(self): 72 | t = _get_type(pd.Series()) 73 | self.assertEqual(t, 'un_determined') 74 | 75 | @raises(AssertionError) 76 | def test_get_type_multiple_types(self): 77 | A = read_csv_metadata(path_a) 78 | A.loc[0, 'ID'] = 1000 79 | t = _get_type(A['ID']) 80 | 81 | def test_get_type_valid_2(self): 82 | A = read_csv_metadata(path_a) 83 | A['temp'] = True 84 | t = _get_type(A['temp']) 85 | self.assertEqual(t, 'boolean') 86 | 87 | def test_get_type_valid_3(self): 88 | A = read_csv_metadata(path_a) 89 | A['temp'] = "This is a very very very very very very very very very very very very very long string" 90 | t = _get_type(A['temp']) 91 | self.assertEqual(t, "str_gt_10w") 92 | 93 | def test_len_handle_nan_invalid(self): 94 | result = _len_handle_nan(None) 95 | self.assertEqual(pd.isnull(result), True) 96 | -------------------------------------------------------------------------------- /py_entitymatching/tests/test_feature_tokenizers.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | # from nose.tools import * 3 | import unittest 4 | import pandas as pd 5 | import numpy as np 6 | import six 7 | from .utils import raises 8 | 9 | import py_entitymatching.feature.tokenizers as tok 10 | 11 | class TokenizerTestCases(unittest.TestCase): 12 | def test_get_global_tokenizers(self): 13 | x = tok._global_tokenizers 14 | 15 | def test_get_tokenizers_for_blocking(self): 16 | x = tok.get_tokenizers_for_blocking() 17 | self.assertEqual(isinstance(x, dict), True) 18 | input = 'data science' 19 | for name, value in six.iteritems(x): 20 | self.assertEqual(isinstance(value(input), list), True) 21 | 22 | @raises(AssertionError) 23 | def test_get_tokenizers_for_blocking_invalid(self): 24 | tok.get_tokenizers_for_blocking(None, None) 25 | 26 | def test_get_tokenizers_for_matching(self): 27 | x = tok.get_tokenizers_for_matching() 28 | self.assertEqual(isinstance(x, dict), True) 29 | input = 'data science' 30 | for name, value in six.iteritems(x): 31 | self.assertEqual(isinstance(value(input), list), True) 32 | 33 | @raises(AssertionError) 34 | def test_get_tokenizers_for_matching_invalid(self): 35 | x = tok.get_tokenizers_for_matching(None, None) 36 | 37 | 38 | @raises(AssertionError) 39 | def test_get_single_arg_tokenizers_invalid_1(self): 40 | tok._get_single_arg_tokenizers(None, None) 41 | 42 | 43 | def test_get_single_arg_tokenizers_valid_2(self): 44 | tok._get_single_arg_tokenizers(q=3, dlm_char=' ') 45 | 46 | def test_get_single_arg_tokenizers_valid_3(self): 47 | tok._get_single_arg_tokenizers(q=[], dlm_char=[]) 48 | 49 | def test_get_single_arg_tokenizers_valid_4(self): 50 | tok._get_single_arg_tokenizers(q=None, dlm_char=[' ']) 51 | 52 | def test_get_single_arg_tokenizers_valid_5(self): 53 | tok._get_single_arg_tokenizers(q=3, dlm_char=None) 54 | 55 | def test_qgram_invalid(self): 56 | x = tok._make_tok_qgram(3) 57 | self.assertEqual(pd.isnull(x(np.NaN)), True) 58 | 59 | def test_qgram_delim(self): 60 | x = tok._make_tok_delim(' ') 61 | self.assertEqual(pd.isnull(x(np.NaN)), True) 62 | 63 | def test_tokqgram_valid(self): 64 | x = tok.tok_qgram('data science', 3) 65 | self.assertEqual(isinstance(x, list), True) 66 | 67 | def test_tokdelim_valid(self): 68 | x = tok.tok_delim('data science', ' ') 69 | self.assertEqual(isinstance(x, list), True) 70 | self.assertEqual(len(x), 2) 71 | 72 | def test_tokqgram_invalid(self): 73 | x = tok.tok_qgram(np.NaN, 3) 74 | self.assertEqual(pd.isnull(x), True) 75 | 76 | def test_tokdelim_invalid(self): 77 | x = tok.tok_delim(np.NaN, ' ') 78 | self.assertEqual(pd.isnull(x), True) 79 | -------------------------------------------------------------------------------- /py_entitymatching/tests/test_sampler_single_table.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import os 3 | # from nose.tools import * 4 | import unittest 5 | import pandas as pd 6 | import six 7 | from .utils import raises 8 | 9 | from py_entitymatching.utils.generic_helper import get_install_path 10 | import py_entitymatching.catalog.catalog_manager as cm 11 | from py_entitymatching.io.parsers import read_csv_metadata 12 | from py_entitymatching.sampler.single_table import sample_table 13 | 14 | datasets_path = os.sep.join([get_install_path(), 'tests', 'test_datasets']) 15 | path_a = os.sep.join([datasets_path, 'A.csv']) 16 | path_b = os.sep.join([datasets_path, 'B.csv']) 17 | path_c = os.sep.join([datasets_path, 'C.csv']) 18 | 19 | class SamplerSingleTableTestCases(unittest.TestCase): 20 | def test_sample_table_valid_1(self): 21 | A = read_csv_metadata(path_a) 22 | B = read_csv_metadata(path_b, key='ID') 23 | C = read_csv_metadata(path_c, ltable=A, rtable=B) 24 | D = sample_table(C, 10, False) 25 | self.assertEqual(cm.get_all_properties(C), cm.get_all_properties(D)) 26 | self.assertEqual(len(D), 10) 27 | 28 | def test_sample_table_valid_2(self): 29 | A = read_csv_metadata(path_a) 30 | B = read_csv_metadata(path_b, key='ID') 31 | C = read_csv_metadata(path_c, ltable=A, rtable=B) 32 | D = sample_table(C, 10, True) 33 | self.assertEqual(id(cm.get_ltable(D)), id(cm.get_ltable(C))) 34 | self.assertEqual(id(cm.get_rtable(D)), id(cm.get_rtable(C))) 35 | self.assertEqual(cm.get_fk_ltable(D), cm.get_fk_ltable(C)) 36 | self.assertEqual(cm.get_fk_rtable(D), cm.get_fk_rtable(C)) 37 | self.assertEqual(len(D), 10) 38 | 39 | @raises(AssertionError) 40 | def test_sample_table_invalid_df(self): 41 | A = read_csv_metadata(path_a) 42 | B = read_csv_metadata(path_b, key='ID') 43 | C = read_csv_metadata(path_c, ltable=A, rtable=B) 44 | D = sample_table(None, 10, True) 45 | # self.assertEqual(cm.get_all_properties(C), cm.get_all_properties(D)) 46 | # self.assertEqual(len(D), 10) 47 | 48 | @raises(AssertionError) 49 | def test_sample_table_invalid_size(self): 50 | A = read_csv_metadata(path_a) 51 | B = read_csv_metadata(path_b, key='ID') 52 | C = read_csv_metadata(path_c, ltable=A, rtable=B) 53 | D = sample_table(C, len(C)+1, True) 54 | 55 | @raises(AssertionError) 56 | def test_sample_table_invalid_df_sz0(self): 57 | # A = read_csv_metadata(path_a) 58 | # B = read_csv_metadata(path_b, key='ID') 59 | # C = read_csv_metadata(path_c, ltable=A, rtable=B) 60 | D = sample_table(pd.DataFrame(), 1, True) 61 | -------------------------------------------------------------------------------- /py_entitymatching/tests/test_validation_helper.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import unittest 4 | 5 | # from nose.tools import * 6 | import pandas as pd 7 | from .utils import raises 8 | 9 | from py_entitymatching.utils import validation_helper as vh 10 | 11 | 12 | class ValidationHelperTestCases(unittest.TestCase): 13 | def test_validate_object_type_with_valid_type(self): 14 | vh.validate_object_type('ABC', str) 15 | vh.validate_object_type(pd.DataFrame(), pd.DataFrame) 16 | vh.validate_object_type(list(), list), 17 | vh.validate_object_type(True, bool), 18 | vh.validate_object_type(123, int), 19 | vh.validate_object_type(dict(), dict) 20 | 21 | # Currently, can validate unexpected types 22 | class A(object): pass 23 | a = A() 24 | vh.validate_object_type(a, A) 25 | 26 | def test_validate_object_type_with_invalid_type(self): 27 | self.assertRaises(AssertionError, lambda: vh.validate_object_type('ABC', int)) 28 | self.assertRaises(AssertionError, lambda: vh.validate_object_type(123, str)) 29 | self.assertRaises(AssertionError, lambda: vh.validate_object_type(list(), dict)) 30 | self.assertRaises(AssertionError, lambda: vh.validate_object_type(dict(), list)) 31 | 32 | def test_validate_object_type_with_unexpected_type(self): 33 | class B(object): pass 34 | self.assertRaises(KeyError, lambda: vh.validate_object_type(123, B)) 35 | 36 | def test_validate_subclass_with_valid_class(self): 37 | class C(object): pass 38 | class D(C): pass 39 | class E(D): pass 40 | vh.validate_subclass(E, E) 41 | vh.validate_subclass(E, D) 42 | vh.validate_subclass(E, C) 43 | 44 | def test_validate_subclass_with_invalid_class(self): 45 | class F(object): pass 46 | class G(object): pass 47 | class H(G): pass 48 | self.assertRaises(AssertionError, lambda: vh.validate_subclass(G, F)) 49 | self.assertRaises(AssertionError, lambda: vh.validate_subclass(H, F)) 50 | -------------------------------------------------------------------------------- /py_entitymatching/tests/utils.py: -------------------------------------------------------------------------------- 1 | # Simplified knockoff of nose.tools.raises 2 | # Thanks to zware for writing this for py_stringmatching 3 | def raises(exc_type): 4 | def deco(f): 5 | def raises_wrapper(self): 6 | with self.assertRaises(exc_type): 7 | return f(self) 8 | return raises_wrapper 9 | return deco 10 | -------------------------------------------------------------------------------- /py_entitymatching/triggers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhaidgroup/py_entitymatching/0eebc5823a5faac89934c52cfe2b7cb5f6e15e8a/py_entitymatching/triggers/__init__.py -------------------------------------------------------------------------------- /py_entitymatching/tuner/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhaidgroup/py_entitymatching/0eebc5823a5faac89934c52cfe2b7cb5f6e15e8a/py_entitymatching/tuner/__init__.py -------------------------------------------------------------------------------- /py_entitymatching/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # install path 2 | import os 3 | 4 | install_path = os.path.dirname(os.path.realpath(__file__)) 5 | -------------------------------------------------------------------------------- /py_entitymatching/utils/stop_words.txt: -------------------------------------------------------------------------------- 1 | a 2 | about 3 | above 4 | across 5 | after 6 | afterwards 7 | again 8 | against 9 | all 10 | almost 11 | alone 12 | along 13 | already 14 | also 15 | although 16 | always 17 | am 18 | among 19 | amongst 20 | amoungst 21 | amount 22 | an 23 | and 24 | another 25 | any 26 | anyhow 27 | anyone 28 | anything 29 | anyway 30 | anywhere 31 | are 32 | around 33 | as 34 | at 35 | back 36 | be 37 | became 38 | because 39 | become 40 | becomes 41 | becoming 42 | been 43 | before 44 | beforehand 45 | behind 46 | being 47 | below 48 | beside 49 | besides 50 | between 51 | beyond 52 | bill 53 | both 54 | bottom 55 | but 56 | by 57 | call 58 | can 59 | cannot 60 | cant 61 | co 62 | con 63 | could 64 | couldnt 65 | cry 66 | de 67 | describe 68 | detail 69 | do 70 | done 71 | down 72 | due 73 | during 74 | each 75 | eg 76 | eight 77 | either 78 | eleven 79 | else 80 | elsewhere 81 | empty 82 | enough 83 | etc 84 | even 85 | ever 86 | every 87 | everyone 88 | everything 89 | everywhere 90 | except 91 | few 92 | fifteen 93 | fify 94 | fill 95 | find 96 | fire 97 | first 98 | five 99 | for 100 | former 101 | formerly 102 | forty 103 | found 104 | four 105 | from 106 | front 107 | full 108 | further 109 | get 110 | give 111 | go 112 | had 113 | has 114 | hasnt 115 | have 116 | he 117 | hence 118 | her 119 | here 120 | hereafter 121 | hereby 122 | herein 123 | hereupon 124 | hers 125 | herself 126 | him 127 | himself 128 | his 129 | how 130 | however 131 | hundred 132 | ie 133 | if 134 | in 135 | inc 136 | indeed 137 | interest 138 | into 139 | is 140 | it 141 | its 142 | itself 143 | keep 144 | last 145 | latter 146 | latterly 147 | least 148 | less 149 | ltd 150 | made 151 | many 152 | may 153 | me 154 | meanwhile 155 | might 156 | mill 157 | mine 158 | more 159 | moreover 160 | most 161 | mostly 162 | move 163 | much 164 | must 165 | my 166 | myself 167 | name 168 | namely 169 | neither 170 | never 171 | nevertheless 172 | next 173 | nine 174 | no 175 | nobody 176 | none 177 | noone 178 | nor 179 | not 180 | nothing 181 | now 182 | nowhere 183 | of 184 | off 185 | often 186 | on 187 | once 188 | one 189 | only 190 | onto 191 | or 192 | other 193 | others 194 | otherwise 195 | our 196 | ours 197 | ourselves 198 | out 199 | over 200 | own 201 | part 202 | per 203 | perhaps 204 | please 205 | put 206 | rather 207 | re 208 | same 209 | see 210 | seem 211 | seemed 212 | seeming 213 | seems 214 | serious 215 | several 216 | she 217 | should 218 | show 219 | side 220 | since 221 | sincere 222 | six 223 | sixty 224 | so 225 | some 226 | somehow 227 | someone 228 | something 229 | sometime 230 | sometimes 231 | somewhere 232 | still 233 | such 234 | system 235 | take 236 | ten 237 | than 238 | that 239 | the 240 | their 241 | them 242 | themselves 243 | then 244 | thence 245 | there 246 | thereafter 247 | thereby 248 | therefore 249 | therein 250 | thereupon 251 | these 252 | they 253 | thickv 254 | thin 255 | third 256 | this 257 | those 258 | though 259 | three 260 | through 261 | throughout 262 | thru 263 | thus 264 | to 265 | together 266 | too 267 | top 268 | toward 269 | towards 270 | twelve 271 | twenty 272 | two 273 | un 274 | under 275 | until 276 | up 277 | upon 278 | us 279 | very 280 | via 281 | was 282 | we 283 | well 284 | were 285 | what 286 | whatever 287 | when 288 | whence 289 | whenever 290 | where 291 | whereafter 292 | whereas 293 | whereby 294 | wherein 295 | whereupon 296 | wherever 297 | whether 298 | which 299 | while 300 | whither 301 | who 302 | whoever 303 | whole 304 | whom 305 | whose 306 | why 307 | will 308 | with 309 | within 310 | without 311 | would 312 | yet 313 | you 314 | your 315 | yours 316 | yourself 317 | yourselves 318 | -------------------------------------------------------------------------------- /py_entitymatching/utils/validation_helper.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import six 3 | 4 | 5 | def type_name(expected_type): 6 | messages = { 7 | six.string_types: 'string', 8 | pd.DataFrame: 'pandas dataframe', 9 | list: 'list', 10 | bool: 'bool', 11 | int: 'int', 12 | dict: 'dictionary', 13 | str: 'str', 14 | } 15 | return messages[expected_type] 16 | 17 | 18 | def validate_object_type(input_object, expected_type, error_prefix='Input object'): 19 | if not isinstance(input_object, expected_type): 20 | error_message = '{0}: {1} \nis not of type {2}'.format(error_prefix, str(input_object), type_name(expected_type)) 21 | raise AssertionError(error_message) 22 | 23 | 24 | def validate_subclass(input_class, expected_class, error_prefix='Input class'): 25 | if not issubclass(input_class, expected_class): 26 | error_message = f'{error_prefix}: {str(input_class)}\nis is not a sublcass of {str(expected_class)}' 27 | raise AssertionError(error_message) 28 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | ipython>=5.6 3 | matplotlib>=2.2.4 4 | PyPrind==2.9.8 5 | py-stringmatching>=0.2.1 6 | py-stringsimjoin>=0.3.0 7 | numpy 8 | scikit-learn>=0.22 9 | scipy>=1.3.2 10 | cloudpickle 11 | -------------------------------------------------------------------------------- /requirements.yml: -------------------------------------------------------------------------------- 1 | name: py_entitymatching_dev 2 | channels: 3 | - conda-forge 4 | - uwmagellan 5 | - defaults 6 | dependencies: 7 | - ipython == 5.6 8 | - matplotlib => 2.2.4 9 | - setuptools 10 | - py_stringsimjoin >= 0.3.0 11 | - cloudpickle 12 | - pyparsing 13 | - scikit-learn >= 0.18 14 | - pyqt 15 | - py_stringmatching 16 | - requests 17 | - cloudpickle 18 | # - xgboost 19 | --------------------------------------------------------------------------------